| Chief Blingologist ( @ 2007-10-17 22:50:00 |
MySecondSpider.vb
' Next gen would keep track of recursion depth, multithreading, page object model
Imports System.Data.SqlClient
Imports System.Net
Imports System.IO
Imports System.Text.RegularExpressions
Module Module1
Dim Quotes As String = Chr(34)
Sub Main()
Dim URL As String = "http://dummy.com"
Call Spider(URL)
End Sub
Sub Spider(ByVal URL As String)
Dim Connection As SqlConnection = New SqlConnection("Data Source=BURROUGHS\SQLEXPRESS;" & _
"Initial Catalog=Spider;" & _
"Integrated Security=SSPI;")
Dim Command As SqlCommand = New SqlCommand
Command.Connection = Connection
Connection.Open()
Try
Dim WebRequest As HttpWebRequest = HttpWebRequest.Create(URL)
Dim Response As HttpWebResponse = WebRequest.GetResponse
Dim StreamReader As StreamReader = New StreamReader(Response.GetResponseStream)
Dim Page As String = New StreamReader(Response.GetResponseStream) .ReadToEnd()
Dim URLRegex As String = "(?\w+):\/\/(?[\w.]+\/?)\S*"
For Each URLMatch As Match In Regex.Matches(Page, URLRegex)
Dim Match As String = URLMatch.Value
' I actually do realize this is stupid after the regex... fix later
If Match.LastIndexOf(Quotes) > 0 Then Match = Match.Remove(Match.LastIndexOf(Quotes))
If Match.LastIndexOf("'") > 0 Then Match = Match.Remove(Match.LastIndexOf("'"))
If Match.LastIndexOf("?") > 0 Then Match = Match.Remove(Match.LastIndexOf("?"))
If Match.LastIndexOf("#") > 0 Then Match = Match.Remove(Match.LastIndexOf("#"))
If Match.LastIndexOf(">") > 0 Then Match = Match.Remove(Match.LastIndexOf(">"))
If Match.LastIndexOf("<") > 0 Then Match = Match.Remove(Match.LastIndexOf("<"))
If Match.LastIndexOf(",") > 0 Then Match = Match.Remove(Match.LastIndexOf(","))
Command.CommandText = "SELECT COUNT(*) FROM Pages WHERE URL='" + Match + "'"
If Command.ExecuteScalar = 0 Then
If Match.StartsWith("http://dummy.com") Then
Command.CommandText = "INSERT INTO Pages (URL) VALUES ('" + Match + "')"
Command.ExecuteNonQuery()
Console.WriteLine(Match)
End If
End If
Next
Catch ex As Exception
Console.WriteLine(ex.Message)
End Try
Command.CommandText = "SELECT TOP 1 URL FROM Pages WHERE LastVisit IS NULL"
Dim NextURL As String = Command.ExecuteScalar
If NextURL = Nothing Then
Console.WriteLine("Done!")
Return
Else
Command.CommandText = "UPDATE Pages SET LastVisit='" + Now().ToString + "' WHERE URL='" + NextURL + "'"
Command.ExecuteNonQuery()
Call Spider(NextURL)
End If
End Sub
End Module
' Next gen would keep track of recursion depth, multithreading, page object model
Imports System.Data.SqlClient
Imports System.Net
Imports System.IO
Imports System.Text.RegularExpressions
Module Module1
Dim Quotes As String = Chr(34)
Sub Main()
Dim URL As String = "http://dummy.com"
Call Spider(URL)
End Sub
Sub Spider(ByVal URL As String)
Dim Connection As SqlConnection = New SqlConnection("Data Source=BURROUGHS\SQLEXPRESS;" & _
"Initial Catalog=Spider;" & _
"Integrated Security=SSPI;")
Dim Command As SqlCommand = New SqlCommand
Command.Connection = Connection
Connection.Open()
Try
Dim WebRequest As HttpWebRequest = HttpWebRequest.Create(URL)
Dim Response As HttpWebResponse = WebRequest.GetResponse
Dim StreamReader As StreamReader = New StreamReader(Response.GetResponseStream)
Dim Page As String = New StreamReader(Response.GetResponseStream)
Dim URLRegex As String = "(?
For Each URLMatch As Match In Regex.Matches(Page, URLRegex)
Dim Match As String = URLMatch.Value
' I actually do realize this is stupid after the regex... fix later
If Match.LastIndexOf(Quotes) > 0 Then Match = Match.Remove(Match.LastIndexOf(Quotes))
If Match.LastIndexOf("'") > 0 Then Match = Match.Remove(Match.LastIndexOf("'"))
If Match.LastIndexOf("?") > 0 Then Match = Match.Remove(Match.LastIndexOf("?"))
If Match.LastIndexOf("#") > 0 Then Match = Match.Remove(Match.LastIndexOf("#"))
If Match.LastIndexOf(">") > 0 Then Match = Match.Remove(Match.LastIndexOf(">"))
If Match.LastIndexOf("<") > 0 Then Match = Match.Remove(Match.LastIndexOf("<"))
If Match.LastIndexOf(",") > 0 Then Match = Match.Remove(Match.LastIndexOf(","))
Command.CommandText = "SELECT COUNT(*) FROM Pages WHERE URL='" + Match + "'"
If Command.ExecuteScalar = 0 Then
If Match.StartsWith("http://dummy.com") Then
Command.CommandText = "INSERT INTO Pages (URL) VALUES ('" + Match + "')"
Command.ExecuteNonQuery()
Console.WriteLine(Match)
End If
End If
Next
Catch ex As Exception
Console.WriteLine(ex.Message)
End Try
Command.CommandText = "SELECT TOP 1 URL FROM Pages WHERE LastVisit IS NULL"
Dim NextURL As String = Command.ExecuteScalar
If NextURL = Nothing Then
Console.WriteLine("Done!")
Return
Else
Command.CommandText = "UPDATE Pages SET LastVisit='" + Now().ToString + "' WHERE URL='" + NextURL + "'"
Command.ExecuteNonQuery()
Call Spider(NextURL)
End If
End Sub
End Module