Chief Blingologist ([info]alexmizell) wrote,
@ 2007-10-17 22:50:00
Previous Entry  Add to memories!  Tell a Friend  Next Entry
MySecondSpider.vb
' Next gen would keep track of recursion depth, multithreading, page object model

Imports System.Data.SqlClient
Imports System.Net
Imports System.IO
Imports System.Text.RegularExpressions

Module Module1

Dim Quotes As String = Chr(34)

Sub Main()

Dim URL As String = "http://dummy.com"

Call Spider(URL)

End Sub

Sub Spider(ByVal URL As String)

Dim Connection As SqlConnection = New SqlConnection("Data Source=BURROUGHS\SQLEXPRESS;" & _
"Initial Catalog=Spider;" & _
"Integrated Security=SSPI;")
Dim Command As SqlCommand = New SqlCommand
Command.Connection = Connection
Connection.Open()

Try

Dim WebRequest As HttpWebRequest = HttpWebRequest.Create(URL)
Dim Response As HttpWebResponse = WebRequest.GetResponse

Dim StreamReader As StreamReader = New StreamReader(Response.GetResponseStream)

Dim Page As String = New StreamReader(Response.GetResponseStream).ReadToEnd()
Dim URLRegex As String = "(?\w+):\/\/(?[\w.]+\/?)\S*"

For Each URLMatch As Match In Regex.Matches(Page, URLRegex)

Dim Match As String = URLMatch.Value

' I actually do realize this is stupid after the regex... fix later
If Match.LastIndexOf(Quotes) > 0 Then Match = Match.Remove(Match.LastIndexOf(Quotes))
If Match.LastIndexOf("'") > 0 Then Match = Match.Remove(Match.LastIndexOf("'"))
If Match.LastIndexOf("?") > 0 Then Match = Match.Remove(Match.LastIndexOf("?"))
If Match.LastIndexOf("#") > 0 Then Match = Match.Remove(Match.LastIndexOf("#"))
If Match.LastIndexOf(">") > 0 Then Match = Match.Remove(Match.LastIndexOf(">"))
If Match.LastIndexOf("<") > 0 Then Match = Match.Remove(Match.LastIndexOf("<"))
If Match.LastIndexOf(",") > 0 Then Match = Match.Remove(Match.LastIndexOf(","))

Command.CommandText = "SELECT COUNT(*) FROM Pages WHERE URL='" + Match + "'"

If Command.ExecuteScalar = 0 Then

If Match.StartsWith("http://dummy.com") Then

Command.CommandText = "INSERT INTO Pages (URL) VALUES ('" + Match + "')"
Command.ExecuteNonQuery()
Console.WriteLine(Match)

End If

End If

Next

Catch ex As Exception

Console.WriteLine(ex.Message)

End Try

Command.CommandText = "SELECT TOP 1 URL FROM Pages WHERE LastVisit IS NULL"

Dim NextURL As String = Command.ExecuteScalar

If NextURL = Nothing Then

Console.WriteLine("Done!")
Return

Else

Command.CommandText = "UPDATE Pages SET LastVisit='" + Now().ToString + "' WHERE URL='" + NextURL + "'"
Command.ExecuteNonQuery()

Call Spider(NextURL)

End If

End Sub

End Module


Create an Account
Forgot your login or password?
Login w/ OpenID
English • Español • Deutsch • Русский…