Jump to content
Sign in to follow this  
kenzu

Help updating a web crawler

Recommended Posts

kenzu

<p>Ok, so i found this script and i have some questions

; ----------------------------------------------------------------------------
;
; AutoIt Version: 3.1.1.87
; Author:        AcidicChip <acidicchip@acidicchip.com>
;
; Script Name:  Web Media Spider
; Script Version: 0.21
;
; Script Function:
;   Spider the web and gather media file URLs
;
; ----------------------------------------------------------------------------

Opt("GUIOnEventMode", 1)
Opt("TrayIconDebug", 1)

#include <Array.au3>
#include <GUIConstants.au3>

Dim $collected[1]
Dim $urls[1]
Dim $urlon = 0
Dim $urlnum = 0
Dim $imagenum = 0
Dim $audionum = 0
Dim $videonum = 0

#region "GUI"
GUICreate("Media Spider", 600, 100)
$lblAction = GUICtrlCreateLabel("Action:", 0, 3, 35, 20)
$txtAction = GUICtrlCreateInput("", 40, 0, 560, 20)
GUICtrlSetState($txtAction, $GUI_DISABLE)
$lblURL = GUICtrlCreateLabel("URL:", 0, 23, 35, 20)
$txtURL = GUICtrlCreateInput("", 40, 20, 560, 20)
GUICtrlSetState($txtURL, $GUI_DISABLE)
$prgPercent = GUICtrlCreateProgress(0, 40, 560, 20)
$txtPercent = GUICtrlCreateInput("0%", 560, 40, 40, 20)
GUICtrlSetState($txtPercent, $GUI_DISABLE)
$lblURLs = GUICtrlCreateLabel("URLs:", 0, 63, 35, 20)
$txtURLs = GUICtrlCreateInput("0", 40, 60, 75, 20)
GUICtrlSetState($txtURLs, $GUI_DISABLE)
$lblAudio = GUICtrlCreateLabel("Audio:", 125, 63, 35, 20)
$txtAudio = GUICtrlCreateInput("0", 160, 60, 75, 20)
GUICtrlSetState($txtAudio, $GUI_DISABLE)
$lblImages = GUICtrlCreateLabel("Images:", 245, 63, 36, 20)
$txtImages = GUICtrlCreateInput("0", 285, 60, 75, 20)
GUICtrlSetState($txtImages, $GUI_DISABLE)
$lblVideos = GUICtrlCreateLabel("Videos:", 370, 63, 35, 20)
$txtVideos = GUICtrlCreateInput("0", 410, 60, 75, 20)
GUICtrlSetState($txtVideos, $GUI_DISABLE)
$lblHistory = GUICtrlCreateLabel("History:", 490, 63, 35, 20)
$txtHistory = GUICtrlCreateInput("0", 530, 60, 75, 20)
GUICtrlSetState($txtHistory, $GUI_DISABLE)
$lblStartURL = GUICtrlCreateLabel("Start URL:", 0, 83, 50, 20)
$txtStartURL = GUICtrlCreateInput("http://www.myspace.com/acidicchip", 55, 80, 490, 20)
$btnStartStop = GUICtrlCreateButton("Start", 550, 80, 50, 20)
GUISetState(@SW_SHOW)

GUISetOnEvent($GUI_EVENT_CLOSE, "GUIClose")
GUICtrlSetOnEvent($btnStartStop, "GUIStartStop")
#endregion "GUI"

Func GUIClose()
    Exit
EndFunc ;==>GUIClose

Func GUIStartStop()
    If GUICtrlRead($btnStartStop) == "Start" Then
        GUICtrlSetData($btnStartStop, "Stop")
        GUICtrlSetState($txtStartURL, $GUI_DISABLE)
        FileDelete("spider.urls.txt")
        GetURLs(GUICtrlRead($txtStartURL))
        Do
        ;$url = $urls[1]
            $urlon = $urlon + 1
            $url = FileReadLine("spider.urls.txt", $urlon)
        ;_ArrayDelete($urls, 1)
            $urlnum = $urlnum - 1
            GetURLs($url)
        Until $urlnum <= 0 Or GUICtrlRead($btnStartStop) == "Start"
    ;Until UBound($urls) <= 1 Or GUICtrlRead($btnStartStop) == "Start"
    Else
        GUICtrlSetData($btnStartStop, "Start")
        GUICtrlSetState($txtStartURL, $GUI_ENABLE)
    EndIf
EndFunc ;==>GUIStartStop

While 1
    Sleep(250)
Wend

Func Status($action, $url, $percent)
    GUICtrlSetData($txtAction, $action)
    If $url <> "" Then GUICtrlSetData($txtURL, $url)
    GUICtrlSetData($prgPercent, $percent)
    GUICtrlSetData($txtPercent, $percent & "%")
    
    GUICtrlSetData($txtURLs, $urlnum)
;GUICtrlSetData($txtURLs, UBound($urls))
    GUICtrlSetData($txtAudio, $audionum)
    GUICtrlSetData($txtImages, $imagenum)
    GUICtrlSetData($txtVideos, $videonum)
    GUICtrlSetData($txtHistory, UBound($collected))
EndFunc ;==>Status

Func _ArrayParse($str, $before, $after)
    Return StringRegExp($str, "(?i)" & $before & "(.*?)" & $after, 3)
EndFunc ;==>_ArrayParse

Func AddURL($url)
    If Not WasCollected($url) Then
        _ArrayAdd($collected, $url)
    ;_ArrayAdd($urls, $url)
        FileWriteLine("spider.urls.txt", $url)
        $urlnum = $urlnum + 1
    EndIf
EndFunc ;==>AddURL

Func WasCollected($url)
    $return = False
    For $i = 1 To Ubound($collected) - 1 Step 1
        If $collected[$i] == $url Then
            $return = True
            ExitLoop
        EndIf
    Next
    If Not $return And UBound($collected) >= 1024 Then _ArrayDelete($collected, 1)
    Return $return
EndFunc ;==>WasCollected

Func GetURI($url)
    $uri = StringMid($url, 1, StringInStr($url, "://")) & "//"
    $turl = StringMid($url, StringLen($uri) + 1)
    If StringInStr($turl, "?") Then
        $temp = StringSplit($turl, "?")
        $turl = $temp[1]
        $temp = StringSplit($turl, "/")
        $uri = $uri & $temp[1] & "/"
        For $i = 2 To UBound($temp) - 1 Step 1
            If StringInStr($temp[$i], ".") Or Not StringLen($temp[$i]) Then ExitLoop
            $uri = $uri & $temp[$i] & "/"
        Next
        If Not InetGetSize(StringLeft($uri, StringLen($uri) - 1)) Then
            $uri = StringMid($url, 1, StringInStr($url, "://")) & "//"
            $temp = StringSplit($turl, "?")
            $turl = $temp[1]
            $temp = StringSplit($turl, "/")
            $uri = $uri & $temp[1] & "/"
            For $i = 2 To UBound($temp) - 2 Step 1
                If StringInStr($temp[$i], ".") Or Not StringLen($temp[$i]) Then ExitLoop
                $uri = $uri & $temp[$i] & "/"
            Next
        EndIf
    Else
        $temp = StringSplit($turl, "/")
        $uri = $uri & $temp[1] & "/"
        For $i = 2 To UBound($temp) - 1 Step 1
            If StringInStr($temp[$i], ".") Or Not StringLen($temp[$i]) Then ExitLoop
            $uri = $uri & $temp[$i] & "/"
        Next
    EndIf
    
    Return $uri
EndFunc ;==>GetURI

Func GetURLs($url)
    $uri = GetURI($url)
    
    $file = "spider.html.txt"
    Status("Downloading", $url, 0)
    $filesize = InetGetSize($url)
    $lastsize = 0
    $strikes = 0
    InetGet($url, $file, 1, 1)
    While @InetGetActive
        If $lastsize == @InetGetBytesRead Then $strikes = $strikes + 1
        If $strikes >= 30 Then ExitLoop
        $lastsize = @InetGetBytesRead
        Status("Downloading", $url, Round(($lastsize / $filesize) * 100))
        Sleep(250)
    Wend
    $html = FileRead($file, FileGetSize($file))
    FileDelete($file)
    
    Status("Parsing URLs", $url, 0)
    $tags = _ArrayParse($html, "<a", ">")
    For $i = 0 To UBound($tags) - 1 Step 1
        Status("Checking <A> Tags for URLs", $url, Round(($i / (UBound($tags) - 1)) * 100))
        CheckURL($uri, $tags[$i], $url)
    Next
    $tags = _ArrayParse($html, "<img", ">")
    For $i = 0 To UBound($tags) - 1 Step 1
        Status("Checking <IMG> Tags for URLs", $url, Round(($i / (UBound($tags) - 1)) * 100))
        CheckURL($uri, $tags[$i], $url)
    Next
    $tags = _ArrayParse($html, "<embed", ">")
    For $i = 0 To UBound($tags) - 1 Step 1
        Status("Checking <EMBED> Tags for URLs", $url, Round(($i / (UBound($tags) - 1)) * 100))
        CheckURL($uri, $tags[$i], $url)
    Next
EndFunc ;==>GetURLs

Func CheckURL($uri, $str, $ref)
    If StringInStr($str, "href=") Then
        $turl = GetAttr($str, "href=")
        If Not StringInStr(StringLeft($turl, 10), "://") Then
            If StringLeft($turl, 1) == "/" Then
                $turl = $uri & StringMid($turl, 2)
            Else
                $turl = $uri & $turl
            EndIf
        EndIf
        CheckType($turl, $ref)
    EndIf
    If StringInStr($str, "src=") Then
        $turl = GetAttr($str, "src=")
        If Not StringInStr(StringLeft($turl, 10), "://") Then
            If StringLeft($turl, 1) == "/" Then
                $turl = $uri & StringMid($turl, 2)
            Else
                $turl = $uri & $turl
            EndIf
        EndIf
        CheckType($turl, $ref)
    EndIf
EndFunc ;==>CheckURL

Func GetAttr($str, $attr)
    If StringInStr($str, $attr & '"') Then
        $temp = _ArrayParse($str, $attr & '"', '"')
        If UBound($temp) == 1 Then Return $temp[0]
    ElseIf StringInStr($str, $attr & "'") Then
        $temp = _ArrayParse($str, $attr & "'", "'")
        If UBound($temp) == 1 Then Return $temp[0]
    ElseIf StringInStr($str, $attr) Then
        $temp = StringMid($str, StringInStr($str, $attr) + StringLen($attr))
        If StringInStr($temp, " ") Then
            $temp = StringMid($temp, 1, StringInStr($temp, " ") - 1)
        EndIf
        Return $temp
    EndIf
EndFunc ;==>GetAttr

Func CheckType($url, $ref)
    If StringRight($url, 4) == ".jpg" Or _
            StringRight($url, 4) == ".gif" Or _
            StringRight($url, 4) == ".png" Or _
            StringRight($url, 4) == "bmp" Then
        
        FileWriteLine("spider.images.log", $url & @TAB & $ref)
        $imagenum = $imagenum + 1
    ElseIf StringRight($url, 4) == ".mp3" Or _
            StringRight($url, 4) == ".rbs" Then
        
        FileWriteLine("spider.audio.log", $url & @TAB & $ref)
        $audionum = $audionum + 1
        AddURL(GetURI($url))
    ElseIf StringRight($url, 4) == ".avi" Or _
            StringRight($url, 4) == ".wmv" Or _
            StringRight($url, 4) == ".mpg" Or _
            StringRight($url, 5) == ".mpeg" Then
        
        FileWriteLine("spider.video.log", $url & @TAB & $ref)
        $videonum = $videonum + 1
        AddURL(GetURI($url))
    ElseIf StringRight($url, 4) == ".exe" Or _
            StringRight($url, 4) == ".zip" Or _
            StringRight($url, 4) == ".rar" Or _
            StringRight($url, 4) == ".tar" Then
        
    ;Do Nothing
    Else
        AddURL($url)
    EndIf
EndFunc ;==>CheckType

As you see it saves the .mp3 url(URL1) after that it tabs and saves the URL of the page (URL2). Is it possible to save the Page Title istead of URL2 ?

Edited by kenzu

Share this post


Link to post
Share on other sites
kenzu

yes i looked, but i cant figure it out. i think the answer is in here

$tags = _ArrayParse($html, "<img", ">")
    For $i = 0 To UBound($tags) - 1 Step 1
        Status("Checking <IMG> Tags for URLs", $url, Round(($i / (UBound($tags) - 1)) * 100))
        CheckURL($uri, $tags[$i], $url)

by changing _arrayPhrase($html, "<img" to ($html, "<title") but i dont know how to save the titles in the txt files

Edited by kenzu

Share this post


Link to post
Share on other sites
BrewManNH

JohnOne was trying to tell you that the script won't run on the latest version of AutoIt. What version are YOU running?


If I posted any code, assume that code was written using the latest release version unless stated otherwise. Also, if it doesn't work on XP I can't help with that because I don't have access to XP, and I'm not going to.
Give a programmer the correct code and he can do his work for a day. Teach a programmer to debug and he can do his work for a lifetime - by Chirag Gude
How to ask questions the smart way!

I hereby grant any person the right to use any code I post, that I am the original author of, on the autoitscript.com forums, unless I've specifically stated otherwise in the code or the thread post. If you do use my code all I ask, as a courtesy, is to make note of where you got it from.

Back up and restore Windows user files _Array.au3 - Modified array functions that include support for 2D arrays.  -  ColorChooser - An add-on for SciTE that pops up a color dialog so you can select and paste a color code into a script.  -  Customizable Splashscreen GUI w/Progress Bar - Create a custom "splash screen" GUI with a progress bar and custom label.  -  _FileGetProperty - Retrieve the properties of a file  -  SciTE Toolbar - A toolbar demo for use with the SciTE editor  -  GUIRegisterMsg demo - Demo script to show how to use the Windows messages to interact with controls and your GUI.  -   Latin Square password generator

Share this post


Link to post
Share on other sites
Mat

JohnOne was trying to tell you that the script won't run on the latest version of AutoIt. What version are YOU running?

The header of the script says: 3.1.1.87

Share this post


Link to post
Share on other sites
BrewManNH

The header of the script says: 3.1.1.87

I understand that. My point was that it won't run on the current version of AutoIt, I don't think the header has any bearing on what I said. That is why I asked what version the OP was running.

If I posted any code, assume that code was written using the latest release version unless stated otherwise. Also, if it doesn't work on XP I can't help with that because I don't have access to XP, and I'm not going to.
Give a programmer the correct code and he can do his work for a day. Teach a programmer to debug and he can do his work for a lifetime - by Chirag Gude
How to ask questions the smart way!

I hereby grant any person the right to use any code I post, that I am the original author of, on the autoitscript.com forums, unless I've specifically stated otherwise in the code or the thread post. If you do use my code all I ask, as a courtesy, is to make note of where you got it from.

Back up and restore Windows user files _Array.au3 - Modified array functions that include support for 2D arrays.  -  ColorChooser - An add-on for SciTE that pops up a color dialog so you can select and paste a color code into a script.  -  Customizable Splashscreen GUI w/Progress Bar - Create a custom "splash screen" GUI with a progress bar and custom label.  -  _FileGetProperty - Retrieve the properties of a file  -  SciTE Toolbar - A toolbar demo for use with the SciTE editor  -  GUIRegisterMsg demo - Demo script to show how to use the Windows messages to interact with controls and your GUI.  -   Latin Square password generator

Share this post


Link to post
Share on other sites
kenzu

im using autoit3, the script works fine, it saves the .mp3,jpg,avi files just fine, i am trying to save the page title too in the audio.txt file.

I tryed to add this

$html = FileRead($file, FileGetSize($file))
    
For $line = 1 To 500
    If StringInStr(FileReadLine($file, $line), "<title>") Then
        $Fchartid = StringRegExpReplace(FileReadLine($file, $line), "(\D.*|\d.*|)(<title>)=(\d*)(\D.*|\d.*|)", "$3")
        $Online = $line
     EndIf

and this to save it

ElseIf StringRight($url, 4) == ".swf" Or _
            StringRight($url, 4) == ".mp3" Then
        
        FileWriteLine("spider.audio.log", $url & @TAB & $Fchartid)

but it's not working

Edited by kenzu

Share this post


Link to post
Share on other sites
BrewManNH

That's not a version number, that's the name of the program. If you're trying to run this on any version above 3.3.2.0 it won't run correctly at all.


If I posted any code, assume that code was written using the latest release version unless stated otherwise. Also, if it doesn't work on XP I can't help with that because I don't have access to XP, and I'm not going to.
Give a programmer the correct code and he can do his work for a day. Teach a programmer to debug and he can do his work for a lifetime - by Chirag Gude
How to ask questions the smart way!

I hereby grant any person the right to use any code I post, that I am the original author of, on the autoitscript.com forums, unless I've specifically stated otherwise in the code or the thread post. If you do use my code all I ask, as a courtesy, is to make note of where you got it from.

Back up and restore Windows user files _Array.au3 - Modified array functions that include support for 2D arrays.  -  ColorChooser - An add-on for SciTE that pops up a color dialog so you can select and paste a color code into a script.  -  Customizable Splashscreen GUI w/Progress Bar - Create a custom "splash screen" GUI with a progress bar and custom label.  -  _FileGetProperty - Retrieve the properties of a file  -  SciTE Toolbar - A toolbar demo for use with the SciTE editor  -  GUIRegisterMsg demo - Demo script to show how to use the Windows messages to interact with controls and your GUI.  -   Latin Square password generator

Share this post


Link to post
Share on other sites
kenzu

i run v3.3.8.1 .. as i sayd the script works but i dont understand how to save the page title instead of the url

Edited by kenzu

Share this post


Link to post
Share on other sites
kenzu

my code is this

; ----------------------------------------------------------------------------
;
; AutoIt Version: 3.1.1.87
; Author:         AcidicChip <acidicchip@acidicchip.com>
;
; Script Name:    Web Media Spider
; Script Version: 0.21
;
; Script Function:
;    Spider the web and gather media file URLs
;
; ----------------------------------------------------------------------------

Opt("GUIOnEventMode", 1)
Opt("TrayIconDebug", 1)

#include <Array.au3>
#include <GUIConstants.au3>

Dim $collected[1]
Dim $urls[1]
Dim $urlon = 0
Dim $urlnum = 0
Dim $imagenum = 0
Dim $audionum = 0
Dim $videonum = 0

#region "GUI"
GUICreate("Media Spider", 600, 100)
$lblAction = GUICtrlCreateLabel("Action:", 0, 3, 35, 20)
$txtAction = GUICtrlCreateInput("", 40, 0, 560, 20)
GUICtrlSetState($txtAction, $GUI_DISABLE)
$lblURL = GUICtrlCreateLabel("URL:", 0, 23, 35, 20)
$txtURL = GUICtrlCreateInput("", 40, 20, 560, 20)
GUICtrlSetState($txtURL, $GUI_DISABLE)
$prgPercent = GUICtrlCreateProgress(0, 40, 560, 20)
$txtPercent = GUICtrlCreateInput("0%", 560, 40, 40, 20)
GUICtrlSetState($txtPercent, $GUI_DISABLE)
$lblURLs = GUICtrlCreateLabel("URLs:", 0, 63, 35, 20)
$txtURLs = GUICtrlCreateInput("0", 40, 60, 75, 20)
GUICtrlSetState($txtURLs, $GUI_DISABLE)
$lblAudio = GUICtrlCreateLabel("Audio:", 125, 63, 35, 20)
$txtAudio = GUICtrlCreateInput("0", 160, 60, 75, 20)
GUICtrlSetState($txtAudio, $GUI_DISABLE)
$lblImages = GUICtrlCreateLabel("Images:", 245, 63, 36, 20)
$txtImages = GUICtrlCreateInput("0", 285, 60, 75, 20)
GUICtrlSetState($txtImages, $GUI_DISABLE)
$lblVideos = GUICtrlCreateLabel("Videos:", 370, 63, 35, 20)
$txtVideos = GUICtrlCreateInput("0", 410, 60, 75, 20)
GUICtrlSetState($txtVideos, $GUI_DISABLE)
$lblHistory = GUICtrlCreateLabel("History:", 490, 63, 35, 20)
$txtHistory = GUICtrlCreateInput("0", 530, 60, 75, 20)
GUICtrlSetState($txtHistory, $GUI_DISABLE)
$lblStartURL = GUICtrlCreateLabel("Start URL:", 0, 83, 50, 20)
$txtStartURL = GUICtrlCreateInput("http://www.myspace.com/acidicchip", 55, 80, 490, 20)
$btnStartStop = GUICtrlCreateButton("Start", 550, 80, 50, 20)
GUISetState(@SW_SHOW)

GUISetOnEvent($GUI_EVENT_CLOSE, "GUIClose")
GUICtrlSetOnEvent($btnStartStop, "GUIStartStop")
#endregion "GUI"

Func GUIClose()
    Exit
EndFunc  ;==>GUIClose

Func GUIStartStop()
    If GUICtrlRead($btnStartStop) == "Start" Then
        GUICtrlSetData($btnStartStop, "Stop")
        GUICtrlSetState($txtStartURL, $GUI_DISABLE)
        FileDelete("spider.urls.txt")
        GetURLs(GUICtrlRead($txtStartURL))
        Do
        ;$url = $urls[1]
            $urlon = $urlon + 1
            $url = FileReadLine("spider.urls.txt", $urlon)
        ;_ArrayDelete($urls, 1)
            $urlnum = $urlnum - 1
            GetURLs($url)
        Until $urlnum <= 0 Or GUICtrlRead($btnStartStop) == "Start"
    ;Until UBound($urls) <= 1 Or GUICtrlRead($btnStartStop) == "Start"
    Else
        GUICtrlSetData($btnStartStop, "Start")
        GUICtrlSetState($txtStartURL, $GUI_ENABLE)
    EndIf
EndFunc  ;==>GUIStartStop

While 1
    Sleep(250)
Wend

Func Status($action, $url, $percent)
    GUICtrlSetData($txtAction, $action)
    If $url <> "" Then GUICtrlSetData($txtURL, $url)
    GUICtrlSetData($prgPercent, $percent)
    GUICtrlSetData($txtPercent, $percent & "%")
    
    GUICtrlSetData($txtURLs, $urlnum)
;GUICtrlSetData($txtURLs, UBound($urls))
    GUICtrlSetData($txtAudio, $audionum)
    GUICtrlSetData($txtImages, $imagenum)
    GUICtrlSetData($txtVideos, $videonum)
    GUICtrlSetData($txtHistory, UBound($collected))
EndFunc  ;==>Status

Func _ArrayParse($str, $before, $after)
    Return StringRegExp($str, "(?i)" & $before & "(.*?)" & $after, 3)
EndFunc  ;==>_ArrayParse

Func AddURL($url)
    If Not WasCollected($url) Then
        _ArrayAdd($collected, $url)
    ;_ArrayAdd($urls, $url)
        FileWriteLine("spider.urls.txt", $url)
        $urlnum = $urlnum + 1
    EndIf
EndFunc  ;==>AddURL

Func WasCollected($url)
    $return = False
    For $i = 1 To Ubound($collected) - 1 Step 1
        If $collected[$i] == $url Then
            $return = True
            ExitLoop
        EndIf
    Next
    If Not $return And UBound($collected) >= 1024 Then _ArrayDelete($collected, 1)
    Return $return
EndFunc  ;==>WasCollected

Func GetURI($url)
    $uri = StringMid($url, 1, StringInStr($url, "://")) & "//"
    $turl = StringMid($url, StringLen($uri) + 1)
    If StringInStr($turl, "?") Then
        $temp = StringSplit($turl, "?")
        $turl = $temp[1]
        $temp = StringSplit($turl, "/")
        $uri = $uri & $temp[1] & "/"
        For $i = 2 To UBound($temp) - 1 Step 1
            If StringInStr($temp[$i], ".") Or Not StringLen($temp[$i]) Then ExitLoop
            $uri = $uri & $temp[$i] & "/"
        Next
        If Not InetGetSize(StringLeft($uri, StringLen($uri) - 1)) Then
            $uri = StringMid($url, 1, StringInStr($url, "://")) & "//"
            $temp = StringSplit($turl, "?")
            $turl = $temp[1]
            $temp = StringSplit($turl, "/")
            $uri = $uri & $temp[1] & "/"
            For $i = 2 To UBound($temp) - 2 Step 1
                If StringInStr($temp[$i], ".") Or Not StringLen($temp[$i]) Then ExitLoop
                $uri = $uri & $temp[$i] & "/"
            Next
        EndIf
    Else
        $temp = StringSplit($turl, "/")
        $uri = $uri & $temp[1] & "/"
        For $i = 2 To UBound($temp) - 1 Step 1
            If StringInStr($temp[$i], ".") Or Not StringLen($temp[$i]) Then ExitLoop
            $uri = $uri & $temp[$i] & "/"
        Next
    EndIf
    
    Return $uri
EndFunc  ;==>GetURI

Func GetURLs($url)
    $uri = GetURI($url)
    
    $file = "spider.html.txt"
    Status("Downloading", $url, 0)
    $filesize = InetGetSize($url)
    $lastsize = 0
    $strikes = 0
    InetGet($url, $file, 1, 1)
    While @InetGetActive
        If $lastsize == @InetGetBytesRead Then $strikes = $strikes + 1
        If $strikes >= 30 Then ExitLoop
        $lastsize = @InetGetBytesRead
        Status("Downloading", $url, Round(($lastsize / $filesize) * 100))
        Sleep(250)
    Wend
    $html = FileRead($file, FileGetSize($file))
    
For $line = 1 To 500
    If StringInStr(FileReadLine($file, $line), "<title>") Then
        $Fchartid = StringRegExpReplace(FileReadLine($file, $line), "(\D.*|\d.*|)(<title>)=(\d*)(\D.*|\d.*|)", "$3")
        $Online = $line
     EndIf

 Next
 
    FileDelete($file)
    
    Status("Parsing URLs", $url, 0)
    $tags = _ArrayParse($html, "<a", ">")
    For $i = 0 To UBound($tags) - 1 Step 1
        Status("Checking <A> Tags for URLs", $url, Round(($i / (UBound($tags) - 1)) * 100))
        CheckURL($uri, $tags[$i], $url, $Fchartid)
    Next
    $tags = _ArrayParse($html, "<EMBED", ">")
    For $i = 0 To UBound($tags) - 1 Step 1
        Status("Checking <EMBED> Tags for URLs", $url, Round(($i / (UBound($tags) - 1)) * 100))
        CheckURL($uri, $tags[$i], $url, $Fchartid)
     Next

EndFunc  ;==>GetURLs

Func CheckURL($uri, $str, $ref, $Fchartid)
    If StringInStr($str, "href=") Then
        $turl = GetAttr($str, "href=")
        If Not StringInStr(StringLeft($turl, 10), "://") Then
            If StringLeft($turl, 1) == "/" Then
                $turl = $uri & StringMid($turl, 2)
            Else
                $turl = $uri & $turl
            EndIf
        EndIf
        CheckType($turl, $ref, $Fchartid)
    EndIf
    If StringInStr($str, "src=") Then
        $turl = GetAttr($str, "src=")
        If Not StringInStr(StringLeft($turl, 10), "://") Then
            If StringLeft($turl, 1) == "/" Then
                $turl = $uri & StringMid($turl, 2)
            Else
                $turl = $uri & $turl
            EndIf
        EndIf
        CheckType($turl, $ref, $Fchartid)
    EndIf
EndFunc  ;==>CheckURL

Func GetAttr($str, $attr)
    If StringInStr($str, $attr & '"') Then
        $temp = _ArrayParse($str, $attr & '"', '"')
        If UBound($temp) == 1 Then Return $temp[0]
    ElseIf StringInStr($str, $attr & "'") Then
        $temp = _ArrayParse($str, $attr & "'", "'")
        If UBound($temp) == 1 Then Return $temp[0]
    ElseIf StringInStr($str, $attr) Then
        $temp = StringMid($str, StringInStr($str, $attr) + StringLen($attr))
        If StringInStr($temp, " ") Then
            $temp = StringMid($temp, 1, StringInStr($temp, " ") - 1)
        EndIf
        Return $temp
    EndIf
EndFunc  ;==>GetAttr

Func CheckType($url, $ref, $Fchartid)
    If StringRight($url, 4) == ".jpg" Or _
            StringRight($url, 4) == ".bmp" Then
        
        FileWriteLine("spider.images.log", $url & @TAB & $ref)
        $imagenum = $imagenum + 1
    ElseIf StringRight($url, 4) == ".mp3" Or _
            StringRight($url, 4) == ".rbs" Then
        
        FileWriteLine("spider.audio.log", $url & @TAB & $Fchartid)
        $audionum = $audionum + 1
        AddURL(GetURI($url))
    ElseIf StringRight($url, 4) == ".exe" Or _
            StringRight($url, 4) == ".zip" Or _
            StringRight($url, 4) == ".rar" Or _
            StringRight($url, 4) == ".tar" Then
        
    ;Do Nothing
    Else
        AddURL($url)
    EndIf
EndFunc ;==>CheckType

It works fine for 5-10 url's after that i get

new.au3 (198) : ==> Variable used without being declared.:
CheckURL($uri, $tags[$i], $url, $Fchartid)
CheckURL($uri, $tags[$i], $url, ^ ERROR

Share this post


Link to post
Share on other sites
BrewManNH

You should post your new code, because you cannot be running the code in first post, it would error and not run.

I ran the code, it doesn't error on those macros, it just sees them as being 0 so the While loop never runs.
  • Like 1

If I posted any code, assume that code was written using the latest release version unless stated otherwise. Also, if it doesn't work on XP I can't help with that because I don't have access to XP, and I'm not going to.
Give a programmer the correct code and he can do his work for a day. Teach a programmer to debug and he can do his work for a lifetime - by Chirag Gude
How to ask questions the smart way!

I hereby grant any person the right to use any code I post, that I am the original author of, on the autoitscript.com forums, unless I've specifically stated otherwise in the code or the thread post. If you do use my code all I ask, as a courtesy, is to make note of where you got it from.

Back up and restore Windows user files _Array.au3 - Modified array functions that include support for 2D arrays.  -  ColorChooser - An add-on for SciTE that pops up a color dialog so you can select and paste a color code into a script.  -  Customizable Splashscreen GUI w/Progress Bar - Create a custom "splash screen" GUI with a progress bar and custom label.  -  _FileGetProperty - Retrieve the properties of a file  -  SciTE Toolbar - A toolbar demo for use with the SciTE editor  -  GUIRegisterMsg demo - Demo script to show how to use the Windows messages to interact with controls and your GUI.  -   Latin Square password generator

Share this post


Link to post
Share on other sites
kenzu

I ran the code, it doesn't error on those macros, it just sees them as being 0 so the While loop never runs.

You just need to press 2-3 times the start button so that it starts crawling

Share this post


Link to post
Share on other sites
BrewManNH

Doesn't matter how many times you press the button, the While loop will never run. This will ALWAYS evaluate to zero because the macro @InetGetActive doesn't exist any longer. If you run it and run Au3Check on it first, it will never run at all because that will cause it to fail.

While @InetGetActive

If I posted any code, assume that code was written using the latest release version unless stated otherwise. Also, if it doesn't work on XP I can't help with that because I don't have access to XP, and I'm not going to.
Give a programmer the correct code and he can do his work for a day. Teach a programmer to debug and he can do his work for a lifetime - by Chirag Gude
How to ask questions the smart way!

I hereby grant any person the right to use any code I post, that I am the original author of, on the autoitscript.com forums, unless I've specifically stated otherwise in the code or the thread post. If you do use my code all I ask, as a courtesy, is to make note of where you got it from.

Back up and restore Windows user files _Array.au3 - Modified array functions that include support for 2D arrays.  -  ColorChooser - An add-on for SciTE that pops up a color dialog so you can select and paste a color code into a script.  -  Customizable Splashscreen GUI w/Progress Bar - Create a custom "splash screen" GUI with a progress bar and custom label.  -  _FileGetProperty - Retrieve the properties of a file  -  SciTE Toolbar - A toolbar demo for use with the SciTE editor  -  GUIRegisterMsg demo - Demo script to show how to use the Windows messages to interact with controls and your GUI.  -   Latin Square password generator

Share this post


Link to post
Share on other sites
guinness

kenzu,

Sorry to come in at short notice, but why the reluctance to use V3.3.8.1?


UDF List:

 
_AdapterConnections()_AlwaysRun()_AppMon()_AppMonEx()_ArrayFilter/_ArrayReduce_BinaryBin()_CheckMsgBox()_CmdLineRaw()_ContextMenu()_ConvertLHWebColor()/_ConvertSHWebColor()_DesktopDimensions()_DisplayPassword()_DotNet_Load()/_DotNet_Unload()_Fibonacci()_FileCompare()_FileCompareContents()_FileNameByHandle()_FilePrefix/SRE()_FindInFile()_GetBackgroundColor()/_SetBackgroundColor()_GetConrolID()_GetCtrlClass()_GetDirectoryFormat()_GetDriveMediaType()_GetFilename()/_GetFilenameExt()_GetHardwareID()_GetIP()_GetIP_Country()_GetOSLanguage()_GetSavedSource()_GetStringSize()_GetSystemPaths()_GetURLImage()_GIFImage()_GoogleWeather()_GUICtrlCreateGroup()_GUICtrlListBox_CreateArray()_GUICtrlListView_CreateArray()_GUICtrlListView_SaveCSV()_GUICtrlListView_SaveHTML()_GUICtrlListView_SaveTxt()_GUICtrlListView_SaveXML()_GUICtrlMenu_Recent()_GUICtrlMenu_SetItemImage()_GUICtrlTreeView_CreateArray()_GUIDisable()_GUIImageList_SetIconFromHandle()_GUIRegisterMsg()_GUISetIcon()_Icon_Clear()/_Icon_Set()_IdleTime()_InetGet()_InetGetGUI()_InetGetProgress()_IPDetails()_IsFileOlder()_IsGUID()_IsHex()_IsPalindrome()_IsRegKey()_IsStringRegExp()_IsSystemDrive()_IsUPX()_IsValidType()_IsWebColor()_Language()_Log()_MicrosoftInternetConnectivity()_MSDNDataType()_PathFull/GetRelative/Split()_PathSplitEx()_PrintFromArray()_ProgressSetMarquee()_ReDim()_RockPaperScissors()/_RockPaperScissorsLizardSpock()_ScrollingCredits_SelfDelete()_SelfRename()_SelfUpdate()_SendTo()_ShellAll()_ShellFile()_ShellFolder()_SingletonHWID()_SingletonPID()_Startup()_StringCompact()_StringIsValid()_StringRegExpMetaCharacters()_StringReplaceWholeWord()_StringStripChars()_Temperature()_TrialPeriod()_UKToUSDate()/_USToUKDate()_WinAPI_Create_CTL_CODE()_WinAPI_CreateGUID()_WMIDateStringToDate()/_DateToWMIDateString()Au3 script parsingAutoIt SearchAutoIt3 PortableAutoIt3WrapperToPragmaAutoItWinGetTitle()/AutoItWinSetTitle()CodingDirToHTML5FileInstallrFileReadLastChars()GeoIP databaseGUI - Only Close ButtonGUI ExamplesGUICtrlDeleteImage()GUICtrlGetBkColor()GUICtrlGetStyle()GUIEventsGUIGetBkColor()Int_Parse() & Int_TryParse()IsISBN()LockFile()Mapping CtrlIDsOOP in AutoItParseHeadersToSciTE()PasswordValidPasteBinPosts Per DayPreExpandProtect GlobalsQueue()Resource UpdateResourcesExSciTE JumpSettings INISHELLHOOKShunting-YardSignature CreatorStack()Stopwatch()StringAddLF()/StringStripLF()StringEOLToCRLF()VSCROLLWM_COPYDATAMore Examples...

Updated: 22/04/2018

Share this post


Link to post
Share on other sites
kenzu

a am a begginter at autoit and programming..

Share this post


Link to post
Share on other sites
guinness

Whoops, missed your last post about your version. Anyway, JohnOne mentioned what to do already, go to the help file, also look in my signature for Inet.


UDF List:

 
_AdapterConnections()_AlwaysRun()_AppMon()_AppMonEx()_ArrayFilter/_ArrayReduce_BinaryBin()_CheckMsgBox()_CmdLineRaw()_ContextMenu()_ConvertLHWebColor()/_ConvertSHWebColor()_DesktopDimensions()_DisplayPassword()_DotNet_Load()/_DotNet_Unload()_Fibonacci()_FileCompare()_FileCompareContents()_FileNameByHandle()_FilePrefix/SRE()_FindInFile()_GetBackgroundColor()/_SetBackgroundColor()_GetConrolID()_GetCtrlClass()_GetDirectoryFormat()_GetDriveMediaType()_GetFilename()/_GetFilenameExt()_GetHardwareID()_GetIP()_GetIP_Country()_GetOSLanguage()_GetSavedSource()_GetStringSize()_GetSystemPaths()_GetURLImage()_GIFImage()_GoogleWeather()_GUICtrlCreateGroup()_GUICtrlListBox_CreateArray()_GUICtrlListView_CreateArray()_GUICtrlListView_SaveCSV()_GUICtrlListView_SaveHTML()_GUICtrlListView_SaveTxt()_GUICtrlListView_SaveXML()_GUICtrlMenu_Recent()_GUICtrlMenu_SetItemImage()_GUICtrlTreeView_CreateArray()_GUIDisable()_GUIImageList_SetIconFromHandle()_GUIRegisterMsg()_GUISetIcon()_Icon_Clear()/_Icon_Set()_IdleTime()_InetGet()_InetGetGUI()_InetGetProgress()_IPDetails()_IsFileOlder()_IsGUID()_IsHex()_IsPalindrome()_IsRegKey()_IsStringRegExp()_IsSystemDrive()_IsUPX()_IsValidType()_IsWebColor()_Language()_Log()_MicrosoftInternetConnectivity()_MSDNDataType()_PathFull/GetRelative/Split()_PathSplitEx()_PrintFromArray()_ProgressSetMarquee()_ReDim()_RockPaperScissors()/_RockPaperScissorsLizardSpock()_ScrollingCredits_SelfDelete()_SelfRename()_SelfUpdate()_SendTo()_ShellAll()_ShellFile()_ShellFolder()_SingletonHWID()_SingletonPID()_Startup()_StringCompact()_StringIsValid()_StringRegExpMetaCharacters()_StringReplaceWholeWord()_StringStripChars()_Temperature()_TrialPeriod()_UKToUSDate()/_USToUKDate()_WinAPI_Create_CTL_CODE()_WinAPI_CreateGUID()_WMIDateStringToDate()/_DateToWMIDateString()Au3 script parsingAutoIt SearchAutoIt3 PortableAutoIt3WrapperToPragmaAutoItWinGetTitle()/AutoItWinSetTitle()CodingDirToHTML5FileInstallrFileReadLastChars()GeoIP databaseGUI - Only Close ButtonGUI ExamplesGUICtrlDeleteImage()GUICtrlGetBkColor()GUICtrlGetStyle()GUIEventsGUIGetBkColor()Int_Parse() & Int_TryParse()IsISBN()LockFile()Mapping CtrlIDsOOP in AutoItParseHeadersToSciTE()PasswordValidPasteBinPosts Per DayPreExpandProtect GlobalsQueue()Resource UpdateResourcesExSciTE JumpSettings INISHELLHOOKShunting-YardSignature CreatorStack()Stopwatch()StringAddLF()/StringStripLF()StringEOLToCRLF()VSCROLLWM_COPYDATAMore Examples...

Updated: 22/04/2018

Share this post


Link to post
Share on other sites

Create an account or sign in to comment

You need to be a member in order to leave a comment

Create an account

Sign up for a new account in our community. It's easy!

Register a new account

Sign in

Already have an account? Sign in here.

Sign In Now
Sign in to follow this  

×