Jump to content

Sitemap crawler


Recommended Posts

Just sharing in case anyone else found a need. I was looking for something to generate my image cache and webp images on my site and oddly enough couldn't find anything simple or didn't need server access, then I realized I was overcomplicating things and could just generate a sitemap and make an app to crawl it. I have only tested this on win 10 64bit, but it's simple and should work in any environment. You will need chrome driver and the UDF for it.

The "URL Element" just put in whatever it is without the "< >".

#include <String.au3>
#include <Array.au3>
#include <GUIConstantsEx.au3>
#include <StaticConstants.au3>
#include <WindowsConstants.au3>

; non standard UDF's
#include "wd_helper.au3"
#include "wd_capabilities.au3"


Global $sSession
Global $sDesiredCapabilities = Call(SetupChrome)
Global $_WD_DEBUG = $_WD_DEBUG_None
Global $sFile = RegRead("HKEY_CURRENT_USER\Software\Crawler", "CrawlFile")
Global $sSite = RegRead("HKEY_CURRENT_USER\Software\Crawler", "CrawlSite")
Opt('GUICloseOnESC', 0)

GUICreate("Crawler", 355, 130, -1, -1, -1, $WS_EX_ACCEPTFILES)
    GUISetFont (10)

    $gFileL = GUICtrlCreateLabel("File:", 10, 13, 30, 20)
    $gFile = GUICtrlCreateInput($sFile, 95, 10, 200, 25)
    GUICtrlSetState($gFile, $GUI_DROPACCEPTED)
    $gBetweenL = GUICtrlCreateLabel("URL Element:", 10, 43, 95, 20)
    $gSite = GUICtrlCreateInput($sSite, 95, 40, 200, 25)
    $gBetweeni = GUICtrlCreateLabel("< ??? >", 300, 43, 55, 20)
    $gSpeed = GUICtrlCreateCheckbox("Speed", 10, 70)
    $gExperimentali = GUICtrlCreateLabel("(Experimental: Does not wait on Load)", 75, 74, 230, 20)
    ;GUICtrlSetBkColor(-1, 0xff0000)
    $gCrawl = GUICtrlCreateButton("Crawl", 10, 95)
    $gFrame = GUICtrlCreateLabel("dds", 49, 96, 295, 27, $SS_ETCHEDFRAME)
    $gProgress = GUICtrlCreateProgress(50, 97, 292, 24)
    DllCall("UxTheme.dll", "int", "SetWindowTheme", "hwnd", GUICtrlGetHandle(-1), "wstr", " ", "wstr", " ")
    GUICtrlSetStyle(-1, 1)
    $gPages = GUICtrlCreateLabel("", 49, 96, 295, 26, BitOR($SS_CENTER, $SS_CENTERIMAGE), $WS_EX_TOPMOST)
    GUICtrlSetBkColor(-1, $GUI_BKCOLOR_TRANSPARENT)

GUISetState()

While 1

    $ID = GUIGetMsg()

    Switch $ID
        Case $gCrawl
            _Crawl()
        Case $GUI_EVENT_CLOSE
            Exit
    EndSwitch

    Sleep(20)
WEnd


Func _Crawl()

    Local $Diff

    RegWrite("HKEY_CURRENT_USER\Software\Crawler", "CrawlFile", "REG_SZ", GUICtrlRead($gFile))
    RegWrite("HKEY_CURRENT_USER\Software\Crawler", "CrawlSite", "REG_SZ", GUICtrlRead($gSite))

    $bixcontent=FileOpen(GUICtrlRead($gFile))
    $bixcontent=FileRead($bixcontent)
    $SitePages=_StringBetween($bixcontent, "<" & GUICtrlRead($gSite) & ">", "</" & GUICtrlRead($gSite) & ">" )

    ;_ArrayDisplay($SitePages,"")
    ;Exit

    _WD_Startup()
    If @error <> $_WD_ERROR_Success Then MsgBox(0,9,@error)

    $sSession = _WD_CreateSession($sDesiredCapabilities)
    ;ConsoleWrite("Session: " & $sSession & "  -  " & @ScriptLineNumber & @CRLF)

    ConsoleWrite("! Pages to scan: " & UBound($SitePages) & @CRLF)
    GUICtrlSetData($gPages, UBound($SitePages) & " Pages.")

    For $i = 0 To UBound($SitePages) -1

        ;$Timer = TimerInit()

        _WD_Navigate($sSession, $SitePages[$i])

        ;Experimental, may not completely work in generating everything on the page on the speed side
        If GUICtrlRead($gSpeed) = $GUI_CHECKED Then
            _WD_LoadWait($sSession, 250, 6000)
        Else
            Sleep(10)
        EndIf

        ;$Diff = TimerDiff($Timer)

        ;ConsoleWrite($i & "  --  " & $SitePages[$i] & "  --  " & $Diff/1000 & @CRLF)

        GUICtrlSetData($gProgress, $i / UBound($SitePages) * 100)
        GUICtrlSetData($gPages, UBound($SitePages) & " Pages.")

    Next

    GUICtrlSetData($gPages, "Complete.")
    _WD_DeleteSession($sSession)
    _WD_Shutdown()

EndFunc

Func SetupChrome()
    _WD_Option('Driver', 'chromedriver.exe')
    _WD_Option('Port', 9515)
    _WD_Option('DriverParams', '--verbose --log-path="' & @ScriptDir & '\chrome.log"')
    _WD_CapabilitiesStartup()
    _WD_CapabilitiesAdd('alwaysMatch', 'chrome')
    _WD_CapabilitiesAdd('w3c', True)
    _WD_CapabilitiesAdd('args', '--headless')
    _WD_CapabilitiesAdd('excludeSwitches', 'enable-automation')
    _WD_CapabilitiesDump(@ScriptLineNumber) ; dump current Capabilities setting to console - only for testing in this demo
    Local $sDesiredCapabilities = _WD_CapabilitiesGet()
    Return $sDesiredCapabilities
EndFunc   ;==>SetupChrome

 

Link to comment
Share on other sites

Create an account or sign in to comment

You need to be a member in order to leave a comment

Create an account

Sign up for a new account in our community. It's easy!

Register a new account

Sign in

Already have an account? Sign in here.

Sign In Now
 Share

  • Recently Browsing   0 members

    • No registered users viewing this page.
×
×
  • Create New...