#include-once Global $_HTML_SEARCHMODE = 1 ; (0 = Compare / 1 = Substring / RegExp) (2 = Compare / 3 = Substring / String-compare) #Region #current# ; _HTML_ExtractURLVar ; _HTML_Get ; _HTML_GetAllLinks ; _HTML_GetImageSrc ; _HTML_GetLink ; _HTML_GetSource ; _HTML_GetTable ; _HTML_GetText ; _HTML_GetURLVar ; _HTML_ImageSave ; _HTML_Search #EndRegion #current# ; #FUNCTION# =================================================================== ; Name ..........: _HTML_GetSource ; Description ...: ; AutoIt Version : V3.3.2.0 ; Syntax ........: _HTML_GetSource($sURL) ; Parameter(s): .: $sURL - ; Return Value ..: Success - string ; Failure - empty string ; @ERROR - ; Author(s) .....: Thorsten Willert ; Date ..........: Wed Jan 27 23:12:10 CET 2010 ; Link ..........: ; Related .......: ; Example .......: No ; ============================================================================== Func _HTML_GetSource($sURL) Local $sHTML = InetRead($sURL, 1) If @error Then Return SetError(@error, @extended, "") $sHTML = BinaryToString($sHTML) $sHTML = StringRegExpReplace($sHTML, '[\r\n\t]', " ") $sHTML = StringRegExpReplace($sHTML, '(?i).*?', "") Return $sHTML EndFunc ;==>_HTML_GetSource ; #FUNCTION# =================================================================== ; Name ..........: _HTML_ExtractURLVar ; Description ...: Extracts an URL variable from an URL ; AutoIt Version : V3.3.0.0 ; Syntax ........: _HTML_ExtractURLVar($sURL, $sVar) ; Parameter(s): .: $sURL - URL ; $sVar - variable-name ; Return Value ..: Success - string ; Failure - empty string ; @ERROR - (see help-file: StringRegExp) ; Author(s) .....: Thorsten Willert ; Date ..........: Thu Dec 24 13:28:27 CET 2009 ; ============================================================================== Func _HTML_ExtractURLVar($sURL, $sVar) Local $a = StringRegExp($sURL, '\?.*?' & $sVar & '=([\w%]+)(?:&|&)?', 3) If UBound($a) = 0 Then Return SetError(@error, @extended, "") Return $a[0] EndFunc ;==>_HTML_ExtractURLVar ; #FUNCTION# =================================================================== ; Name ..........: _HTML_Get ; Description ...: ; AutoIt Version : V3.3.0.0 ; Syntax ........: _HTML_Get($sHTML, $sTag, $sAttributeGet, $sValue[, $sAttribute = "id"[, $iIndex = 0]]) ; Parameter(s): .: $sHTML - HTML-Source ; $sTag - HTML-tag ; $sAttributeGet - attribute to get the value from ; $sValue - value of the attribute to search ; $sAttribute - Optional: (Default = "id") : attribute to search ; $iIndex - Optional: (Default = 0) : ; Return Value ..: Success - string ; Failure - empty string ; @ERROR - 1 ; Author(s) .....: Thorsten Willert ; Date ..........: Wed Jan 27 22:12:38 CET 2010 ; ============================================================================== Func _HTML_Get($sHTML, $sTag, $sAttributeGet, $sValue, $sAttribute = "id", $iIndex = 0) Local Const $sE1 = '(?i)<' & $sTag & '(.*?)>' Local Const $sE2 = '(?i)' & $sAttribute & '\s*=\s*("|''|)' & __HTML_Search($sValue) & '\1' Local Const $sE3 = '(?i)' & $sAttributeGet & '\s*=\s*("|''|)(.*?)\1' ConsoleWrite("_HTML_Get: " & @CRLF & $sE1 & @CRLF & $sE2 & @CRLF & $sE3 & @CRLF) Local $a = StringRegExp($sHTML, $sE1, 3) If @error = 2 Then ConsoleWriteError("_HTML_Get: Error in expression: " & $sE1 & @CRLF) Return SetError(1, 0, "") EndIf Local $c = 0, $r For $i = 0 To UBound($a) - 1 ConsoleWrite($a[$i] & @CRLF) If StringRegExp($a[$i], $sE2) Then If @error = 2 Then ConsoleWriteError("_HTML_Get: Error in expression: " & $sE2 & @CRLF) Return SetError(1, 0, "") EndIf $r = StringRegExp($a[$i], $sE3, 3) If @error = 2 Then ConsoleWriteError("_HTML_Get: Error in expression: " & $sE3 & @CRLF) Return SetError(1, 0, "") EndIf If $c = $iIndex Then ExitLoop $c += 1 EndIf Next If UBound($r) = 0 Then Return SetError(1, 0, "") Return $r[1] EndFunc ;==>_HTML_Get ; #FUNCTION# =================================================================== ; Name ..........: _HTML_GetAllImageSrc ; Description ...: Returns an array with all image-srcs of the source-code ; AutoIt Version : V3.3.0.0 ; Syntax ........: _HTML_GetAllImageSrc($sHTML[, $sFilter = '.*?']) ; Parameter(s): .: $sHTML - HTML-Source ; $sFilter - Optional: (Default = '.*?') : RegEx-filter for the src ; Return Value ..: Success - Array ; Failure - ; @ERROR - (see help-file: StringRegExp) ; Author(s) .....: Thorsten Willert ; Date ..........: Wed Jan 27 22:12:54 CET 2010 ; ============================================================================== Func _HTML_GetAllImageSrc($sHTML, $sFilter = '.*?') $sFilter = __HTML_Search($sFilter) Local $sE1 = '(?i)' ConsoleWrite("_HTML_GetAllImageSrc:" & @CRLF & $sE1 & @CRLF) Local $r = StringRegExp($sHTML, $sE1, 3) Return SetError(@error, @extended, $r) EndFunc ;==>_HTML_GetAllImageSrc ; #FUNCTION# =================================================================== ; Name ..........: _HTML_GetAllLinks ; Description ...: Returns an array with all links of the source-code ; AutoIt Version : V3.3.0.0 ; Syntax ........: _HTML_GetAllLinks($sHTML[, $sFilter = '.*?']) ; Parameter(s): .: $sHTML - HTML-Source ; $sFilter - Optional: (Default = '.*?') : RegEx-filter for the href ; Return Value ..: Success - Array ; Failure - ; @ERROR - (see help-file: StringRegExp) ; Author(s) .....: Thorsten Willert ; Date ..........: Wed Jan 27 22:13:08 CET 2010 ; ============================================================================== Func _HTML_GetAllLinks($sHTML, $sFilter = '.*?') $sFilter = __HTML_Search($sFilter) Local $sE1 = '(?i)' ConsoleWrite("_HTML_GetAllLinks:" & @CRLF & $sE1 & @CRLF) Local $r = StringRegExp($sHTML, $sE1, 3) Return SetError(@error, @extended, $r) EndFunc ;==>_HTML_GetAllLinks ; #FUNCTION# =================================================================== ; Name ..........: _HTML_GetImageSrc ; Description ...: Returns the img-src of the specified image ; AutoIt Version : V3.3.0.0 ; Syntax ........: _HTML_GetImageSrc($sHTML, $sValue[, $sAttribute = "id"[, $iIndex = 0]]) ; Parameter(s): .: $sHTML - HTML-Source ; $sValue - The value of the attribute ; $sAttribute - Optional: (Default = "id") : The attribute of the image ; $iIndex - Optional: (Default = 0) : ; Return Value ..: Success - img-src ; Failure - empty string ; @ERROR - 1 ; Author(s) .....: Thorsten Willert ; Date ..........: Wed Jan 06 20:52:52 CET 2010 ; ============================================================================== Func _HTML_GetImageSrc($sHTML, $sValue, $sAttribute = "id", $iIndex = 0) Local $r = _HTML_Get($sHTML, "img", "src", $sValue, $sAttribute, $iIndex) Return SetError(@error, 0, $r) EndFunc ;==>_HTML_GetImageSrc ; #FUNCTION# =================================================================== ; Name ..........: _HTML_GetLink ; Description ...: Returns the href of the specified link ; AutoIt Version : V3.3.0.0 ; Syntax ........: _HTML_GetLink($sHTML, $sValue[, $sAttribute = "id"[, $iIndex = 0]]) ; Parameter(s): .: $sHTML - HTML-Source ; $sValue - The value of the attribute ; $sAttribute - Optional: (Default = "id") : Attribute of the link ; $iIndex - Optional: (Default = 0) : ; Return Value ..: Success - href ; Failure - empty string ; @ERROR - 1 ; Author(s) .....: Thorsten Willert ; Date ..........: Wed Jan 06 20:52:56 CET 2010 ; ============================================================================== Func _HTML_GetLink($sHTML, $sValue, $sAttribute = "id", $iIndex = 0) Local $r = _HTML_Get($sHTML, "a", "href", $sValue, $sAttribute, $iIndex) Return SetError(@error, 0, $r) EndFunc ;==>_HTML_GetLink ; #FUNCTION# =================================================================== ; Name ..........: _HTML_GetTable ; Description ...: Returns a HTML-table as 2-dim array. ; AutoIt Version : V3.3.2.0 ; Syntax ........: _HTML_GetTable($sHTML[, $sValue = ""[, $sAttribute = "id"[, $iIndex = 0[, $iFilter = 30]]]]) ; Parameter(s): .: $sHTML - HTML-source ; $sValue - Optional: (Default = "") : ; $sAttribute - Optional: (Default = "id") : ; $iIndex - Optional: (Default = 0) : ; $iFilter - Optional: (Default = 30) : ; - 0 = no filter ; - 1 = removes non ascii characters ; - 2 = removes all double whitespaces ; - 4 = removes all double linefeeds ; - 8 = removes all html-tags ; - 16 = simple html-tag / entities convertor ; Return Value ..: Success - array ; Failure - array ; @ERROR - ; Author(s) .....: Thorsten Willert ; Date ..........: Thu Feb 24 22:51:43 CET 2010 ; Link ..........: ; Related .......: ; Example .......: No ; ============================================================================== Func _HTML_GetTable($sHTML, $sValue = "", $sAttribute = "id", $iIndex = 0, $iFilter = 30) Local $aRet[1][1] $sHTML = _HTML_GetText($sHTML, "table", $sValue, $sAttribute, $iIndex, 0) If @error Then Return SetError(1, 0, $aRet) Local $aR = StringRegExp($sHTML, '(?i)(.*?)', 3) If @error Then Return SetError(1, 0, $aRet) Local $iR = UBound($aR), $aC, $iC For $j = 0 To $iR - 1 $aC = StringRegExp($aR[$j], '(?i)<(?:td|th).*?>(.*?)', 3) If @error Then Return SetError(1, 0, $aRet) $iC = UBound($aC) ReDim $aRet[$iR][$iC] For $k = 0 To $iC - 1 $aRet[$j][$k] = StringStripWS(__HTML_Filter($aC[$k], $iFilter), 3) Next Next Return $aRet EndFunc ;==>_HTML_GetTable ; #FUNCTION# =================================================================== ; Name ..........: _HTML_GetText ; Description ...: ; AutoIt Version : V3.3.0.0 ; Syntax ........: _HTML_GetText($sHTML, $sTag, $sValue[, $sAttribute = "id"[, $iIndex = 0[, $iFilter = 30]]]) ; Parameter(s): .: $sHTML - HTML-Source ; $sTag - HTML-tag ; $sValue - Optional: (Default = "") : value of this attribute ($_HTML_SEARCHMODE) ; $sAttribute - Optional: (Default = "id") : attribute in this tag ; $iIndex - Optional: (Default = 0) : index of the tag ; $iFilter - Optional: (Default = 30) : String filter (you can add them) ; - 0 = no filter ; - 1 = removes non ascii characters ; - 2 = removes all double whitespaces ; - 4 = removes all double linefeeds ; - 8 = removes all html-tags ; - 16 = simple html-tag / entities convertor ; Return Value ..: Success - string ; Failure - empty string ; @ERROR - 1 ; Author(s) .....: Thorsten Willert ; Date ..........: Wed Jan 27 20:26:32 CET 2010 ; ============================================================================== Func _HTML_GetText($sHTML, $sTag, $sValue = "", $sAttribute = "id", $iIndex = 0, $iFilter = 30) Local $sE1 If $sValue And $sAttribute Then $iIndex = $iIndex * 2 + 1 $sE1 = '(?i)<' & $sTag & '\s+.*?' & $sAttribute & '\s*=\s*("|''|)' & __HTML_Search($sValue) & '\1.*?>(.*?)' Else $sE1 = '(?i)<' & $sTag & '.*?>(.*?)' EndIf ConsoleWrite("_HTML_GetText: " & $sE1 & @CRLF) Local $r = StringRegExp($sHTML, $sE1, 3) If @error = 2 Then ConsoleWriteError("_HTML_GetText: Error in expression: " & $sE1 & @CRLF) Return SetError(1, 0, "") EndIf Local $iE = UBound($r) If $iE = 0 Or $iIndex >= $iE Then Return SetError(1, 0, "") If $iFilter Then __HTML_Filter($r[$iIndex], $iFilter) Return $r[$iIndex] EndFunc ;==>_HTML_GetText ; #FUNCTION# =================================================================== ; Name ..........: _HTML_GetURLVar ; Description ...: ; AutoIt Version : V3.3.0.0 ; Syntax ........: _HTML_GetURLVar($sHTML, $sVar, $sValue[, $sAttribute = "id"[, $iIndex = 0]]) ; Parameter(s): .: $sHTML - HTML-source ; $sVar - the variable in the URL ; $sValue - the value of the attribute in $sMode ; $sAttribute - Optional: (Default = "id") : attribute of the link ; $iIndex - Optional: (Default = 0) : index for the attribute ; Return Value ..: Success - string ; Failure - empty string ; @ERROR - (see help-file: StringRegExp) ; Author(s) .....: Thorsten Willert ; Date ..........: Fri Dec 25 10:26:26 CET 2009 ; ============================================================================== Func _HTML_GetURLVar($sHTML, $sVar, $sValue, $sAttribute = "id", $iIndex = 0) Local $sURL = _HTML_Get($sHTML, "a", "href", $sValue, $sAttribute, $iIndex) If @error Then Return SetError(@error, @extended, "") Local $s = _HTML_ExtractURLVar($sURL, $sVar) Return SetError(@error, @extended, $s) EndFunc ;==>_HTML_GetURLVar ; #FUNCTION# =================================================================== ; Name ..........: _HTML_ImageSave ; Description ...: ; AutoIt Version : V3.3.2.0 ; Syntax ........: _HTML_ImageSave($sHTML, $sValue[, $sAttribute = "id"[, $iIndex = 0[, $sBaseURL = ""[, $sDestDir = @SCRIPTDIR[, $sDestFile = ""]]]]]) ; Parameter(s): .: $sHTML - HTML-source ; $sValue - value of $sAttribute ; $sAttribute - Optional: (Default = "id") : attribute of the image ; $iIndex - Optional: (Default = 0) : index of the attribute ; $sBaseURL - Optional: (Default = "") : base url of the image (if there is no full path in the src) ; $sDestDir - Optional: (Default = @SCRIPTDIR) : directory where the image is saved ; $sDestFile - Optional: (Default = "") : file name (default is the orignal-name) ; Return Value ..: Success - 1 ; Failure - 0 ; @ERROR - ; Author(s) .....: Thorsten Willert ; Date ..........: Fri Jan 22 10:58:46 CET 2010 ; Link ..........: ; Related .......: ; Remarks .......: You can not use it, if relative paths are used in the image-src ; Example .......: Yes #cs #include <_html.au3> #include $HTML = _INetGetSource("http://autoit.de/index.php?page=Portal") _HTML_ImageSave($HTML, "registerS.png", "src", 0, "www.autoit.de", "c:\\") #ce ; ============================================================================== Func _HTML_ImageSave($sHTML, $sValue, $sAttribute = "id", $iIndex = 0, $sBaseURL = "", $sDestDir = @ScriptDir, $sDestFile = "") Local $src = _HTML_GetImageSrc($sHTML, $sValue, $sAttribute, $iIndex) If Not $sBaseURL Then $sBaseURL = _HTML_Get($sHTML, "base", "", "", "href") If Not StringRegExp($sBaseURL, '^(?:http|ftp)s?://') Then $sBaseURL = "http://" & $sBaseURL If Not FileExists($sDestDir) Then $sDestDir = @ScriptDir If Not $sDestFile Then $sDestFile = $sDestDir & StringMid($src, StringInStr($src, "/", 2, -1)) If StringRight($sBaseURL, 1) <> "/" And StringLeft($src, 1) <> "/" Then $sBaseURL &= "/" If $sBaseURL Then $src = $sBaseURL & $src If InetGet($src, $sDestFile, 1) Then ConsoleWrite("_HTML_ImageSave:" & @CRLF & "from:" & @TAB & $src & @CRLF & "to:" & @TAB & $sDestFile & @CRLF) Return FileExists($sDestFile) Else Return SetError(@error, 0, 0) EndIf EndFunc ;==>_HTML_ImageSave ; #FUNCTION# =================================================================== ; Name ..........: _HTML_Search ; Description ...: Searches only in the text of the HTML-source ; AutoIt Version : V3.3.0.0 ; Syntax ........: _HTML_Search($sHTML, $sSearch) ; Parameter(s): .: $sHTML - HTML-source ; $sSearch - the string to search ($_HTML_SEARCHMODE) ; Return Value ..: Success - 1 ; Failure - 0 ; Author(s) .....: Thorsten Willert ; Date ..........: Wed Jan 06 21:19:29 CET 2010 ; ============================================================================== Func _HTML_Search($sHTML, $sSearch) Return StringRegExp(StringRegExpReplace($sHTML, '<[^>]*>', ""), __HTML_Search($sSearch)) EndFunc ;==>_HTML_Search ;=============================================================================== Func __HTML_RegExMask($s) Return StringRegExpReplace($s, '(\$|\\|\+|\-|\.|\*|\(|\)|\[|\]|\{|\})+', '\\$1') EndFunc ;==>__HTML_RegExMask ;=============================================================================== Func __HTML_Search($s) If $s = '.*?' Then Return $s Switch $_HTML_SEARCHMODE Case 0 Return $s Case 1 Return '.*?' & $s & '.*?' Case 2 Return __HTML_RegExMask($s) Case 3 Return '.*?' & __HTML_RegExMask($s) & '.*?' Case Else Return $s EndSwitch EndFunc ;==>__HTML_Search ; #INTERNAL_USE_ONLY# ========================================================== ; Name ..........: __HTML_Filter ; Description ...: Filter for strings ; AutoIt Version : V3.3.0.0 ; Syntax ........: __HTML_Filter(ByRef $sString[, $iMode = 0]) ; Parameter(s): .: $sString - String to filter ; $iMode - Optional: (Default = 0) : removes nothing ; - 0 = no filter ; - 1 = removes non ascii characters ; - 2 = removes all double whitespaces ; - 4 = removes all double linefeeds ; - 8 = removes all html-tags ; - 16 = simple html-tag / entities convertor ; Return Value ..: Success - Filterd String ; Failure - Input String ; Author(s) .....: Thorsten Willert, Stephen Podhajecki {gehossafats at netmdc. com} _ConvertEntities ; Date ..........: Wed Jan 27 20:49:59 CET 2010 ; ============================================================================== Func __HTML_Filter(ByRef $sString, $iMode = 0) If $iMode = 0 Then Return $sString ;16 simple HTML tag / entities converter If $iMode >= 16 And $iMode < 32 Then Local $aEntities[96][2] = [[""", 34],["&", 38],["<", 60],[">", 62],[" ", 3],[" ", 32] _ ,["¡", 161],["¢", 162],["£", 163],["¤", 164],["¥", 165],["¦", 166] _ ,["§", 167],["¨", 168],["©", 169],["ª", 170],["¬", 172],["­", 173] _ ,["®", 174],["¯", 175],["°", 176],["±", 177],["²", 178],["³", 179] _ ,["´", 180],["µ", 181],["¶", 182],["·", 183],["¸", 184],["¹", 185] _ ,["º", 186],["»", 187],["¼", 188],["½", 189],["¾", 190],["¿", 191] _ ,["À", 192],["Á", 193],["Ã", 195],["Ä", 196],["Å", 197],["Æ", 198] _ ,["Ç", 199],["È", 200],["É", 201],["Ê", 202],["Ì", 204],["Í", 205] _ ,["Î", 206],["Ï", 207],["Ð", 208],["Ñ", 209],["Ò", 210],["Ó", 211] _ ,["Ô", 212],["Õ", 213],["Ö", 214],["×", 215],["Ø", 216],["Ù", 217] _ ,["Ú", 218],["Û", 219],["Ü", 220],["Ý", 221],["Þ", 222],["ß", 223] _ ,["à", 224],["á", 225],["â", 226],["ã", 227],["ä", 228],["å", 229] _ ,["æ", 230],["ç", 231],["è", 232],["é", 233],["ê", 234],["ë", 235] _ ,["ì", 236],["í", 237],["î", 238],["ï", 239],["ð", 240],["ñ", 241] _ ,["ò", 242],["ó", 243],["ô", 244],["õ", 245],["ö", 246],["÷", 247] _ ,["ø", 248],["ù", 249],["ú", 250],["û", 251],["ü", 252],["þ", 254]] $sString = StringRegExpReplace($sString, '(?i)', @CRLF & @CRLF) $sString = StringRegExpReplace($sString, '(?i)
', @CRLF) Local $iE = UBound($aEntities) - 1 For $x = 0 To $iE $sString = StringReplace($sString, $aEntities[$x][0], Chr($aEntities[$x][1]), 0, 2) Next For $x = 32 To 255 $sString = StringReplace($sString, "&#" & $x & ";", Chr($x)) Next $iMode -= 16 EndIf ;8 Tag filter If $iMode >= 8 And $iMode < 16 Then ;$sString = StringRegExpReplace($sString, '.*?', "") $sString = StringRegExpReplace($sString, "<[^>]*>", "") $iMode -= 8 EndIf ; 4 remove all double cr, lf If $iMode >= 4 And $iMode < 8 Then $sString = StringRegExpReplace($sString, "([ \t]*[\n\r]+[ \t]*)", @CRLF) $sString = StringRegExpReplace($sString, "[\n\r]+", @CRLF) $iMode -= 4 EndIf ; 2 remove all double withespaces If $iMode = 2 Or $iMode = 3 Then $sString = StringRegExpReplace($sString, "[[:blank:]]+", " ") $sString = StringRegExpReplace($sString, "\n[[:blank:]]+", @CRLF) $sString = StringRegExpReplace($sString, "[[:blank:]]+\n", "") $iMode -= 2 EndIf ; 1 remove all non ASCII If $iMode = 1 Then $sString = StringRegExpReplace($sString, "[^\x00-\x7F]", " ") EndIf Return $sString EndFunc ;==>__HTML_Filter