parsing tables from raw HTML

Gianni · February 16, 2015

I'm again on this topic,

here a first nearly complete raw listing I came out with (and i'm still working on), that seems to works quite well, (only a little error checking still implemented)
it can extract tables from raw HTML, taking care of rowspan and colspan as well.
it can extract each single table from the HTML even if nested.
nested tables are extracted individually (that is, lower level tables of the one extracted, are not extracted together with itself, you can extract in this way each single wanted table)

could someone please provide some link to some web page containing complex and nested tables just to make some testing?
any help and suggestion is appreciated
thanks

#include <Array.au3>
;
Local $sSourcePage = "http://html.cita.illinois.edu/nav/dtable/dtable-example-complex.php"
ShellExecute($sSourcePage)
Sleep(3000)
MsgBox(64, "info", "This web page is opened just to show you the content of the table" & @CRLF & "otherwise it is not necessary")

Local $sHtml, $iWantedTable, $aResult
; read the raw HTML directly from the web server
$sHtml = BinaryToString(InetRead($sSourcePage))
;
; extract the first table from the HTML
$iWantedTable = 1
$aResult = ExtractTable($sHtml, $iWantedTable)
;
_ArrayDisplay($aResult)
Exit


Func ExtractTable($sHtml, $iWantedTable, $bExtractTH = True, $bFillSpan = False)
    ;
    ; $sHtml:           the raw HTML of that contains the table(s)
    ; $iWantedTable:    the nr. of the table to extract from the HTML
    ; $bExtractTH:      if Headers should be extracted as well
    ; $bFillSpaw:       if the whole spaw areas should be filled  (not yet implemented)
    ;
    ; This will find all tables in the HTML page (even if nested)
    Local $aTables = ParseTags($sHtml, "<table", "</table>")
    If @error Then Return SetError(@error, 0, "")
    If $iWantedTable > $aTables[0] Then Return SetError(4, $aTables[0], "") ; in @extended nr. of tables in this HTML
    ; _ArrayDisplay($aTables, "ParseTags <table ... </table>")
    ;
    If $bExtractTH Then ;extract also TableHeaders as normal data?
        $aTables[$iWantedTable] = StringReplace(StringReplace($aTables[$iWantedTable], "<th", "<td"), "</th>", "</td>") ; th becomes td
    EndIf
    ;
    ; rows of the wanted table
    Local $aRows = ParseTags($aTables[$iWantedTable], "<tr", "</tr>") ; $aRows[0] = nr. of rows
    If @error Then Return SetError(@error, 0, "")

    Local $aCols[$aRows[0] + 1], $aTemp
    For $i = 1 To $aRows[0]
        $aTemp = ParseTags($aRows[$i], "<td", "</td>")
        If $aCols[0] < $aTemp[0] Then $aCols[0] = $aTemp[0] ; $aTemp[0] = max nr. of columns in table
        $aCols[$i] = $aTemp
    Next

    Local $aResult[$aRows[0]][$aCols[0]], $iStart, $iEnd, $aRowspan, $aColspan, $iSpanY, $iSpanX, $iSpanRow, $iSpanCol, $iMarkerCode ;  = 1
    Local $aMirror = $aResult
    For $i = 1 To $aRows[0] ;      scan all rows in this table
        $aTemp = $aCols[$i] ; <td ..> xx </td> .....
        For $ii = 1 To $aTemp[0] ; scan all cells in this row
            $iSpanY = 0
            $iSpanX = 0
            $iY = $i - 1 ; zero base index for vertical ref
            $iX = $ii - 1 ; zero based indexes for horizontal ref
            $aRowspan = StringRegExp($aTemp[$ii], "(?i)rowspan\s*=\s*[""']?\s*(\d+)", 1) ; check presence of rowspan
            If IsArray($aRowspan) Then
                $iSpanY = $aRowspan[0] - 1
                If $iSpanY + $iY > $aRows[0] Then
                    $iSpanY -= $iSpanY + $iY - $aRows[0] + 1
                EndIf
            EndIf
            ;
            $aColspan = StringRegExp($aTemp[$ii], "(?i)colspan\s*=\s*[""']?\s*(\d+)", 1) ; check presence of colspan
            If IsArray($aColspan) Then $iSpanX = $aColspan[0] - 1
            ;
            $iMarkerCode += 1 ; code to mark this span area or single cell
            If $iSpanY Or $iSpanX Then
                $iX1 = $iX
                For $iSpY = 0 To $iSpanY
                    For $iSpX = 0 To $iSpanX
                        $iSpanRow = $iY + $iSpY
                        If $iSpanRow > UBound($aMirror, 1) - 1 Then
                            $iSpanRow = UBound($aMirror, 1) - 1
                        EndIf
                        $iSpanCol = $iX1 + $iSpX
                        If $iSpanCol > UBound($aMirror, 2) - 1 Then
                            ReDim $aResult[$aRows[0]][UBound($aResult, 2) + 1]
                            ReDim $aMirror[$aRows[0]][UBound($aMirror, 2) + 1]
                        EndIf
                        ;
                        While $aMirror[$iSpanRow][$iX1 + $iSpX] ; search first free column
                            $iX1 += 1 ; $iSpanCol += 1
                            If $iX1 + $iSpX > UBound($aMirror, 2) - 1 Then
                                ReDim $aResult[$aRows[0]][UBound($aResult, 2) + 1]
                                ReDim $aMirror[$aRows[0]][UBound($aMirror, 2) + 1]
                            EndIf
                        WEnd
                    Next
                Next
            EndIf
            $iX1 = $iX
            For $iSpX = 0 To $iSpanX
                For $iSpY = 0 To $iSpanY
                    $iSpanRow = $iY + $iSpY
                    If $iSpanRow > UBound($aMirror, 1) - 1 Then
                        $iSpanRow = UBound($aMirror, 1) - 1
                    EndIf
                    $iSpawnCol = $iX1 + $iSpX
                    While $aMirror[$iSpanRow][$iX1 + $iSpX]
                        $iX1 += 1
                        If $iX1 + $iSpX > UBound($aMirror, 2) - 1 Then
                            ReDim $aResult[$aRows[0]][$iX1 + $iSpX + 1]
                            ReDim $aMirror[$aRows[0]][$iX1 + $iSpX + 1]
                        EndIf
                    WEnd
                    $aMirror[$iSpanRow][$iX1 + $iSpX] = $iMarkerCode ; 1
                    $aResult[$iY][$iX1] = StringRegExpReplace($aTemp[$ii], '<[^>]+>', "") ; "(?U)\<.*\>", "")
                Next
            Next
        Next
    Next
    ; _ArrayDisplay($aMirror)
    Return $aResult
EndFunc   ;==>ExtractTable
;
; -----------------------------------------------------------------------------------------
; returns an array containing a collection of <tag ...... </tag> lines. one in each element
; even if are nested
; -----------------------------------------------------------------------------------------
Func ParseTags($sHtml, $sOpening, $sClosing) ; example: $sOpening = '<table', $sClosing = '</table>'
    ; it finds how many of such tags are on the HTML page
    StringReplace($sHtml, $sOpening, $sOpening) ; in @xtended nr. of occurences
    Local $iNrOfThisTag = @extended
    ; I assume that opening <tag and closing </tag> tags are balanced (as should be)
    ; (so NO check is made to see if they are actually balanced)
    If $iNrOfThisTag Then ; if there is at least one of this tag
        ; $aThisTagsPositions array will contain the positions of the
        ; starting <tag and ending </tag> tags within the HTML
        Local $aThisTagsPositions[$iNrOfThisTag * 2 + 1][3] ; 1 based (make room for all open and close tags)
        ; 2) find in the HTML the positions of the $sOpening <tag and $sClosing </tag> tags
        For $i = 1 To $iNrOfThisTag
            $aThisTagsPositions[$i][0] = StringInStr($sHtml, $sOpening, 0, $i) ; start position of $i occurrence of <tag opening tag
            $aThisTagsPositions[$i][1] = $sOpening ; it marks which kind of tag is this
            $aThisTagsPositions[$i][2] = $i ; nr of this tag
            $aThisTagsPositions[$iNrOfThisTag + $i][0] = StringInStr($sHtml, $sClosing, 0, $i) + StringLen($sClosing) - 1 ; end position of $i^ occurrence of </tag> closing tag
            $aThisTagsPositions[$iNrOfThisTag + $i][1] = $sClosing ; it marks which kind of tag is this
        Next
        _ArraySort($aThisTagsPositions, 0, 1) ; now all opening and closing tags are in the same sequence as them appears in the HTML
        Local $aStack[UBound($aThisTagsPositions)][2]
        Local $aTags[Ceiling(UBound($aThisTagsPositions) / 2)] ; will contains the collection of <tag ..... </tag> from the html
        For $i = 1 To UBound($aThisTagsPositions) - 1
            If $aThisTagsPositions[$i][1] = $sOpening Then ; opening <tag
                $aStack[0][0] += 1 ; nr of tags in html
                $aStack[$aStack[0][0]][0] = $sOpening
                $aStack[$aStack[0][0]][1] = $i
            ElseIf $aThisTagsPositions[$i][1] = $sClosing Then ; a closing </tag> was found
                If Not $aStack[0][0] Or Not ($aStack[$aStack[0][0]][0] = $sOpening And $aThisTagsPositions[$i][1] = $sClosing) Then
                    Return SetError(3, 0, 0) ; Open/Close mismatch error
                Else ; pair detected (the reciprocal tag)
                    ; now get coordinates of the 2 tags
                    ; 1) extract this tag <tag ..... </tag> from the html to the array
                    $aTags[$aThisTagsPositions[$aStack[$aStack[0][0]][1]][2]] = StringMid($sHtml, $aThisTagsPositions[$aStack[$aStack[0][0]][1]][0], 1 + $aThisTagsPositions[$i][0] - $aThisTagsPositions[$aStack[$aStack[0][0]][1]][0])
                    ; 2) remove that tag <tag ..... </tag> from the html
                    $sHtml = StringLeft($sHtml, $aThisTagsPositions[$aStack[$aStack[0][0]][1]][0] - 1) & StringMid($sHtml, $aThisTagsPositions[$i][0] + 1)
                    ; 3) adjust the references to the new positions of remaining tags
                    For $ii = $i To UBound($aThisTagsPositions) - 1
                        $aThisTagsPositions[$ii][0] -= StringLen($aTags[$aThisTagsPositions[$aStack[$aStack[0][0]][1]][2]])
                    Next
                    $aStack[0][0] -= 1 ; nr of tags still in html
                EndIf
            EndIf
        Next
        If Not $aStack[0][0] Then ; all tags has been parsed correctly
            $aTags[0] = $iNrOfThisTag
            Return $aTags ; OK
        Else
            Return SetError(2, 0, 0) ; opening and closing tags are not balanced
        EndIf
    Else
        Return SetError(1, 0, 0) ; there are no of such tags on this HTML page
    EndIf
EndFunc   ;==>ParseTags

Edited February 16, 2015 by Chimp

Sign In

parsing tables from raw HTML

Recommended Posts

Gianni

Link to comment

Share on other sites

Create an account or sign in to comment

Create an account

Sign in

Recently Browsing 0 members

Browse

AutoIt Resources

Release

Beta