Jump to content

Scrape Google Search results using Google APIs


Recommended Posts

Has anyone tried to use Google APIs for scraping search results?

I've built this simple script to demonstrate my problem I'm having with Google results scraping:

#include <Array.au3>

Global $oHTTP = ObjCreate("WinHttp.WinHttpRequest.5.1")

_PerformSearch();

Func _PerformSearch()
    dim $ShowResults[0][3];
    $searchPages = 3

    for $j = 1 to $searchPages*8 Step 8
        $SearchString = 'Apple+Juice';              Disable this line...
;~      $SearchString = 'intitle:"crazy+stink"';    ...And enable this one
        ;http://ajax.googleapis.com/ajax/services/search/web?v=1.0&start=1&rsz=large&q=intitle:%22crazy+stink%22
        $sURL = "http://ajax.googleapis.com/ajax/services/search/web?v=1.0&start=" & $j & "&rsz=large&q=" & $SearchString
        $oHTTP.Open("GET", $sURL, False)
        $oHTTP.SetRequestHeader("Referer", @IPAddress1)
        If (@error) Then Return SetError(1, 0, 0)
        $oHTTP.Send()
        If (@error) Then Return SetError(2, 0, 0)
        $retVal = $oHTTP.ResponseText
        If (@error) Then Return SetError(3, 0, 0)

        $aReturn = _JSON_Decode($retVal)
        if NOT @error then
            $responseData = $aReturn[0][1]
            $results = $responseData[0][1]
            for $i = 0 to UBound($results)-1
                $oneResult = $results[$i]
                $title = _OnlyBoldedDecode(_getJSonValue($oneResult, "title"))
                $url = _getJSonValue($oneResult, "url")
                $content = _OnlyBoldedDecode(_getJSonValue($oneResult, "content"))

                ReDim $ShowResults[UBound($ShowResults)+1][3]
                $arIndex = UBound($ShowResults)-1
                $ShowResults[$arIndex][0] = $title
                $ShowResults[$arIndex][1] = $url
                $ShowResults[$arIndex][2] = $content
            Next
        EndIf
    Next
    _ArrayDisplay($ShowResults);
EndFunc

Func _OnlyBoldedDecode($sData);decoding only most common code
    Return StringReplace(StringReplace($sData, "\u003c", "<"), "\u003e", ">");
EndFunc

Func _getJSonValue($_res, $getData)
    for $i = 0 to UBound($_res)-1
        if $_res[$i][0] == $getData then Return $_res[$i][1]
    Next
    Return "";
EndFunc

Func _JSON_Decode($sString)
    Local $iIndex, $aVal, $sOldStr = $sString, $b

    $sString = StringStripCR(StringStripWS($sString, 7))
    If Not StringRegExp($sString, "(?i)^\{.+}$") Then Return SetError(1, 0, 0)
    Local $aArray[1][2], $iIndex = 0
    $sString = StringMid($sString, 2)

    Do
        $b = False

        $aVal = StringRegExp($sString, '^"([^"]+)"\s*:\s*(["{[]|[-+]?\d+(?:(?:\.\d+)?[eE][+-]\d+)?|true|false|null)', 2) ; Get value & next token
        If @error Then
            ConsoleWrite("!> StringRegExp Error getting next Value." & @CRLF)
            ConsoleWrite($sString & @CRLF)
            $sString = StringMid($sString, 2) ; maybe it works when the string is trimmed by 1 char from the left ?
            ContinueLoop
        EndIf

        $aArray[$iIndex][0] = $aVal[1] ; Key
        $sString = StringMid($sString, StringLen($aVal[0]))

        Switch $aVal[2] ; Value Type (Array, Object, String) ?
            Case '"' ; String
                ; Value -> Array subscript. Trim String after that.

                $aArray[$iIndex][1] = StringMid($sString, 2, StringInStr($sString, """", 1, 2) - 2)
                $sString = StringMid($sString, StringLen($aArray[$iIndex][1]) + 3)

                ReDim $aArray[$iIndex + 2][2]
                $iIndex += 1

            Case '{' ; Object
                ; Recursive function call which will decode the object and return it.
                ; Object -> Array subscript. Trim String after that.

                $aArray[$iIndex][1] = _JSON_Decode($sString)
                $sString = StringMid($sString, @extended + 2)
                If StringLeft($sString, 1) = "," Then $sString = StringMid($sString, 2)

                $b = True
                ReDim $aArray[$iIndex + 2][2]
                $iIndex += 1

            Case '[' ; Array
                ; Decode Array
                $sString = StringMid($sString, 2)
                Local $aRet[1], $iArIndex = 0 ; create new array which will contain the Json-Array.

                Do
                    $sString = StringStripWS($sString, 3) ; Trim Leading & trailing spaces
                    $aNextArrayVal = StringRegExp($sString, '^\s*(["{[]|\d+(?:(?:\.\d+)?[eE]\+\d+)?|true|false|null)', 2)
                    if @error Then Return SetError(@error, 0, 0);
                    Switch $aNextArrayVal[1]
                        Case '"' ; String
                            ; Value -> Array subscript. Trim String after that.
                            $aRet[$iArIndex] = StringMid($sString, 2, StringInStr($sString, """", 1, 2) - 2)
                            $sString = StringMid($sString, StringLen($aRet[$iArIndex]) + 3)

                        Case "{" ; Object
                            ; Recursive function call which will decode the object and return it.
                            ; Object -> Array subscript. Trim String after that.
                            $aRet[$iArIndex] = _JSON_Decode($sString)
                            $sString = StringMid($sString, @extended + 2)

                        Case "["
                            MsgBox(0, "", "Array in Array. WTF is up with this JSON shit?")
                            MsgBox(0, "", "This should not happen! Please post this!")
                            Exit 0xDEADBEEF

                        Case Else
                            ConsoleWrite("Array Else (maybe buggy?)" & @CRLF)
                            $aRet[$iArIndex] = $aNextArrayVal[1]
                    EndSwitch

                    ReDim $aRet[$iArIndex + 2]
                    $iArIndex += 1

                    $sString = StringStripWS($sString, 3) ; Leading & trailing
                    If StringLeft($sString, 1) = "]" Then ExitLoop
                    $sString = StringMid($sString, 2)
                Until False

                $sString = StringMid($sString, 2)
                ReDim $aRet[$iArIndex]
                $aArray[$iIndex][1] = $aRet

                ReDim $aArray[$iIndex + 2][2]
                $iIndex += 1

            Case Else ; Number, bool
                ; Value (number (int/flaot), boolean, null) -> Array subscript. Trim String after that.
                $aArray[$iIndex][1] = $aVal[2]
                ReDim $aArray[$iIndex + 2][2]
                $iIndex += 1
                $sString = StringMid($sString, StringLen($aArray[$iIndex][1]) + 2)
        EndSwitch

        If StringLeft($sString, 1) = "}" Then
            StringMid($sString, 2)
            ExitLoop
        EndIf
        If Not $b Then $sString = StringMid($sString, 2)
    Until False

    ReDim $aArray[$iIndex][2]
    Return SetError(0, StringLen($sOldStr) - StringLen($sString), $aArray)
EndFunc   ;==>_JSON_Decode

This works as long as you're not using "intelligent search placeholders" like using "intitle", "inurl", "site", and other placeholders with sentences (single word works, like: intitle:cake, but with sentence like: intitle:"crazy+stink" it doesn't, while searching this on google will give you approx. 35 results:    https://www.google.com/search?q=intitle:"crazy+stink"     )

Has anyone found a better way to legally scrape Google? This JSON API was built to be free, without big limitations (max results you get from a single query is 64), but it's not working properly, it doesn't give me results on "intelligent search placeholders". I'm aware of the Google Custom Search API, which requires API Key (which I have) but this API can search only specific website, and I need to scrape results from Google's search results.

Any thoughts, suggestions, ideas?

Edit: July 4th 2014:

I have found a way how to use Google Custom Search API with API Key, and still search entire web (instead of only single page). I have found this: https://support.google.com/customsearch/answer/2631040?hl=en and I have followed the instructions. I got my CX code, and I formatted the URL:

https://www.googleapis.com/customsearch/v1?key=[MY_API_KEY]&cx=017576662512468239146:omuauf_lfve&q=intitle:%22crazy+stink%22

(the CX in this example is the one that Google provides as an example for the API here: https://developers.google.com/custom-search/json-api/v1/using_rest, however, even with my own CX I get the same results)
Here are the results:

{
 "kind": "customsearch#search",
 "url": {
  "type": "application/json",
  "template": "https://www.googleapis.com/customsearch/v1?q={searchTerms}&num={count?}&start={startIndex?}&lr={language?}&safe={safe?}&cx={cx?}&cref={cref?}&sort={sort?}&filter={filter?}&gl={gl?}&cr={cr?}&googlehost={googleHost?}&c2coff={disableCnTwTranslation?}&hq={hq?}&hl={hl?}&siteSearch={siteSearch?}&siteSearchFilter={siteSearchFilter?}&exactTerms={exactTerms?}&excludeTerms={excludeTerms?}&linkSite={linkSite?}&orTerms={orTerms?}&relatedSite={relatedSite?}&dateRestrict={dateRestrict?}&lowRange={lowRange?}&highRange={highRange?}&searchType={searchType}&fileType={fileType?}&rights={rights?}&imgSize={imgSize?}&imgType={imgType?}&imgColorType={imgColorType?}&imgDominantColor={imgDominantColor?}&alt=json"
 },
 "queries": {
  "request": [
   {
    "title": "Google Custom Search - intitle:\"crazy stink\"",
    "totalResults": "0",
    "searchTerms": "intitle:\"crazy stink\"",
    "count": 10,
    "inputEncoding": "utf8",
    "outputEncoding": "utf8",
    "safe": "off",
    "cx": "017576662512468239146:omuauf_lfve"
   }
  ]
 },
 "searchInformation": {
  "searchTime": 0.35068,
  "formattedSearchTime": "0.35",
  "totalResults": "0",
  "formattedTotalResults": "0"
 }
}

The results are almost the same as I get them from AJAX JSON Api http://ajax.googleapis.com/ajax/services/search/web?v=1.0&start=1&rsz=large&q=intitle:%22crazy+stink%22:

{"responseData": {"results":[],"cursor":{"moreResultsUrl":"http://www.google.com/search?oe\u003dutf8\u0026ie\u003dutf8\u0026source\u003duds\u0026start\u003d1\u0026hl\u003den\u0026q\u003dintitle:%22crazy+stink%22","searchResultTime":"0.10"}}, "responseDetails": null, "responseStatus": 200}

Which is 0.

So... maybe there isn't any error on my part, but there is on Google's?

I'm just curious if anyone encountered an issue like the one I have, or if anyone have any better suggestion, but bare in mind that I want to keep this legal (scraping results from IE object is not something I want to do).

Edited by dragan
Link to comment
Share on other sites

Create an account or sign in to comment

You need to be a member in order to leave a comment

Create an account

Sign up for a new account in our community. It's easy!

Register a new account

Sign in

Already have an account? Sign in here.

Sign In Now
 Share

×
×
  • Create New...