Jump to content
Sign in to follow this  

Scrape Google Search results using Google APIs

Recommended Posts


Has anyone tried to use Google APIs for scraping search results?

I've built this simple script to demonstrate my problem I'm having with Google results scraping:

#include <Array.au3>

Global $oHTTP = ObjCreate("WinHttp.WinHttpRequest.5.1")


Func _PerformSearch()
    dim $ShowResults[0][3];
    $searchPages = 3

    for $j = 1 to $searchPages*8 Step 8
        $SearchString = 'Apple+Juice';              Disable this line...
;~      $SearchString = 'intitle:"crazy+stink"';    ...And enable this one
        $sURL = "http://ajax.googleapis.com/ajax/services/search/web?v=1.0&start=" & $j & "&rsz=large&q=" & $SearchString
        $oHTTP.Open("GET", $sURL, False)
        $oHTTP.SetRequestHeader("Referer", @IPAddress1)
        If (@error) Then Return SetError(1, 0, 0)
        If (@error) Then Return SetError(2, 0, 0)
        $retVal = $oHTTP.ResponseText
        If (@error) Then Return SetError(3, 0, 0)

        $aReturn = _JSON_Decode($retVal)
        if NOT @error then
            $responseData = $aReturn[0][1]
            $results = $responseData[0][1]
            for $i = 0 to UBound($results)-1
                $oneResult = $results[$i]
                $title = _OnlyBoldedDecode(_getJSonValue($oneResult, "title"))
                $url = _getJSonValue($oneResult, "url")
                $content = _OnlyBoldedDecode(_getJSonValue($oneResult, "content"))

                ReDim $ShowResults[UBound($ShowResults)+1][3]
                $arIndex = UBound($ShowResults)-1
                $ShowResults[$arIndex][0] = $title
                $ShowResults[$arIndex][1] = $url
                $ShowResults[$arIndex][2] = $content

Func _OnlyBoldedDecode($sData);decoding only most common code
    Return StringReplace(StringReplace($sData, "\u003c", "<"), "\u003e", ">");

Func _getJSonValue($_res, $getData)
    for $i = 0 to UBound($_res)-1
        if $_res[$i][0] == $getData then Return $_res[$i][1]
    Return "";

Func _JSON_Decode($sString)
    Local $iIndex, $aVal, $sOldStr = $sString, $b

    $sString = StringStripCR(StringStripWS($sString, 7))
    If Not StringRegExp($sString, "(?i)^\{.+}$") Then Return SetError(1, 0, 0)
    Local $aArray[1][2], $iIndex = 0
    $sString = StringMid($sString, 2)

        $b = False

        $aVal = StringRegExp($sString, '^"([^"]+)"\s*:\s*(["{[]|[-+]?\d+(?:(?:\.\d+)?[eE][+-]\d+)?|true|false|null)', 2) ; Get value & next token
        If @error Then
            ConsoleWrite("!> StringRegExp Error getting next Value." & @CRLF)
            ConsoleWrite($sString & @CRLF)
            $sString = StringMid($sString, 2) ; maybe it works when the string is trimmed by 1 char from the left ?

        $aArray[$iIndex][0] = $aVal[1] ; Key
        $sString = StringMid($sString, StringLen($aVal[0]))

        Switch $aVal[2] ; Value Type (Array, Object, String) ?
            Case '"' ; String
                ; Value -> Array subscript. Trim String after that.

                $aArray[$iIndex][1] = StringMid($sString, 2, StringInStr($sString, """", 1, 2) - 2)
                $sString = StringMid($sString, StringLen($aArray[$iIndex][1]) + 3)

                ReDim $aArray[$iIndex + 2][2]
                $iIndex += 1

            Case '{' ; Object
                ; Recursive function call which will decode the object and return it.
                ; Object -> Array subscript. Trim String after that.

                $aArray[$iIndex][1] = _JSON_Decode($sString)
                $sString = StringMid($sString, @extended + 2)
                If StringLeft($sString, 1) = "," Then $sString = StringMid($sString, 2)

                $b = True
                ReDim $aArray[$iIndex + 2][2]
                $iIndex += 1

            Case '[' ; Array
                ; Decode Array
                $sString = StringMid($sString, 2)
                Local $aRet[1], $iArIndex = 0 ; create new array which will contain the Json-Array.

                    $sString = StringStripWS($sString, 3) ; Trim Leading & trailing spaces
                    $aNextArrayVal = StringRegExp($sString, '^\s*(["{[]|\d+(?:(?:\.\d+)?[eE]\+\d+)?|true|false|null)', 2)
                    if @error Then Return SetError(@error, 0, 0);
                    Switch $aNextArrayVal[1]
                        Case '"' ; String
                            ; Value -> Array subscript. Trim String after that.
                            $aRet[$iArIndex] = StringMid($sString, 2, StringInStr($sString, """", 1, 2) - 2)
                            $sString = StringMid($sString, StringLen($aRet[$iArIndex]) + 3)

                        Case "{" ; Object
                            ; Recursive function call which will decode the object and return it.
                            ; Object -> Array subscript. Trim String after that.
                            $aRet[$iArIndex] = _JSON_Decode($sString)
                            $sString = StringMid($sString, @extended + 2)

                        Case "["
                            MsgBox(0, "", "Array in Array. WTF is up with this JSON shit?")
                            MsgBox(0, "", "This should not happen! Please post this!")
                            Exit 0xDEADBEEF

                        Case Else
                            ConsoleWrite("Array Else (maybe buggy?)" & @CRLF)
                            $aRet[$iArIndex] = $aNextArrayVal[1]

                    ReDim $aRet[$iArIndex + 2]
                    $iArIndex += 1

                    $sString = StringStripWS($sString, 3) ; Leading & trailing
                    If StringLeft($sString, 1) = "]" Then ExitLoop
                    $sString = StringMid($sString, 2)
                Until False

                $sString = StringMid($sString, 2)
                ReDim $aRet[$iArIndex]
                $aArray[$iIndex][1] = $aRet

                ReDim $aArray[$iIndex + 2][2]
                $iIndex += 1

            Case Else ; Number, bool
                ; Value (number (int/flaot), boolean, null) -> Array subscript. Trim String after that.
                $aArray[$iIndex][1] = $aVal[2]
                ReDim $aArray[$iIndex + 2][2]
                $iIndex += 1
                $sString = StringMid($sString, StringLen($aArray[$iIndex][1]) + 2)

        If StringLeft($sString, 1) = "}" Then
            StringMid($sString, 2)
        If Not $b Then $sString = StringMid($sString, 2)
    Until False

    ReDim $aArray[$iIndex][2]
    Return SetError(0, StringLen($sOldStr) - StringLen($sString), $aArray)
EndFunc   ;==>_JSON_Decode

This works as long as you're not using "intelligent search placeholders" like using "intitle", "inurl", "site", and other placeholders with sentences (single word works, like: intitle:cake, but with sentence like: intitle:"crazy+stink" it doesn't, while searching this on google will give you approx. 35 results:    https://www.google.com/search?q=intitle:"crazy+stink"     )

Has anyone found a better way to legally scrape Google? This JSON API was built to be free, without big limitations (max results you get from a single query is 64), but it's not working properly, it doesn't give me results on "intelligent search placeholders". I'm aware of the Google Custom Search API, which requires API Key (which I have) but this API can search only specific website, and I need to scrape results from Google's search results.

Any thoughts, suggestions, ideas?

Edit: July 4th 2014:

I have found a way how to use Google Custom Search API with API Key, and still search entire web (instead of only single page). I have found this: https://support.google.com/customsearch/answer/2631040?hl=en and I have followed the instructions. I got my CX code, and I formatted the URL:


(the CX in this example is the one that Google provides as an example for the API here: https://developers.google.com/custom-search/json-api/v1/using_rest, however, even with my own CX I get the same results)
Here are the results:

 "kind": "customsearch#search",
 "url": {
  "type": "application/json",
  "template": "https://www.googleapis.com/customsearch/v1?q={searchTerms}&num={count?}&start={startIndex?}&lr={language?}&safe={safe?}&cx={cx?}&cref={cref?}&sort={sort?}&filter={filter?}&gl={gl?}&cr={cr?}&googlehost={googleHost?}&c2coff={disableCnTwTranslation?}&hq={hq?}&hl={hl?}&siteSearch={siteSearch?}&siteSearchFilter={siteSearchFilter?}&exactTerms={exactTerms?}&excludeTerms={excludeTerms?}&linkSite={linkSite?}&orTerms={orTerms?}&relatedSite={relatedSite?}&dateRestrict={dateRestrict?}&lowRange={lowRange?}&highRange={highRange?}&searchType={searchType}&fileType={fileType?}&rights={rights?}&imgSize={imgSize?}&imgType={imgType?}&imgColorType={imgColorType?}&imgDominantColor={imgDominantColor?}&alt=json"
 "queries": {
  "request": [
    "title": "Google Custom Search - intitle:\"crazy stink\"",
    "totalResults": "0",
    "searchTerms": "intitle:\"crazy stink\"",
    "count": 10,
    "inputEncoding": "utf8",
    "outputEncoding": "utf8",
    "safe": "off",
    "cx": "017576662512468239146:omuauf_lfve"
 "searchInformation": {
  "searchTime": 0.35068,
  "formattedSearchTime": "0.35",
  "totalResults": "0",
  "formattedTotalResults": "0"

The results are almost the same as I get them from AJAX JSON Api http://ajax.googleapis.com/ajax/services/search/web?v=1.0&start=1&rsz=large&q=intitle:%22crazy+stink%22:

{"responseData": {"results":[],"cursor":{"moreResultsUrl":"http://www.google.com/search?oe\u003dutf8\u0026ie\u003dutf8\u0026source\u003duds\u0026start\u003d1\u0026hl\u003den\u0026q\u003dintitle:%22crazy+stink%22","searchResultTime":"0.10"}}, "responseDetails": null, "responseStatus": 200}

Which is 0.

So... maybe there isn't any error on my part, but there is on Google's?

I'm just curious if anyone encountered an issue like the one I have, or if anyone have any better suggestion, but bare in mind that I want to keep this legal (scraping results from IE object is not something I want to do).

Edited by dragan

Share this post

Link to post
Share on other sites

Create an account or sign in to comment

You need to be a member in order to leave a comment

Create an account

Sign up for a new account in our community. It's easy!

Register a new account

Sign in

Already have an account? Sign in here.

Sign In Now

Sign in to follow this  

  • Similar Content

    • Ascer
      By Ascer
      1. Description.
      oAuth 2.0 is security system implemented by Google a few years ago. You are able to connect into your Google accounts and manage documents. In this UDF i show you how to pass first authorization process., this allow you to automate most of functions using API interface. 2. Requirements.
      Google account. oAuth.au3 Download 3. Possibilities
      ;============================================================================================================ ; Date: 2018-02-10, 14:21 ; ; Description: UDF for authorize your app with oAuth 2.0 Google. ; ; Function(s): ; oAuth2GetAuthorizationCode() -> Get Code for "grant". ; oAuth2GetAccessToken() -> Get "access_token" and "refresh_token" first time. ; oAuth2RefreshAccessToken() -> Get current "access_token" using "refresh_token". ; ; Author(s): Ascer ;============================================================================================================ 4. Enable your Google API.
          4.1. Video Tutorial not mine!
       YouTube     4.2 Screenshots from authorization process (Polish language) 
      Go to https://console.developers.google.com/apis/dashboard and accept current rules.  

      Next create an new project  

      Enter name of you new project and click Create  

      Google will working now, please wait until finish. Next go to enable your API interface, we make if for Google  

      Take "Gmail" in search input and after click in found result.  

      Click Enable interface, Google will working now.  

      Create your login credentials  

      Select Windows Interface (combobox), User credentials (radio) and click button what is need bla bla  

      Type name of a new client id for oAuth 2.0 and click Create a new Client ID.  

      Next configure screen aplication, type some name and click Next. Google will working now.  

      Last step on this website is download source with your credentials in *Json format.  

      Now you received a file named client_id.json, it's how it look in Sublime Text:  

      5. Coding.
      Now we need to call a some function to get access code.  
      #include <oAuth.au3> Local $sClientId = "167204758184-vpeues0uk6b0g4jrnv0ipq5fapoig2v8.apps.googleusercontent.com" Local $sRedirectUri = "http://localhost" oAuth2GetAuthorizationCode($sClientId, $sRedirectUri)  
      Function will execute default browser for ask you to permission.  

      Next Google ask you to permission for access to your personal details by application Autoit   

      Now you can thing is something wrong but all is ok, you need to copy all after  code= . It your access code.  

      Let's now ask Google about our Access Token and Refresh Token  
      #include <oAuth.au3> Local $sClientId = "167204758184-vpeues0uk6b0g4jrnv0ipq5fapoig2v8.apps.googleusercontent.com" Local $sClientSecret = "cWalvFr3WxiE6cjUkdmKEPo8" Local $sAuthorizationCode = "4/AAAPXJOZ-Tz0s6mrx7JbV6nthXSfcxaszFh_aH0azVqHkSHkfiwE8uamcabn4eMbEWg1eAuUw7AU0PQ0XeWUFRo#" Local $sRedirectUri = "http://localhost" Local $aRet = oAuth2GetAccessToken($sClientId, $sClientSecret, $sAuthorizationCode, $sRedirectUri) If Ubound($aRet) <> 4 then ConsoleWrite("+++ Something wrong with reading ResponseText." & @CRLF) Exit EndIf ConsoleWrite("Successfully received data from Google." & @CRLF) ConsoleWrite("access_token: " & $aRet[0] & @CRLF) ConsoleWrite("expires_in: " & $aRet[1] & @CRLF) ConsoleWrite("refresh_token: " & $aRet[2] & @CRLF) ConsoleWrite("token_type: " & $aRet[3] & @CRLF)  
      Important! When you received error 400 and output says: Invalid grant it means that your previous generated access_code lost validity and you need to generate new calling previus code. When everything is fine you should received a 4 informations about your: access_token, expires_in, refresh_token and token_type. Access_Token time is a little short so you need to know fuction possible to refresh it (tell Google that he should generate a new Token for you)  
      #include <oAuth.au3> Local $sRefreshToken = "1/ba8JpW7TjQH3-UI1BvPaXhSf-oTQ4BmZAbBfhcKgKfY" Local $sClientId = "167204758184-vpeues0uk6b0g4jrnv0ipq5fapoig2v8.apps.googleusercontent.com" Local $sClientSecret = "cWalvFr3WxiE6cjUkdmKEPo8" Local $sRedirectUri = "http://localhost" Local $aRet = oAuth2RefreshAccessToken($sRefreshToken, $sClientId, $sClientSecret) If Ubound($aRet) <> 3 then ConsoleWrite("+++ Something wrong with reading ResponseText." & @CRLF) Exit EndIf ConsoleWrite("Successfully received data from Google." & @CRLF) ConsoleWrite("access_token: " & $aRet[0] & @CRLF) ConsoleWrite("expires_in: " & $aRet[1] & @CRLF) ConsoleWrite("token_type: " & $aRet[2] & @CRLF)  
      6. Finish words
      If you followed all this above steps im sure that you received all informations required for coding your Google API (Gmail, Dropbox, YouTube, Calender etc. See next thread: [UDF] Gmail API - Email automation with AutoIt!
    • Ascer
      By Ascer
      1. Description.
      Automate communication with Gmail API using oAuth 2.0 security. 2. Requirements.
      Google Gmail account. Finished Authorization process. Look here 3. Possibilities.
      ;======================================================================================================================== ; Date: 2018-02-12, 11:46 ; ; Bug Fixs: 2018-02-17, 7:31 -> Fixed problems with adding items to array and minor bugs. ; ; Description: UDF for using Gmail API interface. This UDF requires oAuth.au3 and Gmail account. ; ; Function(s): ; gmailUsersGetProfile() -> Information about your account. ; gmailUsersLabelsList() -> Get all available labels ids. ex. "INBOX", "UNREAD" ; gmailUsersLabelsGet() -> Get information about specific label id. ; gmailUsersMessagesBatchDelete() -> Delete many messages emails by id. ; gmailUsersMessagesBatchModify() -> Set status for many messages ex. "INBOX", "UNREAD" ; gmailUsersMessagesDelete() -> Totaly delete email from ur account. ; gmailUsersMessagesGet() -> Get all information about specific email. ; gmailUsersMessagesList() -> Get list of last ~100 emails. ; gmailUsersMessagesModify() -> Modify single message. ; gmailUsersMessagesTrash() -> Put email in trash. ; gmailUsersMessagesUntrash() -> Restore email from trash. ; gmailUsersMessagesSend() -> Send email to single or group recipients. ; gmailUsersMessagesAttachmentsGet() -> Download attachment by id. ; ; Author(s): Ascer ;======================================================================================================================== 4. Downloads.
      oAuth.au3 Gmail API.au3 5. Examples.
      Sending emails
    • islandspapand
      By islandspapand
      Hi All
      i am currently trying to add a function to my project that can send SMS, i have gone with Twilio for the sms service that use a REST API.
      I have never worked with an API before, and could use some help.
      I can get my function working with using cURL.exe and copy past command from the website with the following code. And thats great unfortunately i am have issue with character like æøå when sending a SMS appears like a box or ?. this does not happen if i do it from the website so it looks like a Unicode issue in curl.exe.
      I have done some searching on the forum and understand that i should be able to implement this curl command with the WinHTTP UDF from @trancexx so i don't need a third part exe and it might fix my charater issue.
      Unfortunately i really don't understand how i am to change curl commands to the WinHTTP and i was hoping some good maybe give me an example i could learn from.
      Thanks in advanced
      i have removed the AuthToken number from the script.
      _SendSMS("00000000","SomeOne","SMS body info") Func _SendSMS($SendTo,$SendFrom,$Msgtxt) $AccountSID = "ACbb765b3180d5938229eff8b8f63ed1bc" $AuthToken = "Auth Token number" $Data = '"https://api.twilio.com/2010-04-01/Accounts/'&$AccountSID&'/Messages.json"'& _ '-X POST \ --data-urlencode "To=+45'&$SendTo&'" \ --data-urlencode "From='&$SendFrom&'" \ --data-urlencode "Body='&$Msgtxt&'" \ -u '&$AccountSID&':'&$AuthToken&'' ShellExecute(@ScriptDir&"\curl.exe","-k "&$Data) ;~ curl 'https://api.twilio.com/2010-04-01/Accounts/ACbb765b3180d5938229eff8b8f63ed1bc/Messages.json' -X POST \ ;~ --data-urlencode 'To=+4500000000' \ ;~ --data-urlencode 'From=Reception' \ ;~ --data-urlencode 'Body=Test Body' \ ;~ -u ACbb765b3180d5938229eff8b8f63ed1bc:[AuthToken] EndFunc  
    • jesus40
      By jesus40
      Hello friends, i have a working curl command that show informations about my account on binance.com, but_it dont work with autoit code without curl.exe.
      I want to do it without curl, because the whole process much Slower_ with StdoutRead (I want get the response in variable.)
      My Curl command in Autoit:
      This 2 are works, but_ i would like to do it without curl.exe
      $apikey="XYZ" sCommand = @ScriptDir & '\curl.exe -k -H "X-MBX-APIKEY: ' & $apikey & '" -X GET "https://api.binance.com/api/v3/account?' & $request the same in .bat  file
      curl.exe -k -H "X-MBX-APIKEY: XYZ" -X GET "https://api.binance.com/api/v3/account?timestamp=1514917812000&signature=85bdee77e53cd521e1d5229fbfb459d53799c42b3fa4596d73f1520fad5f965a" (I use curl with -k option which allows curl to make insecure connections, because there is problem with the sites certificate, (cURL error 60))
      I tried many variations, this is the latest... I cant get the same response.
      curl $error message (I changed ): {"code":-2015,"msg":"Invalid API-key, IP, or permissions for action."}
      autoit version $error message (Response code:400): Mandatory parameter 'timestamp' was not sent, was empty/null, or malformed.
      $request = $query & '&signature=' & $signature $oHTTP = ObjCreate("winhttp.winhttprequest.5.1") $oHTTP.Open("GET", "https://api.binance.com/api/v3/account", False) $oHTTP.SetRequestHeader("X-MBX-APIKEY", $apikey) $oHTTP.Send($request) $oReceived = $oHTTP.ResponseText $oStatusCode = $oHTTP.Status If $oStatusCode <> 200 then MsgBox(4096, "Response code", $oStatusCode) EndIf  
    • IMRafferty
      By IMRafferty
      Hi, I posted this to StackOverflow already (see below) and they said to 'AutoIt' it (see bottom) and gave me two down-votes. Can anybody help me get started?  I'm happy to paypal my gratitude in advance.  I'm brand new to AutoIt .  Have been watching tutorial videos on Youtube, but still climbing learning curve to get any initial momentum going.  Thank you for at least reading -Ian :
      Batch searching/verifying Mailing/Resident Addresses using Google - possible?
      Good tidings, St'overflow-ers!
      So I have the unenviable task of verifying keyed/transcribed hand-written mailing addresses from paper fishing licenses. We use Smartsoft/Frameworks as it catches/verifies about 90% (domestic, some Canada) of what we get from licensing, but for the 'leftovers', it's then time for the one-by-one Googling-steps below this 'graph with pics, rinse/repeat times 20,000+. Question: can I somehow batch/automate this (by feeding Google Maps/Places a google sheet or *.csv maybe?) to save time, or nope? or perhaps you fell asleep reading this q? Beuller? Thanks! -Ian
      (1) Copy 'rough' address from MS Excel (or Google Sheet): 

      (2) Paste & search in Google (usually goes right into searching Google_Places/Earth):
      (3) (Hopefully) get a 'hit' and copy more correct/'verified' address:

      (4) Make corrections/tweaks back in Excel/GoogleSheet (faster than copy/pasting):

      5) rinse/repeat times 20k (why I'm asking for help)
      Stackoverflow comments:
      "I think you should look into something like AutoIt instead of batch for something to do that complicated stuff. I use AutoIt for some works tasks that are the same over and over again. And since batch is so limited i dont think batch would be worth using here, but if you insist on using a build in program then i'd think VBS would work to an extend." – NizonRox 
      "You'd have to interact with EXCEL and a Browser. Batch can't do that. Autoit supports both". – Stephan

Important Information

We have placed cookies on your device to help make this website better. You can adjust your cookie settings, otherwise we'll assume you're okay to continue.