Grigoriy Petukhov avatar Grigoriy Petukhov committed f1ff390

Fix tools.google module

Comments (0)

Files changed (1)

grab/tools/google.py

+# coding: utf-8
 """
 Google parser.
 
 import base64
 
 from grab.tools.html import decode_entities
-from grab.tools.lxml import get_node_text
+from grab.tools.lxml_tools import get_node_text
+from grab.tools.http import urlencode
 
 ANONYMIZER_ARG = re.compile(r'q=([^&"]+)')
 
     Raised in case of standard anonymizer error.
     """
 
-
-def build_search_url(query, page=1, per_page=None, lang='en', filter=True):
+def build_search_url(query, page=1, per_page=None, lang='en', filter=True, **kwargs):
     """
     Build google search url with specified query and pagination options.
+
+    :param per_page: 10, 20, 30, 50, 100
+    kwargs:
+        tbs=qdr:h
+        tbs=qdr:d
+        tbs=qdr:w
+        tbs=qdr:m
+        tbs=qdr:y
     """
 
     if per_page is None:
         url += '&num=%d' % per_page
     if not filter:
         url += '&filter=0'
+    if kwargs:
+        url += '&' + urlencode(kwargs)
     return url
 
 
 
 
 
-def parse_search_results(grab, parse_index_size=False, anonymizer=False):
+def parse_search_results(grab, parse_index_size=False, anonymizer=False,
+                         strict_query=False):
     """
     Parse google search results page content.
     """
         raise AnonymizerNetworkError('URL Error (0)')
 
     elif grab.css_exists('#ires'):
-        if len(grab.css_list('#ires h3')):
+        if (strict_query and (
+            grab.search(u'Нет результатов для') or grab.search(u'No results found for'))):
+            pass
+            logging.debug('Query modified')
+        else:
+            if len(grab.css_list('#ires h3')):
 
-            # Something was found
-            if parse_index_size:
-                index_size = parse_index_size(grab)
+                # Something was found
+                if parse_index_size:
+                    index_size = parse_index_size(grab)
+                else:
+                    index_size = None
+
+                # Yield found results
+                for elem in grab.css_list('h3.r a'):
+                    url = elem.get('href')
+                    if url.startswith('/url?'):
+                        url = url.split('?q=')[1].split('&')[0]
+                        url = urllib.unquote_plus(url)
+                    if anonymizer:
+                        match = ANONYMIZER_ARG.search(url)
+                        if match:
+                            token = urllib.unquote(match.group(1))
+                            url = decode_entities(base64.b64decode(token))
+                        else:
+                            url = None
+                            logging.error('Could not parse url encoded by anonymizer')
+
+                    snippet = get_node_text(
+                        elem.getparent().getparent().xpath('div[@class="s"]')[0])
+                    if url:
+                        yield {'url': url, 'title': get_node_text(elem),
+                                'index_size': index_size, 'snippet': snippet}
             else:
-                index_size = None
-
-            # Yield found results
-            for elem in grab.css_list('h3.r a'):
-                url = elem.get('href')
-                if anonymizer:
-                    match = ANONYMIZER_ARG.search(url)
-                    if match:
-                        token = urllib.unquote(match.group(1))
-                        url = decode_entities(base64.b64decode(token))
-                    else:
-                        url = None
-                        logging.error('Could not parse url encoded by anonymizer')
-
-                if url:
-                    yield {'url': url, 'title': get_node_text(elem),
-                           'index_size': index_size}
-        else:
-            pass
-            #return []
+                pass
+                #return []
+    elif grab.css_exists('#res'):
+        # Could be search results here?
+        # or just message "nothing was found"?
+        pass
     else:
         raise ParsingError('Could not identify google page format')
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.