Rafał Kos avatar Rafał Kos committed 2e36685

add scripts for getting google results

Comments (0)

Files changed (2)

google_results/google_v1.py

+import urllib
+
+import re
+import urllib
+from htmlentitydefs import name2codepoint
+from BeautifulSoup import BeautifulSoup
+
+import random
+import socket
+import urllib2
+import httplib
+
+BROWSERS = (
+    # Top most popular browsers in my access.log on 2009.02.12
+    # tail -50000 access.log |
+    #  awk -F\" '{B[$6]++} END { for (b in B) { print B[b] ": " b } }' |
+    #  sort -rn |
+    #  head -20
+    'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.6) Gecko/2009011913 Firefox/3.0.6',
+    'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.0.6) Gecko/2009011912 Firefox/3.0.6',
+    'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.6) Gecko/2009011913 Firefox/3.0.6 (.NET CLR 3.5.30729)',
+    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.6) Gecko/2009020911 Ubuntu/8.10 (intrepid) Firefox/3.0.6',
+    'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.6) Gecko/2009011913 Firefox/3.0.6',
+    'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.6) Gecko/2009011913 Firefox/3.0.6 (.NET CLR 3.5.30729)',
+    'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.48 Safari/525.19',
+    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648)',
+    'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.0.6) Gecko/2009020911 Ubuntu/8.10 (intrepid) Firefox/3.0.6',
+    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.5) Gecko/2008121621 Ubuntu/8.04 (hardy) Firefox/3.0.5',
+    'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-us) AppleWebKit/525.27.1 (KHTML, like Gecko) Version/3.2.1 Safari/525.27.1',
+    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)',
+    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)',
+    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
+)
+
+TIMEOUT = 5  # socket timeout
+
+class BrowserError(Exception):
+    def __init__(self, url, error):
+        self.url = url
+        self.error = error
+
+class PoolHTTPConnection(httplib.HTTPConnection):
+    def connect(self):
+        """Connect to the host and port specified in __init__."""
+        msg = "getaddrinfo returns an empty list"
+        for res in socket.getaddrinfo(self.host, self.port, 0,
+                                      socket.SOCK_STREAM):
+            af, socktype, proto, canonname, sa = res
+            try:
+                self.sock = socket.socket(af, socktype, proto)
+                if self.debuglevel > 0:
+                    print "connect: (%s, %s)" % (self.host, self.port)
+                self.sock.settimeout(TIMEOUT)
+                self.sock.connect(sa)
+            except socket.error, msg:
+                if self.debuglevel > 0:
+                    print 'connect fail:', (self.host, self.port)
+                if self.sock:
+                    self.sock.close()
+                self.sock = None
+                continue
+            break
+        if not self.sock:
+            raise socket.error, msg
+
+class PoolHTTPHandler(urllib2.HTTPHandler):
+    def http_open(self, req):
+        return self.do_open(PoolHTTPConnection, req)
+
+class Browser(object):
+    def __init__(self, user_agent=BROWSERS[0], debug=False, use_pool=False):
+        self.headers = {
+            'User-Agent': user_agent,
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+            'Accept-Language': 'pl-PL,pl;q=0.5'
+        }
+        self.debug = debug
+
+    def get_page(self, url, data=None):
+        handlers = [PoolHTTPHandler]
+        opener = urllib2.build_opener(*handlers)
+        if data: data = urllib.urlencode(data)
+        request = urllib2.Request(url, data, self.headers)
+        try:
+            response = opener.open(request)
+            #f = open('test.html', 'w')
+            #f.write(response.read())
+            #f.close()
+            return response.read()
+        except (urllib2.HTTPError, urllib2.URLError), e:
+            raise BrowserError(url, str(e))
+        except (socket.error, socket.sslerror), msg:
+            raise BrowserError(url, msg)
+        except socket.timeout, e:
+            raise BrowserError(url, "timeout")
+        except KeyboardInterrupt:
+            raise
+        except:
+            raise BrowserError(url, "unknown error")
+
+    def set_random_user_agent(self):
+        self.headers['User-Agent'] = random.choice(BROWSERS)
+        return self.headers['User-Agent']
+
+class SearchError(Exception):
+    """
+    Base class for Google Search exceptions.
+    """
+    pass
+
+class ParseError(SearchError):
+    """
+    Parse error in Google results.
+    self.msg attribute contains explanation why parsing failed
+    self.tag attribute contains BeautifulSoup object with the most relevant tag that failed to parse
+    Thrown only in debug mode
+    """
+     
+    def __init__(self, msg, tag):
+        self.msg = msg
+        self.tag = tag
+
+    def __str__(self):
+        return self.msg
+
+    def html(self):
+        return self.tag.prettify()
+
+class SearchResult:
+    def __init__(self, title, url, desc):
+        self.title = title
+        self.url = url
+        self.desc = desc
+
+    def __str__(self):
+        return 'Google Search Result: "%s"' % self.title
+
+class GoogleSearch(object):
+    SEARCH_URL_0 = "http://www.google.com/search?hl=pl&q=%(query)s"
+    NEXT_PAGE_0 = "http://www.google.com/search?hl=pl&q=%(query)s&start=%(start)d"
+    SEARCH_URL_1 = "http://www.google.com/search?hl=pl&q=%(query)s&num=%(num)d"
+    NEXT_PAGE_1 = "http://www.google.com/search?hl=pl&q=%(query)s&num=%(num)d&start=%(start)d"
+
+    def __init__(self, query, random_agent=False, debug=False):
+        self.query = query
+        self.debug = debug
+        self.browser = Browser(debug=debug)
+        self.results_info = None
+        self.eor = False # end of results
+        self._page = 0
+        self._results_per_page = 10
+        self._last_from = 0
+
+        if random_agent:
+            self.browser.set_random_user_agent()
+
+    @property
+    def num_results(self):
+        if not self.results_info:
+            page = self._get_results_page()
+            self.results_info = self._extract_info(page)
+            if self.results_info['total'] == 0:
+                self.eor = True
+        return self.results_info['total']
+
+    def _get_page(self):
+        return self._page
+
+    def _set_page(self, page):
+        self._page = page
+
+    page = property(_get_page, _set_page)
+
+    def _get_results_per_page(self):
+        return self._results_per_page
+
+    def _set_results_par_page(self, rpp):
+        self._results_per_page = rpp
+
+    results_per_page = property(_get_results_per_page, _set_results_par_page)
+
+    def get_results(self):
+        """ Gets a page of results """
+        if self.eor:
+            return []
+
+        page = self._get_results_page()
+        search_info = self._extract_info(page)
+        if not self.results_info:
+            self.results_info = search_info
+            if self.num_results == 0:
+                self.eor = True
+                return []
+        results = self._extract_results(page)
+        if not results:
+            self.eor = True
+            return []
+        if self._page > 0 and search_info['from'] == self._last_from:
+            self.eor = True
+            return []
+        if search_info['to'] == search_info['total']:
+            self.eor = True
+        self._page += 1
+        self._last_from = search_info['from']
+        return results
+
+    def _maybe_raise(self, cls, *arg):
+        if self.debug:
+            raise cls(*arg)
+
+    def _get_results_page(self):
+        if self._page == 0:
+            if self._results_per_page == 10:
+                url = GoogleSearch.SEARCH_URL_0
+            else:
+                url = GoogleSearch.SEARCH_URL_1
+        else:
+            if self._results_per_page == 10:
+                url = GoogleSearch.NEXT_PAGE_0
+            else:
+                url = GoogleSearch.NEXT_PAGE_1
+
+        safe_url = url % { 'query': urllib.quote_plus(self.query),
+                           'start': self._page * self._results_per_page,
+                           'num': self._results_per_page }        
+        try:
+            page = self.browser.get_page(safe_url)
+        except BrowserError, e:
+            raise SearchError, "Failed getting %s: %s" % (e.url, e.error)
+
+        return BeautifulSoup(page)
+
+    def _extract_info(self, soup):
+        empty_info = {'from': 0, 'to': 0, 'total': 0}
+        div_ssb = soup.find('div', id='ssb')
+        if not div_ssb:
+            self._maybe_raise(ParseError, "Div with number of results was not found on Google search page", soup)
+            return empty_info
+        p = div_ssb.find('p')
+        if not p:
+            self._maybe_raise(ParseError, """<p> tag within <div id="ssb"> was not found on Google search page""", soup)
+            return empty_info
+        txt = ''.join(p.findAll(text=True))
+        txt = txt.replace(',', '')
+        
+        matches = re.search(r'Wyniki (\d+) - (\d+)', txt, re.U)        
+        if not matches:            
+            return empty_info
+        result = {'from': int(matches.group(1)), 'to': int(matches.group(2))}
+
+        matches = re.search(r'(\d+) dla zapytania', txt, re.U)        
+        if not matches:
+            return empty_info
+        
+        result.update({'total': int(matches.group(1))})
+        return result
+
+    def _extract_results(self, soup):
+        results = soup.findAll('li', {'class': 'g'})
+        ret_res = []
+        for result in results:
+            eres = self._extract_result(result)
+            if eres:
+                ret_res.append(eres)
+        return ret_res
+
+    def _extract_result(self, result):
+        title, url = self._extract_title_url(result)
+        desc = self._extract_description(result)
+        if not title or not url or not desc:
+            return None
+        return SearchResult(title, url, desc)
+
+    def _extract_title_url(self, result):
+        #title_a = result.find('a', {'class': re.compile(r'\bl\b')})
+        title_a = result.find('a')
+        if not title_a:
+            self._maybe_raise(ParseError, "Title tag in Google search result was not found", result)
+            return None, None
+        title = ''.join(title_a.findAll(text=True))
+        title = self._html_unescape(title)
+        url = title_a['href']
+        match = re.match(r'/url\?q=(http[^&]+)&', url)
+        if match:
+            url = urllib.unquote(match.group(1))
+        return title, url
+
+    def _extract_description(self, result):
+        desc_div = result.find('div', {'class': re.compile(r'\bs\b')})
+        if not desc_div:
+            self._maybe_raise(ParseError, "Description tag in Google search result was not found", result)
+            return None
+
+        desc_strs = []
+        def looper(tag):
+            if not tag: return
+            for t in tag:
+                try:
+                    if t.name == 'br': break
+                except AttributeError:
+                    pass
+
+                try:
+                    desc_strs.append(t.string)
+                except AttributeError:
+                    desc_strs.append(t)
+
+        looper(desc_div)
+        looper(desc_div.find('wbr')) # BeautifulSoup does not self-close <wbr>
+
+        desc = ''.join(s for s in desc_strs if s)
+        return self._html_unescape(desc)
+
+    def _html_unescape(self, str):
+        def entity_replacer(m):
+            entity = m.group(1)
+            if entity in name2codepoint:
+                return unichr(name2codepoint[entity])
+            else:
+                return m.group(0)
+
+        def ascii_replacer(m):
+            cp = int(m.group(1))
+            if cp <= 255:
+                return unichr(cp)
+            else:
+                return m.group(0)
+
+        s =    re.sub(r'&#(\d+);',  ascii_replacer, str, re.U)
+        return re.sub(r'&([^;]+);', entity_replacer, s, re.U)
+
+
+import MySQLdb
+conn = MySQLdb.connect (host = "rigel.poczta5.com", user = "literowki", passwd = "QW6vsYfZz7nehQmN", db = "literowki", use_unicode=False, charset='cp1250')
+cursor = conn.cursor(MySQLdb.cursors.DictCursor)
+
+while True:
+    sql = "SELECT id, name FROM BadWords WHERE modDate is null LIMIT 10"
+    cursor.execute(sql)
+    result = cursor.fetchall()
+    
+    for record in result:
+        gs = GoogleSearch(record['name'], random_agent=True, debug=False)    
+        gs.results_per_page = 50
+        results = gs.get_results()
+        print record['name'] + ' - ' + str(gs.num_results)
+
+        update_sql = 'UPDATE BadWords SET googleCount = ' + str(gs.num_results) + ', modDate = now() where id = ' + str(record['id'])
+        cursor.execute(update_sql)
+        conn.commit()
+'''
+for res in results:
+    print res.title.encode('utf8')
+    print res.desc.encode('utf8')
+    print res.url.encode('utf8')
+    print
+'''

google_results/google_v2.py

+## -*- coding: utf-8 -*-
+import urllib
+
+import re
+import urllib
+from htmlentitydefs import name2codepoint
+from BeautifulSoup import BeautifulSoup
+
+import random
+import socket
+import urllib2
+import httplib
+
+BROWSERS = (
+    # Top most popular browsers in my access.log on 2009.02.12
+    # tail -50000 access.log |
+    #  awk -F\" '{B[$6]++} END { for (b in B) { print B[b] ": " b } }' |
+    #  sort -rn |
+    #  head -20
+    'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.6) Gecko/2009011913 Firefox/3.0.6',
+    'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.0.6) Gecko/2009011912 Firefox/3.0.6',
+    'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.6) Gecko/2009011913 Firefox/3.0.6 (.NET CLR 3.5.30729)',
+    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.6) Gecko/2009020911 Ubuntu/8.10 (intrepid) Firefox/3.0.6',
+    'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.6) Gecko/2009011913 Firefox/3.0.6',
+    'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.6) Gecko/2009011913 Firefox/3.0.6 (.NET CLR 3.5.30729)',
+    'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.48 Safari/525.19',
+    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648)',
+    'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.0.6) Gecko/2009020911 Ubuntu/8.10 (intrepid) Firefox/3.0.6',
+    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.5) Gecko/2008121621 Ubuntu/8.04 (hardy) Firefox/3.0.5',
+    'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-us) AppleWebKit/525.27.1 (KHTML, like Gecko) Version/3.2.1 Safari/525.27.1',
+    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)',
+    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)',
+    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
+)
+
+TIMEOUT = 5  # socket timeout
+
+class BrowserError(Exception):
+    def __init__(self, url, error):
+        self.url = url
+        self.error = error
+
+class PoolHTTPConnection(httplib.HTTPConnection):
+    def connect(self):
+        """Connect to the host and port specified in __init__."""
+        msg = "getaddrinfo returns an empty list"
+        for res in socket.getaddrinfo(self.host, self.port, 0,
+                                      socket.SOCK_STREAM):
+            af, socktype, proto, canonname, sa = res
+            try:
+                self.sock = socket.socket(af, socktype, proto)
+                if self.debuglevel > 0:
+                    print "connect: (%s, %s)" % (self.host, self.port)
+                self.sock.settimeout(TIMEOUT)
+                self.sock.connect(sa)
+            except socket.error, msg:
+                if self.debuglevel > 0:
+                    print 'connect fail:', (self.host, self.port)
+                if self.sock:
+                    self.sock.close()
+                self.sock = None
+                continue
+            break
+        if not self.sock:
+            raise socket.error, msg
+
+class PoolHTTPHandler(urllib2.HTTPHandler):
+    def http_open(self, req):
+        return self.do_open(PoolHTTPConnection, req)
+
+class Browser(object):
+    def __init__(self, user_agent=BROWSERS[0], debug=False, use_pool=False):
+        self.headers = {
+            'User-Agent': user_agent,
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+            'Accept-Language': 'pl-PL,pl;q=0.5'
+        }
+        self.debug = debug
+
+    def get_page(self, url, data=None):
+        handlers = [PoolHTTPHandler]
+        opener = urllib2.build_opener(*handlers)
+        if data: data = urllib.urlencode(data)
+        request = urllib2.Request(url, data, self.headers)
+        try:
+            response = opener.open(request)
+            #f = open('test.html', 'w')
+            #f.write(response.read())
+            #f.close()
+            return response.read()
+        except (urllib2.HTTPError, urllib2.URLError), e:
+            raise BrowserError(url, str(e))
+        except (socket.error, socket.sslerror), msg:
+            raise BrowserError(url, msg)
+        except socket.timeout, e:
+            raise BrowserError(url, "timeout")
+        except KeyboardInterrupt:
+            raise
+        except:
+            raise BrowserError(url, "unknown error")
+
+    def set_random_user_agent(self):
+        self.headers['User-Agent'] = random.choice(BROWSERS)
+        return self.headers['User-Agent']
+
+class SearchError(Exception):
+    """
+    Base class for Google Search exceptions.
+    """
+    pass
+
+class ParseError(SearchError):
+    """
+    Parse error in Google results.
+    self.msg attribute contains explanation why parsing failed
+    self.tag attribute contains BeautifulSoup object with the most relevant tag that failed to parse
+    Thrown only in debug mode
+    """
+     
+    def __init__(self, msg, tag):
+        self.msg = msg
+        self.tag = tag
+
+    def __str__(self):
+        return self.msg
+
+    def html(self):
+        return self.tag.prettify()
+
+class SearchResult:
+    def __init__(self, title, url, desc):
+        self.title = title
+        self.url = url
+        self.desc = desc
+
+    def __str__(self):
+        return 'Google Search Result: "%s"' % self.title
+
+class GoogleSearch(object):
+    SEARCH_URL_0 = "http://www.google.com/search?hl=pl&q=%(query)s"
+    NEXT_PAGE_0 = "http://www.google.com/search?hl=pl&q=%(query)s&start=%(start)d"
+    SEARCH_URL_1 = "http://www.google.com/search?hl=pl&q=%(query)s&num=%(num)d"
+    NEXT_PAGE_1 = "http://www.google.com/search?hl=pl&q=%(query)s&num=%(num)d&start=%(start)d"
+
+    def __init__(self, query, random_agent=False, debug=False):
+        self.query = query
+        self.debug = debug
+        self.browser = Browser(debug=debug)
+        self.results_info = None
+        self.eor = False # end of results
+        self._page = 0
+        self._results_per_page = 10
+        self._last_from = 0
+
+        if random_agent:
+            self.browser.set_random_user_agent()
+
+    @property
+    def num_results(self):
+        if not self.results_info:
+            page = self._get_results_page()
+            self.results_info = self._extract_info(page)
+            if self.results_info['total'] == 0:
+                self.eor = True
+        return self.results_info['total']
+
+    def _get_page(self):
+        return self._page
+
+    def _set_page(self, page):
+        self._page = page
+
+    page = property(_get_page, _set_page)
+
+    def _get_results_per_page(self):
+        return self._results_per_page
+
+    def _set_results_par_page(self, rpp):
+        self._results_per_page = rpp
+
+    results_per_page = property(_get_results_per_page, _set_results_par_page)
+
+    def get_results(self):
+        """ Gets a page of results """
+        if self.eor:
+            return []
+
+        page = self._get_results_page()
+        search_info = self._extract_info(page)
+        
+
+    def _maybe_raise(self, cls, *arg):
+        if self.debug:
+            raise cls(*arg)
+
+    def _get_results_page(self):
+        if self._page == 0:
+            if self._results_per_page == 10:
+                url = GoogleSearch.SEARCH_URL_0
+            else:
+                url = GoogleSearch.SEARCH_URL_1
+        else:
+            if self._results_per_page == 10:
+                url = GoogleSearch.NEXT_PAGE_0
+            else:
+                url = GoogleSearch.NEXT_PAGE_1
+
+        safe_url = url % { 'query': urllib.quote_plus(self.query),
+                           'start': self._page * self._results_per_page,
+                           'num': self._results_per_page }        
+        try:
+            page = self.browser.get_page(safe_url)
+        except BrowserError, e:
+            raise SearchError, "Failed getting %s: %s" % (e.url, e.error)
+
+        return BeautifulSoup(page)
+
+    def _extract_info(self, soup):
+        empty_info = {'from': 0, 'to': 0, 'total': 0}
+        div_ssb = soup.find('div', id='resultStats')
+        if not div_ssb:
+            self._maybe_raise(ParseError, "Div with number of results was not found on Google search page", soup)
+            return empty_info
+        
+        matches =  re.search(r'Oko�o ([0-9,]+) wynik�w', ''.join(div_ssb.findAll(text=True)).encode('cp1250'), re.U)
+        
+        if matches:
+            result = {'from': int(matches.group(1).replace(',','')), 'to': 0}
+      
+        if not matches:
+            return empty_info
+        
+        result.update({'total': int(matches.group(1).replace(',', ''))})
+        return result
+
+    def _extract_results(self, soup):
+        results = soup.findAll('div', {'id': 'resultStats'})
+        ret_res = []
+        for result in results:
+            eres = self._extract_result(result)
+            if eres:
+                ret_res.append(eres)
+        return ret_res
+
+    def _extract_result(self, result):
+        title, url = self._extract_title_url(result)
+        desc = self._extract_description(result)
+        if not title or not url or not desc:
+            return None
+        return SearchResult(title, url, desc)
+
+    def _extract_title_url(self, result):
+        #title_a = result.find('a', {'class': re.compile(r'\bl\b')})
+        title_a = result.find('a')
+        if not title_a:
+            self._maybe_raise(ParseError, "Title tag in Google search result was not found", result)
+            return None, None
+        title = ''.join(title_a.findAll(text=True))
+        title = self._html_unescape(title)
+        url = title_a['href']
+        match = re.match(r'/url\?q=(http[^&]+)&', url)
+        if match:
+            url = urllib.unquote(match.group(1))
+        return title, url
+
+    def _extract_description(self, result):
+        desc_div = result.find('div', {'class': re.compile(r'\bs\b')})
+        if not desc_div:
+            self._maybe_raise(ParseError, "Description tag in Google search result was not found", result)
+            return None
+
+        desc_strs = []
+        def looper(tag):
+            if not tag: return
+            for t in tag:
+                try:
+                    if t.name == 'br': break
+                except AttributeError:
+                    pass
+
+                try:
+                    desc_strs.append(t.string)
+                except AttributeError:
+                    desc_strs.append(t)
+
+        looper(desc_div)
+        looper(desc_div.find('wbr')) # BeautifulSoup does not self-close <wbr>
+
+        desc = ''.join(s for s in desc_strs if s)
+        return self._html_unescape(desc)
+
+    def _html_unescape(self, str):
+        def entity_replacer(m):
+            entity = m.group(1)
+            if entity in name2codepoint:
+                return unichr(name2codepoint[entity])
+            else:
+                return m.group(0)
+
+        def ascii_replacer(m):
+            cp = int(m.group(1))
+            if cp <= 255:
+                return unichr(cp)
+            else:
+                return m.group(0)
+
+        s =    re.sub(r'&#(\d+);',  ascii_replacer, str, re.U)
+        return re.sub(r'&([^;]+);', entity_replacer, s, re.U)
+
+
+import MySQLdb
+conn = MySQLdb.connect (host = "rigel.poczta5.com", user = "literowki", passwd = "QW6vsYfZz7nehQmN", db = "literowki", use_unicode=False, charset='cp1250')
+cursor = conn.cursor(MySQLdb.cursors.DictCursor)
+
+while True:
+    sql = "SELECT id, name FROM BadWords WHERE modDate is null LIMIT 10"
+    cursor.execute(sql)
+    result = cursor.fetchall()
+    
+    for record in result:
+        gs = GoogleSearch(record['name'], random_agent=True, debug=False)    
+        gs.results_per_page = 50
+        results = gs.get_results()
+        print record['name'] + ' - ' + str(gs.num_results)
+
+        update_sql = 'UPDATE BadWords SET googleCount = ' + str(gs.num_results) + ', modDate = now() where id = ' + str(record['id'])
+        cursor.execute(update_sql)
+        conn.commit()
+"""
+gs = GoogleSearch('allergro zasilacz', random_agent=True, debug=True)
+gs.results_per_page = 50
+results = gs.get_results()
+print gs.num_results
+"""
+'''
+for res in results:
+    print res.title.encode('utf8')
+    print res.desc.encode('utf8')
+    print res.url.encode('utf8')
+    print
+'''
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.