Commits

Gregory Petukhov committed 3e089fd

Initial commit

Comments (0)

Files changed (5)

+syntax:glob
+
+# Common
+*.pyc
+*.pyo
+*.swp
+*.swo
+*.orig
+
+# Project specific
+.env
+pip-log.txt
+Installation
+============
+
+virtualenv .env
+pip install -E .env -r requirements.txt
+mkdir img logs
+#!.env/bin/python
+import urllib
+from grab import Grab
+import logging
+import re
+from decimal import Decimal
+from hashlib import sha1
+from urlparse import urljoin
+
+BASE_URL = 'http://odesk.com'
+RE_NUMBER = re.compile('[.,\d]+')
+
+def extract_number(data):
+    """
+    Find number in the given text data.
+    """
+
+    return Decimal(RE_NUMBER.search(data).group(0).replace(',', ''))
+
+
+def fetch_photo(g, elem, item):
+    """
+    Download and save the photo of contractor.
+
+    Args:
+        g - Grab instance which was used to fetch search results page
+        elem - HTML node with details about the person
+    """
+
+    g2 = g.clone()
+    photo_url = urljoin(BASE_URL, elem.xpath('.//div[@class="providerPortrait"]/img')[0].get('src'))
+    g2.go(photo_url)
+    fname = 'img/%s.png' % sha1(item['url']).hexdigest()
+    open(fname, 'w').write(g2.response.body)
+    return fname
+
+
+def parse_contractors(query, pages=3):
+    """
+    Iterate over search results for ``query``.
+    """
+
+    g = Grab(log_dir='logs')
+    g.go('https://www.odesk.com/contractors/?clear_button=1')
+    g.set_input('q', query)
+    g.submit()
+
+    # Iterate over pages
+    for x in xrange(0, pages):
+        logging.debug('Parsing page #%d' % (x + 1))
+        for elem in g.xpath('//div[@class="searchResult"]'):
+            item = {}
+
+            header = elem.xpath('.//h3/a')[0]
+            item['name'] = header.text
+            item['url'] = BASE_URL + header.get('href')
+
+            details = elem.xpath('.//p[@class="details"]')[0]
+            item['rate'] = extract_number(details.xpath('./strong[contains(@name, "rate_")]')[0].text)
+            item['hours'] = extract_number(details.xpath('./strong[contains(@name, "tot_hrs")]')[0].text)
+            item['country'] = details.xpath('./strong[contains(@name, "country_")]')[0].text
+            item['description'] = elem.xpath('.//div[contains(@class, "description")]')[0]\
+                                      .text_content().strip()
+            item['photo'] = fetch_photo(g, elem, item)
+            yield item
+
+        # Try to find link on next page of search results
+        try:
+            url = g.xpath('//li[@class="nextPage"]/a')[0].get('href')
+            g.go(url)
+        except IndexError:
+            return
+
+
+def main():
+    """
+    Demo of parse_contractors function
+    """
+
+    for item in parse_contractors('django'):
+        print item['name'], item['rate'], '$/hour', item['hours'], 'hours', item['country']
+
+
+if __name__ == '__main__':
+    logging.basicConfig(level=logging.DEBUG)
+    main()
+#!.env/bin/python
+import cPickle as pickle
+import logging
+import xlwt
+from PIL import Image
+
+from odesk_parser import parse_contractors
+
+def get_contractors():
+    try:
+        items = pickle.load(open('.cache'))
+        logging.debug('Using cached data')
+    except IOError:
+        items = list(parse_contractors('django'))
+        pickle.dump(items, open('.cache', 'w'))
+    return items
+
+
+def generate_excel(items):
+    """
+    Generate excel report.
+    """
+
+    print 'Generating excel report'
+    wb = xlwt.Workbook()
+    ws = wb.add_sheet('Contractors')
+
+    ws.col(0).width = 5000
+
+    headers = ['Name', 'Rate', 'Hours', 'Country', 'Photo']
+    for count, header in enumerate(headers):
+        ws.write(0, count, header)
+
+    for count, item in enumerate(items):
+        row = ws.row(count + 1)
+        row.height = 1000
+
+        ws.write(count + 1, 0, item['name'])
+        ws.write(count + 1, 1, item['rate'])
+        ws.write(count + 1, 2, item['hours'])
+        ws.write(count + 1, 3, item['country'])
+        img = Image.open(item['photo']).convert('RGB')
+        path = item['photo'] + '.bmp'
+        img.save(path)
+        ws.insert_bitmap(path, count + 1, 4, scale_y=0.35)
+
+
+
+    wb.save('report.xls')
+
+
+def generate_pdf(items):
+    """
+    Generate pdf report.
+    """
+
+    print 'Generating pdf report'
+
+    from reportlab.lib import colors
+    from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Image
+     
+    doc = SimpleDocTemplate('report.pdf')
+    # container for the 'Flowable' objects
+    elements = []
+     
+    data= []
+    headers = ['Name', 'Rate', 'Hours', 'Country', 'Photo']
+    data.append(headers)
+    for item in items:
+        img = Image(item['photo'])
+        data.append((item['name'], item['rate'], item['hours'],
+                     item['country'], img))
+    table = Table(data)
+    table.setStyle(TableStyle((
+        ('INNERGRID', (0,0), (-1,-1), 0.25, colors.gray),
+        ('BOX', (0,0), (-1,-1), 0.25, colors.gray),
+    )))
+                               
+    elements.append(table)
+    doc.build(elements)
+
+
+def main():
+    contractors = get_contractors()
+    contractors.sort(key=lambda x: x['rate'], reverse=True)
+    for item in contractors:
+        print item['name'], item['rate']
+    generate_excel(contractors)
+    generate_pdf(contractors)
+
+
+if __name__ == '__main__':
+    logging.basicConfig(level=logging.DEBUG)
+    main()
+grab
+pycurl
+lxml
+xlwt
+PIL
+reportlab