Source

woocode / py / crawler / crawler / scripts / download_img.py

import os
import hashlib
import time
import sqlite3
from urllib2 import Request, urlopen

from selenium import webdriver

conn = sqlite3.connect('../data.db')
cur = conn.cursor()

# level 2, step 2 hash directory
step = 2
level = 2
USER_AGENT = ('Mozilla/5.0 X11 Linux x86_64 AppleWebKit/535.19 KHTML, '
              'like Gecko Ubuntu/12.04 Chromium/18.0.1025.151 Chrome/1'
              '8.0.1025.151 Safari/535.19')

def permu(li):
    ret = []
    for i, e in enumerate(li):
        if i + 1 >= len(li):
            break
        ret.append([e, li[i+1]])
    return ret

indexs = permu(range(0, 50, step))[:level]

def mkdir(path):
    if not os.path.exists(path):
        os.makedirs(path)

def get_path_by_url(url):
    '''hash url to get path'''
    sha = hashlib.sha1(url).hexdigest()
    fs = [sha[i[0]:i[1]] for i in indexs]
    fs.append(sha[level*step:])
    fs = os.sep.join(fs)
    ext_index = url.rfind('.')
    ext = url[ext_index:]
    if not ext.lower() in ['.jpg', '.gif', '.png']:
        print 'invalid url', url
        return
    fs += ext
    return fs

def crawl_img(r, fp):
    try:
        resp = urlopen(r)
    except Exception, e:
        print '%s --> %s' % (r.get_full_url(), e)
        return None
    size = 512 * 1024
    with open(fp, 'wb') as fb:
        while True:
            chunk = resp.read(size)
            if not chunk: break
            fb.write(chunk)
    print '[done] %s saved to %s' % (r.get_full_url(), fp)

size_map = {'200x200': '550x550',
            '116x86': '980x1200'}
small_size = '200x200'
big_size = '550x550'
# small_pat = re.compile('.+%s' % small_size)

def main(url):
    headers = {'Referer': url, 'User-Agent': USER_AGENT}
    driver = webdriver.Firefox(timeout=15)
    driver.get(url)
    time.sleep(8)
    img_links = [t.get_attribute('src') for t in driver.find_elements_by_tag_name('img')]
    if not img_links:
        return
    sql = 'UPDATE tencent SET image_urls="%s" WHERE url="%s"' % ('|'.join(img_links), url)
    cur.execute(sql)
    conn.commit()
    driver.quit()
    for small_size in size_map:
        img_links.extend([l.replace(small_size, size_map[small_size]) for l in img_links if small_size in l])
    requests = [Request(u, headers=headers) for u in img_links]

    for req in requests:
        fp = get_path_by_url(req.get_full_url())
        if not fp: continue
        fp = '/data/crawl/image/full/' + fp
        if os.path.exists(fp):
            continue
        mkdir(os.path.dirname(fp))
        crawl_img(req, fp)

if __name__ == '__main__':
    cur.execute('SELECT url, `image_urls` from tencent')
    urls = [l[0] for l in cur.fetchall() if not l[1]]
    for l in urls:
        main(l)
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.