Commits

yang xiaoyong committed b9995d4 Draft

add crawel

  • Participants
  • Parent commits e612268

Comments (0)

Files changed (20)

py/crawler/crawler/__init__.py

Empty file added.

py/crawler/crawler/backup_db.sh

+#!/bin/sh
+
+DST="/data/backups"
+
+cp data.db $DST/$(date +%Y-%m-%d)-data.db

py/crawler/crawler/init_db.py

+import os
+import sqlite3
+
+db = 'data.db'
+def create_table(table_name):
+    #if os.path.exists(db):
+        #print 'db already was initialized.'
+        #return
+    conn = sqlite3.connect(db)
+    conn.execute("""CREATE TABLE IF NOT EXISTS %s (
+         url TEXT,
+         title TEXT,
+         content TEXT,
+         image_urls TEXT,
+         category TEXT,
+         download_urls TEXT)""" % table_name)
+    conn.commit()
+    conn.close()
+
+def add_column(tbl, cl):
+    conn = sqlite3.connect(db)
+    conn.execute("""ALTER table %s add column %s TEXT;""" % (tbl, cl))
+    conn.commit()
+    conn.close()
+
+# domains = ['gamersky', 'dmgame', 'tencent', 'sina', 'duowan',
+domains = ['dmgame', 'tencent', 'sina', 'duowan',
+           'sgamer', 'a9vg', 'g178', 'net163', 'youba', 'pcgames',
+           'verycd', 'g91', 'd_cn']
+
+domains = ['ali213']
+for d in domains:
+    create_table(d)
+    # add_column(d, 'download_urls')
+    #create_table(d)
+
+

py/crawler/crawler/items.py

+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/topics/items.html
+
+from scrapy.item import Item, Field
+
+class ImageItem(Item):
+    url = Field()
+    image_urls = Field()
+    images = Field()
+
+class CrawlerItem(Item):
+    table_name = Field()
+    url = Field()
+    title = Field()
+    content = Field()
+    image_urls = Field()
+    images = Field()
+    category = Field()
+    download_urls = Field()
+
+class DownloadItem(Item):
+    table_name = Field()
+    url = Field()
+    title = Field()
+    content = Field()
+    image_urls = Field()
+    images = Field()
+    category = Field()
+    download_urls = Field()
+

py/crawler/crawler/pipelines.py

+# -*- encoding:utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/topics/item-pipeline.html
+import os
+import hashlib
+import sqlite3
+
+from scrapy.contrib.pipeline.images import ImagesPipeline
+from scrapy import signals
+from scrapy.xlib.pydispatch import dispatcher
+
+from utils import get_pathname_from_url, writefile, mkdir
+from items import CrawlerItem
+
+def unicode_to_str(raw, enc='utf8', sep='|'):
+    '''
+    raw 如果是列表,列表里的元素必须是unicode
+    raw
+    '''
+    if hasattr(raw, '__iter__'):
+        return sep.join([i.encode(enc) for i in raw])
+
+    elif isinstance(raw, unicode):
+        return raw.encode(enc)
+    elif isinstance(raw, str):
+        return raw
+    else:
+        raise TypeError('Unsupport type: %r <%r>' % (raw, type(raw)))
+
+class CrawlerPipeline(object):
+    filename = 'data.db'
+    def __init__(self):
+        self.conn = None
+        dispatcher.connect(self.initialize, signals.engine_started)
+        dispatcher.connect(self.finalize, signals.engine_stopped)
+
+    def process_item(self, item, spider):
+        d = {}
+        for k, v in item.items():
+            sep = '|'
+            if k == 'content':
+                sep = '\n'
+            d[k] = unicode_to_str(v, sep=sep)
+
+        values = (d['url'], d['title'], d['content'], d['image_urls'], d['category'], d['download_urls'])
+        self.conn.execute('INSERT INTO %s values (?, ?, ?, ?, ?, ?)' % d['table_name'], values)
+
+        self.conn.commit()
+        # 暂时不考虑写文件系统
+        #if isinstance(item, CrawlerItem):
+            #fn = '/data/crawl/sites/' + get_pathname_from_url(item['url'])
+            #mkdir(os.path.dirname(fn))
+            ## content = '\n'.join([e.encode('utf8') for e in item['content']])
+            #writefile(fn, d['content'])
+        return item
+
+
+    def initialize(self):
+        self.conn = sqlite3.connect(self.filename)
+        self.conn.text_factory = str
+
+    def finalize(self):
+        if self.conn is not None:
+            self.conn.commit()
+            self.conn.close()
+            self.conn = None
+
+#class CrawlerPipeline(object):
+    #def process_item(self, item, spider):
+        #if isinstance(item, CrawlerItem):
+            #fn = os.path.join('dat', get_pathname_from_url(item['url']))
+            #fd = os.path.dirname(fn)
+            #mkdir(fd)
+            #content = '\n'.join([e.encode('utf8') for e in item['content']])
+            #writefile(fn, content)
+
+        #return item
+
+class MyImagePipeline(ImagesPipeline):
+    def _get_hash_key(self, s):
+        v = hashlib.sha1(s).hexdigest()
+        return (v[:2], v[2:4], v[4:])
+
+    def image_key(self, url):
+        fp = os.sep.join(self._get_hash_key(url))
+        path = 'full/%s.jpg' % fp
+        return path
+
+    def thumb_key(self, url, thumb_id):
+        fp = os.sep.join(self._get_hash_key(url))
+        path = 'thumbs/%s/%s.jpg' % (thumb_id, fp)
+        return path
+
+

py/crawler/crawler/restruct.py

+#!/usr/bin/env python
+# -*- encoding:utf-8 -*-
+import os
+import shutil
+
+DST = '/data/image/full'
+
+def get_file(path):
+    for root, dirs, files in os.walk(path):
+        for fn in files:
+            yield os.path.join(root, fn)
+
+def copyfile(src, dst):
+    shutil.copyfile(src, dst)
+
+def mkdir(path):
+    if not os.path.exists(path):
+        os.makedirs(path)
+
+def split(v):
+    return (v[:2], v[2:4], v[4:])
+
+
+def main():
+    for fn in get_file('.'):
+        if os.path.basename(fn) == __file__:
+            continue
+        sha = fn[2:].replace('/', '')
+        dst = DST + '/' + '/'.join(split(sha))
+        mkdir(os.path.dirname(dst))
+        print 'copy %s to %s' % (fn, dst)
+        copyfile(fn, dst)
+
+if __name__ == '__main__':
+    main()
+
+

py/crawler/crawler/scripts/download_img.py

+import os
+import hashlib
+import time
+import sqlite3
+from urllib2 import Request, urlopen
+
+from selenium import webdriver
+
+conn = sqlite3.connect('../data.db')
+cur = conn.cursor()
+
+# level 2, step 2 hash directory
+step = 2
+level = 2
+USER_AGENT = ('Mozilla/5.0 X11 Linux x86_64 AppleWebKit/535.19 KHTML, '
+              'like Gecko Ubuntu/12.04 Chromium/18.0.1025.151 Chrome/1'
+              '8.0.1025.151 Safari/535.19')
+
+def permu(li):
+    ret = []
+    for i, e in enumerate(li):
+        if i + 1 >= len(li):
+            break
+        ret.append([e, li[i+1]])
+    return ret
+
+indexs = permu(range(0, 50, step))[:level]
+
+def mkdir(path):
+    if not os.path.exists(path):
+        os.makedirs(path)
+
+def get_path_by_url(url):
+    '''hash url to get path'''
+    sha = hashlib.sha1(url).hexdigest()
+    fs = [sha[i[0]:i[1]] for i in indexs]
+    fs.append(sha[level*step:])
+    fs = os.sep.join(fs)
+    ext_index = url.rfind('.')
+    ext = url[ext_index:]
+    if not ext.lower() in ['.jpg', '.gif', '.png']:
+        print 'invalid url', url
+        return
+    fs += ext
+    return fs
+
+def crawl_img(r, fp):
+    try:
+        resp = urlopen(r)
+    except Exception, e:
+        print '%s --> %s' % (r.get_full_url(), e)
+        return None
+    size = 512 * 1024
+    with open(fp, 'wb') as fb:
+        while True:
+            chunk = resp.read(size)
+            if not chunk: break
+            fb.write(chunk)
+    print '[done] %s saved to %s' % (r.get_full_url(), fp)
+
+size_map = {'200x200': '550x550',
+            '116x86': '980x1200'}
+small_size = '200x200'
+big_size = '550x550'
+# small_pat = re.compile('.+%s' % small_size)
+
+def main(url):
+    headers = {'Referer': url, 'User-Agent': USER_AGENT}
+    driver = webdriver.Firefox(timeout=15)
+    driver.get(url)
+    time.sleep(8)
+    img_links = [t.get_attribute('src') for t in driver.find_elements_by_tag_name('img')]
+    if not img_links:
+        return
+    sql = 'UPDATE tencent SET image_urls="%s" WHERE url="%s"' % ('|'.join(img_links), url)
+    cur.execute(sql)
+    conn.commit()
+    driver.quit()
+    for small_size in size_map:
+        img_links.extend([l.replace(small_size, size_map[small_size]) for l in img_links if small_size in l])
+    requests = [Request(u, headers=headers) for u in img_links]
+
+    for req in requests:
+        fp = get_path_by_url(req.get_full_url())
+        if not fp: continue
+        fp = '/data/crawl/image/full/' + fp
+        if os.path.exists(fp):
+            continue
+        mkdir(os.path.dirname(fp))
+        crawl_img(req, fp)
+
+if __name__ == '__main__':
+    cur.execute('SELECT url, `image_urls` from tencent')
+    urls = [l[0] for l in cur.fetchall() if not l[1]]
+    for l in urls:
+        main(l)

py/crawler/crawler/settings.py

+# Scrapy settings for crawler project
+#
+# For simplicity, this file contains only the most important settings by
+# default. All the other settings are documented here:
+#
+#     http://doc.scrapy.org/topics/settings.html
+#
+
+BOT_NAME = 'crawler'
+BOT_VERSION = '1.0'
+
+SPIDER_MODULES = ['crawler.spiders']
+NEWSPIDER_MODULE = 'crawler.spiders'
+# USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
+
+USER_AGENT = 'Mozilla/5.0 X11 Linux x86_64 AppleWebKit/535.19 KHTML, like Gecko Ubuntu/12.04 Chromium/18.0.1025.151 Chrome/18.0.1025.151 Safari/535.19'
+
+ITEM_PIPELINES = ['crawler.pipelines.CrawlerPipeline',
+                  'crawler.pipelines.MyImagePipeline'
+                  ]
+
+SPIDER_MIDDLEWARES = {'scrapy.contrib.spidermiddleware.referer.RefererMiddleware': 10}
+IMAGES_STORE = '/data/crawl/image'
+
+SITES = {'duowan': {'rules': [(r'com/1206/\d+\.html', 'parse_item')],
+                    'start_urls': ['http://pc.duowan.com/tag/184669959747.html'],
+                    'allowed_domains': ['duowan.com'],
+                    'table_name': 'duowan',
+                    },
+         'qq': {},
+         }

py/crawler/crawler/spiders/__init__.py

+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.

py/crawler/crawler/spiders/a3dmgame.py

+# -*- encoding:utf-8 -*-
+import re
+
+from scrapy.selector import HtmlXPathSelector
+from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
+from scrapy.contrib.spiders import CrawlSpider, Rule
+from crawler.items import CrawlerItem
+
+date = '201206'
+thunder_pat = re.compile(r'href=["\'](thunder://.+?)["\']\s*')
+
+class A3dmSpider(CrawlSpider):
+    name = 'a3dmgame'
+    allowed_domains = ['3dmgame.com']
+    start_urls = [
+                  'http://www.3dmgame.com/news/',
+                  'http://www.3dmgame.com/preview/',
+                  'http://www.3dmgame.com/review/',
+                  'http://dl.3dmgame.com/SoftList_18.html',
+                  ]
+
+    rules = (
+        Rule(SgmlLinkExtractor(allow=r'com/%s/\d+(_\d+)?\.html' % date), callback='parse_soft', follow=True),
+        Rule(SgmlLinkExtractor(allow=r'news/%s/\d+(_\d+)?\.html' % date), callback='parse_news', follow=True),
+        Rule(SgmlLinkExtractor(allow=r'preview/%s/\d+(_\d+)?\.html' % date), callback='parse_news', follow=True),
+        Rule(SgmlLinkExtractor(allow=r'review/%s/\d+(_\d+)?\.html' % date), callback='parse_news', follow=True),
+    )
+
+    def _parse_rel_link(self, url, link):
+        i = url.rfind('/')
+        par_url = url[:i]
+        if link.startswith('..'):
+            link = '/'.join([par_url, link])
+        elif link.startswith('/'):
+            link = par_url + link
+        return link
+
+    def parse_news(self, response):
+        hxs = HtmlXPathSelector(response)
+
+        i = CrawlerItem()
+        i['table_name'] = self.name
+        i['url'] = response.url
+        i['title'] = hxs.select('//h1/text()').extract()
+        i['image_urls'] = [self._parse_rel_link(response.url.strip(), u.strip()) for u in hxs.select('//img/@src').extract()
+                          if u.strip()[-4:].lower() in ['.jpg', '.png', '.gif']] # TODO .jpeg support
+        i['category'] = hxs.select('//div[@id="Location"]/div[@class="LoTitle"]/text()').extract()[1]
+        i['content'] = hxs.select('//div[@class="DeContent"]').re('.+')
+        i['download_urls'] = ''
+        return i
+
+    def parse_soft(self, response):
+        hxs = HtmlXPathSelector(response)
+
+        i = CrawlerItem()
+        i['table_name'] = self.name
+        i['url'] = response.url
+        i['title'] = hxs.select('//h1/text()').extract()
+        i['image_urls'] = [self._parse_rel_link(response.url.strip(), u.strip()) for u in hxs.select('//img/@src').extract()]
+        i['content'] = hxs.select('//div[@class="gameContent"]').re('.+')
+        i['category'] = hxs.select('//div[@class="gameContent"]/div[@class="jbContentBOttom"]/dl/ul/li[3]/dd/div[@class="wenziRight"]/text()').extract()[0].strip()
+        i['download_urls'] = [thunder_pat.search(t).group(1) for t in hxs.select('//div[@class="xiazaiList"]/a').extract()
+                              if thunder_pat.search(t)]
+
+        return i

py/crawler/crawler/spiders/duowan.py

+from scrapy.selector import HtmlXPathSelector
+from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
+from scrapy.contrib.spiders import CrawlSpider, Rule
+from scrapy.conf import settings
+from crawler.items import CrawlerItem
+
+site_name = 'duowan'
+site_conf = settings['SITES'][site_name]
+
+class DuowanSpider(CrawlSpider):
+    name = site_name
+    allowed_domains = ['duowan.com']
+    start_urls = site_conf['start_urls']
+
+    rules = [Rule(SgmlLinkExtractor(allow=regex), callback=func, follow=True) for regex, func in site_conf['rules']]
+
+    def parse_item(self, response):
+        hxs = HtmlXPathSelector(response)
+        i = CrawlerItem()
+        i['table_name'] = site_conf['table_name']
+        i['url'] = response.url
+        i['title'] = hxs.select('//h1/text()').extract()
+        i['content'] = hxs.select('//div[@id="text"]/p/text()').extract()
+        i['image_urls'] = ''
+        i['category'] = hxs.select('//div[@class="mod-crumb"]/a/text()').extract()
+        i['download_urls'] = ''
+
+        return i

py/crawler/crawler/spiders/gamersky.py

+# -*- encoding:utf-8 -*-
+
+from scrapy.selector import HtmlXPathSelector
+from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
+from scrapy.contrib.spiders import CrawlSpider, Rule
+from crawler.items import CrawlerItem
+
+class GamerskySpider(CrawlSpider):
+    name = 'gamersky'
+    allowed_domains = ['gamersky.com']
+    start_urls = ['http://www.gamersky.com/news/pc/zx/',     # 新闻速递
+                  'http://www.gamersky.com/news/pc/qz/',
+                  'http://www.gamersky.com/handbook/pc/gl/', # 攻略
+                  'http://www.gamersky.com/news/pc/dp/',     # 评测
+                  'http://www.gamersky.com/news/tv/zx/',
+                  ]
+
+    rules = (
+        Rule(SgmlLinkExtractor(allow=r'news/201206/\d+(_\d+)?\.shtml'), callback='parse_article', follow=True),
+        # TODO: 抓取大图
+        # Rule(SgmlLinkExtractor(allow=r'.+\.(jpg|png|gif)$'), callback='parse_img', follow=False),
+    )
+
+    def parse_img(self, response):
+        # hxs = HtmlXPathSelector(response)
+        i = CrawlerItem()
+        i['image_urls'] = response.url# hxs('//div[@id="gspaging"]/p/a/@href').extract()
+        return i
+
+    def parse_article(self, response):
+        hxs = HtmlXPathSelector(response)
+
+        i = CrawlerItem()
+
+        i['table_name'] = self.name
+        i['url'] = response.url
+        i['image_urls'] = hxs.select('//a/img/@src').extract()
+        i['title'] = hxs.select('//h1/b/text()').extract()
+        i['category'] = hxs.select('//div[@class="tit1 mid"]/a/text()').extract()
+        i['content'] = hxs.select('//div[@id="gspaging"]').re(r'(.+)')
+
+        return i

py/crawler/crawler/spiders/gamersky_dw.py

+# -*- encoding:utf-8 -*-
+
+from scrapy.selector import HtmlXPathSelector
+from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
+from scrapy.contrib.spiders import CrawlSpider, Rule
+from crawler.items import CrawlerItem, DownloadItem
+
+class GamerskySpider(CrawlSpider):
+    name = 'dw'
+    allowed_domains = ['gamersky.com']
+    start_urls = ['http://www.gamersky.com/Soft/te/',
+                  ]
+
+    rules = (
+        Rule(SgmlLinkExtractor(allow=r'Soft/201206/\d+(_\d+)?\.shtml'), callback='parse_article', follow=True),
+    )
+
+    def parse_article(self, response):
+        hxs = HtmlXPathSelector(response)
+
+        i = DownloadItem()
+
+        i['table_name'] = 'gamersky_dw'
+        i['url'] = response.url
+        i['image_urls'] = hxs.select('//a/img/@src').extract()
+        i['title'] = hxs.select('//h1/b/text()').extract()
+        i['category'] = hxs.select('//div[@class="tit1 mid"]/a/text()').extract()
+        i['content'] = hxs.select('//div[@class="actdl"]').re('.+')
+        i['download_urls'] = hxs.select('//div[@class="dvurl1"]/p/a/@href').extract()
+
+        return i

py/crawler/crawler/spiders/qq.py

+from scrapy.selector import HtmlXPathSelector
+from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
+from scrapy.contrib.spiders import CrawlSpider, Rule
+from crawler.items import CrawlerItem
+
+class QqSpider(CrawlSpider):
+    name = 'qq'
+    allowed_domains = ['qq.com']
+    start_urls = ['http://games.qq.com/pcgame/05pchotnews/list2009124183318.htm',
+                  'http://games.qq.com/l/tvgame/07zxzx/list2010098184725.htm',
+                  ]
+
+    rules = (
+        Rule(SgmlLinkExtractor(allow=r'a/201206\d+/\d+\.htm'), callback='parse_item', follow=True),
+    )
+
+    def parse_item(self, response):
+        hxs = HtmlXPathSelector(response)
+        i = CrawlerItem()
+        i['table_name'] = 'tencent'
+        i['title'] = hxs.select('//h1/text()').extract()
+        i['content'] = hxs.select('//div[@bosszone="content"]/p/text()').extract()
+        i['url'] = response.url
+        i['image_urls'] = ''
+        i['category'] = hxs.select('//span[@bosszone="crumbNav"]/a/text()').extract()
+        i['download_urls'] = ''
+
+        return i

py/crawler/crawler/spiders/test.py

+from scrapy.spider import BaseSpider
+
+class TestSpider(BaseSpider):
+    name = "test"
+    allowed_domains = ["google.com"]
+    start_urls = (
+        'http://www.google.com/',
+        'http://www.baidu.com/',
+        'http://www.bing.com/',
+        )
+
+    def parse(self, response):
+        self.log('A response from %s just arrived!' % response.url)

py/crawler/crawler/test.sh

+#!/bin/sh
+
+scrapy crawl gamersky
+# scrapy crawl 3dmgame

py/crawler/crawler/utils.py

+import os
+from urlparse import urlparse
+
+
+def writefile(fn, content):
+    with open(fn, 'wb') as fb:
+        fb.write(content)
+
+def mkdir(path):
+    if not os.path.exists(path):
+        os.makedirs(path)
+
+def get_pathname_from_url(url):
+    u = urlparse(url)
+    fn = '/'.join([u.netloc, u.path])
+    #mkdir(os.path.dirname(fn))
+    return fn

py/crawler/scrapy.cfg

+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# http://doc.scrapy.org/topics/scrapyd.html
+
+[settings]
+default = crawler.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = crawler

py/pil/add_watermark.py

+#!/usr/bin/env python
+# -*- encoding:utf-8 -*-
+
+import os
+import Image, ImageEnhance
+
+
+def batch(infolder, outfolder, watermark):
+    mark = Image.open(watermark)
+    for root, dirs, files in os.walk(infolder):
+        for name in files:
+            try:
+                im = Image.open(os.path.join(root, name))
+                if im.mode != 'RGBA':
+                    im = im.convert('RGBA')
+                layer = Image.new('RGBA', im.size, (0, 0, 0, 0))
+                position = (im.size[0] - mark.size[0],
+                            im.size[1] - mark.size[1])
+                layer.paste(mark, position)
+                Image.composite(layer, im, layer).save(os.path.join(outfolder, name))
+            except Exception, e:
+                print e
+
+def test():
+    batch('/home/yxy/Pictures', '/home/yxy/tmp', '/home/yxy/tmp/t.png')
+
+if __name__ == '__main__':
+    test()

py/sa/obj.pkl

Binary file modified.