Source

woocode / py / crawler / crawler / pipelines.py

Full commit
# -*- encoding:utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/topics/item-pipeline.html
import os
import hashlib
import sqlite3

from scrapy.contrib.pipeline.images import ImagesPipeline
from scrapy import signals
from scrapy.xlib.pydispatch import dispatcher

from utils import get_pathname_from_url, writefile, mkdir
from items import CrawlerItem

def unicode_to_str(raw, enc='utf8', sep='|'):
    '''
    raw 如果是列表,列表里的元素必须是unicode
    raw
    '''
    if hasattr(raw, '__iter__'):
        return sep.join([i.encode(enc) for i in raw])

    elif isinstance(raw, unicode):
        return raw.encode(enc)
    elif isinstance(raw, str):
        return raw
    else:
        raise TypeError('Unsupport type: %r <%r>' % (raw, type(raw)))

class CrawlerPipeline(object):
    filename = 'data.db'
    def __init__(self):
        self.conn = None
        dispatcher.connect(self.initialize, signals.engine_started)
        dispatcher.connect(self.finalize, signals.engine_stopped)

    def process_item(self, item, spider):
        d = {}
        for k, v in item.items():
            sep = '|'
            if k == 'content':
                sep = '\n'
            d[k] = unicode_to_str(v, sep=sep)

        values = (d['url'], d['title'], d['content'], d['image_urls'], d['category'], d['download_urls'])
        self.conn.execute('INSERT INTO %s values (?, ?, ?, ?, ?, ?)' % d['table_name'], values)

        self.conn.commit()
        # 暂时不考虑写文件系统
        #if isinstance(item, CrawlerItem):
            #fn = '/data/crawl/sites/' + get_pathname_from_url(item['url'])
            #mkdir(os.path.dirname(fn))
            ## content = '\n'.join([e.encode('utf8') for e in item['content']])
            #writefile(fn, d['content'])
        return item


    def initialize(self):
        self.conn = sqlite3.connect(self.filename)
        self.conn.text_factory = str

    def finalize(self):
        if self.conn is not None:
            self.conn.commit()
            self.conn.close()
            self.conn = None

#class CrawlerPipeline(object):
    #def process_item(self, item, spider):
        #if isinstance(item, CrawlerItem):
            #fn = os.path.join('dat', get_pathname_from_url(item['url']))
            #fd = os.path.dirname(fn)
            #mkdir(fd)
            #content = '\n'.join([e.encode('utf8') for e in item['content']])
            #writefile(fn, content)

        #return item

class MyImagePipeline(ImagesPipeline):
    def _get_hash_key(self, s):
        v = hashlib.sha1(s).hexdigest()
        return (v[:2], v[2:4], v[4:])

    def image_key(self, url):
        fp = os.sep.join(self._get_hash_key(url))
        path = 'full/%s.jpg' % fp
        return path

    def thumb_key(self, url, thumb_id):
        fp = os.sep.join(self._get_hash_key(url))
        path = 'thumbs/%s/%s.jpg' % (thumb_id, fp)
        return path