1. yang xiaoyong
  2. woocode


woocode / py / crawler / pipelines.py

# Define your item pipelines here
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/topics/item-pipeline.html
import os

from scrapy.contrib.pipeline.images import ImagesPipeline

from utils import get_pathname_from_url, writefile, mkdir
from items import CrawlerItem, ImageItem

class CrawlerPipeline(object):
    def process_item(self, item, spider):
        if isinstance(item, CrawlerItem):
            fn = os.path.join('dat', get_pathname_from_url(item['url']))
            fd = os.path.dirname(fn)
            writefile(fn, '\n'.join([e.encode('utf8') for e in item['content']]))

        return item

class MyImagePipeline(ImagesPipeline):
    def image_key(self, url):
        return get_pathname_from_url(url)