woocode / py / crawler /

# Define your item pipelines here
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See:
import os

from scrapy.contrib.pipeline.images import ImagesPipeline

from utils import get_pathname_from_url, writefile, mkdir
from items import CrawlerItem, ImageItem

class CrawlerPipeline(object):
    def process_item(self, item, spider):
        if isinstance(item, CrawlerItem):
            fn = os.path.join('dat', get_pathname_from_url(item['url']))
            fd = os.path.dirname(fn)
            writefile(fn, '\n'.join([e.encode('utf8') for e in item['content']]))

        return item

class MyImagePipeline(ImagesPipeline):
    def image_key(self, url):
        return get_pathname_from_url(url)