Source

woocode / py / crawler / crawler / settings.py

Full commit
# Scrapy settings for crawler project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
#     http://doc.scrapy.org/topics/settings.html
#

BOT_NAME = 'crawler'
BOT_VERSION = '1.0'

SPIDER_MODULES = ['crawler.spiders']
NEWSPIDER_MODULE = 'crawler.spiders'
# USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)

USER_AGENT = 'Mozilla/5.0 X11 Linux x86_64 AppleWebKit/535.19 KHTML, like Gecko Ubuntu/12.04 Chromium/18.0.1025.151 Chrome/18.0.1025.151 Safari/535.19'

ITEM_PIPELINES = ['crawler.pipelines.CrawlerPipeline',
                  'crawler.pipelines.MyImagePipeline'
                  ]

SPIDER_MIDDLEWARES = {'scrapy.contrib.spidermiddleware.referer.RefererMiddleware': 10}
IMAGES_STORE = '/data/crawl/image'

SITES = {'duowan': {'rules': [(r'com/1206/\d+\.html', 'parse_item')],
                    'start_urls': ['http://pc.duowan.com/tag/184669959747.html'],
                    'allowed_domains': ['duowan.com'],
                    'table_name': 'duowan',
                    },
         'qq': {},
         }