1. yang xiaoyong
  2. woocode

Source

woocode / py / crawler / crawler / spiders / duowan.py

from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.conf import settings
from crawler.items import CrawlerItem

site_name = 'duowan'
site_conf = settings['SITES'][site_name]

class DuowanSpider(CrawlSpider):
    name = site_name
    allowed_domains = ['duowan.com']
    start_urls = site_conf['start_urls']

    rules = [Rule(SgmlLinkExtractor(allow=regex), callback=func, follow=True) for regex, func in site_conf['rules']]

    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)
        i = CrawlerItem()
        i['table_name'] = site_conf['table_name']
        i['url'] = response.url
        i['title'] = hxs.select('//h1/text()').extract()
        i['content'] = hxs.select('//div[@id="text"]/p/text()').extract()
        i['image_urls'] = ''
        i['category'] = hxs.select('//div[@class="mod-crumb"]/a/text()').extract()
        i['download_urls'] = ''

        return i