Source

woocode / py / crawler / crawler / spiders / gamersky_dw.py

Full commit
# -*- encoding:utf-8 -*-

from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from crawler.items import CrawlerItem, DownloadItem

class GamerskySpider(CrawlSpider):
    name = 'dw'
    allowed_domains = ['gamersky.com']
    start_urls = ['http://www.gamersky.com/Soft/te/',
                  ]

    rules = (
        Rule(SgmlLinkExtractor(allow=r'Soft/201206/\d+(_\d+)?\.shtml'), callback='parse_article', follow=True),
    )

    def parse_article(self, response):
        hxs = HtmlXPathSelector(response)

        i = DownloadItem()

        i['table_name'] = 'gamersky_dw'
        i['url'] = response.url
        i['image_urls'] = hxs.select('//a/img/@src').extract()
        i['title'] = hxs.select('//h1/b/text()').extract()
        i['category'] = hxs.select('//div[@class="tit1 mid"]/a/text()').extract()
        i['content'] = hxs.select('//div[@class="actdl"]').re('.+')
        i['download_urls'] = hxs.select('//div[@class="dvurl1"]/p/a/@href').extract()

        return i