Source

woocode / py / crawler / crawler / spiders / gamersky.py

Full commit
# -*- encoding:utf-8 -*-

from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from crawler.items import CrawlerItem

class GamerskySpider(CrawlSpider):
    name = 'gamersky'
    allowed_domains = ['gamersky.com']
    start_urls = ['http://www.gamersky.com/news/pc/zx/',     # 新闻速递
                  'http://www.gamersky.com/news/pc/qz/',
                  'http://www.gamersky.com/handbook/pc/gl/', # 攻略
                  'http://www.gamersky.com/news/pc/dp/',     # 评测
                  'http://www.gamersky.com/news/tv/zx/',
                  ]

    rules = (
        Rule(SgmlLinkExtractor(allow=r'news/201206/\d+(_\d+)?\.shtml'), callback='parse_article', follow=True),
        # TODO: 抓取大图
        # Rule(SgmlLinkExtractor(allow=r'.+\.(jpg|png|gif)$'), callback='parse_img', follow=False),
    )

    def parse_img(self, response):
        # hxs = HtmlXPathSelector(response)
        i = CrawlerItem()
        i['image_urls'] = response.url# hxs('//div[@id="gspaging"]/p/a/@href').extract()
        return i

    def parse_article(self, response):
        hxs = HtmlXPathSelector(response)

        i = CrawlerItem()

        i['table_name'] = self.name
        i['url'] = response.url
        i['image_urls'] = hxs.select('//a/img/@src').extract()
        i['title'] = hxs.select('//h1/b/text()').extract()
        i['category'] = hxs.select('//div[@class="tit1 mid"]/a/text()').extract()
        i['content'] = hxs.select('//div[@id="gspaging"]').re(r'(.+)')

        return i