Source

woocode / py / crawler / spiders / gamersky.py

Full commit
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from crawler.items import GamerskyItem, ImageItem

class GamerskySpider(CrawlSpider):
    name = 'gamersky'
    allowed_domains = ['gamersky.com']
    start_urls = ['http://www.gamersky.com/news/pc/zx/',
                  'http://www.gamersky.com/news/pc/qz/',
                  'http://www.gamersky.com/news/tv/zx/']
    # start_urls = ['http://www.gamersky.com/news/201206/203393.shtml']

    rules = (
        Rule(SgmlLinkExtractor(allow=r'news/201206/\d+(_\d+)?\.shtml'), callback='parse_article', follow=True),
    )

    def parse_article(self, response):
        hxs = HtmlXPathSelector(response)

        i = GamerskyItem()
        #im = ImageItem()
        i['url'] = response.url
        #im['image_urls'] = hxs.select('//a/img/@src').extract()
        i['image_urls'] = hxs.select('//a/img/@src').extract()
        i['title'] = hxs.select('//h1/b/text()').extract()
        i['category'] = hxs.select('//div[@class="tit1 mid"]/a/text()').extract()
        i['content'] = hxs.select('//div[@id="gspaging"]').re(r'(.+)')

        return i