Source

woocode / py / crawler / tutorial / tutorial / spiders / gamesky_spider.pyf

Full commit
# -*- encoding:utf-8 -*-

#from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders.crawl import Rule, CrawlSpider
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor

from tutorial.items import TorrentItems, GameItems

class GameskySpider(CrawlSpider):
    name = 'gamesky'
    allowed_domains = ['gamesky.com']
    start_urls = ['http://www.gamersky.com/news/201206/203230.shtml']
    rules = [Rule(SgmlLinkExtractor(allow=['/\d+/\d+\.shtml']), 'parse_article')]

    def parse_article(self, response):
        x = HtmlXPathSelector(response)

        item = GameItems()
        item['url'] = response.url
        item['title'] = x.select("//h1/text()").extract()
        item['description'] = x.select("//div[@id='gspaging']/text()").extract()
        #item['download'] = x.select("//div[@id='description']").extract()
        return item