Source

woocode / py / crawler / tutorial / tutorial / spiders / gamesky_spider.py

# -*- encoding:utf-8 -*-

from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector

from tutorial.items import DmozItems, GameItems

class DmozSpider(BaseSpider):
    name = "gamesky"
    allowed_domains = ['gamesky.com']
    start_urls = ['http://www.gamersky.com/news/201206/203230.shtml']

    def parse(self, response):
        x = HtmlXPathSelector(response)
        # items = []
        item = GameItems()
        item['url'] = response.url
        item['title'] = x.select("//h1/b/text()").extract()
        item['description'] = x.select("//div[@id='gspaging']/p").extract()
        return item