Source

woocode / py / crawler / tutorial / tutorial / spiders / miniova_spider.py

# -*- encoding:utf-8 -*-

#from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders.crawl import Rule, CrawlSpider
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor

from tutorial.items import TorrentItems

class MininovaSpider(CrawlSpider):
    name = 'mininova.org'
    allowed_domains = ['mininova.org']
    start_urls = ['http://www.mininova.org/today']
    start_urls = ['http://www.mininova.org/']
    rules = [Rule(SgmlLinkExtractor(allow=['/tor/\d+']), 'parse_torrent')]

    def parse_torrent(self, response):
        x = HtmlXPathSelector(response)

        torrent = TorrentItems()
        torrent['url'] = response.url
        torrent['name'] = x.select("//h1/text()").extract()
        torrent['size'] = x.select("//div[@id='specifications']/p[3]/text()").extract()[1]
        torrent['description'] = x.select("//div[@id='description']").extract()
        return torrent