Source

woocode / py / crawler / crawler / spiders / a3dmgame.py

# -*- encoding:utf-8 -*-
import re

from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from crawler.items import CrawlerItem

date = '201206'
thunder_pat = re.compile(r'href=["\'](thunder://.+?)["\']\s*')

class A3dmSpider(CrawlSpider):
    name = 'a3dmgame'
    allowed_domains = ['3dmgame.com']
    start_urls = [
                  'http://www.3dmgame.com/news/',
                  'http://www.3dmgame.com/preview/',
                  'http://www.3dmgame.com/review/',
                  'http://dl.3dmgame.com/SoftList_18.html',
                  ]

    rules = (
        Rule(SgmlLinkExtractor(allow=r'com/%s/\d+(_\d+)?\.html' % date), callback='parse_soft', follow=True),
        Rule(SgmlLinkExtractor(allow=r'news/%s/\d+(_\d+)?\.html' % date), callback='parse_news', follow=True),
        Rule(SgmlLinkExtractor(allow=r'preview/%s/\d+(_\d+)?\.html' % date), callback='parse_news', follow=True),
        Rule(SgmlLinkExtractor(allow=r'review/%s/\d+(_\d+)?\.html' % date), callback='parse_news', follow=True),
    )

    def _parse_rel_link(self, url, link):
        i = url.rfind('/')
        par_url = url[:i]
        if link.startswith('..'):
            link = '/'.join([par_url, link])
        elif link.startswith('/'):
            link = par_url + link
        return link

    def parse_news(self, response):
        hxs = HtmlXPathSelector(response)

        i = CrawlerItem()
        i['table_name'] = self.name
        i['url'] = response.url
        i['title'] = hxs.select('//h1/text()').extract()
        i['image_urls'] = [self._parse_rel_link(response.url.strip(), u.strip()) for u in hxs.select('//img/@src').extract()
                          if u.strip()[-4:].lower() in ['.jpg', '.png', '.gif']] # TODO .jpeg support
        i['category'] = hxs.select('//div[@id="Location"]/div[@class="LoTitle"]/text()').extract()[1]
        i['content'] = hxs.select('//div[@class="DeContent"]').re('.+')
        i['download_urls'] = ''
        return i

    def parse_soft(self, response):
        hxs = HtmlXPathSelector(response)

        i = CrawlerItem()
        i['table_name'] = self.name
        i['url'] = response.url
        i['title'] = hxs.select('//h1/text()').extract()
        i['image_urls'] = [self._parse_rel_link(response.url.strip(), u.strip()) for u in hxs.select('//img/@src').extract()]
        i['content'] = hxs.select('//div[@class="gameContent"]').re('.+')
        i['category'] = hxs.select('//div[@class="gameContent"]/div[@class="jbContentBOttom"]/dl/ul/li[3]/dd/div[@class="wenziRight"]/text()').extract()[0].strip()
        i['download_urls'] = [thunder_pat.search(t).group(1) for t in hxs.select('//div[@class="xiazaiList"]/a').extract()
                              if thunder_pat.search(t)]

        return i