woocode / py / crawler / crawler / spiders / duowan.py

from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.conf import settings
from crawler.items import CrawlerItem

site_name = 'duowan'
site_conf = settings['SITES'][site_name]

class DuowanSpider(CrawlSpider):
    name = site_name
    allowed_domains = ['duowan.com']
    start_urls = site_conf['start_urls']

    rules = [Rule(SgmlLinkExtractor(allow=regex), callback=func, follow=True) for regex, func in site_conf['rules']]

    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)
        i = CrawlerItem()
        i['table_name'] = site_conf['table_name']
        i['url'] = response.url
        i['title'] = hxs.select('//h1/text()').extract()
        i['content'] = hxs.select('//div[@id="text"]/p/text()').extract()
        i['image_urls'] = ''
        i['category'] = hxs.select('//div[@class="mod-crumb"]/a/text()').extract()
        i['download_urls'] = ''

        return i
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.