Commits

yang xiaoyong  committed e612268 Draft

add crawler

  • Participants
  • Parent commits 8afb40f

Comments (0)

Files changed (9)

File py/crawler/__init__.py

Empty file added.

File py/crawler/items.py

+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/topics/items.html
+
+from scrapy.item import Item, Field
+
+class CrawlerItem(Item):
+    url = Field()
+    title = Field()
+    category = Field()
+    content = Field()
+
+class ImageItem(Item):
+    url = Field()
+    image_urls = Field()
+    images = Field()
+
+class GamerskyItem(Item):
+    url = Field()
+    title = Field()
+    content = Field()
+    image_urls = Field()
+    images = Field()
+    category = Field()
+

File py/crawler/pipelines.py

+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/topics/item-pipeline.html
+import os
+
+from scrapy.contrib.pipeline.images import ImagesPipeline
+
+from utils import get_pathname_from_url, writefile, mkdir
+from items import CrawlerItem, ImageItem
+
+class CrawlerPipeline(object):
+    def process_item(self, item, spider):
+        if isinstance(item, CrawlerItem):
+            fn = os.path.join('dat', get_pathname_from_url(item['url']))
+            fd = os.path.dirname(fn)
+            mkdir(fd)
+            writefile(fn, '\n'.join([e.encode('utf8') for e in item['content']]))
+
+        return item
+
+class MyImagePipeline(ImagesPipeline):
+    def image_key(self, url):
+        return get_pathname_from_url(url)
+

File py/crawler/settings.py

+# Scrapy settings for crawler project
+#
+# For simplicity, this file contains only the most important settings by
+# default. All the other settings are documented here:
+#
+#     http://doc.scrapy.org/topics/settings.html
+#
+
+BOT_NAME = 'crawler'
+BOT_VERSION = '1.0'
+
+SPIDER_MODULES = ['crawler.spiders']
+NEWSPIDER_MODULE = 'crawler.spiders'
+# USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
+
+USER_AGENT = 'Mozilla/5.0 X11 Linux x86_64 AppleWebKit/535.19 KHTML, like Gecko Ubuntu/12.04 Chromium/18.0.1025.151 Chrome/18.0.1025.151 Safari/535.19'
+
+ITEM_PIPELINES = ['crawler.pipelines.CrawlerPipeline',
+                  'crawler.pipelines.MyImagePipeline']
+
+IMAGES_STORE = 'dat/'

File py/crawler/spiders/__init__.py

+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.

File py/crawler/spiders/gamersky.py

+from scrapy.selector import HtmlXPathSelector
+from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
+from scrapy.contrib.spiders import CrawlSpider, Rule
+from crawler.items import GamerskyItem, ImageItem
+
+class GamerskySpider(CrawlSpider):
+    name = 'gamersky'
+    allowed_domains = ['gamersky.com']
+    start_urls = ['http://www.gamersky.com/news/pc/zx/',
+                  'http://www.gamersky.com/news/pc/qz/',
+                  'http://www.gamersky.com/news/tv/zx/']
+    # start_urls = ['http://www.gamersky.com/news/201206/203393.shtml']
+
+    rules = (
+        Rule(SgmlLinkExtractor(allow=r'news/201206/\d+(_\d+)?\.shtml'), callback='parse_article', follow=True),
+    )
+
+    def parse_article(self, response):
+        hxs = HtmlXPathSelector(response)
+
+        i = GamerskyItem()
+        #im = ImageItem()
+        i['url'] = response.url
+        #im['image_urls'] = hxs.select('//a/img/@src').extract()
+        i['image_urls'] = hxs.select('//a/img/@src').extract()
+        i['title'] = hxs.select('//h1/b/text()').extract()
+        i['category'] = hxs.select('//div[@class="tit1 mid"]/a/text()').extract()
+        i['content'] = hxs.select('//div[@id="gspaging"]').re(r'(.+)')
+
+        return i

File py/crawler/spiders/test.py

+from scrapy.spider import BaseSpider
+
+class TestSpider(BaseSpider):
+    name = "test"
+    allowed_domains = ["google.com"]
+    start_urls = (
+        'http://www.google.com/',
+        'http://www.baidu.com/',
+        'http://www.bing.com/',
+        )
+
+    def parse(self, response):
+        self.log('A response from %s just arrived!' % response.url)

File py/crawler/utils.py

+import os
+from urlparse import urlparse
+
+
+def writefile(fn, content):
+    with open(fn, 'wb') as fb:
+        fb.write(content)
+
+def mkdir(path):
+    if not os.path.exists(path):
+        os.makedirs(path)
+
+def get_pathname_from_url(url):
+    u = urlparse(url)
+    fn = '/'.join([u.netloc, u.path])
+    mkdir(os.path.dirname(fn))
+    return fn

File py/sa/obj.pkl

Binary file modified.