Commits

story645 committed 3d79ce5

crawler pulls down sites quickly

  • Participants
  • Parent commits e1bd33d

Comments (0)

Files changed (5)

         if not self._downloaded:
             self.download()
         return self._mime_type
-
+  
     @property
     def fail(self):
         """True if the download failed; False otherwise.
         is located.
         """
         return self._depth
-
+    
     @property
     def url(self):
         """The URL from where the webpage is downloaded.

newstracker/crawler.py

+# http://www.tryolabs.com/Blog/2011/09/27/calling-scrapy-python-script/
+
+import scrapy
+from scrapy import project, signals
+from scrapy.conf import settings
+from scrapy.crawler import CrawlerProcess
+from scrapy.xlib.pydispatch import dispatcher
+from multiprocessing.queues import Queue
+import multiprocessing
+
+import news_spider as ns
+
+class CrawlerWorker(multiprocessing.Process):
+	 
+	def __init__(self, spider, result_queue):
+		multiprocessing.Process.__init__(self)
+		self.result_queue = result_queue
+		
+		self.crawler = CrawlerProcess(settings)
+		if not hasattr(project, 'crawler'):
+			self.crawler.install()
+		self.crawler.configure()
+			
+		self.items = []
+		self.spider = spider
+		dispatcher.connect(self._item_passed, signals.item_passed)
+		
+	def _item_passed(self, item):
+		self.items.append(item)
+		
+	def run(self):
+		self.crawler.crawl(self.spider)
+		self.crawler.start()
+		self.crawler.stop()
+		self.result_queue.put(self.items)
+		
+		
+def crawler(**kwargs):
+	result_queue = Queue()
+	crawler = CrawlerWorker(ns.NewsSpider(**kwargs), result_queue)
+	crawler.start()
+	return [item for item in result_queue.get()]
+
+if __name__ =='__main__':
+	#items = crawler(start_urls=["http://www.nytimes.com/"])
+	items = crawler()
+	print "# crawled: ", len(items)

newstracker/news_spider.py

+import os
+
+from scrapy.http import Request
+from scrapy.item import Item, Field
+from scrapy.spider import BaseSpider
+from scrapy.contrib.spiders import CrawlSpider, Rule
+from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
+from scrapy.selector import HtmlXPathSelector
+
+from newsites import sites
+
+
+class Article(Item):
+	title = Field()
+	link = Field()
+	desc = Field()
+	data = Field()
+	text = Field()
+	
+	def __str__(self):
+		return "Article: title=%s url=%s" % (self['title'], self['link'])
+									
+
+class NewsSpider(CrawlSpider):
+	name = "news"
+	allowed_domains = []
+
+	rules = (Rule(SgmlLinkExtractor(), callback='parse_item', follow=True), ) 
+	
+	def __init__(self, *args, **kwargs): 
+		super(NewsSpider, self).__init__(*args, **kwargs) 
+		self.start_urls = kwargs.get('start_urls', sites) 
+		
+	def start_requests(self):
+		return [Request(su, callback=self.parse_item) for su in self.start_urls]
+			
+		
+	
+	def parse_item(self, response):
+		hxs = HtmlXPathSelector(response)
+		title = hxs.select('//title/text()').extract()
+		link = response.url
+		desc = hxs.select('text()').extract()
+		print title
+		if title and link and desc:
+			return make_item(title, desc, link)
+		return
+	
+def make_item(title, desc, link):
+	item = Article()
+	item['title'] = title
+	item['desc'] = desc
+	item['link'] = link
+	return item
+			
+			
+			
+																				

newstracker/newsites.py

+#list obtained from https://en.wikipedia.org/wiki/Google_News
+
+sites = ["http://www.nytimes.com/", 
+         "http://www.washingtonpost.com/", 
+		 "http://www.bloomberg.com/", 
+		 "http://www.latimes.com", 
+		 "http://www.reuters.com",
+		 "http://www.forbes.com", 
+		 "http://www.guardian.co.uk/", 
+		 "http://www.bostonglobe.com/", 
+		 "http://www.bbc.co.uk/",
+		 "http://www.sfgate.com/", 
+		 "http://www.cbsnews.com/", 
+		 "http://www.chinaview.cn/", 
+		 "http://www.usatoday.com/", 
+		 "http://www.fox.com/", 
+		 "http://www.cnn.com/",
+		 "http://www.msnbc.msn.com/",
+		 "http://www.abcnews.go.com/", 
+		 ]
+           
-#!/usr/bin/env python
+#!/usr/bin/env python2.7
 #
 # setup_venv.py
 #
     # command for installing ccplib and ccpweb in develop mode:
     web = "".join(['-e'," ", 'file:', web_path])
 
-    prereqs = ['bottle', 'requests', 'beautifulsoup', 'numpy', 'nltk', web]
+    prereqs = ['bottle', 'requests', 'beautifulsoup', 'numpy', 'nltk', 'scrapy', web]
 
     # creates venv and installs prereqs in it
     setup_virtualenv(env_path, python_path)