Commits

story645 committed 5b1294b

recursive crawl works

  • Participants
  • Parent commits 3d79ce5

Comments (0)

Files changed (3)

File newstracker/crawler.py

-# http://www.tryolabs.com/Blog/2011/09/27/calling-scrapy-python-script/
+"""Script manages spider calls, obtained from
+http://www.tryolabs.com/Blog/2011/09/27/calling-scrapy-python-script/
+"""
 
 import scrapy
 from scrapy import project, signals
 
 import news_spider as ns
 
-class CrawlerWorker(multiprocessing.Process):
-	 
+class CrawlerWorker(multiprocessing.Process):	 
 	def __init__(self, spider, result_queue):
 		multiprocessing.Process.__init__(self)
 		self.result_queue = result_queue
 		self.crawler.start()
 		self.crawler.stop()
 		self.result_queue.put(self.items)
-		
-		
+				
 def crawler(**kwargs):
 	result_queue = Queue()
 	crawler = CrawlerWorker(ns.NewsSpider(**kwargs), result_queue)
 	return [item for item in result_queue.get()]
 
 if __name__ =='__main__':
-	#items = crawler(start_urls=["http://www.nytimes.com/"])
-	items = crawler()
+	items = crawler(start_urls=["https://www.nytimes.com/2012/01/02/world/middleeast/holocaust-images-in-ultra-orthodox-protest-anger-israeli-leaders.html?src=recg"])	
+	#items = crawler()
 	print "# crawled: ", len(items)

File newstracker/news_spider.py

 from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
 from scrapy.selector import HtmlXPathSelector
 
+from BeautifulSoup import BeautifulSoup, SoupStrainer
+
 from newsites import sites
 
 
 		return "Article: title=%s url=%s" % (self['title'], self['link'])
 									
 
-class NewsSpider(CrawlSpider):
+class NewsSpider(BaseSpider):
 	name = "news"
 	allowed_domains = []
 
-	rules = (Rule(SgmlLinkExtractor(), callback='parse_item', follow=True), ) 
+	#rules = (Rule(SgmlLinkExtractor(), callback='parse_item', follow=True), ) 
 	
 	def __init__(self, *args, **kwargs): 
 		super(NewsSpider, self).__init__(*args, **kwargs) 
 		self.start_urls = kwargs.get('start_urls', sites) 
+	
+	
+	def start_requests(self):
+		"""translates a seed url into a request"""
+		return[Request(su, callback=self.parse_urls) for su in self.start_urls]
+	
+	
+	def parse_urls(self, response):
 		
-	def start_requests(self):
-		return [Request(su, callback=self.parse_item) for su in self.start_urls]
-			
+		hxs = HtmlXPathSelector(response)
+		links = hxs.select('//a[@href]/@href').extract()
+		for link in links:
+			if link[:4] == 'http':
+				yield Request(link, callback=self.parse_item)
 		
 	
 	def parse_item(self, response):
 		hxs = HtmlXPathSelector(response)
+		#assumes that article title is always the title attribute
 		title = hxs.select('//title/text()').extract()
 		link = response.url
 		desc = hxs.select('text()').extract()
-		print title
-		if title and link and desc:
-			return make_item(title, desc, link)
-		return
-	
-def make_item(title, desc, link):
-	item = Article()
-	item['title'] = title
-	item['desc'] = desc
-	item['link'] = link
-	return item
+		item = Article()
+		item['title'] = title
+		item['desc'] = desc
+		item['link'] = link
+		yield item
 			
 			
 			

File newstracker/newsites.py

-#list obtained from https://en.wikipedia.org/wiki/Google_News
+"""List of sites to seed search if no seeds are given
+list obtained from https://en.wikipedia.org/wiki/Google_News
+"""
 
 sites = ["http://www.nytimes.com/", 
          "http://www.washingtonpost.com/",