Commits

story645 committed 030e0fa

rough skecth of how dates should work

Comments (0)

Files changed (2)

newstracker/crawler.py

 
 if __name__ =='__main__':
 	items = crawler(start_urls=["https://www.nytimes.com/2012/01/02/world/middleeast/holocaust-images-in-ultra-orthodox-protest-anger-israeli-leaders.html?src=recg"])	
-	#items = crawler()
-	print "# crawled: ", len(items)
+	print "# crawled: ", len(items)
+	for item in items:
+		print item

newstracker/news_spider.py

 import os
+import re
+import calendar
+import datetime
 
 from scrapy.http import Request
 from scrapy.item import Item, Field
 from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
 from scrapy.selector import HtmlXPathSelector
 
+#import nltk
 from BeautifulSoup import BeautifulSoup, SoupStrainer
 
 from newsites import sites
 
-
 class Article(Item):
 	title = Field()
-	link = Field()
-	desc = Field()
-	data = Field()
-	text = Field()
+	date = Field()
+	body = Field()
 	
 	def __str__(self):
-		return "Article: title=%s url=%s" % (self['title'], self['link'])
-									
+		return "Article: title=%s url=%s" % (self['title'], self['body'])
 
 class NewsSpider(BaseSpider):
 	name = "news"
 		for link in links:
 			if link[:4] == 'http':
 				yield Request(link, callback=self.parse_item)
-		
+				
 	
 	def parse_item(self, response):
 		hxs = HtmlXPathSelector(response)
 		#assumes that article title is always the title attribute
 		title = hxs.select('//title/text()').extract()
 		link = response.url
-		desc = hxs.select('text()').extract()
+		body =  response.body
+		#these two should be global
+		date_expr = re.compile(r"(?:%s) \d{2}, \d{4}" % '|'.join(calendar.month_abbr[1:]))
+		date_expr2 = re.compile(r"(?:%s) \d{2}, \d{4}" % '|'.join(calendar.month_name[1:]))
+		dates = []
+		for match in date_expr.findall(body):
+			dates.append(datetime.strptime(match, '%b %d, %Y'))
+		for match in date_expr2.findall(body):
+			dates.append(datetime.strptime(match, '%B %d, %Y'))
+		print "dates: {}".format(dates)
+		if dates:
+			date = dates[0]
+		else:
+			date = 0
+		
 		item = Article()
 		item['title'] = title
-		item['desc'] = desc
-		item['link'] = link
+		item['date'] = date
+		#item['body'] = nltk.html_clean(response.body)
 		yield item