Commits

Jernej Virag committed b712e2c

Implemented scraper for Delo

Comments (0)

Files changed (4)

scrapers/delo_scraper.py

+import calendar
+from datetime import datetime
+import hashlib
+import bs4
+import feedparser
+import pytz
+import requests
+
+class DeloScraper(object):
+    DELO_RSS_URL = "http://www.delo.si/rss/"
+
+    def get_news(self):
+        news = []
+        feed_content = feedparser.parse(self.DELO_RSS_URL)
+        for feed_entry in feed_content.entries:
+            link = feed_entry["link"]
+            article = self.get_article_text(link)
+            if article is None: continue
+
+            published_st = feed_entry["published_parsed"]
+            published_date = datetime.fromtimestamp(calendar.timegm(published_st), tz=pytz.utc)
+            article["published"] = published_date
+            article["source"] = "Delo"
+            article["source_url"] = link
+            article["language"] = "si"
+
+            # Generate ID from link
+            hash = hashlib.md5()
+            hash.update("Delo")
+            hash.update(link)
+            article["id"] = hash.hexdigest()
+            news.append(article)
+        print news
+        return news
+
+    def get_article(self, link):
+        print "[Delo] Grabbing article", link
+        response = requests.get(link)
+        return response.text
+
+    def get_article_text(self, link):
+        article_html = self.get_article(link)
+        result = {}
+        article = bs4.BeautifulSoup(article_html)
+        result["title"] = article.title.text.strip()
+
+        subtitle = article.find(id="EXCERPT", text=True)
+        if subtitle is None:
+            subtitle = article.find(id="EXCERPT_mnenja", text=True)
+
+        if subtitle is not None:
+            result["subtitles"] = [subtitle.text.strip()]
+
+        content_item = article.find(id="D_NEWS")
+        if content_item is None:
+            content_item = article.find(id="D_NEWS_MNENJA")
+
+        if content_item is not None:
+            text_content = " ".join([p_item.text.strip() for p_item in content_item.find_all('p', text=True) if p_item is not None])
+            result["text"] = text_content
+            return result
+        else:
+            print "Unknown article content for", link
+            return None
+

scrapers/rtv_scraper.py

 import feedparser
 import requests
 import hashlib
-from time import mktime
 from datetime import datetime
 
 class RTVScraper(object):
         return news
 
     def get_article(self, article_id):
-        print "Grabbing article ID", article_id
+        print "[RTVSlo] Grabbing article ID", article_id
         url = self.RTV_ARTICLE_URL + str(article_id)
         response = requests.get(url)
         return response.text

scrapers/scraping.py

 from rtv_scraper import RTVScraper
+from scrapers.delo_scraper import DeloScraper
 from scrapers.zurnal_scraper import ZurnalScraper
 
 def scrape_news():
     news.extend(rtv_scraper.get_news())
     zurnal_scraper = ZurnalScraper()
     news.extend(zurnal_scraper.get_news())
+    delo_scraper = DeloScraper()
+    news.extend(delo_scraper.get_news())
     return news

scrapers/zurnal_scraper.py

         return news
 
     def get_article(self, article_id):
+        print "[Zurnal] Grabbing article ID", article_id
         url = self.ZURNAL_PRINT_URL + str(article_id)
         response = requests.get(url)
         return response.text