Frederic De Groef avatar Frederic De Groef committed d7df1f5

[lesoir2] nothing is better than a CMS switch.

Comments (0)

Files changed (3)

csxj/datasources/lesoir2.py

+"""
+Frontpage and article scrapping module for the new version of www.lesoir.be (as of oct 2, 2012)
+"""
+
+import itertools
+from urllib2 import urlparse
+
+from scrapy.selector import HtmlXPathSelector
+from csxj.common.tagging import tag_URL, classify_and_tag, make_tagged_url, TaggedURL
+from csxj.db.article import ArticleData
+from common.utils import fetch_html_content
+from common.utils import setup_locales
+
+
+
+setup_locales()
+
+SOURCE_TITLE = u"Le Soir"
+SOURCE_NAME = u"lesoir2"
+
+LESOIR2_NETLOC = 'www.lesoir.be'
+
+
+
+
+def extract_title_and_url(link_hxs):
+    title = u"".join(link_hxs.select("text()").extract())
+    url = link_hxs.select('@href').extract()[0]
+    if not title:
+        title = u"__NO_TITLE__"
+    return title, url
+
+
+
+def separate_news_and_blogposts(titles_and_urls):
+    def is_external_blog(url):
+        return not url.startswith('/')
+
+    toc, blogposts = list(), list()
+    for t, u in titles_and_urls:
+        if is_external_blog(u):
+            blogposts.append((t, u))
+        else:
+            toc.append((t, u))
+    return toc, blogposts
+
+
+
+def reconstruct_full_url(url):
+    return urlparse.urljoin("http://{0}".format(LESOIR2_NETLOC), url)
+
+
+
+def get_frontpage_toc():
+    html_data = fetch_html_content('http://www.lesoir.be')
+    hxs = HtmlXPathSelector(text=html_data)
+
+    # main stories
+    list_items = hxs.select("//div [@id='main-content']//ul/li")
+    headlines_links = list_items.select("./h2/a | ./h3/a")
+
+    # just for the blog count statistics
+    blog_block = hxs.select("//div [@class='bottom-content']//div [@class='block-blog box']//h5/a")
+
+    # mainly soccer
+    sport_block = hxs.select("//div [@class='bottom-content']//div [@class='block-sport']")
+    sports_links = sport_block.select(".//h2/a | .//aside//li/a")
+
+    # bottom sections
+    bottom_news_links = hxs.select("//div [@class='bottom-content']//div [@class='block-articles']//a")
+
+
+    all_links_hxs = itertools.chain(headlines_links, blog_block, sports_links, bottom_news_links)
+    titles_and_urls = [extract_title_and_url(link) for link in all_links_hxs]
+
+    articles_toc, blogpost_toc = separate_news_and_blogposts(titles_and_urls)
+    return [(title, reconstruct_full_url(url)) for (title, url) in articles_toc], blogpost_toc
+
+
+if __name__=="__main__":
+    toc, blogposts = get_frontpage_toc()
+    for t, u in toc:
+        print u"{0} ({1})".format(t, u)
+
+    print len(toc), len(blogposts)

scripts/csxj_download_queued_articles.py

         print("no such database directory: {0}".format(db_root))
     else:
         ArticleQueueDownloader.setup_logging()
-        all_sources = lesoir, dhnet, lalibre, rtlinfo, sudinfo
+        all_sources = dhnet, lalibre, rtlinfo, sudinfo
         for source in all_sources:
             try:
                 csxj.crawler.download_queued_articles(source, db_root)

scripts/csxj_update_all_queues.py

 import os
 import argparse
 import csxj.crawler
-from csxj.datasources import lesoir, lalibre, dhnet, sudinfo, rtlinfo, lavenir, rtbfinfo, levif, septsursept
+from csxj.datasources import lesoir, lalibre, dhnet, sudinfo, rtlinfo, lavenir, rtbfinfo, levif, septsursept, lesoir2
 from csxj.articlequeue import ArticleQueueFiller
 
 
         os.mkdir(db_root)
 
     ArticleQueueFiller.setup_logging()
-    all_providers = lesoir, dhnet, lalibre, sudinfo, rtlinfo, lavenir, rtbfinfo, levif, septsursept
+    all_providers = dhnet, lalibre, sudinfo, rtlinfo, lavenir, rtbfinfo, levif, septsursept, lesoir2
     for provider in all_providers:
         csxj.crawler.put_articles_in_queue(provider, db_root)
 
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.