Commits

Frederic De Groef  committed b563f86

[parsers] get_frontpage_toc() now returns 3 lists: links of actual headlines, links to blogposts, links for paywalled articles

although only lesoir_new takes advantage of it

  • Participants
  • Parent commits 4828361

Comments (0)

Files changed (12)

File csxj/datasources/dhnet.py

             titles_and_urls = extract_title_and_link_from_anounce_group(announce_group)
             all_titles_and_urls.extend(titles_and_urls)
 
-        return [(title, 'http://www.dhnet.be%s' % url) for (title, url) in all_titles_and_urls], []
+        return [(title, 'http://www.dhnet.be%s' % url) for (title, url) in all_titles_and_urls], [], []
     else:
-        return [], []
+        return [], [], []
 
 
 if __name__ == "__main__":

File csxj/datasources/lalibre.py

 from parser_tools import ipm_utils
 from parser_tools import twitter_utils
 
+import os
 from helpers.unittest_generator import generate_unittest
 
 LALIBRE_ASSOCIATED_SITES = {
 
     updated_tagged_urls = update_tagged_urls(all_links, ipm_utils.LALIBRE_SAME_OWNER)
 
-    # print generate_test_func('embedded_tweet', 'lalibre', dict(tagged_urls=updated_tagged_urls))
-    # save_sample_data_file(html_content, source_url, 'embedded_tweet', '/Users/judemaey/code/csxj-crawler/tests/datasources/test_data/lalibre')
-
-
+    # generate_unittest("links_intext_overload", 'lalibre', dict(updated_tagged_urls=updated_tagged_urls),
+    #                   html_content, source_url,
+    #                   os.path.join(os.path.dirname(__file__), "../../tests/datasources/test_data/lalibre"),
+    #                   save_file=True)
 
     new_article = ArticleData(source_url, title,
                               pub_date, pub_time, fetched_datetime,
         title, url = announce.h1.a.contents[0], announce.h1.a.get('href')
         return title, '{0}{1}'.format(hostname_url, url)
 
-    return [extract_title_and_link(announce) for announce in announces], []
+    return [extract_title_and_link(announce) for announce in announces], [], []
 
 
 def test_sample_data():
             "http://www.lalibre.be/economie/actualite/article/789261/le-fmi-s-est-trompe-et-fait-son-mea-culpa.html",
             "http://www.lalibre.be/societe/general/article/779522/la-pornographie-une-affaire-d-hommes-pas-seulement.html",
             "http://www.lalibre.be/societe/insolite/article/787359/des-chocolats-aux-insectes.html",
-            "http://www.lalibre.be/societe/insolite/article/786611/le-tweet-sarcastique-de-johnny-a-gege.html"]
+            "http://www.lalibre.be/societe/insolite/article/786611/le-tweet-sarcastique-de-johnny-a-gege.html",
+            "http://www.lalibre.be/economie/actualite/article/755845/les-bourses-avancent-timidement-vers-le-web.html"]
 
     from pprint import pprint
     import os

File csxj/datasources/lavenir.py

     return datetime_published.date(), datetime_published.time()
 
 
-
 def extract_links_from_article_body(article_body_hxs):
     links = list()
     # intext urls
         tags = tags.union(['in text', 'plaintext'])
         links.append(make_tagged_url(url, url, tags))
 
-
     #embedded objects
     iframe_sources = article_body_hxs.select(".//iframe/@src").extract()
     for url in iframe_sources:
 
     return links
 
+
 def select_title_and_url(selector, tag_name):
     url = selector.select("./@href").extract()[0]
     title = selector.select(".//text()").extract()
     if title:
-        title =  remove_text_formatting_markup_from_fragments(title[0])
+        title = remove_text_formatting_markup_from_fragments(title[0])
         tags = classify_and_tag(url, LAVENIR_NETLOC, LAVENIR_INTERNAL_BLOGS)
         tags = tags.union([tag_name])
     else:
         title = '__GHOST_LINK__'
     return make_tagged_url(url, title, tags)
 
+
 def extract_sidebar_links(sidebar_links):
     tagged_urls = [select_title_and_url(sidebar_link, 'sidebar box') for sidebar_link in sidebar_links]
     return tagged_urls
 
-def extract_bottom_links(bottom_links):    
+
+def extract_bottom_links(bottom_links):
     tagged_urls = [select_title_and_url(bottom_link, 'bottom box') for bottom_link in bottom_links]
     return tagged_urls
 
     else:
         html_content = fetch_html_content(source)
 
-
     hxs = HtmlXPathSelector(text=html_content)
 
     # extract breadcrumbs for category info
     pub_date, pub_time = extract_publication_date(raw_date)
     fetched_datetime = datetime.today()
 
-
     #author(s)
     raw_author = article_detail_hxs.select("./div/ul/li[@class='author']/text()").extract()
     author = None
     if raw_intro:
         intro = ''.join([fragment.strip() for fragment in raw_intro])
 
-
     #detect photoset
     full_class = article_detail_hxs.select("./@class").extract()[0]
     if 'article-with-photoset' in full_class.split(" "):
     article_body = article_detail_hxs.select("./div/div[@class='article-body ']")
     content = article_body.select(".//p//text()").extract()
 
-
     all_links.extend(extract_links_from_article_body(article_body))
 
-
     # associated sidebar links
     sidebar_links = article_detail_hxs.select("./div/div[@class='article-side']/div[@class='article-related']//li/a")
     all_links.extend(extract_sidebar_links(sidebar_links))
     #print generate_test_func('bottom_box_and_sidebar_and_intext_links', 'lavenir', dict(tagged_urls=updated_tagged_urls))
     #save_sample_data_file(html_content, source, 'bottom_box_and_sidebar_and_intext_links', '/Users/judemaey/code/csxj-crawler/tests/datasources/test_data/lavenir')
 
-
     # wrapping up
     article_data = ArticleData(source, title, pub_date, pub_time, fetched_datetime,
                                updated_tagged_urls,
         return local_url
 
 
-
 def extract_title_and_url(link_hxs):
     url = link_hxs.select("./@href").extract()[0]
     title = link_hxs.select("./text()").extract()[0].strip()
     return title, url
 
 
-
 def separate_blogposts(all_items):
     blogpost_items = set([(title, url)for title, url in all_items if not is_internal_url(url)])
     news_items = set(all_items) - blogpost_items
 
     all_links = chain(story_links, more_story_links, local_sport_links, nopic_story_list)
 
-
     all_items = [extract_title_and_url(link_hxs) for link_hxs in all_links]
     news_items, blogpost_items = separate_blogposts(all_items)
 
-    return  [(title, expand_full_url(url)) for (title, url) in news_items if url not in BLACKLIST], list(blogpost_items)
-
+    return  [(title, expand_full_url(url)) for (title, url) in news_items if url not in BLACKLIST], list(blogpost_items), []
 
 
 def show_sample_articles():
     intro_url = "http://www.lavenir.net/article/detail.aspx?articleid=DMF20120226_002"
     photoset_with_links = "http://www.lavenir.net/article/detail.aspx?articleid=DMF20120222_00121489"
 
-
     norma_url = "http://www.lavenir.net/article/detail.aspx?articleid=DMF20120330_00139582"
     for url in [normal_url, photoset_url, intro_url, photoset_with_links]:
     #for url in [normal_url]:
 
 
 def show_sample_articles():
-    urls = [  "http://www.lavenir.net/article/detail.aspx?articleid=DMF20120326_023",
+    urls = ["http://www.lavenir.net/article/detail.aspx?articleid=DMF20120326_023",
             "http://www.lavenir.net/article/detail.aspx?articleid=DMF20120330_00139582",
             "http://www.lavenir.net/article/detail.aspx?articleid=DMF20120331_00140331",
             "http://www.lavenir.net/article/detail.aspx?articleid=DMF20120902_00199571",
     #     article.print_summary()
     #     for tagged_link in article.links:
     #         print tagged_link.URL, tagged_link.title, tagged_link.tags
-    
+
     article, html = extract_article_data(urls[-1])
 
 
-
 def show_frontpage():
     toc, blogposts = get_frontpage_toc()
     print "Articles:"

File csxj/datasources/lesoir.py

 import sys
 import locale
 from datetime import datetime
-from BeautifulSoup import  BeautifulStoneSoup,  Tag
+from BeautifulSoup import BeautifulStoneSoup, Tag
 import urlparse
 import codecs
 from csxj.common.tagging import tag_URL, classify_and_tag, make_tagged_url, TaggedURL, update_tagged_urls
 LESOIR_INTERNAL_BLOGS = {
 
     'archives.lesoir.be': ['archives', 'internal'],
-   
+
     'belandroid.lesoir.be': ['internal', 'jblog'],
     'geeko.lesoir.be': ['internal', 'jblog'],
     'blog.lesoir.be': ['internal', 'jblog'],
     if link.get('href'):
         url = link.get('href')
 
-    else :
+    else:
         url = "__GHOST_LINK__"
         base_tags.append("ghost link")
 
     Finds the story's body, cleans up the text to remove all html formatting.
     Returns a list of strings, one per found paragraph, and all the plaintext urls, as TaggedURLs
     """
-    story = story.find('div', {'id':'story_body'})
+    story = story.find('div', {'id': 'story_body'})
     paragraphs = story.findAll('p', recursive=False)
 
     tagged_urls = list()
 
 
 def extract_to_read_links_from_sidebar(sidebar):
-    to_read_links_container = sidebar.find('div', {'id':'lire_aussi'})
+    to_read_links_container = sidebar.find('div', {'id': 'lire_aussi'})
     #sometimes, it does not exist at all
     if to_read_links_container:
         urls_and_titles = [(link.get('href'), link.get('title'))
-                            for link in to_read_links_container.findAll('a')]
+                           for link in to_read_links_container.findAll('a')]
         return classify_and_make_tagged_url(urls_and_titles, additional_tags=set(['sidebar box', 'to read']))
     else:
         return []
 
 
 def extract_external_links_from_sidebar(sidebar):
-    external_links_container = sidebar.find('div', {'id':'external'})
+    external_links_container = sidebar.find('div', {'id': 'external'})
 
     if external_links_container:
         urls_and_titles = [(link.get('href'), link.get('title'))
-                            for link in external_links_container.findAll('a')]
+                           for link in external_links_container.findAll('a')]
         return classify_and_make_tagged_url(urls_and_titles, additional_tags=set(['sidebar box', 'web']))
     else:
         return []
         return url, title
 
     #todo : check if those links are actually associated to the article
-    recent_links_container = soup.find('div', {'id':'les_plus_recents'})
+    recent_links_container = soup.find('div', {'id': 'les_plus_recents'})
     if recent_links_container:
         urls_and_titles = [extract_url_and_title(item)
                            for item in recent_links_container.findAll('a')]
     Get the link lists for one news item, from the parsed html content.
     'Le Soir' has 3 kinds of links, but they're not all always there.
     """
-    sidebar = soup.find('div', {'id':'st_top_center'})
+    sidebar = soup.find('div', {'id': 'st_top_center'})
 
     all_tagged_urls = extract_external_links_from_sidebar(sidebar)
     all_tagged_urls.extend(extract_to_read_links_from_sidebar(sidebar))
 
 
 def extract_title(story):
-    header = story.find('div', {'id':'story_head'})
+    header = story.find('div', {'id': 'story_head'})
     title = header.h1.contents[0]
     if title:
         return unicode(title)
 
 
 def extract_author_name(story):
-    header = story.find('div', {'id':'story_head'})
-    author_name = header.find('p', {'class':'info st_signature'})
+    header = story.find('div', {'id': 'story_head'})
+    author_name = header.find('p', {'class': 'info st_signature'})
 
     if author_name and author_name.contents:
         return author_name.contents[0]
 
 
 def extract_date(story):
-    header = story.find('div', {'id':'story_head'})
-    publication_date = header.find('p', {'class':'info st_date'})
+    header = story.find('div', {'id': 'story_head'})
+    publication_date = header.find('p', {'class': 'info st_date'})
 
     if publication_date:
         date_string = publication_date.contents[0]
 
 
 def extract_intro(story):
-    header = story.find('div', {'id':'story_head'})
-    intro = header.find('h4', {'class':'chapeau'})
+    header = story.find('div', {'id': 'story_head'})
+    intro = header.find('h4', {'class': 'chapeau'})
     # so yeah, sometimes the intro paragraph contains some <span> tags with things
     # we don't really care about. Filtering that out.
     text_fragments = [fragment for fragment in intro.contents if not isinstance(fragment, Tag)]
 
 
 def extract_category(story):
-    breadcrumbs = story.find('div', {'id':'fil_ariane'})
-    category_stages = [a.contents[0] for a in breadcrumbs.findAll('a') ]
+    breadcrumbs = story.find('div', {'id': 'fil_ariane'})
+    category_stages = [a.contents[0] for a in breadcrumbs.findAll('a')]
     return category_stages
 
 
 
     # extract embedded storify
     scripts = story.findAll('script', recursive=True)
-    for script in scripts :
+    for script in scripts:
         url = script.get('src')
-        if url :
+        if url:
             scheme, netloc, path, params, query, fragment = urlparse.urlparse(url)
             if netloc == "storify.com":
                 url = url.rstrip(".js")
                 tagged_urls.append(make_tagged_url(url, url, all_tags | set(['embedded'])))
 
     # TO DO NEXT : reconstruc kplayer URL
-    kplayer = story.find('div', {'class':'containerKplayer'})
+    kplayer = story.find('div', {'class': 'containerKplayer'})
     if kplayer:
         kplayer_flash = kplayer.find('div', {'class': 'flash_kplayer'})
         url_part1 = kplayer_flash.object['data']
-        url_part2 = kplayer_flash.object.find('param', {'name' : 'flashVars'})['value']
+        url_part2 = kplayer_flash.object.find('param', {'name': 'flashVars'})['value']
         if url_part1 is not None and url_part2 is not None:
             url = "%s?%s" % (url_part1, url_part2)
             all_tags = classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_BLOGS)
         html_content = fetch_html_content(source)
 
     soup = make_soup_from_html_content(html_content)
-    story = soup.find('div', {'id':'story'})
+    story = soup.find('div', {'id': 'story'})
 
     category = extract_category(story)
     title = extract_title(story)
 
     updated_tagged_urls = update_tagged_urls(all_links, rossel_utils.LESOIR_SAME_OWNER)
 
-
     #print generate_test_func('same_owner_tagging', 'lesoir', dict(tagged_urls=updated_tagged_urls))
     #save_sample_data_file(html_content, source.name, 'same_owner_tagging', '/Users/judemaey/code/csxj-crawler/tests/datasources/test_data/lesoir')
 
     return ArticleData(source, title, pub_date, pub_time, fetched_datetime,
-                              updated_tagged_urls,
-                              category, author,
-                              intro, content), html_content
+                       updated_tagged_urls,
+                       category, author,
+                       intro, content), html_content
 
 
 def extract_main_content_links(source):
     soup = make_soup_from_html_content(html_content)
 
     # get maincontent div
-    story = soup.find('div', {'id':'story'})
+    story = soup.find('div', {'id': 'story'})
 
     all_links = soup.findAll('a', recursive=True)
 
     This function assumes we already checked that the element actually has
     two sub stories
     """
-    two_columns_stories_list = element.findAll('ul', {'class':'two_cols'}, recursive=False)[0]
+    two_columns_stories_list = element.findAll('ul', {'class': 'two_cols'}, recursive=False)[0]
     return two_columns_stories_list.findAll('li', recursive=False)
 
 
     Checks whether or not a frontpage entry is a stand alone news item, or a container
     for two 'two columns' items.
     """
-    return len(element.findAll('ul', {'class':'two_cols'}, recursive=False)) == 1
+    return len(element.findAll('ul', {'class': 'two_cols'}, recursive=False)) == 1
 
 
 def get_frontpage_toc():
 
     # Here we have interlaced <ul>'s with a bunch of random other shit that
     # need some filtering
-    stories_containers = soup.findAll('ul', {'class':'stories_list grid_6'})
-
+    stories_containers = soup.findAll('ul', {'class': 'stories_list grid_6'})
 
     articles_toc, blogpost_toc = [], []
 
                 else:
                     articles_toc.append((title, url))
 
-    return [(title, 'http://www.lesoir.be{0}'.format(url)) for (title, url) in articles_toc], blogpost_toc
+    return [(title, 'http://www.lesoir.be{0}'.format(url)) for (title, url) in articles_toc], blogpost_toc, []
 
 
 def get_rss_toc():
     return titles_in_rss
 
 
-
 def parse_sample_data():
     import sys
     data_directory = '../../sample_data'
         # print article_data.content
 
 if __name__ == '__main__':
-    test_sample_data()
+    test_sample_data()

File csxj/datasources/lesoir2.py

     titles_and_urls = [extract_title_and_url(link) for link in all_links_hxs]
 
     articles_toc, blogpost_toc = separate_news_and_blogposts(titles_and_urls)
-    return [(title, reconstruct_full_url(url)) for (title, url) in articles_toc], blogpost_toc
+    return [(title, reconstruct_full_url(url)) for (title, url) in articles_toc], blogpost_toc, []
 
 
 if __name__ == "__main__":

File csxj/datasources/lesoir_new.py

 LESOIR_NETLOC = "www.lesoir.be"
 LESOIR_INTERNAL_SITES = {
 
-    'archives.lesoir.be':['archives', 'internal'],
-   
-    'belandroid.lesoir.be':['internal', 'jblog'],
-    'geeko.lesoir.be':['internal', 'jblog'],
-    'blog.lesoir.be':['internal', 'jblog'],
+    'archives.lesoir.be': ['archives', 'internal'],
 
-    'pdf.lesoir.be' : ['internal', 'pdf newspaper']
+    'belandroid.lesoir.be': ['internal', 'jblog'],
+    'geeko.lesoir.be': ['internal', 'jblog'],
+    'blog.lesoir.be': ['internal', 'jblog'],
+
+    'pdf.lesoir.be': ['internal', 'pdf newspaper']
 }
 
+
 def extract_title_and_url(link_hxs):
     title = u"".join(link_hxs.select("text()").extract())
     url = link_hxs.select('@href').extract()[0]
         title = u"__NO_TITLE__"
     return title, url
 
+
 def separate_news_and_blogposts(titles_and_urls):
     def is_external_blog(url):
         return not url.startswith('/')
 
 def reconstruct_full_url(url):
     return urlparse.urljoin("http://{0}".format(LESOIR_NETLOC), url)
+
+
+
+def separate_paywalled_articles(all_link_hxs):
+    regular, paywalled = list(), list()
+    for link_hxs in all_link_hxs:
+        if link_hxs.select("../span [@class='ir locked']"):
+            paywalled.append(link_hxs)
+        else:
+            regular.append(link_hxs)
+    return regular, paywalled
+
+
+
 def get_frontpage_toc():
     html_data = fetch_html_content('http://www.lesoir.be')
     hxs = HtmlXPathSelector(text=html_data)
     # bottom sections
     bottom_news_links = hxs.select("//div [@class='bottom-content']//div [@class='block-articles']//a")
 
+    all_links_hxs = itertools.chain(headlines_links, blog_block, sports_links, bottom_news_links)
+    regular_articles_hxs, all_paywalled_hxs = separate_paywalled_articles(all_links_hxs)
 
-    all_links_hxs = itertools.chain(headlines_links, blog_block, sports_links, bottom_news_links)
-
-
-    titles_and_urls = [extract_title_and_url(link) for link in all_links_hxs]
+    titles_and_urls = [extract_title_and_url(link) for link in regular_articles_hxs]
+    paywalled_titles_and_urls = [extract_title_and_url(link) for link in all_paywalled_hxs]
 
     articles_toc, blogpost_toc = separate_news_and_blogposts(titles_and_urls)
-    return [(title, reconstruct_full_url(url)) for (title, url) in articles_toc], blogpost_toc
+    return [(title, reconstruct_full_url(url)) for (title, url) in articles_toc], blogpost_toc, [(title, reconstruct_full_url(url)) for (title, url) in paywalled_titles_and_urls]
 
 
 def extract_title(soup):
     # trouver le titre
-    main_content = soup.find(attrs = {"id" : "main-content"})
+    main_content = soup.find(attrs={"id": "main-content"})
     title = main_content.find("h1").contents[0]
     return title
 
+
 def extract_author_name(soup):
     authors = []
-    meta_box = soup.find(attrs = {"class" : "meta"})
+    meta_box = soup.find(attrs={"class": "meta"})
+    #sometimes there's an author mentioned in bold at the end of the article
     author_name = meta_box.find("strong").contents[0]
     authors.append(author_name)
 
-    #sometimes there's an author mentioned in bold at the end of the article
-
     return authors
 
+
 def extract_date_and_time(soup):
-    meta_box = soup.find(attrs = {"class" : "meta"})
-    date = meta_box.find(attrs = {"class" : "prettydate"})
+    meta_box = soup.find(attrs={"class": "meta"})
+    date = meta_box.find(attrs={"class": "prettydate"})
     date_part1 = date.contents[0]
     date_part2 = date.contents[-1]
     full_date_and_time_string = "%sh%s" % (date_part1, date_part2)
 
 
 def extract_intro(soup):
-    intro_box = soup.find(attrs = {"class" : "article-content"})
+    intro_box = soup.find(attrs={"class": "article-content"})
     intro = intro_box.find("h3").contents[0]
     return intro
 
+
 def extract_title_and_url_from_bslink(link):
     base_tags = []
     if link.get('href'):
         url = link.get('href')
-    else :
+    else:
         url = "__GHOST_LINK__"
         base_tags.append("ghost link")
-        
+
     if link.find('h3'):
         title = link.find('h3').contents[0].strip()
     else:
     return title, url, base_tags
 
 
-def extract_text_content_and_links(soup) :
+def extract_text_content_and_links(soup):
     article_text = []
     inline_links = []
     plaintext_urls = []
 
-    text = soup.find(attrs = {"class" : "article-body"})
+    text = soup.find(attrs={"class": "article-body"})
     paragraphs = text.find_all("p")
-    for p in paragraphs :
-        clean_text = utils.remove_text_formatting_markup_from_fragments(p, strip_chars = "\n")
+    for p in paragraphs:
+        clean_text = utils.remove_text_formatting_markup_from_fragments(p, strip_chars="\n")
         article_text.append(clean_text)
         link = p.find_all("a")
         inline_links.extend(link)
         for fragment in p:
             if type(fragment) is bs4.element.Tag:
                 if not fragment.name == "a":
-                    clean_fragment = utils.remove_text_formatting_markup_from_fragments(fragment, strip_chars = "\n")
+                    clean_fragment = utils.remove_text_formatting_markup_from_fragments(fragment, strip_chars="\n")
                     plaintext_links = utils.extract_plaintext_urls_from_text(clean_fragment)
                     plaintext_urls.extend(plaintext_links)
             if type(fragment) is bs4.NavigableString:
-                clean_fragment = utils.remove_text_formatting_markup_from_fragments(fragment, strip_chars = "\n")   
+                clean_fragment = utils.remove_text_formatting_markup_from_fragments(fragment, strip_chars="\n")
                 plaintext_links = utils.extract_plaintext_urls_from_text(clean_fragment)
                 plaintext_urls.extend(plaintext_links)
 
-
     titles_and_urls = [extract_title_and_url_from_bslink(i) for i in inline_links]
 
     tagged_urls = list()
 
     return article_text, tagged_urls
 
+
 def extract_article_tags(soup):
     tagged_urls = list()
-    meta_box = soup.find(attrs = {"class" : "meta"})
-    if meta_box.find(attrs = {'class': 'tags'}):
-        tags = meta_box.find(attrs = {'class': 'tags'})
+    meta_box = soup.find(attrs={"class": "meta"})
+    if meta_box.find(attrs={'class': 'tags'}):
+        tags = meta_box.find(attrs={'class': 'tags'})
         links = tags.find_all("a")
         titles_and_urls = [extract_title_and_url_from_bslink(link) for link in links]
         for title, url, base_tags in titles_and_urls:
 
 
 def extract_category(soup):
-    breadcrumbs = soup.find('div', {'class':'breadcrumbs'})
-    category_stages = [a.contents[0] for a in breadcrumbs.findAll('a') ]
+    breadcrumbs = soup.find('div', {'class': 'breadcrumbs'})
+    category_stages = [a.contents[0] for a in breadcrumbs.findAll('a')]
     return category_stages
 
+
 def extract_links_from_sidebar_box(soup):
     tagged_urls = list()
     sidebar_boxes = soup.find_all('div', {'class': 'box alt'})
                 tagged_urls.append(tagging.make_tagged_url(url, title, tags))
     return tagged_urls
 
+
 def extract_embedded_media_from_top_box(soup):
     tagged_urls = list()
-    top_box = soup.find(attrs = {'class': 'block-slidepic media'})
+    top_box = soup.find(attrs={'class': 'block-slidepic media'})
     if top_box.find("embed"):
         url = top_box.find("embed").get("src")
-        if url :
+        if url:
             tags = tagging.classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_SITES)
             tags.add('embedded')
             tags.add('top box')
             tagged_urls.append(tagging.make_tagged_url(url, url, tags))
-        else :
+        else:
             raise ValueError("There to be an embedded object but we could not find an link. Update the parser.")
-    
+
     # sometimes it's a kewego player
-    kplayer = top_box.find(attrs = {'class': 'emvideo emvideo-video emvideo-kewego'})
+    kplayer = top_box.find(attrs={'class': 'emvideo emvideo-video emvideo-kewego'})
     if kplayer:
         url_part1 = kplayer.object['data']
-        url_part2 = kplayer.object.find('param', {'name' : 'flashVars'})['value']
+        url_part2 = kplayer.object.find('param', {'name': 'flashVars'})['value']
         if url_part1 is not None and url_part2 is not None:
             url = "%s?%s" % (url_part1, url_part2)
             tags = tagging.classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_SITES)
             if kplayer.next_sibling.name == "figcaption":
-                if len(kplayer.next_sibling) > 0 :
+                if len(kplayer.next_sibling) > 0:
                     title = kplayer.next_sibling.contents[0]
                     tagged_urls.append(tagging.make_tagged_url(url, title, tags | set(['embedded', 'top box'])))
-                else :
+                else:
                     title = "__NO_TITLE__"
                     tagged_urls.append(tagging.make_tagged_url(url, title, tags | set(['embedded', 'top box'])))
             else:
                 tagged_urls.append(tagging.make_tagged_url(url, title, tags | set(['embedded', 'top box'])))
         else:
             raise ValueError("We couldn't find an URL in the flash player. Update the parser.")
-    
+
     # sometimes it's a youtube player
-    youtube_player = top_box.find(attrs = {'class': 'emvideo emvideo-video emvideo-youtube'})
+    youtube_player = top_box.find(attrs={'class': 'emvideo emvideo-video emvideo-youtube'})
     if youtube_player:
         url = youtube_player.find("a").get("href")
-        if url :
+        if url:
             tags = tagging.classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_SITES)
             tags.add('embedded')
             tags.add('top box')
             raise ValueError("There seems to be a Youtube player but we couldn't find an URL. Update the parser.")
     return tagged_urls
 
+
 def extract_embedded_media_from_bottom(soup):
     tagged_urls = list()
-    article_body = soup.find(attrs = {'class': 'article-body'})
-    bottom_box = article_body.find(attrs = {'class' : 'related-media'})
-    if bottom_box :
+    article_body = soup.find(attrs={'class': 'article-body'})
+    bottom_box = article_body.find(attrs={'class': 'related-media'})
+    if bottom_box:
         embedded_media = bottom_box.find("iframe")
         if embedded_media:
             url = embedded_media.get("src")
 
     return tagged_urls
 
+
 def extract_embedded_media_in_article(soup):
     tagged_urls = list()
     story = soup.find(attrs = {'class': 'article-body'})
         article, raw = extract_article_data(f)
         # from csxj.common.tagging import print_taggedURLs
         # print_taggedURLs(article.links)
- 
+
 
 if __name__ == '__main__':
+    _, _, paywalled = get_frontpage_toc()
+    for p in paywalled:
+        print p
 
+    import sys
+    sys.exit(0)
     url = "http://www.lesoir.be/142224/article/culture/medias-tele/2012-12-21/audrey-pulvar-quitte-inrocks"
     url = "http://www.lesoir.be/142193/article/debats/cartes-blanches/2012-12-21/g%C3%A9rard-depardieu-l%E2%80%99arbre-qui-cache-for%C3%AAt"
     url = "http://www.lesoir.be/142176/article/actualite/belgique/2012-12-21/van-rompuy-%C2%AB-etre-premier-en-belgique-c%E2%80%99est-frustrant-%C2%BB"
     url = "http://www.lesoir.be/159937/article/actualite/regions/bruxelles/2013-01-12/didier-reynders-%C2%ABbruxelles-doit-travailler-avec-wallonie-et-flandre%C2%BB"
     url = "http://www.lesoir.be/159937/article/actualite/regions/bruxelles/2013-01-12/didier-reynders-%C2%ABbruxelles-doit-travailler-avec-wallonie-et-flandre%C2%BB"
     url = "http://www.lesoir.be/138219/article/styles/air-du-temps/2012-12-14/votre-week-end-en-15-clics"
-    
+
     article, html = extract_article_data(url)
 
 

File csxj/datasources/levif.py

 
     frontpage_items, blogposts = split_news_and_blogposts(titles_and_urls)
 
-    return frontpage_items, blogposts
+    return frontpage_items, blogposts, []
 
 
 

File csxj/datasources/rtbfinfo.py

 
     titles_and_urls = [extract_title_and_url(link_hxs) for link_hxs in chain(main_story, featured_stories, anchored_stories)] + chronic_stories
 
-    return titles_and_urls, []
+    return titles_and_urls, [], []
 
 
 

File csxj/datasources/rtlinfo.py

 
     news_items, blogposts = separate_news_and_blogposts(all_articles)
 
-    return [make_full_url(title_and_url) for title_and_url in news_items], list(blogposts)
+    return [make_full_url(title_and_url) for title_and_url in news_items], list(blogposts), []
 
 
 

File csxj/datasources/septsursept.py

 
     frontpage_items = left_items + right_items
     article_links, photoalbum_links = separate_articles_and_photoalbums(frontpage_items)
-    return [make_full_url(item) for item in article_links], [make_full_url(item) for item in photoalbum_links]
+    return [make_full_url(item) for item in article_links], [make_full_url(item) for item in photoalbum_links], []
 
 
 def get_frontpage_toc():

File csxj/datasources/sudinfo.py

     headlines.extend(regional_headlines)
 
     news, blogposts = separate_blogposts_and_news(headlines)
-    return make_full_url(BASE_URL, news), blogposts
+    return [(title, convert_utf8_url_to_ascii(url)) for title, url in make_full_url(BASE_URL, news)], blogposts, []
 
 
 def show_article():
         u"http://www.sudinfo.be/306989/article/sports/foot-belge/standard/2012-01-08/standard-chattez-en-exclusivite-avec-sebastien-pocognoli-ce-lundi-des-13h30",
 
         # embeddes scribble
-        u"http://www.sudinfo.be/655859/article/sports/foot-belge/anderlecht/2013-02-03/suivez-le-super-sunday-en-live-genk-ecrase-bruges-4-1-le-standard-en-visi"
+        u"http://www.sudinfo.be/655859/article/sports/foot-belge/anderlecht/2013-02-03/suivez-le-super-sunday-en-live-genk-ecrase-bruges-4-1-le-standard-en-visi",
+
+        u"http://www.sudinfo.be/648601/article/regions/tournai/actualite/2013-01-23/le-papa-se-fait-operer-et-devient%E2%80%A6-maman"
     ]
 
     article, html = extract_article_data(urls[-1])

File csxj/datasources/sudpresse.py

                 plaintext_links = extract_plaintext_urls_from_text(clean_fragment)
                 plaintext_urls.extend(plaintext_links)
         if type(fragment) is NavigableString:
-            clean_fragment = remove_text_formatting_markup_from_fragments(fragment, strip_chars = "\n")   
+            clean_fragment = remove_text_formatting_markup_from_fragments(fragment, strip_chars = "\n")
             plaintext_links = extract_plaintext_urls_from_text(clean_fragment)
             plaintext_urls.extend(plaintext_links)
 
             if  link_type in LINK_TYPE_TO_TAG:
                 tags.update(LINK_TYPE_TO_TAG[link_type])
 
-            tags.add("sidebar box")  
+            tags.add("sidebar box")
 
             all_tagged_urls.append(make_tagged_url(url, title, tags))
 
 
         #print generate_test_func('intext_links_tagging', 'sudpresse', dict(tagged_urls=updated_tagged_urls))
         #save_sample_data_file(html_content, source.name, 'intext_links_tagging', '/Users/judemaey/code/csxj-crawler/tests/datasources/test_data/sudpresse')
-        
+
         return ArticleData(source, title, pub_date, pub_time, fetched_datetime,
                            updated_tagged_urls,
                            category, author,
     regional_headlines = make_full_url(url, get_regional_toc())
     headlines.extend(regional_headlines)
 
-    return make_full_url(url, headlines), []
+    return make_full_url(url, headlines), [], []
 
 
 def show_frontpage_articles():