Commits

Frederic De Groef committed d38f2c6

[lalibre] pep8 stuff

Comments (0)

Files changed (1)

csxj/datasources/lalibre.py

 SOURCE_TITLE = u"La Libre"
 SOURCE_NAME = u"lalibre"
 
+
 def is_on_same_domain(url):
     """
     Until we get all the internal blogs/sites, we can still detect
         tags = classify_and_tag(url, LALIBRE_NETLOC, LALIBRE_ASSOCIATED_SITES)
         if is_on_same_domain(url):
             tags.update(['internal site'])
-        tagged_urls.append(make_tagged_url(url, title, tags|additional_tags))
+        tagged_urls.append(make_tagged_url(url, title, tags | additional_tags))
     return tagged_urls
 
 
-
 def was_story_updated(date_string):
     return not date_string.startswith('Mis en ligne le')
 
 
-
 def extract_date(main_content):
-    publication_date = main_content.find('p', {'id':'publicationDate'}).contents[0]
+    publication_date = main_content.find('p', {'id': 'publicationDate'}).contents[0]
     publication_date = publication_date.rstrip().lstrip()
 
     if was_story_updated(publication_date):
     return pub_date.date(), pub_time
 
 
-
 def sanitize_fragment(fragment):
     if isinstance(fragment, Tag):
         # sometimes, we just get <p></p>
     other_links = list(set(links) - set(no_target_links))
     return [('', title) for (target, title) in no_target_links], other_links
 
-    
+
 def separate_keyword_links(all_links):
     keyword_links = [l for l in all_links if l[0].startswith('/sujet')]
     other_links = list(set(all_links) - set(keyword_links))
     return keyword_links, other_links
 
 
-
 def extract_and_tag_in_text_links(article_text):
     """
     Finds the links tags in the html text content.
     Returns a list of TaggedURL objects.
     """
     def extract_link_and_title(link):
-            return link.get('href'),  sanitize_fragment(link.contents[0])
+            return link.get('href'), sanitize_fragment(link.contents[0])
     links = [extract_link_and_title(link)
              for link in article_text.findAll('a', recursive=True)]
 
-
-
     no_target_links, target_links = separate_no_target_links(links)
     keyword_links, other_links = separate_keyword_links(target_links)
 
     return tagged_urls
 
 
-
 def sanitize_paragraph(paragraph):
     """Returns plain text article"""
     sanitized_paragraph = [sanitize_fragment(fragment) for fragment in paragraph.contents]
     return ''.join(sanitized_paragraph)
 
 
-
 def extract_text_content_and_links(main_content):
-    article_text = main_content.find('div', {'id':'articleText'})
-
+    article_text = main_content.find('div', {'id': 'articleText'})
 
     in_text_tagged_urls = extract_and_tag_in_text_links(article_text)
 
-
     all_fragments = []
     all_plaintext_urls = []
     paragraphs = article_text.findAll('p', recursive=False)
         all_plaintext_urls.extend(classify_and_make_tagged_url(urls_and_titles, additional_tags=set(['plaintext'])))
 
     text_content = all_fragments
-    return text_content, in_text_tagged_urls+all_plaintext_urls
-
+    return text_content, in_text_tagged_urls + all_plaintext_urls
 
 
 def extract_category(main_content):
-    breadcrumbs = main_content.find('p', {'id':'breadCrumbs'})
+    breadcrumbs = main_content.find('p', {'id': 'breadCrumbs'})
     links = breadcrumbs.findAll('a', recursive=False)
 
     return [link.contents[0].rstrip().lstrip() for link in links]
 
 
-
 icon_type_to_tags = {
-    'pictoType0':['internal', 'full url'],
-    'pictoType1':['internal', 'local url'],
-    'pictoType2':['images', 'gallery'],
-    'pictoType3':['video'],
-    'pictoType4':['animation'],
-    'pictoType5':['audio'],
-    'pictoType6':['images', 'gallery'],
-    'pictoType9':['internal blog'],
-    'pictoType12':['external']
+    'pictoType0': ['internal', 'full url'],
+    'pictoType1': ['internal', 'local url'],
+    'pictoType2': ['images', 'gallery'],
+    'pictoType3': ['video'],
+    'pictoType4': ['animation'],
+    'pictoType5': ['audio'],
+    'pictoType6': ['images', 'gallery'],
+    'pictoType9': ['internal blog'],
+    'pictoType12': ['external']
 }
 
 
     return tag_URL((url, title), tags)
 
 
-
 def extract_and_tag_associated_links(main_content):
     """
     Extract the associated links. Uses the icon type to tag it.
-    
+
     """
-    link_list = main_content.find('ul', {'class':'articleLinks'})
+    link_list = main_content.find('ul', {'class': 'articleLinks'})
 
     # sometimes there are no links, and thus no placeholder
     if link_list:
         return links
     else:
         return []
-        
 
 
 def extract_author_name(main_content):
-    writer = main_content.find('p', {'id':'writer'})
+    writer = main_content.find('p', {'id': 'writer'})
     if writer:
         return writer.contents[0].rstrip().lstrip()
     else:
         return constants.NO_AUTHOR_NAME
 
 
-
 def extract_intro(main_content):
-    hat = main_content.find('div', {'id':'articleHat'})
+    hat = main_content.find('div', {'id': 'articleHat'})
 
     if hat:
         return hat.contents[0].rstrip().lstrip()
         return ''
 
 
-
 def extract_article_data_from_file(source_url, source_file):
 
     if not hasattr(source_file, 'read'):
     return extract_article_data_from_html(html_content, source_url)
 
 
-
 def extract_article_data_from_html(html_content, source_url):
     soup = make_soup_from_html_content(html_content)
 
-    main_content = soup.find('div', {'id':'mainContent'})
-
+    main_content = soup.find('div', {'id': 'mainContent'})
 
     if main_content.h1:
         title = main_content.h1.contents[0].rstrip().lstrip()
     else:
         return None, html_content
-       
 
     category = extract_category(main_content)
     author = extract_author_name(main_content)
 
-
     pub_date, pub_time = extract_date(main_content)
 
     intro = extract_intro(main_content)
 
-    text_content, in_text_urls  = extract_text_content_and_links(main_content)
+    text_content, in_text_urls = extract_text_content_and_links(main_content)
     associated_tagged_urls = extract_and_tag_associated_links(main_content)
 
     fetched_datetime = datetime.today()
     return new_article, html_content
 
 
-
 def extract_article_data(source):
     """
     """
     return extract_article_data_from_html(html_content, source)
 
 
-
 def get_frontpage_toc():
     hostname_url = 'http://www.lalibre.be'
     html_content = fetch_html_content(hostname_url)
 
     soup = make_soup_from_html_content(html_content)
 
-    article_list_container = soup.find('div', {'id':'mainContent'})
-    announces = article_list_container.findAll('div', {'class':'announce'}, recursive=False)
+    article_list_container = soup.find('div', {'id': 'mainContent'})
+    announces = article_list_container.findAll('div', {'class': 'announce'}, recursive=False)
 
     def extract_title_and_link(announce):
         title, url = announce.h1.a.contents[0], announce.h1.a.get('href')
         return title, '{0}{1}'.format(hostname_url, url)
-    
+
     return [extract_title_and_link(announce) for announce in announces], []
 
 
-
-
 def test_sample_data():
-    urls = [    "http://www.lalibre.be/economie/actualite/article/704138/troisieme-belgian-day-a-wall-street.html",
-                "http://www.lalibre.be/culture/selection-culturelle/article/707244/ou-sortir-ce-week-end.html",
-                "http://www.lalibre.be/actu/usa-2012/article/773294/obama-raille-les-chevaux-et-baionnettes-de-romney.html",
-                "http://www.lalibre.be/actu/international/article/774524/sandy-le-calme-avant-la-tempete.html",
-                "http://www.lalibre.be/sports/football/article/778966/suivez-anderlecht-milan-ac-en-live-des-20h30.html"
-            ]
+    urls = ["http://www.lalibre.be/economie/actualite/article/704138/troisieme-belgian-day-a-wall-street.html",
+            "http://www.lalibre.be/culture/selection-culturelle/article/707244/ou-sortir-ce-week-end.html",
+            "http://www.lalibre.be/actu/usa-2012/article/773294/obama-raille-les-chevaux-et-baionnettes-de-romney.html",
+            "http://www.lalibre.be/actu/international/article/774524/sandy-le-calme-avant-la-tempete.html",
+            "http://www.lalibre.be/sports/football/article/778966/suivez-anderlecht-milan-ac-en-live-des-20h30.html"]
 
     for url in urls[-1:]:
         article, html = extract_article_data(url)
             for tagged_url in article.links:
                 print(u"{0:100} ({1:100}) \t {2}".format(tagged_url.title, tagged_url.URL, tagged_url.tags))
 
-        print("\n"*4)
+        print("\n" * 4)
 
 
-        
 def list_frontpage_articles():
     frontpage_items = get_frontpage_toc()
     print len(frontpage_items)
 
     for (title, url) in frontpage_items:
-        print 'fetching data for article :',  title
+        print 'fetching data for article :', title
 
         article, html_content = extract_article_data(url)
         article.print_summary()
 
-
         for (url, title, tags) in article.internal_links:
             print u'{0} -> {1} {2}'.format(url, title, tags)
 
         for (url, title, tags) in article.external_links:
             print u'{0} -> {1} {2}'.format(url, title, tags)
-            
+
         print '-' * 80