Commits

Juliette De Maeyer committed 55997e6 Merge

Merge

Comments (0)

Files changed (4)

 syntax: glob
+*.orig
 *.pyc
 build/
 out*/

csxj/datasources/lalibre.py

 
 
 def test_sample_data():
-    url = "http://www.lalibre.be/economie/actualite/article/704138/troisieme-belgian-day-a-wall-street.html"
-    url = "http://www.lalibre.be/culture/selection-culturelle/article/707244/ou-sortir-ce-week-end.html"
-    article_data, html_content = extract_article_data(url)
+    urls = [    "http://www.lalibre.be/economie/actualite/article/704138/troisieme-belgian-day-a-wall-street.html",
+                "http://www.lalibre.be/culture/selection-culturelle/article/707244/ou-sortir-ce-week-end.html",
+                "http://www.lalibre.be/actu/usa-2012/article/773294/obama-raille-les-chevaux-et-baionnettes-de-romney.html",
+                "http://www.lalibre.be/actu/international/article/774524/sandy-le-calme-avant-la-tempete.html",
+                "http://www.lalibre.be/sports/football/article/778966/suivez-anderlecht-milan-ac-en-live-des-20h30.html"
+            ]
 
-    if article_data:
-        article_data.print_summary()
+    for url in urls[-1:]:
+        article, html = extract_article_data(url)
 
-        print article_data.to_json()
-        for url in article_data.external_links:
-            print url
-    else:
-        print "article was removed"
+        if article:
+            article.print_summary()
+            print article.title
+            for tagged_url in article.links:
+                print(u"{0:100} ({1:100}) \t {2}".format(tagged_url.title, tagged_url.URL, tagged_url.tags))
+
+        print("\n"*4)
 
 
         

csxj/datasources/lavenir.py

             print tagged_link.URL, tagged_link.title, tagged_link.tags
 
 
-def show_article():
+def show_sample_articles():
     urls = [  "http://www.lavenir.net/article/detail.aspx?articleid=DMF20120326_023",
             "http://www.lavenir.net/article/detail.aspx?articleid=DMF20120330_00139582",
             "http://www.lavenir.net/article/detail.aspx?articleid=DMF20120331_00140331",
         print "********\n\n"
 
 
-
 if __name__ == "__main__":
-    show_article()
+    show_sample_articles()
     #show_frontpage_articles()
     #show_frontpage()

csxj/datasources/sudinfo.py

     if media_links:
         for i, item in enumerate(media_links):
             item_id = item.select("./@href").extract()
-            url = u"{0}{1}".format(source_url, item_id)
+            url = item_id
             title = u"EMBEDDED MEDIA {0}".format(i)
             tags = set(['media', 'embedded'])
             all_tagged_urls.append(make_tagged_url(url, title, tags))
     if hasattr(source, 'read'):
         html_content = source.read()
     else:
-        source_url = convert_utf8_url_to_ascii(source)
+        source = convert_utf8_url_to_ascii(source)
         try:
-            html_content = fetch_html_content(source_url)
+            html_content = fetch_html_content(source)
         except urllib2.HTTPError as err:
             if err.code == 404:
                 return None, "<html><head><title>404</title></head><body></body></html>"
 
         content, content_links = extract_content_and_links(hxs)
 
-        associated_links = extract_associated_links(hxs, source_url)
+        associated_links = extract_associated_links(hxs)
 
         all_links = intro_links + content_links + associated_links
 
 
-        return (ArticleData(source_url, title, pub_date, pub_time, fetched_datetime,
+        return (ArticleData(source, title, pub_date, pub_time, fetched_datetime,
                             all_links,
                             category, author,
                             intro, content),