Commits

Juliette De Maeyer committed 5f2188f

[test] [sudpresse] added test for 'sidebar box' tagging

  • Participants
  • Parent commits 686cf80

Comments (0)

Files changed (4)

File csxj/datasources/lesoir_new.py

     updated_tagged_urls = tagging.update_tagged_urls(all_links, rossel_utils.LESOIR_SAME_OWNER)
 
     #print generate_test_func('same_owner_tagging', 'lesoir_new', dict(tagged_urls=updated_tagged_urls))
-    save_sample_data_file(html_data, source, 'same_owner_tagging', '/Users/judemaey/code/csxj-crawler/tests/datasources/test_data/lesoir_new')
+    #save_sample_data_file(html_data, source, 'same_owner_tagging', '/Users/judemaey/code/csxj-crawler/tests/datasources/test_data/lesoir_new')
 
     return (ArticleData(source, title, pub_date, pub_time, fetched_datetime,
                 updated_tagged_urls,

File csxj/datasources/sudpresse.py

 
         updated_tagged_urls = update_tagged_urls(all_links, rossel_utils.SUDINFO_SAME_OWNER)
 
-        #print generate_test_func('same_owner_tagging', 'sudpresse', dict(tagged_urls=updated_tagged_urls))
-        #save_sample_data_file(html_content, source.name, 'same_owner_tagging', '/Users/judemaey/code/csxj-crawler/tests/datasources/test_data/sudpresse')
+        #print generate_test_func('sidebar_box_tagging', 'sudpresse', dict(tagged_urls=updated_tagged_urls))
+        #save_sample_data_file(html_content, source.name, 'sidebar_box_tagging', '/Users/judemaey/code/csxj-crawler/tests/datasources/test_data/sudpresse')
         
         return ArticleData(source, title, pub_date, pub_time, fetched_datetime,
                            updated_tagged_urls,
     with open(filepath) as f:
         article_data, raw = extract_article_data(f)
 
-        for link in article_data.links:
-            print link.URL
-            print link.title
-            print link.tags
-            print "**********************"
+        # for link in article_data.links:
+        #     print link.URL
+        #     print link.title
+        #     print link.tags
+        #     print "**********************"
 
 
 def download_one_article():

File tests/datasources/test_data/sudpresse/index.json

     [
       "../../sample_data/sudpresse/sudpresse_same_owner.html", 
       "same_owner_tagging.html"
+    ], 
+    [
+      "../../sample_data/sudpresse/sudpresse_associated_link_error.html", 
+      "sidebar_box_tagging.html"
     ]
   ]
 }

File tests/datasources/test_sudpresse.py

 DATA_ROOT = os.path.join(os.path.dirname(__file__), 'test_data', 'sudpresse')
 
 class TestSudpresseLinkExtraction(object):
+    def test_sidebar_box_tagging(self):
+        """ Sudpresse parser correctly tags 'sidebar box' links """
+        with open(os.path.join(DATA_ROOT, "sidebar_box_tagging.html")) as f:
+            article, raw_html = sudpresse.extract_article_data(f)
+            extracted_links = article.links
+            tagged_urls = [
+                make_tagged_url("/actualite/fil_info/2011-05-10/wouter-weylandt-un-registre-de-condoleances-ouvert-au-centre-du-tour-des-flandres-872031.shtml", u"""Wouter Weylandt : un registre de condoléances ouvert au centre du Tour des Flandres""", set(['internal', 'sidebar box'])),
+                make_tagged_url("/sports/cyclisme/2011-05-10/deces-de-weylandt-la-4e-etape-du-giro-neutralisee-872000.shtml", u"""e étape du Giro neutralisée" >Décès de Weylandt: la 4e étape du Giro neutralisée""", set(['internal', 'sidebar box'])),
+                make_tagged_url("/sports/cyclisme/2011-05-10/giro-neutralisee-la-4e-etape-prendra-la-forme-d-un-defile-871999.shtml", u"""Giro: neutralisée, la 4e étape prendra la forme d’un défilé""", set(['internal', 'sidebar box'])),
+                make_tagged_url("/sports/cyclisme/2011-05-10/andy-schleck-pour-wouter-weylandt-repose-en-paix-mon-ami-871991.shtml", u"""Andy Schleck pour Wouter Weylandt: “Repose en paix, mon ami ”""", set(['internal', 'sidebar box'])),
+                make_tagged_url("/sports/cyclisme/2011-05-10/deces-du-coureur-wouter-weylandt-leopard-trek-reste-sur-le-giro-871934.shtml", u"""Décès de Wouter Weylandt: Leopard-Trek reste sur le Giro""", set(['internal', 'sidebar box'])),
+                make_tagged_url("/sports/cyclisme/2011-05-09/giro-terrible-chute-de-wouter-weylandt-dans-une-descente-son-etat-est-inquietant-871804.shtml", u"""Décès de Wouter Weylandt: "Un cas désespéré", explique le médecin de la course""", set(['internal', 'sidebar box'])),
+            ]
+            expected_links = tagged_urls
+            assert_taggedURLs_equals(expected_links, extracted_links)
+
     def test_same_owner_tagging(self):
         """ Sudpresse parser correctly tags 'same owner' links """
         with open(os.path.join(DATA_ROOT, "same_owner_tagging.html")) as f:
             extracted_links = article.links
             tagged_urls = [
                 make_tagged_url("http://www.nordeclair.fr/Actualite/Depeches/2012/02/13/dujardin-a-lille.shtml", u"""Nos confrères de Nord Eclair France """, set(['same owner', 'external', 'in text'])),
-                make_tagged_url("http://www.nordeclair.fr/Actualite/Depeches/2012/02/13/dujardin-a-lille.shtml", u"""Voir sur le site nordeclair.fr""", set(['external', 'same owner'])),
+                make_tagged_url("http://www.nordeclair.fr/Actualite/Depeches/2012/02/13/dujardin-a-lille.shtml", u"""Voir sur le site nordeclair.fr""", set(['external', 'same owner', 'sidebar box'])),
             ]
             expected_links = tagged_urls
             assert_taggedURLs_equals(expected_links, extracted_links)