Commits

Juliette De Maeyer committed d1d8621

[tests] [sudinfo] added test for in text link extraction

Comments (0)

Files changed (4)

csxj/datasources/sudinfo.py

         updated_tagged_urls = update_tagged_urls(all_links, rossel_utils.SUDINFO_SAME_OWNER)
 
 
-        #print generate_test_func('embedded_video_extraction', 'sudinfo', dict(tagged_urls=updated_tagged_urls))
-        #save_sample_data_file(html_content, source.name, 'embedded_video_extraction', '/Users/judemaey/code/csxj-crawler/tests/datasources/test_data/sudinfo')
+        #print generate_test_func('in_text_link_extraction', 'sudinfo', dict(tagged_urls=updated_tagged_urls))
+        #save_sample_data_file(html_content, source.name, 'in_text_link_extraction', '/Users/judemaey/code/csxj-crawler/tests/datasources/test_data/sudinfo')
         
         return (ArticleData(source, title, pub_date, pub_time, fetched_datetime,
                             updated_tagged_urls,

tests/datasources/test_data/sudinfo/index.json

     [
       "../../sample_data/sudinfo/sudinfo_video.html", 
       "embedded_video_extraction.html"
+    ], 
+    [
+      "../../sample_data/sudinfo/embedded_photos.html", 
+      "in_text_link_extraction.html"
     ]
   ]
 }

tests/datasources/test_sudinfo.py

 
 
 class TestSudinfoLinkExtraction(object):
+
     def test_sidebar_box_tagging(self):
         """ Sudinfo parser can extract and tag sidebar links from an article. """
         with open(os.path.join(DATA_ROOT, "sidebar_box_tagging.html")) as f:
             ]
             expected_links = tagged_urls
             assert_taggedURLs_equals(expected_links, extracted_links)
+
+
+    def test_in_text_link_extraction(self):
+        """ Sudinfo parser can extract and tag in-text links """
+        with open(os.path.join(DATA_ROOT, "in_text_link_extraction.html")) as f:
+            article, raw_html = sudinfo.extract_article_data(f)
+            extracted_links = article.links
+            tagged_urls = [
+                make_tagged_url("http://www.sporza.be/cm/sporza/videozone/MG_programmas/MG_Extra_Time_GNMA/1.1450385?utm_medium=twitter&utm_source=dlvr.it", u"""Cliquez ici pour consulter la vidéo capturée par nos confrères de Sporza.""", set(['external', 'in text'])),
+            ]
+            expected_links = tagged_urls
+            assert_taggedURLs_equals(expected_links, extracted_links)
+