1. Frederic De Groef
  2. csxj-crawler

Commits

Frederic De Groef  committed 11f91a8

[tests/lalibre] new test for plaintext links extraction

  • Participants
  • Parent commits 9066696
  • Branches default

Comments (0)

Files changed (3)

File tests/datasources/test_data/lalibre/index.json

View file
  • Ignore whitespace
 {
   "test_data": [
     [
-      "http://www.lalibre.be/actu/politique-belge/article/656130/van-quick-nous-ne-lacherons-pas-la-n-va.html", 
+      "http://www.lalibre.be/actu/politique-belge/article/656130/van-quick-nous-ne-lacherons-pas-la-n-va.html",
       "links_single_bottom_sidebox_link.html"
-    ], 
+    ],
     [
-      "http://www.lalibre.be/economie/actualite/article/704138/troisieme-belgian-day-a-wall-street.html", 
+      "http://www.lalibre.be/economie/actualite/article/704138/troisieme-belgian-day-a-wall-street.html",
       "links_removed_article.html"
-    ], 
+    ],
     [
-      "http://www.lalibre.be/culture/selection-culturelle/article/707244/ou-sortir-ce-week-end.html", 
+      "http://www.lalibre.be/culture/selection-culturelle/article/707244/ou-sortir-ce-week-end.html",
       "links_intext.html"
-    ], 
+    ],
     [
-      "http://www.lalibre.be/actu/usa-2012/article/773294/obama-raille-les-chevaux-et-baionnettes-de-romney.html", 
+      "http://www.lalibre.be/actu/usa-2012/article/773294/obama-raille-les-chevaux-et-baionnettes-de-romney.html",
       "links_storify_sidebox_bottom_links.html"
-    ], 
+    ],
     [
-      "http://www.lalibre.be/actu/international/article/774524/sandy-le-calme-avant-la-tempete.html", 
+      "http://www.lalibre.be/actu/international/article/774524/sandy-le-calme-avant-la-tempete.html",
       "links_storify_video_links.html"
-    ], 
+    ],
     [
-      "http://www.lalibre.be/sports/football/article/778966/suivez-anderlecht-milan-ac-en-live-des-20h30.html", 
+      "http://www.lalibre.be/sports/football/article/778966/suivez-anderlecht-milan-ac-en-live-des-20h30.html",
       "links_embedded_videos.html"
-    ], 
+    ],
     [
-      "http://www.lalibre.be/societe/general/article/779522/la-pornographie-une-affaire-d-hommes-pas-seulement.html", 
+      "http://www.lalibre.be/societe/general/article/779522/la-pornographie-une-affaire-d-hommes-pas-seulement.html",
       "links_same_owner_tagging.html"
-    ], 
+    ],
     [
-      "http://www.lalibre.be/culture/mediastele/article/748553/veronique-genest-mon-coeur-est-en-berne.html", 
+      "http://www.lalibre.be/culture/mediastele/article/748553/veronique-genest-mon-coeur-est-en-berne.html",
       "content_no_paragraphs.html"
-    ], 
+    ],
     [
-      "http://www.lalibre.be/actu/politique-belge/article/778553/accord-sur-le-budget-2013.html", 
+      "http://www.lalibre.be/actu/politique-belge/article/778553/accord-sur-le-budget-2013.html",
       "content_paragraphs_overload.html"
-    ], 
+    ],
     [
-      "http://www.lalibre.be/actu/belgique/article/788978/incendie-d-un-car-belge-en-suisse-plus-de-peur-que-de-mal.html", 
+      "http://www.lalibre.be/actu/belgique/article/788978/incendie-d-un-car-belge-en-suisse-plus-de-peur-que-de-mal.html",
       "content_intro_articleHat.html"
-    ], 
+    ],
     [
-      "http://www.lalibre.be/actu/belgique/article/788978/incendie-d-un-car-belge-en-suisse-plus-de-peur-que-de-mal.html", 
+      "http://www.lalibre.be/actu/belgique/article/788978/incendie-d-un-car-belge-en-suisse-plus-de-peur-que-de-mal.html",
       "links_extract_embedded_tweets.html"
     ],
-   [
-      "http://www.lalibre.be/societe/insolite/article/787359/des-chocolats-aux-insectes.html", 
+    [
+      "http://www.lalibre.be/societe/insolite/article/787359/des-chocolats-aux-insectes.html",
       "plaintext_links.html"
-    ], 
+    ],
     [
-      "http://www.lalibre.be/societe/insolite/article/786611/le-tweet-sarcastique-de-johnny-a-gege.html", 
+      "http://www.lalibre.be/societe/insolite/article/786611/le-tweet-sarcastique-de-johnny-a-gege.html",
       "embedded_tweet.html"
+    ],
+    [
+      "http://www.lalibre.be/economie/actualite/article/755845/les-bourses-avancent-timidement-vers-le-web.html",
+      "links_intext_overload.html"
     ]
   ]
-}
+}

File tests/datasources/test_lalibre.py

View file
  • Ignore whitespace
             expected_links = embedded_content_links + bottom_links + associated_tagged_urls + embedded_audio_links + in_text_links
             assert_taggedURLs_equals(expected_links, extracted_links)
 
+    def test_links_intext_overload(self):
+        """ lalibre parser is very good with plaintext links"""
+        with open(os.path.join(DATA_ROOT, "links_intext_overload.html")) as f:
+            article, raw_html = lalibre.extract_article_data(f)
+            extracted_links = article.links
+            updated_tagged_urls = [
+                make_tagged_url("www.nyx.com", u"""www.nyx.com""", set(['plaintext', 'external', 'in text'])),
+                make_tagged_url("europeanequities.nyx.com", u"""europeanequities.nyx.com""", set(['plaintext', 'external', 'in text'])),
+                make_tagged_url("www.bourse.be", u"""www.bourse.be""", set(['plaintext', 'external', 'in text'])),
+                make_tagged_url("www.beurs.be", u"""www.beurs.be""", set(['plaintext', 'external', 'in text'])),
+                make_tagged_url("bourse.be", u"""bourse.be""", set(['plaintext', 'external', 'in text'])),
+                make_tagged_url("http://www.londonstockexchange.com", u"""http://www.londonstockexchange.com""", set(['plaintext', 'external', 'in text'])),
+                make_tagged_url("http://www.six-swiss-exchange.com/", u"""http://www.six-swiss-exchange.com/""", set(['plaintext', 'external', 'in text'])),
+                make_tagged_url("http://deutsche-boerse.com", u"""http://deutsche-boerse.com""", set(['plaintext', 'external', 'in text'])),
+                make_tagged_url("/economie/actualite/article/754828/le-jeu-video-sans-console-via-belgacom.html", u"""Le jeu vidéo sans console via Belgacom""", set(['internal', 'sidebar box'])),
+                make_tagged_url("/economie/actualite/article/753635/suivre-les-cours-de-bourse-a-la-plage-gare-aux-plongeons.html", u"""Suivre les cours de Bourse à la plage ? Gare aux plongeons !""", set(['internal', 'sidebar box'])),
+                make_tagged_url("/economie/actualite/article/752413/travailler-en-vacances-une-autre-facon-de-garder-la-ligne.html", u"""Travailler en vacances : une autre façon de garder la ligne !""", set(['internal', 'sidebar box'])),
+                make_tagged_url("/economie/actualite/article/754828/le-jeu-video-sans-console-via-belgacom.html", u"""Le jeu vidéo sans console via Belgacom""", set(['bottom box', 'internal'])),
+                make_tagged_url("/economie/actualite/article/753635/suivre-les-cours-de-bourse-a-la-plage-gare-aux-plongeons.html", u"""Suivre les cours de Bourse à la plage ? Gare aux plongeons !""", set(['bottom box', 'internal'])),
+                make_tagged_url("/economie/actualite/article/752413/travailler-en-vacances-une-autre-facon-de-garder-la-ligne.html", u"""Travailler en vacances : une autre façon de garder la ligne !""", set(['bottom box', 'internal'])),
+                make_tagged_url("/economie/actualite/article/755981/la-grece-lance-une-bataille-diplomatique.html", u"""La Grèce lance une bataille diplomatique""", set(['bottom box', 'internal'])),
+                make_tagged_url("/economie/actualite/article/755996/apple-roi-de-la-bourse-us.html", u"""Apple, roi de la bourse US""", set(['bottom box', 'internal'])),
+            ]
+            expected_links = updated_tagged_urls
+            assert_taggedURLs_equals(expected_links, extracted_links)
+
 
 class TestLalibreContentExtraction(object):
     def test_clean_paragraph_extraction(self):