Frederic De Groef avatar Frederic De Groef committed 95a067c

[sudinfo] minor cleanups

Comments (0)

Files changed (2)

csxj/datasources/sudinfo.py

 
 import codecs
 from datetime import datetime
-import locale
-from itertools import chain
 import urllib
 import urllib2
 import itertools as it
 from urlparse import urlparse
-from datetime import datetime
 
 from scrapy.selector import HtmlXPathSelector
 

tests/datasources/test_sudinfo.py

 # -*- coding: utf-8 -*-
 """
-Link extraction test suite for sudinfo.py
+Test suites for sudinfo.py
 """
 
 import os
-from nose.tools import eq_
 
-
-from csxj.datasources.parser_tools.utils import convert_utf8_url_to_ascii
 from csxj.common.tagging import make_tagged_url
 from csxj.datasources import sudinfo
 
 
 
 class TestSudinfoLinkExtraction(object):
-
     def test_no_links(self):
-        """ Sudinfo parser returns an empty link list if the article has no link. """
+        """ sudinfo parser returns an empty link list if the article has no link. """
         with open(os.path.join(DATA_ROOT, "no_links.html")) as f:
             article, raw_html = sudinfo.extract_article_data(f)
             extracted_links = article.links
             assert_taggedURLs_equals(expected_links, extracted_links)
 
     def test_sidebar_box_tagging(self):
-        """ Sudinfo parser can extract and tag sidebar links from an article. """
+        """ sudinfo parser can extract and tag sidebar links from an article. """
         with open(os.path.join(DATA_ROOT, "sidebar_box_tagging.html")) as f:
             article, raw_html = sudinfo.extract_article_data(f)
             extracted_links = article.links
             assert_taggedURLs_equals(expected_links, extracted_links)
 
     def test_in_text_same_owner(self):
-        """ Sudinfo parser can extract and tag in text and sidebar links to same owner sites."""
+        """ sudinfo parser can extract and tag in text and sidebar links to same owner sites."""
         with open(os.path.join(DATA_ROOT, "in_text_same_owner.html")) as f:
             article, raw_html = sudinfo.extract_article_data(f)
             extracted_links = article.links
             assert_taggedURLs_equals(expected_links, extracted_links)
 
     def test_embedded_video_extraction(self):
-        """ Sudinfo parser can extract and tag embedded video from the bottom of an article. """
+        """ sudinfo parser can extract and tag embedded video from the bottom of an article. """
         with open(os.path.join(DATA_ROOT, "embedded_video_extraction.html")) as f:
             article, raw_html = sudinfo.extract_article_data(f)
             extracted_links = article.links
             assert_taggedURLs_equals(expected_links, extracted_links)
 
     def test_in_text_link_extraction(self):
-        """ Sudinfo parser can extract and tag in-text links """
+        """ sudinfo parser can extract and tag in-text links """
         with open(os.path.join(DATA_ROOT, "in_text_link_extraction.html")) as f:
             article, raw_html = sudinfo.extract_article_data(f)
             extracted_links = article.links
             assert_taggedURLs_equals(expected_links, extracted_links)
 
     def test_links_iframe_in_text(self):
-        """ Sudinfo parser can extract iframes within text block"""
+        """ sudinfo parser extracts iframes within text block, does not consider iframes as text content"""
         with open(os.path.join(DATA_ROOT, "links_iframe_in_text.html")) as f:
             article, raw_html = sudinfo.extract_article_data(f)
             extracted_links = article.links
             assert_taggedURLs_equals(expected_links, extracted_links)
 
     def test_links_intext_not_plaintext(self):
-        """ Sudinfo extracts intext urls only once (and not as plaintext URLs)"""
+        """ sudinfo parser extracts in-text urls only once (and not as plaintext URLs)"""
         with open(os.path.join(DATA_ROOT, "links_intext_not_plaintext.html")) as f:
             article, raw_html = sudinfo.extract_article_data(f)
             extracted_links = article.links
 
 class TestSudinfoContentExtracttion(object):
     def test_intext_link(self):
-        """ Sudinfo parser correctly extract text content, even when there is a link inside"""
+        """ sudinfo parser correctly extract text content, even when there is a link inside"""
         with open(os.path.join(DATA_ROOT, "content_intext_link.html")) as f:
             article, _ = sudinfo.extract_article_data(f)
 
-            expected_intro = u"""Le flash mob proposé par Stéphane Thiry, officier pompier au SRI de Saint-Hubert a obtenu un succès tel qu'ils étaient plus de 300 à danser et se regarder lors de la journée portes-ouvertes des pompiers de ce dimanche 7 octobre."""
+            #expected_intro = u"""Le flash mob proposé par Stéphane Thiry, officier pompier au SRI de Saint-Hubert a obtenu un succès tel qu'ils étaient plus de 300 à danser et se regarder lors de la journée portes-ouvertes des pompiers de ce dimanche 7 octobre."""
             expected_content = [u"""Grosse foule et succès mérité pour les pompiers borquins qui ont réalisé multiples exrecices face au public.Une jounée sous un ciel clément et ensoleillé. Et pour cause, Mr Météo avait rangé ses grenouilles et les pompiers ont imploré Sainte-Claire en lui portant des oeufs.""",
                                 u"""Plus de détails et un album photo sur  http://secourslux.blogs.sudinfo.be"""]
             assert_content_equals(expected_content, article.content)
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.