1. Frederic De Groef
  2. csxj-crawler

Commits

Juliette De Maeyer  committed b12d5d3

[lalibre] when processing paragraphs and fragments, we now ignore embedded tweets

  • Participants
  • Parent commits f9a7c0f
  • Branches default

Comments (0)

Files changed (2)

File csxj/datasources/lalibre.py

View file
  • Ignore whitespace
 from datetime import datetime, time
 import urlparse
 
+import BeautifulSoup
+
 from csxj.common.tagging import classify_and_tag, make_tagged_url
 from csxj.db.article import ArticleData
 from parser_tools.utils import fetch_html_content, make_soup_from_html_content, extract_plaintext_urls_from_text
     Returns a list of TaggedURL objects.
     """
     def extract_link_and_title(link):
-            return link.get('href'), remove_text_formatting_markup_from_fragments(link.contents)
+        return link.get('href'), remove_text_formatting_markup_from_fragments(link.contents)
     links = [extract_link_and_title(link)
              for link in article_text.findAll('a', recursive=True)]
 
 
 def sanitize_paragraph(paragraph):
     """Returns plain text article"""
-    sanitized_paragraph = [remove_text_formatting_markup_from_fragments(fragment) for fragment in paragraph.contents]
+    
+    sanitized_paragraph = [remove_text_formatting_markup_from_fragments(fragment) for fragment in paragraph.contents if not isinstance(fragment, BeautifulSoup.Comment)]
+
     return ''.join(sanitized_paragraph)
 
 
     paragraphs = article_text.findAll('p', recursive=False)
 
     for paragraph in paragraphs:
-        fragments = sanitize_paragraph(paragraph)
-        all_fragments.append(fragments)
-        all_fragments.append('\n')
-        plaintext_links = extract_plaintext_urls_from_text(fragments)
-        urls_and_titles = zip(plaintext_links, plaintext_links)
-        all_plaintext_urls.extend(classify_and_make_tagged_url(urls_and_titles, additional_tags=set(['plaintext'])))
+        if not paragraph.find('blockquote', {'class': 'twitter-tweet'}):
+            fragments = sanitize_paragraph(paragraph)
+            print fragments
+            all_fragments.append(fragments)
+            all_fragments.append('\n')
+            plaintext_links = extract_plaintext_urls_from_text(fragments)
+            urls_and_titles = zip(plaintext_links, plaintext_links)
+            all_plaintext_urls.extend(classify_and_make_tagged_url(urls_and_titles, additional_tags=set(['plaintext'])))
 
     text_content = all_fragments
     return text_content, in_text_tagged_urls + all_plaintext_urls
             "http://www.lalibre.be/actu/usa-2012/article/773294/obama-raille-les-chevaux-et-baionnettes-de-romney.html",
             "http://www.lalibre.be/actu/international/article/774524/sandy-le-calme-avant-la-tempete.html",
             "http://www.lalibre.be/sports/football/article/778966/suivez-anderlecht-milan-ac-en-live-des-20h30.html",
+            "http://www.lalibre.be/societe/insolite/article/786611/le-tweet-sarcastique-de-johnny-a-gege.html"
             ]
 
     for url in urls[-1:]:

File csxj/datasources/parser_tools/utils.py

View file
  • Ignore whitespace
 import urlparse
 import re
 import random
-from BeautifulSoup import BeautifulSoup, Tag
+from BeautifulSoup import BeautifulSoup, Tag, Comment, NavigableString
 from useragents import USER_AGENT_STRINGS
 from datetime import datetime
 import bs4
     return urls
 
 
-TEXT_MARKUP_TAGS = ['a', 'b', 'i', 'u', 'em', 'strong', 'tt', 'h1',  'h2',  'h3',  'h4',  'h5', 'span', 'sub', 'sup', 'p' ]
+TEXT_MARKUP_TAGS = ['a', 'b', 'i', 'u', 'em', 'strong', 'tt', 'h1',  'h2',  'h3',  'h4',  'h5', 'span', 'sub', 'sup', 'p', 'img' ]
 
 def remove_text_formatting_markup(formatted_text_fragment, strip_chars):
     """
 
     # A text fragment is either an HTML tag (with its own child text fragments)
     # or just a plain string.
+    
+    
+
+
     if isinstance(formatted_text_fragment, Tag) or isinstance(formatted_text_fragment, bs4.Tag):
         # If it's the former, we remove the tag and clean up all its children
         if formatted_text_fragment.name in TEXT_MARKUP_TAGS: