Commits

Juliette De Maeyer committed d0f22ff

[lalibre] added extraction of embedded tweets (rendered tweets)

Comments (0)

Files changed (2)

csxj/datasources/lalibre.py

 from parser_tools.utils import remove_text_formatting_markup_from_fragments
 from parser_tools import constants
 from parser_tools import ipm_utils
+from parser_tools import twitter_utils
 
 LALIBRE_ASSOCIATED_SITES = {
 
 def extract_text_content_and_links(main_content):
     article_text = main_content.find('div', {'id': 'articleText'})
 
-    in_text_tagged_urls = extract_and_tag_in_text_links(article_text)
-
+    in_text_tagged_urls = []
     all_fragments = []
     all_plaintext_urls = []
+    embedded_tweets = []
+
     paragraphs = article_text.findAll('p', recursive=False)
 
     for paragraph in paragraphs:
         if not paragraph.find('blockquote', {'class': 'twitter-tweet'}):
+
+            in_text_links = extract_and_tag_in_text_links(paragraph)
+            in_text_tagged_urls.extend(in_text_links)
+
             fragments = sanitize_paragraph(paragraph)
-            print fragments
             all_fragments.append(fragments)
             all_fragments.append('\n')
             plaintext_links = extract_plaintext_urls_from_text(fragments)
             urls_and_titles = zip(plaintext_links, plaintext_links)
             all_plaintext_urls.extend(classify_and_make_tagged_url(urls_and_titles, additional_tags=set(['plaintext'])))
+        else:
+            embedded_tweets.extend(twitter_utils.extract_rendered_tweet(paragraph, LALIBRE_NETLOC, LALIBRE_ASSOCIATED_SITES))
 
     text_content = all_fragments
-    return text_content, in_text_tagged_urls + all_plaintext_urls
+
+    return text_content, in_text_tagged_urls + all_plaintext_urls + embedded_tweets
 
 
 def extract_category(main_content):
             "http://www.lalibre.be/societe/insolite/article/786611/le-tweet-sarcastique-de-johnny-a-gege.html"
             ]
 
+    from pprint import pprint
+
     for url in urls[-1:]:
         article, html = extract_article_data(url)
+        pprint(article.links)
+            
 
 
 if __name__ == '__main__':

csxj/datasources/parser_tools/twitter_utils.py

 import urlparse
 import urllib
+from csxj.common import tagging
+
 TWITTER_WIDGET_NETLOC="widgets.twimg.com"
 TWITTER_WIDGET_SCRIPT_URL="http://widgets.twimg.com/j/2/widget.js"
 
             raise ValueError("No type line was found in the TWTR.Widget script")
     else:
         raise ValueError("Detected script is not TWTR.Widget")
+
+
+
+def extract_rendered_tweet(paragraph, netloc, internal_site):
+    tagged_urls = []
+    tweets = paragraph.findAll(attrs = {"class" : "twitter-tweet"})
+    if tweets:
+        for tweet in tweets:
+            links = tweet.findAll("a")
+            for link in links :
+                if link.get("data-datetime"):
+                    url = link.get("href")
+                    tags = tagging.classify_and_tag(url, netloc, internal_site)
+                    tags.add('embedded media')
+                    tags.add('tweet')
+                    tagged_urls.append(tagging.make_tagged_url(url, url, tags))
+
+    return tagged_urls
+
+
+
+