Commits

Juliette De Maeyer  committed 5d94036

[sudpresse] added embedded media (iframe) extraction

  • Participants
  • Parent commits 5f2188f

Comments (0)

Files changed (1)

File csxj/datasources/sudpresse.py

     else:
         return []
 
+def extract_embedded_media(article):
+    tagged_urls = list()
+    # extract any iframe from maincontent
+    iframes = article.findAll("iframe")
+    for media in iframes:
+        url = media.get('src')
+        tags = classify_and_tag(url, SUDPRESSE_OWN_NETLOC, SUDPRESSE_INTERNAL_SITES)
+        tags.add("embedded media")
+        tagged_url = make_tagged_url(url, url, tags)
+        tagged_urls.append(tagged_url)
+
+
+    return tagged_urls
 
 def is_page_error_404(soup):
 
         content, content_links = extract_content_and_links(article)
 
         associated_links = extract_associated_links(article)
+        embedded_media = extract_embedded_media(article)
 
-        all_links = intro_links + content_links + associated_links
+        all_links = intro_links + content_links + associated_links + embedded_media
 
         updated_tagged_urls = update_tagged_urls(all_links, rossel_utils.SUDINFO_SAME_OWNER)
 
     filepath = "../../sample_data/sudpresse/sudpresse_erreur1.html"
     filepath = "../../sample_data/sudpresse/sudpresse_same_owner.html"
     filepath = "../../sample_data/sudpresse/sudpresse_associated_link_error.html"
+    filepath = "../../sample_data/sudpresse/sudpresse_live_article.html"
+    filepath = "../../sample_data/sudpresse/sudpresse_erreur1.html"
     with open(filepath) as f:
         article_data, raw = extract_article_data(f)
 
-        # for link in article_data.links:
-        #     print link.URL
-        #     print link.title
-        #     print link.tags
-        #     print "**********************"
+        for link in article_data.links:
+            print link.URL
+            print link.title
+            print link.tags
+            print "**********************"
 
 
 def download_one_article():