Commits

Juliette De Maeyer  committed abb3708

[septsursept, sudpresse, sudinfo] tagging conventions : "embedded" instead of "embedded media"

  • Participants
  • Parent commits 5d94036

Comments (0)

Files changed (3)

File csxj/datasources/septsursept.py

             url = section.find('a').get('href')
             title = section.find('a').contents
             tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES)
-            tags.add('embedded media')
+            tags.add('embedded')
             tagged_urls.append(tagging.make_tagged_url(url, title, tags))
 
         elif 'video' in section.attrs['class']:
                 url = iframe.get("src")
                 if url :
                     tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES)
-                    tags.add('embedded media')
+                    tags.add('embedded')
                     tagged_urls.append(tagging.make_tagged_url(url, url, tags))
                 else:
                     raise ValueError("There seems to be an iframe but we could not find a link. Please update parser.")
                 url = embedded_stuff.get("src")
                 if url :
                     tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES)
-                    tags.add('embedded media')
+                    tags.add('embedded')
                     tagged_urls.append(tagging.make_tagged_url(url, url, tags))
                 else:
                     raise ValueError("There seems to be an embedded video but we could not find a link. Please update parser.")
                         if link.get("data-datetime"):
                             url = link.get("href")
                             tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES)
-                            tags.add('embedded media')
+                            tags.add('embedded')
                             tags.add('tweet')
                             tagged_urls.append(tagging.make_tagged_url(url, url, tags))
 
             for x in embedded_container:
                 url = x.get("src")
                 tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES)
-                tags.add('embedded media')
+                tags.add('embedded')
                 tags.add ('in text')
                 tagged_urls.append(tagging.make_tagged_url(url, url, tags))
 

File csxj/datasources/sudinfo.py

         if embedded_frame:
             target_url = embedded_frame.select("./@src").extract()[0]
             tags = classify_and_tag(target_url, SUDINFO_OWN_NETLOC, SUDINFO_INTERNAL_SITES)
-            tags.update(['embedded document', 'iframe'])
+            tags.update(['embedded', 'iframe'])
             return make_tagged_url(target_url, title, tags)
         else:
             return None
         
         # liens 'same owner'
         u"http://www.sudinfo.be/551998/article/fun/buzz/2012-10-04/schocking-in-brussles-des-hommes-nus-miment-l-acte-sexuel-au-palais-de-justice-a",
-        u"http://www.sudinfo.be/535396/article/culture/musique/2012-09-27/mylene-farmer-donnera-deux-concerts-en-belgique-l’an-prochain"
+        u"http://www.sudinfo.be/535396/article/culture/musique/2012-09-27/mylene-farmer-donnera-deux-concerts-en-belgique-l’an-prochain",
+
+        # embedded coveritlive + standard widget
+        u"http://www.sudinfo.be/306989/article/sports/foot-belge/standard/2012-01-08/standard-chattez-en-exclusivite-avec-sebastien-pocognoli-ce-lundi-des-13h30",
         
+        # embeddes scribble
+        u"http://www.sudinfo.be/655859/article/sports/foot-belge/anderlecht/2013-02-03/suivez-le-super-sunday-en-live-genk-ecrase-bruges-4-1-le-standard-en-visi"
     ]
 
     article, html = extract_article_data(urls[-1])

File csxj/datasources/sudpresse.py

     for media in iframes:
         url = media.get('src')
         tags = classify_and_tag(url, SUDPRESSE_OWN_NETLOC, SUDPRESSE_INTERNAL_SITES)
-        tags.add("embedded media")
+        tags.add("embedded")
         tagged_url = make_tagged_url(url, url, tags)
         tagged_urls.append(tagged_url)
 
-
     return tagged_urls
 
 def is_page_error_404(soup):