Juliette De Maeyer avatar Juliette De Maeyer committed 76a57d5

[sudinfo] trying to figure out if the "Medias" box contains nothing but images

Comments (0)

Files changed (1)

csxj/datasources/sudinfo.py

 
             all_tagged_urls.append(make_tagged_url(url, title, tags))
 
-    media_links = hxs.select("//div[@id='picture']/descendant::div[@class='bloc-01 pf_article']//a")
+    media_links = hxs.select("//div[@id='picture']/descendant::div[@class='wrappAllMedia']/div")
 
-    if media_links:
-        for i, item in enumerate(media_links):
-            item_id = item.select("./@href").extract()
-            url = item_id
-            title = u"EMBEDDED MEDIA {0}".format(i)
-            tags = set(['media', 'embedded'])
-            all_tagged_urls.append(make_tagged_url(url, title, tags))
+    
+    for i, item in enumerate(media_links):
+        if item.select('./img'):
+            pass
+        else:
+            raise ValueError("The media box contains something other than an image. Update your parser")
+            # item_id = item.select("./@class").extract()
+            # url = item_id[0]
+            # title = u"EMBEDDED MEDIA {0}".format(i)
+            # tags = set(['media', 'embedded'])
+            # all_tagged_urls.append(make_tagged_url(url, title, tags))
 
     return all_tagged_urls
 
 def test_sample_data():
     filepath = '../../sample_data/sudinfo/sudinfo_internal_links_in_sidebar_box.html'
     filepath = '../../sample_data/sudinfo/sudinfo_video.html'
+    filepath = '../../sample_data/sudinfo/embedded_photos.html'
+
     with open(filepath) as f:
         article_data, raw = extract_article_data(f)
         # article_data.print_summary()
 
-        # for link in article_data.links:
-        #     print link.title
-        #     print link.tags
+        for link in article_data.links:
+            print link
+            
 
         # print article_data.intro
         # print article_data.content
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.