Commits

Frederic De Groef  committed d317618

[lavenir] ignore animated gifs, support for ooyala videos

  • Participants
  • Parent commits 75dbd21

Comments (0)

Files changed (5)

File csxj/datasources/lavenir.py

                     url = constants.EMBEDDED_VIDEO_URL
                     tags = set(['external', 'embedded', 'video', 'jwplayer', constants.UNFINISHED_TAG])
                     tagged_urls.append(make_tagged_url(url, title, tags))
+                elif 'ooyala' in script_src[0]:
+                    title = constants.EMBEDDED_VIDEO_TITLE
+                    url = constants.EMBEDDED_VIDEO_URL
+                    tags = set(['external', 'embedded', 'video', 'ooyala', constants.UNFINISHED_TAG])
+                    tagged_urls.append(make_tagged_url(url, title, tags))
                 else:
                     raise ValueError("Found a <script> for an embedded video, for an unknown type")
 
     else:
         url = constants.NO_URL
 
-    title = link_hxs.select("./text()").extract()
+    title = link_hxs.select(".//text()").extract()
     if title:
         title = title[0].strip()
     else:
         "http://www.lavenir.net/sports/cnt/DMF20121007_007",  # jwplayer
         "http://www.lavenir.net/sports/cnt/DMF20121213_026",  # <embed> with eitb.com video
         "http://www.lavenir.net/sports/cnt/DMF20130103_025",  # picture instead of video
+        "http://www.lavenir.net/sports/cnt/DMF20130116_00256248",  # animated gif
+        "http://www.lavenir.net/sports/cnt/DMF20130123_044",  # ooyala videos
     ]
 
     urls_before_june = [
     ]
 
 
-
     for url in urls_new_style[-1:]:
         article, html_content = extract_article_data(url)
         if article:
             print("°" * 80)
 
             import os
-            #generate_unittest("links_new_ignore_images_in_video_div", "lavenir", dict(urls=article.links), html_content, url, os.path.join(os.path.dirname(__file__), "../../tests/datasources/test_data/lavenir"), True)
+            #generate_unittest("links_new_ooyala_videos", "lavenir", dict(urls=article.links), html_content, url, os.path.join(os.path.dirname(__file__), "../../tests/datasources/test_data/lavenir"), True)
 
         else:
             print('page was not recognized as an article')

File tests/datasources/test_data/lavenir/index.json

     [
       "http://www.lavenir.net/sports/cnt/DMF20130103_025", 
       "links_new_ignore_images_in_video_div.html"
+    ], 
+    [
+      "http://www.lavenir.net/sports/cnt/DMF20130116_00256248", 
+      "links_new_ignore_animated_gifs_in_video_div.html"
+    ], 
+    [
+      "http://www.lavenir.net/sports/cnt/DMF20130123_044", 
+      "links_new_ooyala_videos.html"
     ]
   ]
 }