1. Frederic De Groef
  2. csxj-crawler

Commits

Frederic De Groef  committed 68aa021

[lavenir] support for flowplayer videos (for both old and current generated html)

  • Participants
  • Parent commits 8f13329
  • Branches default

Comments (0)

Files changed (6)

File csxj/datasources/lavenir.py

View file
             tags |= set(['embedded', 'video'])
             tagged_urls.append(make_tagged_url(url, title, tags))
 
+    scripts = video_div_hxs.select('.//script')
+    for script_hxs in scripts:
+        script_src = script_hxs.select('./@src').extract()
+        if script_src and 'flowplayer' in script_src[0]:
+            title = constants.EMBEDDED_VIDEO_TITLE
+            url = constants.EMBEDDED_VIDEO_URL
+            tags = set(['external', 'embedded', 'video', 'flowplayer', constants.UNFINISHED_TAG])
+            tagged_urls.append(make_tagged_url(url, title, tags))
+
     if tagged_urls:
         return tagged_urls
     else:
         "http://www.lavenir.net/article/detail.aspx?articleid=DMF20130224_00273104",
         "http://www.lavenir.net/article/detail.aspx?articleid=DMF20130224_005",
         "http://www.lavenir.net/article/detail.aspx?articleid=DMF20120831_00198968",  # highlighted videos + ghost links
+        "http://www.lavenir.net/article/detail.aspx?articleid=DMF20120226_00122978"
         ]
 
     urls_new_style = [
         "http://www.lavenir.net/sports/cnt/DMF20120719_00183602"  # weird storify (no <noscript>)
     ]
 
-    for url in urls_new_style[-1:]:
+    urls_before_june = [
+        "/Volumes/Curst/csxj/tasks/lavenir_backwards_compat/jsondb/lavenir/2012-02-27/01.13.09/raw_data/39.html",
+        "/Volumes/Curst/csxj/tasks/lavenir_backwards_compat/jsondb/lavenir/2012-02-27/13.05.12/raw_data/7.html",
+    ]
+
+
+
+    for url in urls[-1:]:
         article, html_content = extract_article_data(url)
         if article:
             print(article.title)
             print("°" * 80)
 
             import os
-            #generate_unittest("new_links_yet_another_storify", "lavenir", dict(urls=article.links), html_content, url, os.path.join(os.path.dirname(__file__), "../../tests/datasources/test_data/lavenir"), True)
+            #generate_unittest("links_flowplayer", "lavenir", dict(urls=article.links), html_content, url, os.path.join(os.path.dirname(__file__), "../../tests/datasources/test_data/lavenir"), True)
 
         else:
             print('page was not recognized as an article')
 
 
+
 if __name__ == "__main__":
     import sys
     if "--test" in sys.argv:

File csxj/datasources/parser_tools/constants.py

View file
 PAYWALLED_CONTENT = u"__PAYWALLED__"
 RENDERED_STORIFY_TITLE = u"__RENDERED_STORIFY__"
 RENDERED_TWEET_TITLE = u"__RENDERED_TWEET__"
+EMBEDDED_VIDEO_TITLE = u"__EMBEDDED_VIDEO_TITLE__"
+EMBEDDED_VIDEO_URL = u"__EMBEDDED_VIDEO_URL__"

File tests/datasources/test_data/lavenir/index.json

View file
     [
       "http://www.lavenir.net/sports/cnt/DMF20120719_00183602", 
       "new_links_yet_another_storify.html"
+    ], 
+    [
+      "csxjdb://lavenir/2012-02-27/13.05.12/raw_data/7.html", 
+      "links_old_flowplayer.html"
+    ], 
+    [
+      "http://www.lavenir.net/article/detail.aspx?articleid=DMF20120226_00122978", 
+      "links_flowplayer.html"
     ]
   ]
 }