1. Frederic De Groef
  2. csxj-crawler

Commits

Frederic De Groef  committed 7e98998

[sudinfo] extract kewego links in the article media gallery

  • Participants
  • Parent commits 658d7f9
  • Branches default

Comments (0)

Files changed (7)

File csxj/datasources/parser_tools/hxs_media_utils.py

View file
  • Ignore whitespace
         else:
             raise ValueError("The data attribute of that youtube object does not look like a url. Update the parser.")
     else:
-        raise ValueError("Could not find a source url for this youtube object" )
+        raise ValueError("Could not find a source url for this youtube object" )
+
+
+def extract_url_from_kplayer_object(hxs_kplayer_object):
+    """ Extracts the source url from an embedded kplayer object (parsed By scrapy HtmlXpathSelector)"""
+    if hxs_kplayer_object.select('./@data'):
+        data = hxs_kplayer_object.select('./@data').extract()[0]
+        if data.startswith('http'):
+            params = hxs_kplayer_object.select("./param [@name='flashVars']/@value").extract()
+            if params:
+                url = "{0}?{1}".format(data, params[0])
+                return url
+            else:
+                raise ValueError("Params could not be extracted from that kplayer object.")
+        else:
+            raise ValueError("The data attribute of that kplayer object does not look like a url. Update the parser.")
+    else:
+        raise ValueError("This kplayer object has no 'data' attribute.")

File csxj/datasources/sudinfo.py

View file
  • Ignore whitespace
             tags |= set(['youtube', 'embedded', 'video'])
             title = parser_constants.NO_TITLE
             all_tagged_urls.append(make_tagged_url(url, title, tags))
+        elif item.select(".//div[contains(@class, 'emvideo-kewego')]"):
+            kplayer_div = item.select(".//div[contains(@class, 'emvideo-kewego')]")
+            kplayer_object = kplayer_div.select("./object")
+            url = hxs_media_utils.extract_url_from_kplayer_object(kplayer_object)
+            tags = classify_and_tag(url, SUDINFO_OWN_NETLOC, SUDINFO_INTERNAL_SITES)
+            tags |= set(['kewego', 'embedded', 'video'])
+            title = parser_constants.NO_TITLE
+            all_tagged_urls.append(make_tagged_url(url, title, tags))
+        elif not item.select("./div/text()"):
+            pass # empty divs are lame
         else:
+
             raise ValueError("The media box contains something other than an image or a youtube video. Update your parser")
 
     return all_tagged_urls
         all_links = intro_links + content_links + associated_links
         updated_tagged_urls = update_tagged_urls(all_links, rossel_utils.SUDINFO_SAME_OWNER)
 
-        import os
-        generate_unittest("links_embedded_youtube", "sudinfo", dict(urls=updated_tagged_urls), html_content, "csxjdb://sudinfo/2012-06-22/17.05.07/raw_data/1.html", os.path.join(os.path.dirname(__file__), "../../tests/datasources/test_data/sudinfo"), True)
+        # import os
+        # generate_unittest("links_embedded_kewego_gallery", "sudinfo", dict(urls=updated_tagged_urls), html_content, "csxjdb://sudinfo/2012-03-26/13.05.07/raw_data/7.html", os.path.join(os.path.dirname(__file__), "../../tests/datasources/test_data/sudinfo"), True)
 
         return (ArticleData(source, title, pub_date, pub_time, fetched_datetime,
                             updated_tagged_urls,
 
     fpaths = [
         "/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-06-22/17.05.07/raw_data/1.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-06-26/16.05.08/raw_data/2.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-06-26/14.05.07/raw_data/2.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-06-26/15.05.09/raw_data/4.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-06-26/13.05.09/raw_data/1.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-06-26/11.05.06/raw_data/6.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-03-02/14.05.11/raw_data/1.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-03-02/14.05.11/raw_data/3.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-03-02/12.05.08/raw_data/0.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-03-09/14.05.07/raw_data/3.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-12-20/16.05.06/raw_data/3.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-03-24/16.05.06/raw_data/0.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-03-24/16.05.06/raw_data/1.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-03-25/19.05.12/raw_data/2.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-03-26/13.05.07/raw_data/7.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-03-26/01.05.07/raw_data/4.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-03-26/11.05.07/raw_data/7.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-03-01/17.05.07/raw_data/6.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-03-20/15.05.07/raw_data/9.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-06-12/10.05.08/raw_data/5.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-03-23/13.05.25/raw_data/0.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-03-23/15.05.07/raw_data/0.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-03-23/20.05.06/raw_data/13.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-03-23/12.05.08/raw_data/3.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-03-23/12.05.08/raw_data/4.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-03-23/11.05.06/raw_data/3.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-04-06/04.05.07/raw_data/0.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-04-06/10.05.07/raw_data/3.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-04-05/22.05.07/raw_data/0.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-08-13/13.05.07/raw_data/2.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-08-13/06.05.07/raw_data/1.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-04-03/21.05.09/raw_data/7.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-04-28/14.05.09/raw_data/1.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-04-28/19.05.10/raw_data/32.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-04-28/15.05.08/raw_data/1.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-08-02/17.05.07/raw_data/0.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-04-25/20.05.06/raw_data/1.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-03-06/20.05.07/raw_data/1.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-03-06/13.05.10/raw_data/1.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-04-27/18.05.09/raw_data/5.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-04-27/17.05.07/raw_data/2.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-04-27/16.05.07/raw_data/2.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-05-25/13.05.07/raw_data/5.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-05-25/19.05.08/raw_data/0.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-05-25/20.05.10/raw_data/0.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-05-25/17.05.08/raw_data/0.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-05-25/18.05.07/raw_data/4.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-09-06/14.05.06/raw_data/2.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-09-06/18.05.07/raw_data/0.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-09-07/18.05.07/raw_data/6.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-06-01/10.05.07/raw_data/5.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-09-27/17.05.13/raw_data/10.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-09-27/16.05.07/raw_data/2.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-09-27/11.05.07/raw_data/3.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-05-29/11.05.06/raw_data/44.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-09-22/14.05.07/raw_data/3.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-05-21/20.05.07/raw_data/2.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-06-07/20.05.06/raw_data/0.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-06-07/23.05.06/raw_data/0.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-06-07/13.05.08/raw_data/2.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-05-04/12.05.07/raw_data/1.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-06-08/19.05.06/raw_data/72.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-03-05/10.05.08/raw_data/2.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-12-21/15.05.13/raw_data/3.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-11-22/12.05.07/raw_data/7.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-11-22/18.05.05/raw_data/3.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-05-02/23.05.08/raw_data/1.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-05-03/19.05.07/raw_data/0.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-08-03/18.05.07/raw_data/0.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-11-29/19.05.06/raw_data/3.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-11-27/16.05.06/raw_data/4.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-04-24/15.05.10/raw_data/5.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-04-24/17.05.07/raw_data/0.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-04-24/16.05.08/raw_data/5.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-03-19/12.05.20/raw_data/5.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-03-19/17.05.07/raw_data/10.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-03-19/16.05.07/raw_data/2.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-03-19/16.05.07/raw_data/13.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-03-19/19.05.06/raw_data/5.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-08-21/17.05.07/raw_data/0.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-08-21/16.05.07/raw_data/0.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-03-30/17.05.09/raw_data/6.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-06-10/11.05.06/raw_data/0.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-03-13/15.05.07/raw_data/3.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-03-13/17.05.09/raw_data/14.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-03-13/18.05.07/raw_data/1.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-10-01/15.05.11/raw_data/3.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-03-15/07.05.09/raw_data/2.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-03-15/10.05.06/raw_data/3.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-03-16/15.05.07/raw_data/1.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-03-16/15.05.07/raw_data/10.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-03-16/11.05.11/raw_data/1.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-04-11/16.05.07/raw_data/2.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-04-13/15.05.13/raw_data/1.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-12-13/17.05.06/raw_data/0.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-12-13/17.05.06/raw_data/5.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-12-13/18.05.06/raw_data/4.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-04-15/12.05.08/raw_data/1.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-11-24/10.05.06/raw_data/3.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-04-17/13.05.20/raw_data/7.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-04-19/19.05.07/raw_data/0.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-08-28/22.05.12/raw_data/4.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-08-28/08.05.08/raw_data/1.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-05-13/15.05.07/raw_data/1.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-05-13/12.05.08/raw_data/1.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-06-13/09.05.07/raw_data/0.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-06-13/10.05.07/raw_data/0.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-05-11/09.05.07/raw_data/3.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-05-11/09.05.07/raw_data/7.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-06-11/15.05.06/raw_data/6.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-08-07/16.05.08/raw_data/5.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-07-03/12.05.07/raw_data/1.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2013-01-07/01.05.06/raw_data/0.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2013-01-07/07.05.06/raw_data/0.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2013-01-07/08.05.10/raw_data/3.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2013-01-07/14.05.06/raw_data/4.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2013-01-07/10.05.06/raw_data/0.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2013-01-07/10.05.06/raw_data/2.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-02-28/15.05.10/raw_data/5.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-02-28/10.05.10/raw_data/13.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-02-28/14.05.07/raw_data/1.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-02-28/12.05.08/raw_data/1.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-02-28/18.05.07/raw_data/2.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-02-27/00.34.42/raw_data/54.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-02-27/11.05.07/raw_data/1.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-08-23/10.05.06/raw_data/3.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-05-18/11.05.08/raw_data/2.html",
+"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-07-09/16.05.07/raw_data/0.html",
     ]
 
-    for fpath in fpaths:
+    # fpaths = [
+    # "/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-06-26/16.05.08/raw_data/2.html",
+    # "/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudinfo/2012-03-26/13.05.07/raw_data/7.html",]
+
+    for i, fpath in enumerate(fpaths):
+        print "*" * 20, i, fpath
         with open(fpath) as f:
             article, html = extract_article_data(f)
-            print article.content
-            for link in article.links:
-                print link
+            print_taggedURLs(article.links, 50)
 
 
 if __name__ == '__main__':

File tests/datasources/parser_tools/test_hxs_media_utils.py

View file
  • Ignore whitespace
 
         hxs = HtmlXPathSelector(text=raw_html)
         youtube_object = hxs.select("//div [@id='youtube-media']/object")
-        hxs_media_utils.extract_url_from_youtube_object(youtube_object)
+        hxs_media_utils.extract_url_from_youtube_object(youtube_object)
+
+
+
+class TestKewegoURLExtraction(object):
+    def test_from_flash_object(self):
+        """hxs_media_utils.extract_url_from_kplayer_object() can extract the source url from the object parameters"""
+        raw_html = """
+        <html>
+        <body>
+        <div id="kewego-media">
+        <object width="300" height="200" type="application/x-shockwave-flash" id="054c411daa8s" data="http://sll.kewego.com/swf/p3/epix.swf">
+          <param name="flashVars" value="language_code=fr&amp;playerKey=7b7e2d7a9682&amp;skinKey=a07930e183e6&amp;sig=054c411daa8s&amp;autostart=0&amp;advertise=true">
+          <param name="movie" value="http://sll.kewego.com/swf/p3/epix.swf">
+          <param name="allowFullScreen" value="true">
+          <param name="allowscriptaccess" value="always">
+          <param name="wmode" value="opaque">
+
+          <video width="300" height="200" preload="none" poster="http://api.kewego.com/video/getHTML5Thumbnail/?playerKey=7b7e2d7a9682&amp;sig=054c411daa8s" controls="controls">&nbsp;</video>
+          <script src="//sll.kewego.com/embed/assets/kplayer-standalone.js"></script>
+          <script defer="defer">kitd.html5loader("flash_epix_054c411daa8s");</script>
+        </object>
+        </div>
+        </body>
+        </html>
+        """
+        hxs = HtmlXPathSelector(text=raw_html)
+        kewego_object = hxs.select("//div [@id='kewego-media']/object")
+        expected_url = "http://sll.kewego.com/swf/p3/epix.swf?language_code=fr&playerKey=7b7e2d7a9682&skinKey=a07930e183e6&sig=054c411daa8s&autostart=0&advertise=true"
+        url = hxs_media_utils.extract_url_from_kplayer_object(kewego_object)
+        eq_(expected_url, url)
+
+    @raises(ValueError)
+    def test_no_params(self):
+        """hxs_media_utils.extract_url_from_kplayer_object() raises ValueError when the 'flashVars' child parameter is missing """
+        raw_html = """
+        <html>
+        <body>
+        <div id="kewego-media">
+        <object width="300" height="200" type="application/x-shockwave-flash" id="054c411daa8s" data="http://sll.kewego.com/swf/p3/epix.swf">
+          <param name="movie" value="http://sll.kewego.com/swf/p3/epix.swf">
+          <param name="allowFullScreen" value="true">
+          <param name="allowscriptaccess" value="always">
+          <param name="wmode" value="opaque">
+
+          <video width="300" height="200" preload="none" poster="http://api.kewego.com/video/getHTML5Thumbnail/?playerKey=7b7e2d7a9682&amp;sig=054c411daa8s" controls="controls">&nbsp;</video>
+          <script src="//sll.kewego.com/embed/assets/kplayer-standalone.js"></script>
+          <script defer="defer">kitd.html5loader("flash_epix_054c411daa8s");</script>
+        </object>
+        </div>
+        </body>
+        </html>
+        """
+        hxs = HtmlXPathSelector(text=raw_html)
+        kewego_object = hxs.select("//div [@id='kewego-media']/object")
+        hxs_media_utils.extract_url_from_kplayer_object(kewego_object)
+
+    @raises(ValueError)
+    def test_missing_data_attr(self):
+        """hxs_media_utils.extract_url_from_kplayer_object() raises ValueError when the 'data' attribute is missing"""
+        raw_html = """
+        <html>
+        <body>
+        <div id="kewego-media">
+        <object width="300" height="200" type="application/x-shockwave-flash" id="054c411daa8s" >
+          <param name="flashVars" value="language_code=fr&amp;playerKey=7b7e2d7a9682&amp;skinKey=a07930e183e6&amp;sig=054c411daa8s&amp;autostart=0&amp;advertise=true">
+          <param name="movie" value="http://sll.kewego.com/swf/p3/epix.swf">
+          <param name="allowFullScreen" value="true">
+          <param name="allowscriptaccess" value="always">
+          <param name="wmode" value="opaque">
+
+          <video width="300" height="200" preload="none" poster="http://api.kewego.com/video/getHTML5Thumbnail/?playerKey=7b7e2d7a9682&amp;sig=054c411daa8s" controls="controls">&nbsp;</video>
+          <script src="//sll.kewego.com/embed/assets/kplayer-standalone.js"></script>
+          <script defer="defer">kitd.html5loader("flash_epix_054c411daa8s");</script>
+        </object>
+        </div>
+        </body>
+        </html>
+        """
+        hxs = HtmlXPathSelector(text=raw_html)
+        kewego_object = hxs.select("//div [@id='kewego-media']/object")
+        hxs_media_utils.extract_url_from_kplayer_object(kewego_object)
+
+    @raises(ValueError)
+    def test_data_attr_not_url(self):
+        """hxs_media_utils.extract_url_from_kplayer_object() raises ValueError when the 'data' attribute does not look like a URL"""
+        raw_html = """
+        <html>
+        <body>
+        <div id="kewego-media">
+        <object width="300" height="200" type="application/x-shockwave-flash" id="054c411daa8s" data="HELLO">
+          <param name="flashVars" value="language_code=fr&amp;playerKey=7b7e2d7a9682&amp;skinKey=a07930e183e6&amp;sig=054c411daa8s&amp;autostart=0&amp;advertise=true">
+          <param name="movie" value="http://sll.kewego.com/swf/p3/epix.swf">
+          <param name="allowFullScreen" value="true">
+          <param name="allowscriptaccess" value="always">
+          <param name="wmode" value="opaque">
+
+          <video width="300" height="200" preload="none" poster="http://api.kewego.com/video/getHTML5Thumbnail/?playerKey=7b7e2d7a9682&amp;sig=054c411daa8s" controls="controls">&nbsp;</video>
+          <script src="//sll.kewego.com/embed/assets/kplayer-standalone.js"></script>
+          <script defer="defer">kitd.html5loader("flash_epix_054c411daa8s");</script>
+        </object>
+        </div>
+        </body>
+        </html>
+        """
+        hxs = HtmlXPathSelector(text=raw_html)
+        kewego_object = hxs.select("//div [@id='kewego-media']/object")
+        url = hxs_media_utils.extract_url_from_kplayer_object(kewego_object)

File tests/datasources/test_data/sudinfo/index.json

View file
  • Ignore whitespace
     [
       "csxjdb://sudinfo/2012-06-22/17.05.07/raw_data/1.html", 
       "links_embedded_youtube.html"
+    ], 
+    [
+      "csxjdb://sudinfo/2012-06-26/16.05.08/raw_data/2.html", 
+      "links_empty_gallery_div.html"
+    ], 
+    [
+      "csxjdb://sudinfo/2012-03-26/13.05.07/raw_data/7.html", 
+      "links_embedded_kewego_gallery.html"
     ]
   ]
 }