Commits

Juliette De Maeyer committed e1640b9

fixed first error from reprocessing: url in embedded video (kewego) player

Comments (0)

Files changed (1)

csxj/datasources/dhnet.py

                     title = u"__NO_TITLE__"
 
                 kplayer = div.find('div', {'class':'containerKplayer'})
-                kplayer_infos = kplayer.find('video')
-                url = kplayer_infos.get('data-src')
 
-                all_tags = classify_and_tag(url, DHNET_NETLOC, DHNET_INTERNAL_SITES)
-                tagged_urls.append(make_tagged_url(url, title, all_tags | set(['video', 'embedded', 'kplayer'])))
+                #methode 1
+                kplayer_flash = kplayer.find('div', {'class': 'flash_kplayer'})
+                url_part1 = kplayer_flash.object['data']
+                url_part2 = kplayer_flash.object.find('param', {'name' : 'flashVars'})['value']
+                if url_part1 is not None and url_part2 is not None:
+                    url = "%s?%s" % (url_part1, url_part2)
+                    all_tags = classify_and_tag(url, DHNET_NETLOC, DHNET_INTERNAL_SITES)
+                    tagged_urls.append(make_tagged_url(url, title, all_tags | set(['video', 'embedded', 'kplayer'])))
+                else:
+                    raise ValueError("We couldn't find an URL in the flahs player, please check")
 
             elif div.find('script'):
                 # try to detect a twitter widget
 
 
 if __name__ == "__main__":
-    import json
+    # import json
 
-    urls = [
-        "http://www.dhnet.be/infos/faits-divers/article/381082/le-fondateur-des-protheses-pip-admet-la-tromperie-devant-la-police.html",
-        "http://www.dhnet.be/sports/formule-1/article/377150/ecclestone-bientot-l-europe-n-aura-plus-que-cinq-grands-prix.html",
-        "http://www.dhnet.be/infos/belgique/article/378150/la-n-va-menera-l-opposition-a-un-gouvernement-francophone-et-taxateur.html",
-        "http://www.dhnet.be/cine-tele/divers/article/378363/sois-belge-et-poile-toi.html",
-        "http://www.dhnet.be/infos/societe/article/379508/contribuez-au-journal-des-bonnes-nouvelles.html",
-        "http://www.dhnet.be/infos/belgique/article/386721/budget-l-effort-de-2-milliards-confirme.html",
-        "http://www.dhnet.be/infos/monde/article/413062/sandy-paralyse-le-nord-est-des-etats-unis.html",
-        "http://www.dhnet.be/infos/economie/article/387149/belfius-fait-deja-le-buzz.html",
-        "http://www.dhnet.be/infos/faits-divers/article/388710/tragedie-de-sierre-toutes-nos-videos-reactions-temoignages-condoleances.html"
+    # urls = [
+    #     "http://www.dhnet.be/infos/faits-divers/article/381082/le-fondateur-des-protheses-pip-admet-la-tromperie-devant-la-police.html",
+    #     "http://www.dhnet.be/sports/formule-1/article/377150/ecclestone-bientot-l-europe-n-aura-plus-que-cinq-grands-prix.html",
+    #     "http://www.dhnet.be/infos/belgique/article/378150/la-n-va-menera-l-opposition-a-un-gouvernement-francophone-et-taxateur.html",
+    #     "http://www.dhnet.be/cine-tele/divers/article/378363/sois-belge-et-poile-toi.html",
+    #     "http://www.dhnet.be/infos/societe/article/379508/contribuez-au-journal-des-bonnes-nouvelles.html",
+    #     "http://www.dhnet.be/infos/belgique/article/386721/budget-l-effort-de-2-milliards-confirme.html",
+    #     "http://www.dhnet.be/infos/monde/article/413062/sandy-paralyse-le-nord-est-des-etats-unis.html",
+    #     "http://www.dhnet.be/infos/economie/article/387149/belfius-fait-deja-le-buzz.html",
+    #     "http://www.dhnet.be/infos/faits-divers/article/388710/tragedie-de-sierre-toutes-nos-videos-reactions-temoignages-condoleances.html"
 
-    ]
+    # ]
 
-    for url in urls[-2:-1]:
-        article, html = extract_article_data(url)
+    # for url in urls[-2:-1]:
+    #     article, html = extract_article_data(url)
 
-        if article:
-            article.print_summary()
-            print article.title
-            for tagged_url in article.links:
-                print(u"{0:100} ({1:100}) \t {2}".format(tagged_url.title, tagged_url.URL, tagged_url.tags))
+    #     if article:
+    #         article.print_summary()
+    #         print article.title
+    #         for tagged_url in article.links:
+    #             print(u"{0:100} ({1:100}) \t {2}".format(tagged_url.title, tagged_url.URL, tagged_url.tags))
 
-        print("\n"*4)
+    #     print("\n"*4)
+    url = "/Volumes/Curst/json_db_0_5/dhnet/2011-12-19/15.05.05/raw_data/1.html"
+    f = open(url,"r")
+
+    extract_article_data(f)
+
+