Petar Marić avatar Petar Marić committed 722de5c

Programmatically blacklisted all talk URLs with externally hosted downloads

Comments (0)

Files changed (3)

metaTED/crawler/get_downloadable_talks.py

 import logging
-from metaTED.crawler.get_talk_info import get_talk_info, NoDownloadsFound
+from metaTED.crawler.get_talk_info import get_talk_info, ExternallyHostedDownloads, NoDownloadsFound
 from metaTED.crawler.get_talks_urls import get_talks_urls
 
 
                     num_urls
                 )
             downloadable_talks.append(get_talk_info(talk_url))
+        except ExternallyHostedDownloads, e:
+            logging.info(
+                "Downloads for '%s' are not hosted by TED, skipping",
+                talk_url
+            )
         except NoDownloadsFound, e:
             logging.error("No downloads for '%s', skipping", talk_url)
         except Exception, e:

metaTED/crawler/get_talk_info.py

     pass
 
 
+class ExternallyHostedDownloads(Exception):
+    pass
+
+
 def _clean_up_file_name(file_name, replace_first_colon_with_dash=False):
     if replace_first_colon_with_dash:
         # Turns 'Barry Schuler: Genomics' into 'Barry Schuler - Genomics'
         True
     )
     
+    if soup.find('div', 'external_player'): # Downloads not hosted by TED!
+        raise ExternallyHostedDownloads(talk_url)
+    
     # Try to find download URLs for all qualities
     qualities_found = []
     qualities_missing = []

metaTED/crawler/get_talks_urls.py

     # No downloads
     'http://www.ted.com/talks/rokia_traore_sings_m_bifo.html',
     'http://www.ted.com/talks/rokia_traore_sings_kounandi.html',
-    'http://www.ted.com/talks/cat_laine_engineering_a_better_life_for_all.html',
-    'http://www.ted.com/talks/michael_sandel_what_s_the_right_thing_to_do.html',
-    'http://www.ted.com/talks/steve_jobs_how_to_live_before_you_die.html',
-    'http://www.ted.com/talks/edwidge_danticat_stories_of_haiti.html',
-    'http://www.ted.com/talks/matt_weinstein_what_bernie_madoff_couldn_t_steal_from_me.html',
-    'http://www.ted.com/talks/robert_sapolsky_the_uniqueness_of_humans.html',
-    'http://www.ted.com/talks/randy_pausch_really_achieving_your_childhood_dreams.html',
-    
-    # Internal server error + invalid HTML
-    'http://www.ted.com/talks/ex_moonie_diane_benscoter_how_cults_think.html',
-    'http://www.ted.com/talks/yann_arthus_bertrand_captures_fragile_earth_in_wide_angle.html',
-    'http://www.ted.com/talks/frank_gehry_as_a_young_rebel.html',
 ]
 
 
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.