Commits

Petar Marić committed ddc88cb

Added metaTED.crawler.get_downloadable_talks module

  • Participants
  • Parent commits 85e0aaf

Comments (0)

Files changed (2)

metaTED/crawler/get_downloadable_talks.py

+import logging
+from metaTED.crawler.get_talk_info import get_talk_info, NoDownloadsFound
+
+
+_PAGINATE_BY = 20
+
+
+def get_downloadable_talks(talks_urls):
+    num_urls = len(talks_urls)
+    downloadable_talks = []
+    for index, talk_url in enumerate(talks_urls):
+        try:
+            if index % _PAGINATE_BY == 0:
+                logging.info(
+                    "Getting download information on %d of %d talks...",
+                    index+1,
+                    num_urls
+                )
+            downloadable_talks.append(get_talk_info(talk_url))
+        except NoDownloadsFound, e:
+            logging.error("No downloads for '%s', skipping", talk_url)
+    logging.info(
+        "Found %d downloadable talks in total",
+        len(downloadable_talks)
+    )
+    return downloadable_talks

metaTED/metalink.py

 from metaTED.crawler.get_talk_info import AVAILABLE_VIDEO_QUALITIES
 
 
-def _get_downloads(talk_infos, quality, group_by=None):
+def _get_downloads(downloadable_talks, quality, group_by=None):
     downloads = []
-    for talk_info in talk_infos:
+    for talk_info in downloadable_talks:
         quality_info = talk_info['qualities'][quality]
         
         # Calculate full file path
     return "TED-talks%s-in-%s-quality.metalink" % (group_part, quality)
 
 
-def _get_group_downloads_by(talk_infos):
+def _get_group_downloads_by(downloadable_talks):
     # Also generate metalinks with no grouped downloads
     groups = [None]
     
     # Guess possible groupings from talk_info metadata
-    groups.extend(talk_infos[0].keys())
+    groups.extend(downloadable_talks[0].keys())
     groups.remove('qualities')
     
     logging.debug("Downloads can be grouped by '%s'", groups)
     return groups
 
 
-def generate_metalinks(talk_infos):
+def generate_metalinks(downloadable_talks):
     refresh_date = formatdate()
     first_published_on = cached_storage.get('first_published_on')
     if first_published_on is None:
     env = Environment(loader=PackageLoader('metaTED'))
     template = env.get_template('template.metalink')
 
-    for group_by in _get_group_downloads_by(talk_infos):
+    for group_by in _get_group_downloads_by(downloadable_talks):
         for quality in AVAILABLE_VIDEO_QUALITIES.keys():
             metalink_file_name = _get_metalink_file_name(quality, group_by)
             logging.debug("Generating '%s' metalink...", metalink_file_name)
                 'refresh_date': refresh_date,
                 'quality': quality,
                 'group_by': group_by,
-                'talks': _get_downloads(talk_infos, quality, group_by)
+                'talks': _get_downloads(downloadable_talks, quality, group_by)
             }).dump(metalink_file_name, encoding='utf-8')
             logging.info("Generated '%s' metalink", metalink_file_name)