Commits

Petar Marić committed e3582bf

Fixed issue #4 - Added talk subtitles support, as per popular request.
Thanks to Randall Mason for the initial implementation.

Comments (0)

Files changed (6)

+Randall Mason <randall@mason.ch> for the initial talk subtitles support.

metaTED/crawler/get_downloadable_talks.py

 def get_downloadable_talks(num_workers=None):
     talks_urls = get_talks_urls()
     
-    talks_info = cached_storage.get('talks_infos', {})
-    downloadable_talks = []
-    new_talks_urls = []
-    for talk_url in talks_urls:
-        if talk_url in talks_info:
-            downloadable_talks.append(talks_info[talk_url])
-        else:
-            new_talks_urls.append(talk_url)
+    downloadable_talks = cached_storage.get('talks_infos', {})
+    new_talks_urls = [url for url in talks_urls if url not in downloadable_talks]
     
     if not new_talks_urls:
         logging.info('No new talk urls found')
                     else:
                         logging.error("Skipping '%s', reason: %s", talk_url, e)
                 else:
-                    talk_info = future.result()
-                    downloadable_talks.append(talk_info)
-                    talks_info[talk_url] = talk_info
-                    cached_storage['talks_infos'] = talks_info
+                    downloadable_talks[talk_url] = future.result()
+                    cached_storage['talks_infos'] = downloadable_talks
     
     if not downloadable_talks:
         raise NoDownloadableTalksFound('No downloadable talks found')

metaTED/crawler/get_supported_subtitle_languages.py

+import logging
+from lxml import html
+from lxml.cssselect import CSSSelector
+import re
+
+
+LANGUAGES_LIST_URL = 'http://www.ted.com/translate/languages'
+_LANGUAGES_SELECTOR = CSSSelector('div#maincontent div ul li a')
+_LANGUAGE_CODE_RE = re.compile('/translate/languages/([\w\-]+)')
+
+
+def get_supported_subtitle_languages():
+    logging.debug('Looking for supported subtitle languages...')
+    document = html.parse(LANGUAGES_LIST_URL)
+    
+    languages = {}
+    for a in _LANGUAGES_SELECTOR(document):
+        language_name = a.get('title')
+        match = _LANGUAGE_CODE_RE.search(a.get('href'))
+        if match:
+            languages[match.group(1)] = language_name
+        else:
+            logging.warning("'%s' doesn't seem to be a language", language_name)
+    
+    logging.info("Found %d supported subtitle language(s)", len(languages))
+    logging.debug("Supported subtitle languages are: %s", languages)
+    return languages

metaTED/crawler/get_talk_info.py

 
 _THEME_SELECTOR = CSSSelector('ul.relatedThemes li a')
 
+_TRANSCRIPT_LANGUAGES_SELECTOR = CSSSelector('div#transcript select option')
+
 AVAILABLE_VIDEO_QUALITIES = {
     'low': 'Low',
     'standard': 'Regular',
     logging.warning("Failed to guess the theme of '%s'", talk_url)
     return 'Unknown'
 
+def _get_subtitle_languages_codes(talk_url, document):
+    """
+    Returns a list of all subtitle language codes for a given talk URL. 
+    """
+    language_codes = [
+        opt.get('value')
+        for opt in _TRANSCRIPT_LANGUAGES_SELECTOR(document)
+    ]
+    
+    if not language_codes:
+        logging.warning("Failed to find any subtitles for '%s'", talk_url)
+    
+    return language_codes
+
 def _get_download_urls_dict(talk_url):
     """
     Returns a dictionary of all download URLs for a given talk URL, mapping 
     talk_info = {
         'author': _guess_author(talk_url, document),
         'theme': _guess_theme(talk_url, document),
+        'language-codes': _get_subtitle_languages_codes(talk_url, document),
         'qualities': qualities,
     }
     talk_info.update(
 from email.utils import formatdate
 from jinja2 import Environment, PackageLoader
 import logging
+from multiprocessing import Pool
 import os
 from . import __version__
 from .cache import cached_storage
 from .crawler.get_downloadable_talks import get_downloadable_talks
+from .crawler.get_supported_subtitle_languages import get_supported_subtitle_languages
 from .crawler.get_talk_info import AVAILABLE_VIDEO_QUALITIES
 
 
 _METALINK_BASE_URL = "http://metated.petarmaric.com/metalinks/%s"
+_SUBTITLE_URL_FMT = "http://tedsubtitles.appspot.com/getsubtitles?langcode=%s&amp;tedurl=%s"
 
 
-def _get_downloads(downloadable_talks, quality, group_by=None):
+def _get_downloads(downloadable_talks, language_code, quality, group_by):
     downloads = []
-    for talk_info in downloadable_talks:
+    for talk_url, talk_info in downloadable_talks.iteritems():
         quality_info = talk_info['qualities'][quality]
         
-        # Calculate full file path
-        file_name = quality_info['file_name']
+        # Calculate full talk file path
+        talk_file_path = quality_info['file_name']
         if group_by:
-            full_file_path = "%s/%s" % (talk_info[group_by], file_name)
-        else:
-            full_file_path = file_name
+            talk_file_path = "%s/%s" % (talk_info[group_by], talk_file_path)
         
-        downloads.append({
+        download_info = dict(talk={
             'download_url': quality_info['download_url'],
-            'full_file_path': full_file_path
+            'full_file_path': talk_file_path,
         })
+        
+        # Check if there's a subtitle for this talk and language
+        if language_code in talk_info['language-codes']:
+            download_info['subtitle'] = {
+                'download_url': _SUBTITLE_URL_FMT % (
+                    language_code,
+                    talk_url,
+                ),
+                'full_file_path': "%s.%s.srt" % (
+                    os.path.splitext(talk_file_path)[0],
+                    language_code,
+                ),
+            }
+        
+        downloads.append(download_info)
     
     return downloads
 
-def _get_metalink_file_name(quality, group_by):
-    return "TED-talks%s-in-%s-quality.metalink" % (
+def _get_metalink_file_name(language_code, quality, group_by):
+    return "TED-talks%s-in-%s-quality.%s.metalink" % (
         "-grouped-by-%s" % group_by if group_by else '',
-        quality
+        quality,
+        language_code
     )
 
-def _get_metalink_description(quality, group_by):
-    return "Download TED talks%s encoded in %s quality" % (
+def _get_metalink_description(language_name, quality, group_by):
+    return "Download TED talks with %s subtitles%s encoded in %s quality" % (
+        language_name,
         " grouped by %s" % group_by.replace('-', ' ') if group_by else '',
         quality
     )
 
 def _get_group_downloads_by(downloadable_talks):
-    # Also generate metalinks with no grouped downloads
-    groups = [None]
-    # Extract talk_info metadata
-    metadata = downloadable_talks[0].keys()
-    # Guess possible groupings from talk_info metadata
-    groups.extend(metadata)
-    # Can't group by qualities metadata
-    groups.remove('qualities')
+    groups = [None] # Also generate metalinks with no grouped downloads
+    
+    # Extract talk_info metadata and guess possible groupings from it
+    groups.extend(downloadable_talks.itervalues().next().keys())
+    
+    groups.remove('qualities') # Can't group by qualities metadata
+    groups.remove('language-codes') # Can't group by subtitle languages metadata
     
     logging.debug("Downloads can be grouped by '%s'", groups)
     return groups
 
+_metalink_worker_immutable_data_cache = {}
+def _init_metalink_worker_immutable_data_cache(*data):
+    global _metalink_worker_immutable_data_cache
+    
+    data_keys = 'output_dir, downloadable_talks, first_published_on, refresh_date'.split(', ')
+    _metalink_worker_immutable_data_cache = dict(zip(data_keys, data))
+    
+    # Prepare the template upfront, because it can be reused by the same worker
+    # process for multiple metalinks
+    env = Environment(loader=PackageLoader('metaTED'))
+    _metalink_worker_immutable_data_cache['template'] = env.get_template(
+        'template.metalink'
+    )
+
+def _generate_metalink(args):
+    language_code, language_name, group_by, quality = args
+    c = _metalink_worker_immutable_data_cache
+    
+    metalink_file_name = _get_metalink_file_name(language_code, quality, group_by)
+    metalink_url = _METALINK_BASE_URL % metalink_file_name
+    metalink_description = _get_metalink_description(language_name, quality, group_by)
+    logging.debug("Generating '%s' metalink...", metalink_file_name)
+    c['template'].stream({
+        'metalink_url': metalink_url,
+        'metaTED_version': __version__,
+        'first_published_on': c['first_published_on'],
+        'refresh_date': c['refresh_date'],
+        'description': metalink_description,
+        'downloads': _get_downloads(
+            c['downloadable_talks'], language_code, quality, group_by
+        ),
+    }).dump(
+        os.path.join(c['output_dir'], metalink_file_name),
+        encoding='utf-8'
+    )
+    logging.info("Generated '%s' metalink", metalink_file_name)
+    return {
+        'language_code': language_code,
+        'language_name': language_name,
+        'download_url': metalink_url,
+        'description': metalink_description,
+    }
+
 def generate_metalinks(output_dir=None):
     output_dir = os.path.abspath(output_dir or '')
     if not os.path.exists(output_dir):
     # Make sure downloadable_talks can be calculated
     downloadable_talks = get_downloadable_talks()
     
-    # Prepare the template upfront, because it can be reused between metalinks
-    env = Environment(loader=PackageLoader('metaTED'))
-    template = env.get_template('template.metalink')
-    
     # Use the same dates/times for all metalinks because they should, in my
     # opinion, point out when the metalinks were being generated and not when
     # they were physically written do disk
         cached_storage['first_published_on'] = first_published_on = refresh_date
     
     # Generate all metalink variants
-    metalinks = []
-    for group_by in _get_group_downloads_by(downloadable_talks):
-        for quality in AVAILABLE_VIDEO_QUALITIES.keys():
-            metalink_file_name = _get_metalink_file_name(quality, group_by)
-            metalink_url = _METALINK_BASE_URL % metalink_file_name
-            metalink_description = _get_metalink_description(quality, group_by)
-            logging.debug("Generating '%s' metalink...", metalink_file_name)
-            template.stream({
-                'metalink_url': metalink_url,
-                'metaTED_version': __version__,
-                'first_published_on': first_published_on,
-                'refresh_date': refresh_date,
-                'description': metalink_description,
-                'talks': _get_downloads(downloadable_talks, quality, group_by)
-            }).dump(
-                os.path.join(output_dir, metalink_file_name),
-                encoding='utf-8'
-            )
-            metalinks.append({
-                'download_url': metalink_url,
-                'description': metalink_description
-            })
-            logging.info("Generated '%s' metalink", metalink_file_name)
+    group_by_list = _get_group_downloads_by(downloadable_talks)
+    variants = [
+        (language_code, language_name, group_by, quality)
+        for language_code, language_name in get_supported_subtitle_languages().items()
+            for group_by in group_by_list
+                for quality in AVAILABLE_VIDEO_QUALITIES.keys()
+    ]
+    metalinks = Pool(
+        initializer=_init_metalink_worker_immutable_data_cache,
+        initargs=(output_dir, downloadable_talks, first_published_on, refresh_date)
+    ).map(
+        func=_generate_metalink,
+        iterable=variants,
+    )
     
     return {
         'metaTED_version': __version__,