Petar Marić avatar Petar Marić committed 944ebf4

Updated video download markers and download URLs detection code as TED updated their HTML layout

Comments (0)

Files changed (1)

metaTED/crawler/get_talk_info.py

 import logging
 from lxml.cssselect import CSSSelector
 from lxml import html
-from urlparse import urljoin
+from os.path import splitext
+from urlparse import urljoin, urlsplit
 from metaTED import SITE_URL
 from metaTED.cache import cached_storage
+from metaTED.crawler.get_talks_urls import TALKS_LIST_URL
 
 
 AVAILABLE_VIDEO_QUALITIES = {
-    'low': {
-        'marker': 'Low-res video (MP4)',
-        'file_extension': 'mp4',
-    },
-    'standard': {
-        'marker': 'Download to desktop (MP4)',
-        'file_extension': 'mp4',
-    },
-    'high': {
-        'marker': 'High-res video (MP4)',
-        'file_extension': 'mp4',
-    },
+    'low': 'Low',
+    'standard': 'Regular',
+    'high': 'High',
 }
 
 
 
 _THEME_SELECTOR = CSSSelector('ul.relatedThemes li a')
 
+_QUALITIES_XPATH_FMT = "//a[@href='%s']/ancestor::node()[name()='tr']/td[5]/a"
+
 
 class NoDownloadsFound(Exception):
     pass
     return file_name
 
 
+_talk_list_document_cache = None
+def _get_talk_list_document():
+    global _talk_list_document_cache
+    
+    if _talk_list_document_cache is None:
+        _talk_list_document_cache = html.parse(TALKS_LIST_URL)
+    
+    return _talk_list_document_cache
+
+
 def _guess_year(talk_url, document):
     """
     Tries to guess the filming year, or if it's not available - the publishing
     return 'Unknown'
 
 
-def _find_download_url(document, quality_marker):
+def _get_download_urls_dict(talk_url):
     """
-    Returns download URL of a talk in requested video quality, or None if the
-    talk can't be downloaded in that quality.
+    Returns a dictionary of all download URLs for a given talk URL, mapping 
+    quality marker to the download URL.
     """
-    elements = document.xpath("//a[text()='%s']" % quality_marker)
-    if elements:
-        return urljoin(SITE_URL, elements[0].get('href'))
+    return dict(
+        (a.text.strip(), urljoin(SITE_URL, a.get('href')))
+        for a in _get_talk_list_document().xpath(
+            _QUALITIES_XPATH_FMT % urlsplit(talk_url).path
+        )
+    )
 
 
 def _get_talk_info(talk_url):
     qualities_found = []
     qualities_missing = []
     qualities = {}
-    for name, info in AVAILABLE_VIDEO_QUALITIES.items():
-        download_url = _find_download_url(document, info['marker'])
+    quality_marker_to_download_url = _get_download_urls_dict(talk_url)
+    for name, marker in AVAILABLE_VIDEO_QUALITIES.items():
+        download_url = quality_marker_to_download_url.get(marker)
         if download_url:
             qualities_found.append(name)
             qualities[name] = {
                 'download_url': download_url,
-                'file_name': "%s.%s" % (file_base_name, info['file_extension'])
+                'file_name': "%s%s" % (file_base_name, splitext(download_url)[1])
             }
         else:
             logging.error(
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.