Commits

Petar Marić committed 64d4f3f

Updated filming year, publishing year, event name markers and the detection code as TED updated their HTML layout

  • Participants
  • Parent commits 7c6075e

Comments (0)

Files changed (1)

File metaTED/crawler/get_talk_info.py

 
 _EXTERNALLY_HOSTED_DOWNLOADS_SELECTOR = CSSSelector('div#external_player')
 
-_VIDEO_PLAYER_SELECTOR = CSSSelector('body script:last-child')
-_VIDEO_PLAYER_METADATA = {
-    'event': re.compile('en:\"(.+)\",'),
-    'filming-year': re.compile('fd:\"\w+ (\d+)\",'),
-    'publishing-year': re.compile('pd:\"\w+ (\d+)\",'),
-}
+_AUTHOR_BIO_XPATH = XPath(u'//a[text()="Full bio »"]')
 
-_AUTHOR_BIO_XPATH = XPath(u'//a[text()="Full bio »"]')
+_EVENT_SELECTOR = CSSSelector('div.talk-meta span.event-name')
 
 _THEME_SELECTOR = CSSSelector('ul.relatedThemes li a')
 
-_TRANSCRIPT_LANGUAGES_SELECTOR = CSSSelector('div#transcript select option')
+_TRANSCRIPT_LANGUAGES_SELECTOR = CSSSelector('select#languageCode option')
 
 AVAILABLE_VIDEO_QUALITIES = {
     'low': 'Low',
     '//a[@href=$relative_talk_url]/ancestor::node()[name()="tr"]/td[5]/a'
 )
 
+_YEARS_SELECTOR = CSSSelector('div.talk-meta')
+_YEARS_RE_DICT = {
+    'filming-year': re.compile('Filmed \w+ (\d+)'),
+    'publishing-year': re.compile('Posted \w+ (\d+)'),
+}
+
 
 class NoDownloadsFound(Exception):
     pass
     
     return _talk_list_document_cache
 
-def _guess_video_player_metadata(name, regexp, talk_url, document):
-    elements = _VIDEO_PLAYER_SELECTOR(document)
-    if elements:
-        match = regexp.search(elements[0].text)
-        if match:
-            return _clean_up_file_name(match.group(1))
-    
-    logging.warning("Failed to guess the %s of '%s'", name, talk_url)
-    return 'Unknown'
-
 def _guess_author(talk_url, document):
     """
     Tries to guess the author, or returns 'Unknown' if no author was found.
     logging.warning("Failed to guess the author of '%s'", talk_url)
     return 'Unknown'
 
+def _guess_event(talk_url, document):
+    """
+    Tries to guess the talks event, or returns 'Unknown' if no event was found.
+    """
+    elements = _EVENT_SELECTOR(document)
+    if elements:
+        return _clean_up_file_name(elements[0].text)
+    
+    logging.warning("Failed to guess the event of '%s'", talk_url)
+    return 'Unknown'
+
 def _guess_theme(talk_url, document):
     """
     Tries to guess the talks theme, or returns 'Unknown' if no theme was found.
     language_codes = [
         opt.get('value')
         for opt in _TRANSCRIPT_LANGUAGES_SELECTOR(document)
+        if opt.get('value') != ''
     ]
     
     if not language_codes:
         )
     )
 
+def _guess_year(name, regexp, talk_url, document):
+    elements = _YEARS_SELECTOR(document)
+    if elements:
+        match = regexp.search(elements[0].text_content())
+        if match:
+            return _clean_up_file_name(match.group(1))
+    
+    logging.warning("Failed to guess the %s of '%s'", name, talk_url)
+    return 'Unknown'
+
 def get_talk_info(talk_url):
     document = html.parse(talk_url)
     file_base_name = _clean_up_file_name(
         'qualities': qualities,
     }
     talk_info.update(
-        (name, _guess_video_player_metadata(name, regexp, talk_url, document))
-        for name, regexp in _VIDEO_PLAYER_METADATA.items()
+        (name, _guess_year(name, regexp, talk_url, document))
+        for name, regexp in _YEARS_RE_DICT.items()
     )
     return talk_info