Petar Marić avatar Petar Marić committed 1d2a163

Updated author markers and detection code as TED updated their HTML layout

Comments (0)

Files changed (1)

metaTED/crawler/get_talk_info.py

 import logging
 from lxml import html
 from lxml.cssselect import CSSSelector
+from lxml.etree import XPath
 from os.path import splitext
 import re
 from urlparse import urljoin, urlsplit
     'publishing-year': re.compile('pd:\"\w+ (\d+)\",'),
 }
 
-_AUTHOR_SELECTOR = CSSSelector('div#accordion div p strong')
+_AUTHOR_BIO_XPATH = XPath('//a[text()="Full bio and more links"]')
 
 _THEME_SELECTOR = CSSSelector('ul.relatedThemes li a')
 
     """
     Tries to guess the author, or returns 'Unknown' if no author was found.
     """
-    elements = _AUTHOR_SELECTOR(document)
+    elements = _AUTHOR_BIO_XPATH(document)
     if elements:
-        return _clean_up_file_name(elements[0].text)
+        author_bio_url = urljoin(SITE_URL, elements[0].get('href'))
+        author_bio_document = html.parse(author_bio_url)
+        return _clean_up_file_name(
+            author_bio_document.find('/head/title').text.split('|')[0].strip()
+        )
     
     logging.warning("Failed to guess the author of '%s'", talk_url)
     return 'Unknown'
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.