Petar Marić avatar Petar Marić committed f0ed6f6

Switched from `BeautifulSoup` to `lxml`, removed custom crawler code in favor of `lxml.html.parse`.

Comments (0)

Files changed (4)

metaTED/crawler/__init__.py

-import logging
-import metaTED
-import requests
-
-
-_DEFAULT_RETRY_TIMES = 5
-
-
-def urlread(fullurl, max_retries=_DEFAULT_RETRY_TIMES):
-    saved_exception = None
-    for try_num in xrange(1, max_retries+1):
-        try:
-            logging.debug(
-                "Requesting '%s' (try %d of %d)...",
-                fullurl,
-                try_num,
-                max_retries
-            )
-            r = requests.get(
-                url=fullurl,
-                headers={
-                    'User-Agent': "metaTED/%s" % metaTED.__version__,
-            })
-            
-            # Check if we made a bad request
-            r.raise_for_status()
-            
-            logging.debug("Successfully read data from '%s'", fullurl)
-            return r.content
-        except requests.RequestException, e:
-            if try_num == max_retries:
-                log_func = logging.fatal
-                message = "Giving up! Could not read data from '%s': %s"
-                saved_exception = e
-            else:
-                log_func = logging.warning
-                message = "Problem while trying to read data from '%s': %s"
-            log_func(message, fullurl, e)
-    
-    # Re-raise the last exception because crawler used up all retries
-    raise saved_exception

metaTED/crawler/get_talk_info.py

 import re
 import logging
+from lxml.cssselect import CSSSelector
+from lxml import html
 from urlparse import urljoin
-from BeautifulSoup import BeautifulSoup
 from metaTED import SITE_URL
 from metaTED.cache import cached_storage
-from metaTED.crawler import urlread
 
 
 AVAILABLE_VIDEO_QUALITIES = {
 
 _HTML_ENTITY_RE = re.compile(r'&(#?[xX]?[0-9a-fA-F]+|\w{1,8});')
 _INVALID_FILE_NAME_CHARS_RE = re.compile('[^\w\.\- ]+')
+
+_EXTERNALLY_HOSTED_DOWNLOADS_SELECTOR = CSSSelector('div#external_player')
+
+_VIDEO_PLAYER_SELECTOR = CSSSelector('div#videoPlayerSWF + script')
 _FILMING_YEAR_RE = re.compile('fd:\"\w+ (\d+)\",')
 _PUBLISHING_YEAR_RE = re.compile('pd:\"\w+ (\d+)\",')
 
+_AUTHOR_SELECTOR = CSSSelector('div#accordion div p strong')
+
+_THEME_SELECTOR = CSSSelector('ul.relatedThemes li a')
+
 
 class NoDownloadsFound(Exception):
     pass
     return file_name
 
 
-def _guess_year(talk_url, soup):
+def _guess_year(talk_url, document):
     """
     Tries to guess the filming year, or if it's not available - the publishing
     year.
     
     Returns year as string, or 'Unknown' if no date was found.
     """
-    year_txt = soup.find(id='videoPlayerSWF').findNextSibling('script').string
-    match = _FILMING_YEAR_RE.search(year_txt)
-    if match is None:
-        logging.debug("Failed to guess the filming year of '%s'", talk_url)
-        match = _PUBLISHING_YEAR_RE.search(year_txt)
-    if match:
-        return match.group(1)
-    else:
-        logging.warning(
-            "Failed to guess both the publishing and filming year of '%s'",
-            talk_url
-        )
-        return 'Unknown'
+    elements = _VIDEO_PLAYER_SELECTOR(document)
+    if elements:
+        year_txt = elements[0].text
+        match = _FILMING_YEAR_RE.search(year_txt)
+        if match is None:
+            logging.debug("Failed to guess the filming year of '%s'", talk_url)
+            match = _PUBLISHING_YEAR_RE.search(year_txt)
+        if match:
+            return match.group(1)
+    
+    logging.warning(
+        "Failed to guess both the publishing and filming year of '%s'",
+        talk_url
+    )
+    return 'Unknown'
 
 
-def _guess_author(talk_url, soup):
+def _guess_author(talk_url, document):
     """
     Tries to guess the author, or returns 'Unknown' if no author was found.
     """
-    element = soup.find(id='accordion').find(
-        'a', text='About The Speaker'
-    ).next.next.p.strong
-    if element:
-        return _clean_up_file_name(element.string)
-    else:
-        logging.warning(
-            "Failed to guess the author of '%s'",
-            talk_url
-        )
-        return 'Unknown'
+    elements = _AUTHOR_SELECTOR(document)
+    if elements:
+        return _clean_up_file_name(elements[0].text)
+    
+    logging.warning(
+        "Failed to guess the author of '%s'",
+        talk_url
+    )
+    return 'Unknown'
 
 
-def _guess_theme(talk_url, soup):
+def _guess_theme(talk_url, document):
     """
     Tries to guess the talks theme, or returns 'Unknown' if no theme was found.
     """
-    try:
-        element = soup.find('ul', 'relatedThemes notranslate').li.a
-        return _clean_up_file_name(element.string)
-    except AttributeError:
-        # If one of the child nodes isn't found in the parse tree Beautiful Soup
-        # will return `None`. Trying to access any of the `None`s child nodes
-        # will raise an `AttributeError`.  
-        logging.warning(
-            "Failed to guess the theme of '%s'",
-            talk_url
-        )
-        return 'Unknown'
+    elements = _THEME_SELECTOR(document)
+    if elements:
+        return _clean_up_file_name(elements[0].text)
+    
+    logging.warning(
+        "Failed to guess the theme of '%s'",
+        talk_url
+    )
+    return 'Unknown'
 
 
-def _find_download_url(soup, quality_marker):
+def _find_download_url(document, quality_marker):
     """
     Returns download URL of a talk in requested video quality, or None if the
     talk can't be downloaded in that quality.
     """
-    element = soup.find(text=quality_marker)
-    return element and urljoin(SITE_URL, element.parent['href'])
+    elements = document.xpath("//a[text()='%s']" % quality_marker)
+    if elements:
+        return urljoin(SITE_URL, elements[0].get('href'))
 
 
 def _get_talk_info(talk_url):
-    soup = BeautifulSoup(urlread(talk_url))
+    document = html.parse(talk_url)
     file_base_name = _clean_up_file_name(
-        soup.html.head.title.string.split('|')[0].strip(),
+        document.find('/head/title').text.split('|')[0].strip(),
         True
     )
     
-    if soup.find('div', 'external_player'): # Downloads not hosted by TED!
+    # Downloads not hosted by TED!
+    if _EXTERNALLY_HOSTED_DOWNLOADS_SELECTOR(document):
         raise ExternallyHostedDownloads(talk_url)
     
     # Try to find download URLs for all qualities
     qualities_missing = []
     qualities = {}
     for name, info in AVAILABLE_VIDEO_QUALITIES.items():
-        download_url = _find_download_url(soup, info['marker'])
+        download_url = _find_download_url(document, info['marker'])
         if download_url:
             qualities_found.append(name)
             qualities[name] = {
             )
     
     return {
-        'year': _guess_year(talk_url, soup),
-        'author': _guess_author(talk_url, soup),
-        'theme': _guess_theme(talk_url, soup),
+        'year': _guess_year(talk_url, document),
+        'author': _guess_author(talk_url, document),
+        'theme': _guess_theme(talk_url, document),
         'qualities': qualities,
     }
 

metaTED/crawler/get_talks_urls.py

 import logging
-import re
+from lxml.cssselect import CSSSelector
+from lxml import html
+from metaTED import SITE_URL
 from urlparse import urljoin
-from BeautifulSoup import BeautifulSoup
-from metaTED import SITE_URL
-from metaTED.crawler import urlread
 
 
 TALKS_LIST_URL = "http://www.ted.com/talks/quick-list"
-TOTAL_TALKS_RE = re.compile("Showing \d+ - \d+ of\s+(\d+)")
+_TALKS_URLS_SELECTOR = CSSSelector('table.downloads tr td:nth-child(3) a')
 
 
 TALKS_URLS_BLACKLIST = [
 
 def get_talks_urls():
     logging.debug('Looking for talk urls...')
-    soup = BeautifulSoup(urlread(TALKS_LIST_URL))
-    talks_table = soup.find('table', 'downloads')
+    document = html.parse(TALKS_LIST_URL)
     talks_urls = [
-        urljoin(SITE_URL, tr.findAll('td')[2].a['href'])
-        for tr in talks_table.findAll('tr')[1:] # Skip 1st 'tr', used as header
+        urljoin(SITE_URL, a.get('href'))
+        for a in _TALKS_URLS_SELECTOR(document)
     ]
     
     # Remove the well-known problematic talk URLs (i.e. no downloads available)
-BeautifulSoup>=3.1
 Jinja2>=2.1
-requests>=0.7
+lxml>=2.2
 shove>=0.2.2
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.