Commits

Petar Marić committed 0494a6b

Updated requirements and added cache and crawler code

Comments (0)

Files changed (6)

metaTED/__init__.py

+SITE_URL = 'http://www.ted.com/'
+
+__version__ = '1.0.0'
+import os
+from shove import Shove
+from shove.store.file import FileStore
+
+
+cached_storage = Shove(
+    store=FileStore(os.path.expanduser('~/.metaTED/cache')),
+    cache='simplelru://'
+)
+
+cache = cached_storage._cache

metaTED/crawler/__init__.py

+import logging
+import urllib2
+import metaTED
+from metaTED.cache import cache
+
+
+CRAWLER_RETRY_TIMES = 5
+
+
+_opener = urllib2.build_opener()
+_opener.addheaders = [('User-agent', 'metaTED/%s' % metaTED.__version__)]
+
+
+def urlread(fullurl):
+    # Check in-memory cache before requesting url
+    logging.debug("Searching cache for '%s' contents...", fullurl)
+    if fullurl in cache:
+        logging.debug("Found the cached version of '%s' contents", fullurl)
+        return cache[fullurl]
+    logging.debug("Failed to find the cached version of '%s' contents", fullurl)
+
+    saved_exception = None
+    for try_num in xrange(1, CRAWLER_RETRY_TIMES+1):
+        try:
+            logging.debug(
+                "Requesting '%s' (try %d of %d)...",
+                fullurl,
+                try_num,
+                CRAWLER_RETRY_TIMES
+            )
+            data = _opener.open(fullurl).read()
+            logging.debug("Successfully read data from '%s'", fullurl)
+            cache[fullurl] = data
+            return data
+        except urllib2.URLError, e:
+            if try_num == CRAWLER_RETRY_TIMES:
+                log_func = logging.fatal
+                message = "Giving up! Could not read data from '%s': %s"
+                saved_exception = e
+            else:
+                log_func = logging.warning
+                message = "Problem while trying to read data from '%s': %s"
+            log_func(message, fullurl, e)
+    
+    # Re-raise the last exception because crawler used up all retries
+    raise saved_exception

metaTED/crawler/get_talk_info.py

+import re
+import logging
+from urlparse import urljoin
+from BeautifulSoup import BeautifulSoup
+from metaTED import SITE_URL
+from metaTED.cache import cached_storage
+from metaTED.crawler import urlread
+
+
+AVAILABLE_VIDEO_QUALITIES = {
+    'low': {
+        'marker': 'Video to desktop (Zipped MP4)',
+        'file_extension': 'zip',
+    },
+    'high': {
+        'marker': 'Watch this talk as high-res video',
+        'file_extension': 'mp4',
+    },
+}
+
+
+_HTML_ENTITY_RE = re.compile(r'&(#?[xX]?[0-9a-fA-F]+|\w{1,8});')
+_INVALID_FILE_NAME_CHARS_RE = re.compile('[^\w\.\- ]+')
+_FILMING_YEAR_RE = re.compile('so\.addVariable\(\"fd\",\"\w+ (\d+)\"\);')
+_PUBLISHING_YEAR_RE = re.compile('so\.addVariable\(\"pd\",\"\w+ (\d+)\"\);')
+
+
+class NoDownloadsFound(Exception):
+    pass
+
+
+def _guess_year(talk_url, soup):
+    """
+    Tries to guess the filming year, or if it's not available - the publishing
+    year.
+    
+    Returns year as string, or 'Unknown' if no date was found.
+    """
+    year_txt = soup.find(
+        id='videoPlayerSwf'
+    ).findNextSibling('script').string
+    match = _FILMING_YEAR_RE.search(year_txt)
+    if match is None:
+        logging.debug("Failed to guess the filming year of '%s'", talk_url)
+        match = _PUBLISHING_YEAR_RE.search(year_txt)
+    if match:
+        return match.group(1)
+    else:
+        logging.debug("Failed to guess the publishing year of '%s'", talk_url)
+        return 'Unknown'
+
+
+def _guess_file_base_name(soup):
+    """
+    Returns a user-friendly file base name, guessed from the talk title.
+    """
+    # Guess talk title from <title> tag
+    title = soup.html.head.title.string.split('|')[0].strip()
+    # Turns 'Barry Schuler: Genomics' into 'Barry Schuler - Genomics'
+    file_base_name = title.replace(': ', ' - ', 1)
+    # Remove html enitites
+    file_base_name = _HTML_ENTITY_RE.sub('', file_base_name)
+    # Remove invalid file name characters
+    file_base_name = _INVALID_FILE_NAME_CHARS_RE.sub('', file_base_name)
+    # Should be clean now
+    return file_base_name
+    
+
+def _find_download_url(soup, quality_marker):
+    """
+    Returns download URL of a talk in requested video quality, or None if the
+    talk can't be downloaded in that quality.
+    """
+    element = soup.find(text=quality_marker)
+    return element and urljoin(SITE_URL, element.parent['href'])
+
+
+def _get_talk_info(talk_url):
+    soup = BeautifulSoup(urlread(talk_url))
+    file_base_name = _guess_file_base_name(soup)
+    
+    # Try to find download URLs for all qualities
+    qualities_found = []
+    qualities_missing = []
+    qualities = {}
+    for name, info in AVAILABLE_VIDEO_QUALITIES.items():
+        download_url = _find_download_url(soup, info['marker'])
+        if download_url:
+            qualities_found.append(name)
+            qualities[name] = {
+                'download_url': download_url,
+                'file_name': "%s.%s" % (file_base_name, info['file_extension'])
+            }
+        else:
+            logging.error(
+                "Failed to find the %s quality download URL for '%s'",
+                name,
+                talk_url
+            )
+            qualities_missing.append(name)
+
+    if len(qualities_found) == 0: # No downloads found!
+        raise NoDownloadsFound(talk_url)
+
+    if len(qualities_missing) > 0: # Some found, but not all
+        # Use what you got, emulate the rest with the first discovered quality
+        emulator_name = qualities_found[0]
+        emulator = qualities[emulator_name]
+        for name in qualities_missing:
+            qualities[name] = emulator
+            logging.warn(
+                "Emulating %s quality with %s quality for '%s'",
+                name,
+                emulator_name,
+                talk_url
+            )
+    
+    return {
+        'year': _guess_year(talk_url, soup),
+        'qualities': qualities,
+    }
+
+
+def get_talk_info(talk_url):
+    talks_info = cached_storage.get('talks_infos', {})
+    logging.debug("Searching cache for talk info on '%s'...", talk_url)
+    if talk_url in talks_info:
+        logging.debug("Found the cached version of '%s' talk info", talk_url)
+        return talks_info[talk_url]
+    
+    # Cache miss
+    logging.debug(
+        "Failed to find the cached version of '%s' talk info, calculating.",
+        talk_url
+    )
+    info = _get_talk_info(talk_url)
+    talks_info[talk_url] = info
+    cached_storage['talks_infos'] = talks_info
+    return info

metaTED/crawler/get_talks_urls.py

+import logging
+import re
+from urlparse import urljoin
+from BeautifulSoup import BeautifulSoup
+from metaTED import SITE_URL
+from metaTED.cache import cached_storage
+from metaTED.crawler import urlread
+
+
+TALKS_LIST_URLS = "http://www.ted.com/index.php/talks/list/page/%d"
+TOTAL_PAGES_RE = re.compile("Showing page \d+ of (\d+)")
+
+
+def _read_page(page_num):
+    return urlread(TALKS_LIST_URLS % page_num)
+
+
+def _get_num_pages():
+    logging.debug('Trying to find out the number of talk list pages...')
+    soup = BeautifulSoup(_read_page(1))
+    num_pages = int(
+        TOTAL_PAGES_RE.match(
+            soup.find('p', text=TOTAL_PAGES_RE)
+        ).group(1)
+    )
+    logging.info("Found %d talk list page(s)", num_pages)
+    return num_pages
+
+
+def _get_talks_urls_from_page(page_num):
+    logging.debug("Looking for talk urls on page #%d", page_num)
+    soup = BeautifulSoup(_read_page(page_num))
+    urls = [urljoin(SITE_URL, h.a['href']) for h in soup.findAll('h4')]
+    logging.info("Found %d talk url(s) on page #%d", len(urls), page_num)
+    return urls
+
+
+def _get_talks_urls():
+    urls = []
+    for page in xrange(1, _get_num_pages()+1): # Talk list pages are 1-indexed
+        urls.extend(_get_talks_urls_from_page(page))
+    logging.info("Found %d talk url(s) in total", len(urls))
+    return urls
+
+
+def _check_talks_urls_cache():
+    logging.info('Looking for a cached version of talk urls...')
+    if 'talks_urls' in cached_storage:
+        # Cached version of talk urls is considered valid if:
+        # 1. Real number of talk list pages is equal to the cached number
+        # 2. Real number of talk urls on the last list page is equal to the
+        #    cached number
+        logging.info('Found a cached version of talk urls. Validating...')
+        num_pages = cached_storage.get('num_of_talk_list_pages')
+        if num_pages and num_pages == _get_num_pages():
+            num_talks = cached_storage.get('num_of_talks_urls_on_last_page')
+            if num_talks and \
+            num_talks == len(_get_talks_urls_from_page(num_pages)):
+                logging.info('Found a valid cached version of talk urls')
+                return True
+        logging.warning('Cached version of talk urls is invalid')
+        return False
+    logging.info('Failed to find the cached version of talk url(s)')
+    return False
+
+
+def get_talks_urls():
+    if not _check_talks_urls_cache():
+        cached_storage['num_of_talk_list_pages'] = _get_num_pages()
+        cached_storage['num_of_talks_urls_on_last_page'] = len(
+            _get_talks_urls_from_page(cached_storage['num_of_talk_list_pages'])
+        )
+        cached_storage['talks_urls'] = _get_talks_urls()
+    return cached_storage['talks_urls']
+BeautifulSoup
+shove
+Jinja2