metaTED / metaTED / crawler / get_talks_urls.py

import logging
from lxml import html
from lxml.cssselect import CSSSelector
from urlparse import urljoin
from .. import SITE_URL


TALKS_LIST_URL = "http://www.ted.com/talks/quick-list"
_TALKS_URLS_SELECTOR = CSSSelector('table.downloads tr td:nth-child(3) a')


TALKS_URLS_BLACKLIST = [
    # No downloads
    'http://www.ted.com/talks/rokia_traore_sings_m_bifo.html',
    'http://www.ted.com/talks/rokia_traore_sings_kounandi.html',
    'http://www.ted.com/talks/andrew_stanton_the_clues_to_a_great_story.html',
]


def get_talks_urls():
    logging.debug('Looking for talk urls...')
    document = html.parse(TALKS_LIST_URL)
    talks_urls = [
        urljoin(SITE_URL, a.get('href'))
        for a in _TALKS_URLS_SELECTOR(document)
    ]
    
    # Remove the well-known problematic talk URLs (i.e. no downloads available)
    talks_urls = [url for url in talks_urls if url not in TALKS_URLS_BLACKLIST]
    
    logging.info("Found %d talk url(s) in total", len(talks_urls))
    return talks_urls
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.