Source

djangocon-talks / fetchtalks.py

Chris Wesseling 89ec66d 

































































































#!/usr/bin/env python
from BeautifulSoup import BeautifulSoup
import os
import sys
import urllib
import re
import feedparser

CON_URL = 'http://2011.djangocon.eu'
SCHED_URL = CON_URL + '/schedule/'
SCHED = BeautifulSoup(urllib.urlopen(SCHED_URL))
FEED_URL = 'http://blip.tv/djangocon-europe-2011/rss'
FEED = feedparser.parse(FEED_URL)


def schedule():
    sched = {
            'Monday': None,
            'Tuesday': None,
            'Wednesday': None,
            }
    for day in sched:
        sched[day] = Day(SCHED.find(id='schedule_' + day[0:3]))
    return sched


class Day(object):
    def __init__(self, html):
        self.soup = html

    def talk(self, time):
        talk = {}
        sieve = re.compile('^' + time[0:2] + ':' + time[-2:])
        row = self.soup('th', text=sieve)[0].parent.parent
        talk['speakers'] = row.p.text.split(', ')
        talk['title'] = row.a.text
        talk['url'] = row.a['href']
        talk['slides'] = self.getslides(talk['url'])
        return talk

    def getslides(self, talkurl):
        soup = BeautifulSoup(urllib.urlopen(CON_URL + talkurl))
        return [CON_URL + a.parent['href'] for a in
                soup.findAll('a', text='Download slides')]


def videos():
    return [{
        'day': item['title'].split(' ')[0],
        'time': item['title'].split(' ')[1],
        'url': item['enclosures'][0]['href'],
        'size': int(item['enclosures'][0]['length']),
                }
        for item in FEED['entries'] if item['title'][0] in 'MTW']


def reporthook(a, b, c):
    print "% 3.1f%% of %d bytes\r" % (min(100, float(a * b) / c * 100), c),
    sys.stdout.flush()


def leech(url, target_dir, size=0):
    target = os.path.join(target_dir, os.path.basename(url))
    if need_to_leech(target, size):
        print 'Downloading ' + target
        urllib.urlretrieve(url, target, reporthook)


def need_to_leech(target, size):
    if not os.path.exists(target):
        return True
    if size and not os.path.getsize(target) == size:
        os.remove(target)
        return True
    return False


def main():
    queue = []
    sched = schedule()
    for video in videos():
        talk = sched[video['day']].talk(video['time'])
        print "Do you want to download:"
        print '"%s"' % talk['title']
        print 'with ' + ', '.join(talk['speakers'])
        answer = raw_input('[y/N]')
        if answer.lower() == 'y':
            queue.append((video, talk))

    for video, talk in queue:
        target_dir = talk['title']
        if not os.path.exists(target_dir):
            os.mkdir(target_dir)
        leech(video['url'], target_dir, video['size'])
        for url in talk['slides']:
            leech(url, target_dir)

if __name__ == '__main__':
    main()