Source

iviewautodownload / python-iview / daily_scrape.py

Full commit

'''
To view a list of all shows, run:
    H:\iViewNapper\python-iview\iview-cli.py -i
Then add the desired show (as a fnmatch) to shows.txt.
'''

SHOW_LIST_FILE = 'shows.txt'
LAST_SCRAPE_FILE = 'lastscrape.dat'
BASE_OUTPUT_PATH = r'h:\iview'
TRANSCODED_OUTPUT_PATH = r'n:\media\iphone'
RUN_FREQ = 8 * 60 *60  # Frequency to run - every 8 hours. If RUN_ONCE is True, the most frequently it will run.
RUN_ONCE = False    # If True, will only run once then exit

# --------------- There is no config required below this line ---------------

import iview.config
import iview.fetch
import iview.comm
import fnmatch
import os
import sys
import time
import handbrake
import re
import urllib
import shutil

INVALID_FILENAME_CHARACTERS = r'\/:?*"<>|'
SCREEN_WIDTH = 79

LOGLEVEL = 9

DOWNLOADED = []

def log(msg, level=5):
    if level <= LOGLEVEL:
        msg = str(msg)
        print msg

def showlist():
    '''Returns the names of the shows to look for.
    '''
    f = open(SHOW_LIST_FILE)
    ret = []
    for show in f:
        show = show.strip()
        if show:
            if show.startswith('#'):
                continue
            ret.append(show)
    return ret

def fnmatchpatterns(name, patterns):
    for pattern in patterns:
        if fnmatch.fnmatch(name, pattern):
            return True
    return False

def progressbar(pcg):
    txtpcg = ' %3.1f%%' % (pcg*100.0)
    done = int((SCREEN_WIDTH-2-len(txtpcg))*pcg)
    todo = SCREEN_WIDTH-2-done-1-len(txtpcg)
    sys.stdout.write('|%s>%s|%s\r' % ('='*done, ' '*todo, txtpcg))

def validfilename(fn):
    '''Replaces any invalid filname character with an _. Includes / and \.
    '''
    for c in INVALID_FILENAME_CHARACTERS:
        if c in fn:
            fn = fn.replace(c, '_')
    return fn 

def showfilename(show):
    '''
    Gives the file name (less extension) that a given show should have.

    Extracts season and episode info etc.
    '''
    title = show['title']
    match = re.match('(.*)Episode (\d+)(.*)', title)
    print '*************************'
    print 'show dict:', show
    if not match:
        print 'Unknown title pattern: %r' % (title)
        # Use old naming method
        fn = show['title']
    else:
        tmpseriesname, episodenum, episode = match.groups()
        episode = episode.strip()
        if episode:
            episode = ' - %s' % (episode)
        match = re.match(r'.*_\d{2}_(?P<series>\d{2})_(?P<episode>\d{2}(_.+)?)\..*', show['url'])
        if not match:
            print
            print show
            print
            print "NO MATCH for url:", show['url']
        seriesnum = match.group('series')
        episodenum = match.group('episode')
        fn = '%s - s%se%s%s' % (showfolder(show), seriesnum, 
                episodenum, episode)
    fn = validfilename(fn)
    return fn

def getoutputpath(show):
    '''Given a show dict, return the full path the file should be saved to.
    '''
    fn = showfilename(show) + os.path.splitext(show['url'])[1]
    outputdir = os.path.join(BASE_OUTPUT_PATH, showfolder(show))
    if not os.path.isdir(outputdir):
        os.mkdir(outputdir)
    ret = os.path.join(outputdir, fn)
    print "Using show name: %s" % (ret)
    return ret

def getshowart(show):
    '''
    Gets the thumbnail JPEG if it does not already exist and puts it into 
    the metadata folder in the show folder.
    '''
    if not show['thumb'] or show['thumb'] == '(none)':
        # Don't bother doing anything if there is no thumbnail
        return
    outputdir = os.path.join(BASE_OUTPUT_PATH, showfolder(show), 'metadata')
    artfn = os.path.join(outputdir, showfilename(show) + '.jpg')
    print "Art path: %r" % (artfn)
    if os.path.isfile(artfn):
        # Already got it, return 
        return
    print "Getting show art: %r" % (artfn)
    # Make sure the metadata folder is there
    if not os.path.isdir(outputdir):
        os.mkdir(outputdir)
    urlf = urllib.urlopen(show['thumb'])
    outf = open(artfn, 'wb')
    outf.write(urlf.read())
    urlf.close()
    outf.close()

    # Copy to the TRANSCODED_OUTPUT_PATH
    outputdir2 = os.path.join(TRANSCODED_OUTPUT_PATH, showfolder(show), 'metadata')
    artfn2 = os.path.join(outputdir2, showfilename(show) + '.jpg')
    if not os.path.isdir(outputdir2):
        os.mkdir(outputdir2)
    shutil.copy(artfn, artfn2)

def showfolder(show):
    '''
    Returns the name of the folder a show should be put into.

    Currently this just removes the "Series xx" if it exists from the 
    seriestitle. Then passes it through validfilename()
    '''
    seriestitle = show['seriestitle']
    seriesmatch = re.match('(.*)Series \d+', seriestitle)
    if seriesmatch:
        seriestitle = seriesmatch.group(1).strip()
    return validfilename(seriestitle)

def postdownload(show):
    '''Called when a show download is complete.
    '''
    srcpath = show['filename']
    
    outputdir = os.path.join(TRANSCODED_OUTPUT_PATH, showfolder(show))
    outputpath = os.path.join(
        outputdir,
        validfilename(os.path.basename(show['filename']))
        )

    log('Output path: %s' % (outputpath))
    if not os.path.isdir(outputdir):
        os.mkdir(outputdir)

    handbrake.iphoneconvert(srcpath, outputpath)

def download(show):
    global DOWNLOADED
    #outputfn = show['title'] + os.path.splitext(show['url'])[1]
    outputfn = getoutputpath(show)
    print "Getting: %s" % (outputfn)

    # This assumes if we have the file at all - it is a complete download
    if os.path.isfile(outputfn):
        print "Already downloaded: %s" % (outputfn)
    else:
        print "Downloading: %s from %s" % (outputfn, show['url'])
        # Note: fetch.py has been modified to return a subprocess object
        p = iview.fetch.fetch_program(show['url'], dest_file=outputfn)
        show['filename'] = outputfn 
        loopstate = 0
        f = open('sample', 'w')
        while 1:
            line = ''
            while 1:
                c = p.stdout.read(1)
                if not c or c == '\r':
                    break
                line += c
            if not line:
                break
            f.write(line)
            f.write('\n---------------\n')
            line = line.strip()
            if line == 'INFO: Metadata:':
                loopstate = 1
            elif loopstate == 1:
                parts = line.split()
                if len(parts) != 3:
                    loopstate = 2
                    continue
                show[parts[1]] = parts[2]
            elif loopstate == 2:
                match = re.search('\((\d+\.\d+)%\)', line)
                if match:
                    progressbar(float(match.group(1))/100)
        f.close()
        p.wait()
        success = (p.returncode == 0)
        if success:
            DOWNLOADED.append(show['filename'])
            print "Successfully downloaded %s" % (show['title'])
            postdownload(show)
        else:
            print "Error downloading."
            if os.path.isfile(outputfn):
                print "Removing incomplete download"
                os.remove(outputfn)

def getlastrun():
    '''Returns the time the program was last run as unix epoch
    '''
    lastrun = 0
    if os.path.isfile(LAST_SCRAPE_FILE):
        lastrun = os.stat(LAST_SCRAPE_FILE).st_mtime
    return lastrun

def savelastrun():
    '''Saves the last time of a complete run.
    '''
    open(LAST_SCRAPE_FILE, 'w').write('last iView scrape')
    
def main():
    os.chdir(sys.path[0])
    
    # Sleep time for first run, check last run time
    sleeptime = 5 # Sleep for 5 seconds after startup to allow the PC to get a network connection
    lastscrape = getlastrun()
    now = time.time()
    if (now - lastscrape) < RUN_FREQ:
        sleeptime = lastscrape + RUN_FREQ - now
    print now, sleeptime, lastscrape
    while 1:
        nextruntime = time.asctime(time.localtime(time.time() + sleeptime))
        print "Sleeping %d seconds. Will run at: %s" % (sleeptime, nextruntime)
        time.sleep(sleeptime)
        
        # Get the show list we want to watch...
        srclist = showlist()

        # Init the lib
        iview.comm.get_config()
        
        # See what we can find in the index...
        index = iview.comm.get_index()

        # Get the ID's of the shows we are after
        # Then update the list of episodes
        todownload = []
        for series in index:
            if fnmatchpatterns(series['title'], srclist):
                print 'Found series: %s' % (series['title'])
                seriesid = series['id']
                seriesshows = iview.comm.get_series_items(seriesid)
                for showinfo in seriesshows:
                    showinfo['seriesid'] = series['id']
                    showinfo['seriestitle'] = series['title']
                    todownload.append(showinfo)

        i = 1
        for show in todownload:
            print "Processing show %d of %d" % (i, len(todownload))
            download(show)
            getshowart(show)
            i += 1
        
        savelastrun()
        sleeptime = RUN_FREQ
        print
        if DOWNLOADED:
            print "This session have downloaded:"
            for show in DOWNLOADED:
                print "    %s" % (show)

def test():
    sampleshow = {'aacaot': '2.00',
 'audiochannels': '2.00',
 'audiocodecid': 'mp4a',
 'audiosamplerate': '44100.00',
 'avclevel': '30.00',
 'avcprofile': '77.00',
 'date': '2010-03-01 00:00:00',
 'description': "(Preview) Award-winning actor Catherine Tate returns as the Doctor's new companion, returning to her role as Donna Noble who featured in the 2006 Christmas special The Runaway Bride. Now reunited, the Doctor and Donna travel back to Pompeii in AD 79 on the eve of the infamous eruption where people areslowly turning to stone.\n",
 'duration': '139.18',
 'filename': 'Doctor Who Series 4.mp4',
 'height': '360.00',
 'home': 'http://shop.abc.net.au/browse/product.asp?productid=996433',
 'id': '9998147',
 'livestream': '',
 'moovposition': '9993357.00',
 'seriesid': '9998147',
 'seriestitle': 'Doctor Who Series 4',
 'thumb': 'http://shop.abc.net.au/multimediaitems/images/iview/drwho_4_04.jpg',
 'title': 'Doctor Who Series 4',
 'url': 'abcshop/drwho_4_04.mp4',
 'videocodecid': 'avc1',
 'videoframerate': '25.00',
 'width': '640.00'}
    postdownload(sampleshow)
 

if __name__ == '__main__':
    main()
    #test()