Source

ferrybox / wsf.py

Full commit
import re
import urllib
import datetime
import logging

import elementtree.ElementTree as etree
import ElementSoup
ElementSoup.ET = etree

try:
    import json
except ImportError:
    try:
        import simplejson as json
    except ImportError:
        from django.utils import simplejson as json

base_url = 'http://www.wsdot.wa.gov/Ferries/Schedule/'
sched_url = base_url + 'ScheduleDetail.aspx'
alerts_url = base_url + 'RSSFeeds/RouteAlerts.aspx'

_PANEL_ID = "ctl00_cphPageTemplate_sSelectedSailing_pnlLeaveTerm"
DIV_PATTERN = ".//div[@id='" + _PANEL_ID + "']/../div"

def alltext(elem):
    return ''.join(elem.itertext())

def fetch_times(src, dest, date):
    query = urllib.urlencode({
        'tripdate':      date,
        'departingterm': src,
        'arrivingterm':  dest,
    })
    
    # Fetch the schedule HTML.
    url = sched_url + '?' + query
    logging.info('fetching url: %s' % url)
    sched_root = ElementSoup.parse(urllib.urlopen(url))
    sched_divs = sched_root.findall(DIV_PATTERN)
    sched_rows = list(sched_divs)[1].findall('.//tr')
    
    # Get the base date and time of this schedule from the date passed.
    #fixme catch exceptions
    year = int(date[:4])
    month = int(date[4:6])
    day = int(date[6:])
    
    # Parse the timetable into a list of times.
    times = []
    am = True
    wee_hours = False
    last_hour = 0
    for row in sched_rows:
        cells = list(row.findall("./td"))
        
        if len(cells) == 1:
            # AM/PM header
            ampmstr = alltext(cells[0])
            if ampmstr[0] == 'A':
                am = True
            else:
                am = False
                # Next time we're in the AM, it's "wee-hours" sailings.
                wee_hours = True
        
        elif len(cells) == 3:
            # Sailing
            timestr = alltext(cells[0])
            try:
                hour, minute = timestr.split(':', 1)
                hour, minute = int(hour), int(minute) #fixme catch exception
            except ValueError:
                logging.debug('illegible time string: %s' % repr(timestr))
                continue
        
            if am and hour == 12:
                hour = 0
            elif (not am) and hour != 12:
                hour += 12
        
            # Occasionally, the schedule is missing the "PM" header.
            if am and hour < last_hour and last_hour < 12:
                am = False
                wee_hours = True
                hour += 12
            last_hour = hour
        
            dt = datetime.datetime(year, month, day, hour, minute)
            
            # Wee-hours sailings are the following day.
            if am and wee_hours:
                dt = dt + datetime.timedelta(days=1)
            
            times.append(dt)
    
    return times

alert_pat = re.compile(r'\[.+?(\d+)/(\d+)/(\d+).+?(\d+):(\d+).+?([AP])M.*?\]')
def fetch_alerts(src, dest):
    query = urllib.urlencode({
        'departingterm': src,
        'arrivingterm':  dest,
    })
    
    alert_root = etree.parse(urllib.urlopen(alerts_url+'?'+query)).getroot()
    alerts = [e.text for e in alert_root.findall('.//item/title')]
    
    # Remove defailt "no alerts" message.
    if len(alerts) == 1 and alerts[0].startswith('Currently, there are no'):
        alerts = []
    
    # Parse post date and time.
    cleaned_alerts = []
    for alert in alerts:
        match = alert_pat.search(alert)
        month, day, year, hour, minute, ampm = match.groups()
        if ampm.lower().startswith('p'):
            hour = int(hour) + 12
        dt = datetime.datetime(int(year), int(month), int(day),
                               int(hour), int(minute))
        
        text = alert_pat.sub('', alert).strip()
        
        cleaned_alerts.append((dt, text))
    
    return cleaned_alerts

if __name__ == '__main__':
    print fetch_times(7, 3, datetime.datetime.now().strftime('%Y%m%d'))
    print fetch_alerts(7, 3)