Source

ml4hackers / ufo.py

#!/usr/bin/env python

import pandas as pd
from pandas.lib import Timestamp
import numpy as np

columns = [
    "sighted_at",
    "reported_at",
    "location",
    "shape",
    "duration",
    "description",
]

def asdate(v):
    if not v.strip():
        return np.nan

    try:
        return Timestamp(v)
    except ValueError:
        return np.nan


df = pd.read_csv(
    open('ufo_fixed.tsv'),
    sep='\t',
    names=columns,
    converters = {
        'sighted_at': asdate,
        'reported_at': asdate,
    },
)

def nth(i):
    def f(v):
        if type(v) != str:
            return None

        fields = v.split(',')
        if len(fields) != 2:
            return None

        return fields[i].strip()
    return f

df['city'] = df.location.apply(nth(0))
df['state'] = df.location.apply(nth(1))

us_states = {
    'AK','AL','AR','AZ','CA','CO','CT','DE','FL','GA','HI','IA','ID','IL',
    'IN','KS','KY','LA','MA','MD','ME','MI','MN','MO','MS','MT','NC','ND','NE','NH',
    'NJ','NM','NV','NY','OH','OK','OR','PA','RI','SC','SD','TN','TX','UT','VA','VT',
    'WA','WI','WV','WY'
}
isus = np.vectorize(us_states.__contains__)
df = df[isus(df.state)]

df = df[-pd.isnull(df.reported_at)]
df = df[-pd.isnull(df.sighted_at)]

df['ym'] = df.sighted_at.map(lambda v: v.replace(day=1))

ym_range = pd.date_range(df.sighted_at.min(), df.sighted_at.max(), freq='MS')
ym_list = ym_range.tolist()
idx = zip(np.sort(us_states * len(ym_list)), ym_list.tolist() * len(us_states))
full_idx = pd.MultiIndex.from_tuples(list(zip), names=['states', 'ym'])
sight_count = df.groupby(['state', 'ym'])['ym'].count()
sight_count.reindex(full_idx, fill_value=0)