Commits

Miki Tebeka committed d962b6d

stacking

  • Participants
  • Parent commits 7dba5e8

Comments (0)

Files changed (1)

 #!/usr/bin/env python
 
 import pandas as pd
-from datetime import date
+from pandas.lib import Timestamp
+import numpy as np
 
 columns = [
     "sighted_at",
     "description",
 ]
 
-def asdate(n):
-    if int(n) == 0:
-        return None
+def asdate(v):
+    if not v.strip():
+        return np.nan
 
-    year, month, day = int(n[:4]), int(n[4:6]), int(n[6:])
-    month = month or 1
-    day = day or 1
-
-    return date(year, month, day)
+    try:
+        return Timestamp(v)
+    except ValueError:
+        return np.nan
 
 
 df = pd.read_csv(
         'reported_at': asdate,
     },
 )
+
+def nth(i):
+    def f(v):
+        if type(v) != str:
+            return None
+
+        fields = v.split(',')
+        if len(fields) != 2:
+            return None
+
+        return fields[i].strip()
+    return f
+
+df['city'] = df.location.apply(nth(0))
+df['state'] = df.location.apply(nth(1))
+
+us_states = {
+    'AK','AL','AR','AZ','CA','CO','CT','DE','FL','GA','HI','IA','ID','IL',
+    'IN','KS','KY','LA','MA','MD','ME','MI','MN','MO','MS','MT','NC','ND','NE','NH',
+    'NJ','NM','NV','NY','OH','OK','OR','PA','RI','SC','SD','TN','TX','UT','VA','VT',
+    'WA','WI','WV','WY'
+}
+isus = np.vectorize(us_states.__contains__)
+df = df[isus(df.state)]
+
+df = df[-pd.isnull(df.reported_at)]
+df = df[-pd.isnull(df.sighted_at)]
+
+df['ym'] = df.sighted_at.map(lambda v: v.replace(day=1))
+
+ym_range = pd.date_range(df.sighted_at.min(), df.sighted_at.max(), freq='MS')
+ym_list = ym_range.tolist()
+idx = zip(np.sort(us_states * len(ym_list)), ym_list.tolist() * len(us_states))
+full_idx = pd.MultiIndex.from_tuples(list(zip), names=['states', 'ym'])
+sight_count = df.groupby(['state', 'ym'])['ym'].count()
+sight_count.reindex(full_idx, fill_value=0)
+
+