Commits

Miki Tebeka committed 9b842a9

Start of pandas validation

  • Participants
  • Parent commits c281eec

Comments (0)

Files changed (6)

 dynamic-static/lighttpd.inc
 dynamic-static/lighttpd.pid
 macapp/HumbleCalc.*
+pandas-validation/points.h5

pandas-validation/bootstrap.sh

+#!/bin/bash
+# Create databases and populate with dummy data
+# Note: You might need to set PGUSER environment variable (probably to postgres)
+
+for step in pre post; do
+    db=points_${step}
+    createdb $db
+    psql -f schema.sql $db
+done
+./populate.py

pandas-validation/populate.py

+#!/usr/bin/env python
+'''Populate with dummy data.'''
+
+from pull import cursor
+
+from random import randint, random
+from collections import namedtuple
+
+sql = '''
+INSERT INTO points (
+    x
+  , y
+  , z
+  , value
+) VALUES (
+    %s
+  , %s
+  , %s
+  , %s
+)'''
+
+Point = namedtuple('Point', ['x', 'y', 'z'])
+
+
+def rand_coord():
+    return randint(0, 1000)
+
+
+def rand_point():
+    return Point(
+        rand_coord(),
+        rand_coord(),
+        rand_coord(),
+    )
+
+
+def gen_points(count):
+    coords = set()
+    while len(coords) < count:
+        coords.add(rand_point())
+    return coords
+
+
+
+if __name__ == '__main__':
+    pre_cur = cursor('pre')
+    post_cur = cursor('post')
+
+    points = gen_points(10000)
+    for point in points:
+        pre_value = random() * 1000
+        post_value = pre_value
+
+        # In 1% probability, generate different value
+        if random() <= 0.01:
+            # Up to 10% difference
+            diff = (random() * 0.1) * pre_value
+            if random() > 0.5:
+                post_value += diff
+            else:
+                post_value -= diff
+
+        pre_cur.execute(sql, (point.x, point.y, point.z, pre_value))
+        post_cur.execute(sql, (point.x, point.y, point.z, post_value))
+
+    pre_cur.connection.commit()
+    post_cur.connection.commit()

pandas-validation/pull.py

+#!/usr/bin/env python
+'''Pull data from database to .h5 storage.'''
+
+# Assuming our points tables have the following schema
+# CREATE TABLE points (
+#     x INTEGER
+#   , y INTEGER
+#   , z INTEGER
+#   , value FLOAT
+# );
+# We have two database points_pre and points_post
+
+import psycopg2
+from psycopg2.extras import DictCursor
+from pandas import DataFrame, HDFStore
+from threading import Thread
+
+
+def cursor(step):
+    '''Return a DictCursor connected to step=pre/post database.'''
+    conn = psycopg2.connect(database='points_{}'.format(step))
+    return conn.cursor(cursor_factory=DictCursor)
+
+
+if __name__ == '__main__':
+    pre_cursor = cursor('pre')
+    post_cursor = cursor('post')
+
+    sql = 'SELECT x, y, z, value FROM points'''
+
+    # Get data in two threads to speed things up
+    pre_t = Thread(target=pre_cursor.execute, args=(sql,))
+    pre_t.start()
+    post_t = Thread(target=post_cursor.execute, args=(sql,))
+    post_t.start()
+    pre_t.join()
+    post_t.join()
+
+
+    # Create data frames
+    pre = DataFrame.from_records([dict(row) for row in pre_cursor])
+    post = DataFrame.from_records([dict(row) for row in post_cursor])
+
+    # Store data frame in HDF5 data store
+    store_file = 'points.h5'
+    store = HDFStore(store_file)
+    store['pre'] = pre
+    store['post'] = post
+    store.flush()
+
+    print('Data stored at {}'.format(store_file))

pandas-validation/schema.sql

+CREATE TABLE points (
+    x INTEGER
+  , y INTEGER
+  , z INTEGER
+  , value FLOAT
+);

pandas-validation/stats.py

+#!/usr/bin/env python
+'''Display some validation statistics on points_pre vs points_post data.'''
+
+from pandas import HDFStore
+
+def main(argv=None):
+    import sys
+    from argparse import ArgumentParser
+
+    argv = argv or sys.argv
+
+    parser = ArgumentParser(description='show some statistics')
+    parser.parse_args(argv[1:])
+
+    store = HDFStore('points.h5')
+    pre, post = store['pre'], store['post']
+
+    # Calculate diff in %
+    diff = (pre - post)/pre * 100
+
+    # Initial statistics
+    print(diff.describe())
+
+
+if __name__ == '__main__':
+    main()
+