1. Roger Pate
  2. nonsense

Commits

Roger Pate  committed 10b83af

praxis: add benford.py

  • Participants
  • Parent commits 0f5c730
  • Branches default

Comments (0)

Files changed (2)

File praxis/benford.py

View file
+#!/usr/bin/env python2.6
+"""Benford's law
+
+- http://programmingpraxis.com/2010/10/26/benfords-law/
+- http://en.wikipedia.org/wiki/Benford's_law
+"""
+
+import math
+import sys
+
+
+def leading_counts(iterable, base=10):
+  """Count leading digits.
+
+  Returns a list with index corresponding to leading digit.
+
+  Zero is counted for any items equal to zero; absolute value is used for negative numbers.
+  """
+  assert base >= 1
+  digits = [0] * base
+  for x in iterable:
+    x = abs(x)
+    while x >= base:
+      x //= base
+    digits[x] += 1
+  return digits
+
+def distrib(iterable, base=10, drop_zero=True):
+  """Return distribution of each leading digit.
+
+  If drop_zero, all zero values are discarded.  This has no affect if the data has no zeros.
+  """
+  counts = leading_counts(iterable, base)
+  if drop_zero:
+    counts[0] = 0
+  total = sum(counts)
+  return [1.0 * x / total for x in counts]
+
+
+def expected(digit, base=10):
+  """Return probability of given digit according to Benford's law."""
+  assert base >= 1
+  assert 0 <= digit < base
+  return math.log(1 + 1.0 / digit, base)
+
+
+def main(argv):
+  args = argv[1:]
+  if len(args) > 1:
+    return "unexpected arguments"
+  if not args or args[0] == "-":
+    f = sys.stdin
+  else:
+    f = open(args[0])
+  d = distrib(int(x) for x in f)
+  h = "n  actual  expected  diff  diff%"
+  widths = [len(x) for x in h.split("  ")]
+  print h
+  print "  ".join("-" * x for x in widths)
+  fmt = "%%%dd  %%%d.2f  %%%d.2f  %%%d.2f  %%%d.2f" % tuple(widths)
+  for n, x in zip(xrange(1, 10), d[1:]):
+    e = expected(n)
+    diff = abs(e - x)
+    print fmt % (n, 100 * x, 100 * e, 100 * diff, 100 * diff / e)
+
+if __name__ == "__main__":
+  sys.exit(main(sys.argv))

File praxis/test_benford.py

View file
+import random
+import unittest
+
+import benford
+
+
+random_data = []
+# 10 of every leading digit (no zeros)
+for x in xrange(1, 10):
+  for _ in xrange(10):
+    random_data.append(int(str(x) + str(random.randint(1, 1000000))))
+
+
+class Test_leading_counts(unittest.TestCase):
+  def test(self):
+    self.assertEqual([0, 2] + [0] * 8, benford.leading_counts([1, 100]))
+
+  def test_random_input(self):
+    self.assertEqual([0] + [10] * 9, benford.leading_counts(random_data))
+    self.assertEqual([1] + [10] * 9, benford.leading_counts([0] + random_data))
+
+class Test_distrib(unittest.TestCase):
+  def test(self):
+    p = benford.distrib(random_data)
+    self.assertEqual(0, p[0])
+    for n, x in enumerate(p[1:]):
+      self.assertAlmostEqual(1.0/9, x, places=7, msg="10 ~= %r; n = %r" % (x, n))