Commits

jba...@Finklestein.local  committed 74c6b9a

initial import

  • Participants

Comments (0)

Files changed (18)

+Copyright (c) <2009> <Jay Baird and Bob Ippolito>
+
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation
+files (the "Software"), to deal in the Software without
+restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+pybloom is a module that includes a Bloom Filter data structure along with
+an implmentation of Scalable Bloom Filters as discussed in:
+
+P. Almeida, C.Baquero, N. Preguiça, D. Hutchison, Scalable Bloom Filters,
+(GLOBECOM 2007), IEEE, 2007.
+
+Bloom filters are great if you understand what amount of bits you need to set
+aside early to store your entire set. Scalable Bloom Filters allow your bloom
+filter bits to grow as a function of false positive probability and size.
+
+A filter is "full" when at capacity: M * ((ln 2 ^ 2) / abs(ln p)), where M
+is the number of bits and p is the false positive probability. When capacity
+is reached a new filter is then created exponentially larger than the last
+with a tighter probability of false positives and a larger number of hash
+functions.
+
+>>> from pybloom import BloomFilter
+>>> f = BloomFilter(capacity=1000, error_rate=0.001)
+>>> [f.add(x) for x in range(10)]
+[False, False, False, False, False, False, False, False, False, False]
+>>> all([(x in f) for x in range(10)])
+True
+>>> 10 in f
+False
+>>> 5 in f
+True
+>>> f = BloomFilter(capacity=1000, error_rate=0.001)
+>>> for i in xrange(0, f.capacity):
+...     _ = f.add(i)
+>>> abs((len(f) / float(f.capacity)) - 1.0) <= f.error_rate
+True
+
+>>> from pybloom import ScalableBloomFilter
+>>> sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
+>>> count = 10000
+>>> for i in xrange(0, count):
+...     _ = sbf.add(i)
+...
+>>> abs((len(sbf) / float(count)) - 1.0) <= sbf.error_rate
+True
+
+# len(sbf) may not equal the entire input length. 0.006% error is well
+# below the default 0.1% error threshold

File build/lib/pybloom/__init__.py

+"""pybloom
+ 
+"""
+from pybloom import BloomFilter, ScalableBloomFilter, __version__, __author__
+    

File build/lib/pybloom/pybloom.py

+"""This module implements a bloom filter probabilistic data structure and
+an a Scalable Bloom Filter that grows in size as your add more items to it
+without increasing the false positive error_rate.
+
+Requires the bitarray library: http://pypi.python.org/pypi/bitarray/
+
+    >>> from pybloom import BloomFilter
+    >>> f = BloomFilter(capacity=10000, error_rate=0.001)
+    >>> for i in xrange(0, f.capacity):
+    ...     _ = f.add(i)
+    ...
+    >>> 0 in f
+    True
+    >>> f.capacity in f
+    False
+    >>> len(f) <= f.capacity
+    True
+    >>> abs((len(f) / float(f.capacity)) - 1.0) <= f.error_rate
+    True
+
+    >>> from pybloom import ScalableBloomFilter
+    >>> sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
+    >>> count = 10000
+    >>> for i in xrange(0, count):
+    ...     _ = sbf.add(i)
+    ...
+    >>> sbf.capacity > count
+    True
+    >>> len(sbf) <= count
+    True
+    >>> abs((len(sbf) / float(count)) - 1.0) <= sbf.error_rate
+    True
+
+"""
+import math
+import hashlib
+from struct import unpack, pack
+
+try:
+    import bitarray
+except ImportError:
+    raise ImportError('pybloom requires bitarray >= 0.3.4')
+
+__version__ = '1.0.2'
+__author__  = "Jay Baird <jay@mochimedia.com>, Bob Ippolito <bob@redivi.com>"
+
+def make_hashfuncs(num_slices, num_bits):
+    if num_bits >= (1 << 31):
+        fmt_code, chunk_size = 'Q', 8
+    elif num_bits >= (1 << 15):
+        fmt_code, chunk_size = 'I', 4
+    else:
+        fmt_code, chunk_size = 'H', 2
+    total_hash_bits = 8 * num_slices * chunk_size
+    if total_hash_bits > 384:
+        hashfn = hashlib.sha512
+    elif total_hash_bits > 256:
+        hashfn = hashlib.sha384
+    elif total_hash_bits > 160:
+        hashfn = hashlib.sha256
+    elif total_hash_bits > 128:
+        hashfn = hashlib.sha1
+    else:
+        hashfn = hashlib.md5
+    fmt = fmt_code * (hashfn().digest_size // chunk_size)
+    num_salts, extra = divmod(num_slices, len(fmt))
+    if extra:
+        num_salts += 1
+    salts = [hashfn(hashfn(pack('I', i)).digest()) for i in xrange(num_salts)]
+    def _make_hashfuncs(key):
+        if isinstance(key, basestring) and not isinstance(key, unicode):
+            key = key.encode('utf-8')
+        else:
+            key = str(key)
+        rval = []
+        for salt in salts:
+            h = salt.copy()
+            h.update(key)
+            rval.extend(uint % num_bits for uint in unpack(fmt, h.digest()))
+        del rval[num_slices:]
+        return rval
+    return _make_hashfuncs
+
+
+class BloomFilter(object):
+    def __init__(self, capacity, error_rate=0.001):
+        """Implements a space-efficient probabilistic data structure
+
+        capacity
+            this BloomFilter must be able to store at least *capacity* elements
+            while maintaining no more than *error_rate* chance of false
+            positives
+        error_rate
+            the error_rate of the filter returning false positives. This
+            determines the filters capacity. Inserting more than capacity
+            elements greatly increases the chance of false positives.
+
+        >>> b = BloomFilter(capacity=100000, error_rate=0.001)
+        >>> b.add("test")
+        False
+        >>> "test" in b
+        True
+
+        """
+        if not (0 < error_rate < 1):
+            raise ValueError("Error_Rate must be between 0 and 1.")
+        if not capacity > 0:
+            raise ValueError("Capacity must be > 0")
+        # given M = num_bits, k = num_slices, p = error_rate, n = capacity
+        # solving for m = bits_per_slice
+        # n ~= M * ((ln(2) ** 2) / abs(ln(P)))
+        # n ~= (k * m) * ((ln(2) ** 2) / abs(ln(P)))
+        # m ~= n * abs(ln(P)) / (k * (ln(2) ** 2))
+        num_slices = int(math.ceil(math.log(1 / error_rate, 2)))
+        # the error_rate constraint assumes a fill rate of 1/2
+        # so we double the capacity to simplify the API
+        bits_per_slice = int(math.ceil(
+            (2 * capacity * abs(math.log(error_rate))) /
+            (num_slices * (math.log(2) ** 2))))
+        self.error_rate = error_rate
+        self.num_slices = num_slices
+        self.bits_per_slice = bits_per_slice
+        self.capacity = capacity
+        self.num_bits = num_slices * bits_per_slice
+        self.count = 0
+        #print '\n'.join('%s = %s' % tpl for tpl in sorted(self.__dict__.items()))
+        self.bitarray = bitarray.bitarray(self.num_bits)
+        self.bitarray.setall(False)
+        self.make_hashes = make_hashfuncs(self.num_slices, self.bits_per_slice)
+
+    def __contains__(self, key):
+        """Tests a key's membership in this bloom filter.
+
+        >>> b = BloomFilter(capacity=100)
+        >>> b.add("hello")
+        False
+        >>> "hello" in b
+        True
+
+        """
+        bits_per_slice = self.bits_per_slice
+        bitarray = self.bitarray
+        if not isinstance(key, list):
+            hashes = self.make_hashes(key)
+        else:
+            hashes = key
+        offset = 0
+        for k in hashes:
+            if not bitarray[offset + k]:
+                return False
+            offset += bits_per_slice
+        return True
+
+    def __len__(self):
+        """Return the number of keys stored by this bloom filter."""
+        return self.count
+
+    def add(self, key, skip_check=False):
+        """ Adds a key to this bloom filter. If the key already exists in this
+        filter it will return True. Otherwise False.
+
+        >>> b = BloomFilter(capacity=100)
+        >>> b.add("hello")
+        False
+        >>> b.add("hello")
+        True
+
+        """
+        bitarray = self.bitarray
+        bits_per_slice = self.bits_per_slice
+        hashes = self.make_hashes(key)
+        if not skip_check and hashes in self:
+            return True
+        if self.count > self.capacity:
+            raise IndexError("BloomFilter is at capacity")
+        offset = 0
+        for k in hashes:
+            self.bitarray[offset + k] = True
+            offset += bits_per_slice
+        self.count += 1
+        return False
+
+
+class ScalableBloomFilter(object):
+    SMALL_SET_GROWTH = 2 # slower, but takes up less memory
+    LARGE_SET_GROWTH = 4 # faster, but takes up more memory faster
+
+    def __init__(self, initial_capacity=100, error_rate=0.001,
+                 mode=SMALL_SET_GROWTH):
+        """Implements a space-efficient probabilistic data structure that
+        grows as more items are added while maintaining a steady false
+        positive rate
+
+        initial_capacity
+            the initial capacity of the filter
+        error_rate
+            the error_rate of the filter returning false positives. This
+            determines the filters capacity. Going over capacity greatly
+            increases the chance of false positives.
+        mode
+            can be either ScalableBloomFilter.SMALL_SET_GROWTH or
+            ScalableBloomFilter.LARGE_SET_GROWTH. SMALL_SET_GROWTH is slower
+            but uses less memory. LARGE_SET_GROWTH is faster but consumes
+            memory faster.
+
+        >>> b = ScalableBloomFilter(initial_capacity=512, error_rate=0.001, \
+                                    mode=ScalableBloomFilter.SMALL_SET_GROWTH)
+        >>> b.add("test")
+        False
+        >>> "test" in b
+        True
+
+        """
+        if not error_rate or error_rate < 0:
+            raise ValueError("Error_Rate must be a decimal less than 0.")
+        self.scale = mode
+        self.ratio = 0.9
+        self.initial_capacity = initial_capacity
+        self.error_rate = error_rate
+        self.filters = []
+
+    def __contains__(self, key):
+        """Tests a key's membership in this bloom filter.
+
+        >>> b = ScalableBloomFilter(initial_capacity=100, error_rate=0.001, \
+                                    mode=ScalableBloomFilter.SMALL_SET_GROWTH)
+        >>> b.add("hello")
+        False
+        >>> "hello" in b
+        True
+
+        """
+        for f in reversed(self.filters):
+            if key in f:
+                return True
+        return False
+
+    def add(self, key):
+        """Adds a key to this bloom filter.
+        If the key already exists in this filter it will return True.
+        Otherwise False.
+
+        >>> b = ScalableBloomFilter(initial_capacity=100, error_rate=0.001, \
+                                    mode=ScalableBloomFilter.SMALL_SET_GROWTH)
+        >>> b.add("hello")
+        False
+        >>> b.add("hello")
+        True
+
+        """
+        if key in self:
+            return True
+        filter = self.filters[-1] if self.filters else None
+        if filter is None or filter.count >= filter.capacity:
+            num_filters = len(self.filters)
+            filter = BloomFilter(
+                capacity=self.initial_capacity * (self.scale ** num_filters),
+                error_rate=self.error_rate * (self.ratio ** num_filters))
+            self.filters.append(filter)
+        filter.add(key, skip_check=True)
+        return False
+
+    @property
+    def capacity(self):
+        """Returns the total capacity for all filters in this SBF"""
+        return sum([f.capacity for f in self.filters])
+
+    @property
+    def count(self):
+        return len(self)
+
+    def __len__(self):
+        """Returns the total number of elements stored in this SBF"""
+        return sum([f.count for f in self.filters])
+
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()

File build/lib/pybloom/tests.py

+import os
+import doctest
+from unittest import TestSuite
+ 
+def additional_tests():
+    proj_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    readme_fn = os.path.join(proj_dir, 'README.txt')
+    suite = TestSuite([doctest.DocTestSuite('pybloom.pybloom')])
+    if os.path.exists(readme_fn):
+        suite.addTest(doctest.DocFileSuite(readme_fn, module_relative=False))
+    return suite

File dist/pybloom-1.0.2-py2.5.egg

Binary file added.

File dist/pybloom-1.0.2.tar.gz

Binary file added.
+#!python
+"""Bootstrap setuptools installation
+
+If you want to use setuptools in your package's setup.py, just include this
+file in the same directory with it, and add this to the top of your setup.py::
+
+    from ez_setup import use_setuptools
+    use_setuptools()
+
+If you want to require a specific version of setuptools, set a download
+mirror, or use an alternate download directory, you can do so by supplying
+the appropriate options to ``use_setuptools()``.
+
+This file can also be run as a script to install or upgrade setuptools.
+"""
+import sys
+DEFAULT_VERSION = "0.6c9"
+DEFAULT_URL     = "http://pypi.python.org/packages/%s/s/setuptools/" % sys.version[:3]
+
+md5_data = {
+    'setuptools-0.6b1-py2.3.egg': '8822caf901250d848b996b7f25c6e6ca',
+    'setuptools-0.6b1-py2.4.egg': 'b79a8a403e4502fbb85ee3f1941735cb',
+    'setuptools-0.6b2-py2.3.egg': '5657759d8a6d8fc44070a9d07272d99b',
+    'setuptools-0.6b2-py2.4.egg': '4996a8d169d2be661fa32a6e52e4f82a',
+    'setuptools-0.6b3-py2.3.egg': 'bb31c0fc7399a63579975cad9f5a0618',
+    'setuptools-0.6b3-py2.4.egg': '38a8c6b3d6ecd22247f179f7da669fac',
+    'setuptools-0.6b4-py2.3.egg': '62045a24ed4e1ebc77fe039aa4e6f7e5',
+    'setuptools-0.6b4-py2.4.egg': '4cb2a185d228dacffb2d17f103b3b1c4',
+    'setuptools-0.6c1-py2.3.egg': 'b3f2b5539d65cb7f74ad79127f1a908c',
+    'setuptools-0.6c1-py2.4.egg': 'b45adeda0667d2d2ffe14009364f2a4b',
+    'setuptools-0.6c2-py2.3.egg': 'f0064bf6aa2b7d0f3ba0b43f20817c27',
+    'setuptools-0.6c2-py2.4.egg': '616192eec35f47e8ea16cd6a122b7277',
+    'setuptools-0.6c3-py2.3.egg': 'f181fa125dfe85a259c9cd6f1d7b78fa',
+    'setuptools-0.6c3-py2.4.egg': 'e0ed74682c998bfb73bf803a50e7b71e',
+    'setuptools-0.6c3-py2.5.egg': 'abef16fdd61955514841c7c6bd98965e',
+    'setuptools-0.6c4-py2.3.egg': 'b0b9131acab32022bfac7f44c5d7971f',
+    'setuptools-0.6c4-py2.4.egg': '2a1f9656d4fbf3c97bf946c0a124e6e2',
+    'setuptools-0.6c4-py2.5.egg': '8f5a052e32cdb9c72bcf4b5526f28afc',
+    'setuptools-0.6c5-py2.3.egg': 'ee9fd80965da04f2f3e6b3576e9d8167',
+    'setuptools-0.6c5-py2.4.egg': 'afe2adf1c01701ee841761f5bcd8aa64',
+    'setuptools-0.6c5-py2.5.egg': 'a8d3f61494ccaa8714dfed37bccd3d5d',
+    'setuptools-0.6c6-py2.3.egg': '35686b78116a668847237b69d549ec20',
+    'setuptools-0.6c6-py2.4.egg': '3c56af57be3225019260a644430065ab',
+    'setuptools-0.6c6-py2.5.egg': 'b2f8a7520709a5b34f80946de5f02f53',
+    'setuptools-0.6c7-py2.3.egg': '209fdf9adc3a615e5115b725658e13e2',
+    'setuptools-0.6c7-py2.4.egg': '5a8f954807d46a0fb67cf1f26c55a82e',
+    'setuptools-0.6c7-py2.5.egg': '45d2ad28f9750e7434111fde831e8372',
+    'setuptools-0.6c8-py2.3.egg': '50759d29b349db8cfd807ba8303f1902',
+    'setuptools-0.6c8-py2.4.egg': 'cba38d74f7d483c06e9daa6070cce6de',
+    'setuptools-0.6c8-py2.5.egg': '1721747ee329dc150590a58b3e1ac95b',
+    'setuptools-0.6c9-py2.3.egg': 'a83c4020414807b496e4cfbe08507c03',
+    'setuptools-0.6c9-py2.4.egg': '260a2be2e5388d66bdaee06abec6342a',
+    'setuptools-0.6c9-py2.5.egg': 'fe67c3e5a17b12c0e7c541b7ea43a8e6',
+    'setuptools-0.6c9-py2.6.egg': 'ca37b1ff16fa2ede6e19383e7b59245a',
+}
+
+import sys, os
+try: from hashlib import md5
+except ImportError: from md5 import md5
+
+def _validate_md5(egg_name, data):
+    if egg_name in md5_data:
+        digest = md5(data).hexdigest()
+        if digest != md5_data[egg_name]:
+            print >>sys.stderr, (
+                "md5 validation of %s failed!  (Possible download problem?)"
+                % egg_name
+            )
+            sys.exit(2)
+    return data
+
+def use_setuptools(
+    version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir,
+    download_delay=15
+):
+    """Automatically find/download setuptools and make it available on sys.path
+
+    `version` should be a valid setuptools version number that is available
+    as an egg for download under the `download_base` URL (which should end with
+    a '/').  `to_dir` is the directory where setuptools will be downloaded, if
+    it is not already available.  If `download_delay` is specified, it should
+    be the number of seconds that will be paused before initiating a download,
+    should one be required.  If an older version of setuptools is installed,
+    this routine will print a message to ``sys.stderr`` and raise SystemExit in
+    an attempt to abort the calling script.
+    """
+    was_imported = 'pkg_resources' in sys.modules or 'setuptools' in sys.modules
+    def do_download():
+        egg = download_setuptools(version, download_base, to_dir, download_delay)
+        sys.path.insert(0, egg)
+        import setuptools; setuptools.bootstrap_install_from = egg
+    try:
+        import pkg_resources
+    except ImportError:
+        return do_download()       
+    try:
+        pkg_resources.require("setuptools>="+version); return
+    except pkg_resources.VersionConflict, e:
+        if was_imported:
+            print >>sys.stderr, (
+            "The required version of setuptools (>=%s) is not available, and\n"
+            "can't be installed while this script is running. Please install\n"
+            " a more recent version first, using 'easy_install -U setuptools'."
+            "\n\n(Currently using %r)"
+            ) % (version, e.args[0])
+            sys.exit(2)
+        else:
+            del pkg_resources, sys.modules['pkg_resources']    # reload ok
+            return do_download()
+    except pkg_resources.DistributionNotFound:
+        return do_download()
+
+def download_setuptools(
+    version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir,
+    delay = 15
+):
+    """Download setuptools from a specified location and return its filename
+
+    `version` should be a valid setuptools version number that is available
+    as an egg for download under the `download_base` URL (which should end
+    with a '/'). `to_dir` is the directory where the egg will be downloaded.
+    `delay` is the number of seconds to pause before an actual download attempt.
+    """
+    import urllib2, shutil
+    egg_name = "setuptools-%s-py%s.egg" % (version,sys.version[:3])
+    url = download_base + egg_name
+    saveto = os.path.join(to_dir, egg_name)
+    src = dst = None
+    if not os.path.exists(saveto):  # Avoid repeated downloads
+        try:
+            from distutils import log
+            if delay:
+                log.warn("""
+---------------------------------------------------------------------------
+This script requires setuptools version %s to run (even to display
+help).  I will attempt to download it for you (from
+%s), but
+you may need to enable firewall access for this script first.
+I will start the download in %d seconds.
+
+(Note: if this machine does not have network access, please obtain the file
+
+   %s
+
+and place it in this directory before rerunning this script.)
+---------------------------------------------------------------------------""",
+                    version, download_base, delay, url
+                ); from time import sleep; sleep(delay)
+            log.warn("Downloading %s", url)
+            src = urllib2.urlopen(url)
+            # Read/write all in one block, so we don't create a corrupt file
+            # if the download is interrupted.
+            data = _validate_md5(egg_name, src.read())
+            dst = open(saveto,"wb"); dst.write(data)
+        finally:
+            if src: src.close()
+            if dst: dst.close()
+    return os.path.realpath(saveto)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+def main(argv, version=DEFAULT_VERSION):
+    """Install or upgrade setuptools and EasyInstall"""
+    try:
+        import setuptools
+    except ImportError:
+        egg = None
+        try:
+            egg = download_setuptools(version, delay=0)
+            sys.path.insert(0,egg)
+            from setuptools.command.easy_install import main
+            return main(list(argv)+[egg])   # we're done here
+        finally:
+            if egg and os.path.exists(egg):
+                os.unlink(egg)
+    else:
+        if setuptools.__version__ == '0.0.1':
+            print >>sys.stderr, (
+            "You have an obsolete version of setuptools installed.  Please\n"
+            "remove it from your system entirely before rerunning this script."
+            )
+            sys.exit(2)
+
+    req = "setuptools>="+version
+    import pkg_resources
+    try:
+        pkg_resources.require(req)
+    except pkg_resources.VersionConflict:
+        try:
+            from setuptools.command.easy_install import main
+        except ImportError:
+            from easy_install import main
+        main(list(argv)+[download_setuptools(delay=0)])
+        sys.exit(0) # try to force an exit
+    else:
+        if argv:
+            from setuptools.command.easy_install import main
+            main(argv)
+        else:
+            print "Setuptools version",version,"or greater has been installed."
+            print '(Run "ez_setup.py -U setuptools" to reinstall or upgrade.)'
+
+def update_md5(filenames):
+    """Update our built-in md5 registry"""
+
+    import re
+
+    for name in filenames:
+        base = os.path.basename(name)
+        f = open(name,'rb')
+        md5_data[base] = md5(f.read()).hexdigest()
+        f.close()
+
+    data = ["    %r: %r,\n" % it for it in md5_data.items()]
+    data.sort()
+    repl = "".join(data)
+
+    import inspect
+    srcfile = inspect.getsourcefile(sys.modules[__name__])
+    f = open(srcfile, 'rb'); src = f.read(); f.close()
+
+    match = re.search("\nmd5_data = {\n([^}]+)}", src)
+    if not match:
+        print >>sys.stderr, "Internal error!"
+        sys.exit(2)
+
+    src = src[:match.start(1)] + repl + src[match.end(1):]
+    f = open(srcfile,'w')
+    f.write(src)
+    f.close()
+
+
+if __name__=='__main__':
+    if len(sys.argv)>2 and sys.argv[1]=='--md5update':
+        update_md5(sys.argv[2:])
+    else:
+        main(sys.argv[1:])

File pybloom.egg-info/PKG-INFO

+Metadata-Version: 1.0
+Name: pybloom
+Version: 1.0.2
+Summary: PyBloom: A Probabilistic data structure
+Home-page: http://github.com/jaybaird/python-bloomfilter/
+Author: Jay Baird
+Author-email: jay.baird@me.com
+License: MIT License
+Description: 
+        pybloom is a Python implementation of the bloom filter probabilistic data
+        structure. The module also provides a Scalable Bloom Filter that allows a
+        bloom filter to grow without knowing the original set size.
+        
+Keywords: data structures,bloom filter,bloom,filter,probabilistic,set
+Platform: any
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python
+Classifier: Operating System :: OS Independent
+Classifier: Topic :: Utilities
+Classifier: Topic :: Database :: Database Engines/Servers
+Classifier: Topic :: Software Development :: Libraries :: Python Modules

File pybloom.egg-info/SOURCES.txt

+README.txt
+setup.py
+pybloom/__init__.py
+pybloom/pybloom.py
+pybloom/tests.py
+pybloom.egg-info/PKG-INFO
+pybloom.egg-info/SOURCES.txt
+pybloom.egg-info/dependency_links.txt
+pybloom.egg-info/requires.txt
+pybloom.egg-info/top_level.txt
+pybloom.egg-info/zip-safe

File pybloom.egg-info/dependency_links.txt

+

File pybloom.egg-info/requires.txt

+bitarray>=0.3.4

File pybloom.egg-info/top_level.txt

+pybloom

File pybloom.egg-info/zip-safe

+

File pybloom/__init__.py

+"""pybloom
+ 
+"""
+from pybloom import BloomFilter, ScalableBloomFilter, __version__, __author__
+    

File pybloom/pybloom.py

+"""This module implements a bloom filter probabilistic data structure and
+an a Scalable Bloom Filter that grows in size as your add more items to it
+without increasing the false positive error_rate.
+
+Requires the bitarray library: http://pypi.python.org/pypi/bitarray/
+
+    >>> from pybloom import BloomFilter
+    >>> f = BloomFilter(capacity=10000, error_rate=0.001)
+    >>> for i in xrange(0, f.capacity):
+    ...     _ = f.add(i)
+    ...
+    >>> 0 in f
+    True
+    >>> f.capacity in f
+    False
+    >>> len(f) <= f.capacity
+    True
+    >>> abs((len(f) / float(f.capacity)) - 1.0) <= f.error_rate
+    True
+
+    >>> from pybloom import ScalableBloomFilter
+    >>> sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
+    >>> count = 10000
+    >>> for i in xrange(0, count):
+    ...     _ = sbf.add(i)
+    ...
+    >>> sbf.capacity > count
+    True
+    >>> len(sbf) <= count
+    True
+    >>> abs((len(sbf) / float(count)) - 1.0) <= sbf.error_rate
+    True
+
+"""
+import math
+import hashlib
+from struct import unpack, pack
+
+try:
+    import bitarray
+except ImportError:
+    raise ImportError('pybloom requires bitarray >= 0.3.4')
+
+__version__ = '1.0.2'
+__author__  = "Jay Baird <jay@mochimedia.com>, Bob Ippolito <bob@redivi.com>"
+
+def make_hashfuncs(num_slices, num_bits):
+    if num_bits >= (1 << 31):
+        fmt_code, chunk_size = 'Q', 8
+    elif num_bits >= (1 << 15):
+        fmt_code, chunk_size = 'I', 4
+    else:
+        fmt_code, chunk_size = 'H', 2
+    total_hash_bits = 8 * num_slices * chunk_size
+    if total_hash_bits > 384:
+        hashfn = hashlib.sha512
+    elif total_hash_bits > 256:
+        hashfn = hashlib.sha384
+    elif total_hash_bits > 160:
+        hashfn = hashlib.sha256
+    elif total_hash_bits > 128:
+        hashfn = hashlib.sha1
+    else:
+        hashfn = hashlib.md5
+    fmt = fmt_code * (hashfn().digest_size // chunk_size)
+    num_salts, extra = divmod(num_slices, len(fmt))
+    if extra:
+        num_salts += 1
+    salts = [hashfn(hashfn(pack('I', i)).digest()) for i in xrange(num_salts)]
+    def _make_hashfuncs(key):
+        if isinstance(key, basestring) and not isinstance(key, unicode):
+            key = key.encode('utf-8')
+        else:
+            key = str(key)
+        rval = []
+        for salt in salts:
+            h = salt.copy()
+            h.update(key)
+            rval.extend(uint % num_bits for uint in unpack(fmt, h.digest()))
+        del rval[num_slices:]
+        return rval
+    return _make_hashfuncs
+
+
+class BloomFilter(object):
+    def __init__(self, capacity, error_rate=0.001):
+        """Implements a space-efficient probabilistic data structure
+
+        capacity
+            this BloomFilter must be able to store at least *capacity* elements
+            while maintaining no more than *error_rate* chance of false
+            positives
+        error_rate
+            the error_rate of the filter returning false positives. This
+            determines the filters capacity. Inserting more than capacity
+            elements greatly increases the chance of false positives.
+
+        >>> b = BloomFilter(capacity=100000, error_rate=0.001)
+        >>> b.add("test")
+        False
+        >>> "test" in b
+        True
+
+        """
+        if not (0 < error_rate < 1):
+            raise ValueError("Error_Rate must be between 0 and 1.")
+        if not capacity > 0:
+            raise ValueError("Capacity must be > 0")
+        # given M = num_bits, k = num_slices, p = error_rate, n = capacity
+        # solving for m = bits_per_slice
+        # n ~= M * ((ln(2) ** 2) / abs(ln(P)))
+        # n ~= (k * m) * ((ln(2) ** 2) / abs(ln(P)))
+        # m ~= n * abs(ln(P)) / (k * (ln(2) ** 2))
+        num_slices = int(math.ceil(math.log(1 / error_rate, 2)))
+        # the error_rate constraint assumes a fill rate of 1/2
+        # so we double the capacity to simplify the API
+        bits_per_slice = int(math.ceil(
+            (2 * capacity * abs(math.log(error_rate))) /
+            (num_slices * (math.log(2) ** 2))))
+        self.error_rate = error_rate
+        self.num_slices = num_slices
+        self.bits_per_slice = bits_per_slice
+        self.capacity = capacity
+        self.num_bits = num_slices * bits_per_slice
+        self.count = 0
+        #print '\n'.join('%s = %s' % tpl for tpl in sorted(self.__dict__.items()))
+        self.bitarray = bitarray.bitarray(self.num_bits)
+        self.bitarray.setall(False)
+        self.make_hashes = make_hashfuncs(self.num_slices, self.bits_per_slice)
+
+    def __contains__(self, key):
+        """Tests a key's membership in this bloom filter.
+
+        >>> b = BloomFilter(capacity=100)
+        >>> b.add("hello")
+        False
+        >>> "hello" in b
+        True
+
+        """
+        bits_per_slice = self.bits_per_slice
+        bitarray = self.bitarray
+        if not isinstance(key, list):
+            hashes = self.make_hashes(key)
+        else:
+            hashes = key
+        offset = 0
+        for k in hashes:
+            if not bitarray[offset + k]:
+                return False
+            offset += bits_per_slice
+        return True
+
+    def __len__(self):
+        """Return the number of keys stored by this bloom filter."""
+        return self.count
+
+    def add(self, key, skip_check=False):
+        """ Adds a key to this bloom filter. If the key already exists in this
+        filter it will return True. Otherwise False.
+
+        >>> b = BloomFilter(capacity=100)
+        >>> b.add("hello")
+        False
+        >>> b.add("hello")
+        True
+
+        """
+        bitarray = self.bitarray
+        bits_per_slice = self.bits_per_slice
+        hashes = self.make_hashes(key)
+        if not skip_check and hashes in self:
+            return True
+        if self.count > self.capacity:
+            raise IndexError("BloomFilter is at capacity")
+        offset = 0
+        for k in hashes:
+            self.bitarray[offset + k] = True
+            offset += bits_per_slice
+        self.count += 1
+        return False
+
+
+class ScalableBloomFilter(object):
+    SMALL_SET_GROWTH = 2 # slower, but takes up less memory
+    LARGE_SET_GROWTH = 4 # faster, but takes up more memory faster
+
+    def __init__(self, initial_capacity=100, error_rate=0.001,
+                 mode=SMALL_SET_GROWTH):
+        """Implements a space-efficient probabilistic data structure that
+        grows as more items are added while maintaining a steady false
+        positive rate
+
+        initial_capacity
+            the initial capacity of the filter
+        error_rate
+            the error_rate of the filter returning false positives. This
+            determines the filters capacity. Going over capacity greatly
+            increases the chance of false positives.
+        mode
+            can be either ScalableBloomFilter.SMALL_SET_GROWTH or
+            ScalableBloomFilter.LARGE_SET_GROWTH. SMALL_SET_GROWTH is slower
+            but uses less memory. LARGE_SET_GROWTH is faster but consumes
+            memory faster.
+
+        >>> b = ScalableBloomFilter(initial_capacity=512, error_rate=0.001, \
+                                    mode=ScalableBloomFilter.SMALL_SET_GROWTH)
+        >>> b.add("test")
+        False
+        >>> "test" in b
+        True
+
+        """
+        if not error_rate or error_rate < 0:
+            raise ValueError("Error_Rate must be a decimal less than 0.")
+        self.scale = mode
+        self.ratio = 0.9
+        self.initial_capacity = initial_capacity
+        self.error_rate = error_rate
+        self.filters = []
+
+    def __contains__(self, key):
+        """Tests a key's membership in this bloom filter.
+
+        >>> b = ScalableBloomFilter(initial_capacity=100, error_rate=0.001, \
+                                    mode=ScalableBloomFilter.SMALL_SET_GROWTH)
+        >>> b.add("hello")
+        False
+        >>> "hello" in b
+        True
+
+        """
+        for f in reversed(self.filters):
+            if key in f:
+                return True
+        return False
+
+    def add(self, key):
+        """Adds a key to this bloom filter.
+        If the key already exists in this filter it will return True.
+        Otherwise False.
+
+        >>> b = ScalableBloomFilter(initial_capacity=100, error_rate=0.001, \
+                                    mode=ScalableBloomFilter.SMALL_SET_GROWTH)
+        >>> b.add("hello")
+        False
+        >>> b.add("hello")
+        True
+
+        """
+        if key in self:
+            return True
+        filter = self.filters[-1] if self.filters else None
+        if filter is None or filter.count >= filter.capacity:
+            num_filters = len(self.filters)
+            filter = BloomFilter(
+                capacity=self.initial_capacity * (self.scale ** num_filters),
+                error_rate=self.error_rate * (self.ratio ** num_filters))
+            self.filters.append(filter)
+        filter.add(key, skip_check=True)
+        return False
+
+    @property
+    def capacity(self):
+        """Returns the total capacity for all filters in this SBF"""
+        return sum([f.capacity for f in self.filters])
+
+    @property
+    def count(self):
+        return len(self)
+
+    def __len__(self):
+        """Returns the total number of elements stored in this SBF"""
+        return sum([f.count for f in self.filters])
+
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()

File pybloom/tests.py

+import os
+import doctest
+from unittest import TestSuite
+ 
+def additional_tests():
+    proj_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    readme_fn = os.path.join(proj_dir, 'README.txt')
+    suite = TestSuite([doctest.DocTestSuite('pybloom.pybloom')])
+    if os.path.exists(readme_fn):
+        suite.addTest(doctest.DocFileSuite(readme_fn, module_relative=False))
+    return suite
+#!/usr/bin/env python
+from ez_setup import use_setuptools
+use_setuptools()
+
+import os
+
+from setuptools import setup, find_packages, Extension
+
+VERSION = '1.0.2'
+DESCRIPTION = "PyBloom: A Probabilistic data structure"
+LONG_DESCRIPTION = """
+pybloom is a Python implementation of the bloom filter probabilistic data
+structure. The module also provides a Scalable Bloom Filter that allows a 
+bloom filter to grow without knowing the original set size.
+"""
+
+CLASSIFIERS = filter(None, map(str.strip,
+"""                 
+Intended Audience :: Developers
+License :: OSI Approved :: MIT License
+Programming Language :: Python
+Operating System :: OS Independent
+Topic :: Utilities
+Topic :: Database :: Database Engines/Servers
+Topic :: Software Development :: Libraries :: Python Modules
+""".splitlines()))
+
+setup(
+    name="pybloom",
+    version=VERSION,
+    description=DESCRIPTION,
+    long_description=LONG_DESCRIPTION,
+    classifiers=CLASSIFIERS,
+    keywords=('data structures', 'bloom filter', 'bloom', 'filter',
+              'probabilistic', 'set'),
+    author="Jay Baird",
+    author_email="jay.baird@me.com",
+    url="http://github.com/jaybaird/python-bloomfilter/",
+    license="MIT License",
+    packages=find_packages(exclude=['ez_setup']),
+    platforms=['any'],
+    test_suite="pybloom.tests",
+    zip_safe=True,
+    install_requires=['bitarray>=0.3.4']
+)