Commits

Bryan O'Sullivan committed b395d30

Split mutable and immutable into separate modules

Comments (0)

Files changed (4)

Data/BloomFilter.hs

     , insert
     , insertList
 
-    -- * Mutable Bloom filters
-    -- ** Immutability wrappers
-    , create
-    , modify
-
-    -- ** Creation
-    , newMB
-    , unsafeFreezeMB
-    , thawMB
-
-    -- ** Accessors
-    , lengthMB
-    , elemMB
-
-    -- ** Mutation
-    , insertMB
-
     -- * The underlying representation
     -- | If you serialize the raw bit arrays below to disk, do not
     -- expect them to be portable to systems with different
 
     -- | The raw bit array used by the immutable 'Bloom' type.
     , bitArray
-
-    -- | The raw bit array used by the immutable 'MBloom' type.
-    , bitArrayMB
     ) where
 
 #include "MachDeps.h"
 import Control.Monad (liftM, forM_)
 import Control.Monad.ST (ST, runST)
 import Control.DeepSeq (NFData(..))
-import Data.Array.Base (unsafeAt, unsafeRead, unsafeWrite)
-import Data.Array.ST (STUArray, thaw, unsafeFreeze)
+import Data.Array.Base (unsafeAt)
+import qualified Data.Array.ST as ST
 import Data.Array.Unboxed (UArray)
-import Data.Bits ((.&.), (.|.))
-import Data.BloomFilter.Array (newArray)
-import Data.BloomFilter.Util (FastShift(..), (:*)(..), nextPowerOfTwo)
+import Data.Bits ((.&.))
+import Data.BloomFilter.Util (FastShift(..), (:*)(..))
+import qualified Data.BloomFilter.Mutable as MB
+import qualified Data.BloomFilter.Mutable.Internal as MB
+import Data.BloomFilter.Mutable.Internal (Hash, MBloom)
 import Data.Word (Word32)
 
 import Prelude hiding (elem, length, notElem,
                        (/), (*), div, divMod, mod, rem)
 
-{-
-import Debug.Trace
-traceM :: (Show a, Monad m) => a -> m ()
-traceM v = show v `trace` return ()
-traces :: Show a => a -> b -> b
-traces s = trace (show s)
--}
-
--- | A hash value is 32 bits wide.  This limits the maximum size of a
--- filter to about four billion elements, or 512 megabytes of memory.
-type Hash = Word32
-
--- | A mutable Bloom filter, for use within the 'ST' monad.
-data MBloom s a = MB {
-      hashMB :: !(a -> [Hash])
-    , shiftMB :: {-# UNPACK #-} !Int
-    , maskMB :: {-# UNPACK #-} !Int
-    , bitArrayMB :: {-# UNPACK #-} !(STUArray s Int Hash)
-    }
 
 -- | An immutable Bloom filter, suitable for querying from pure code.
 data Bloom a = B {
     , bitArray :: {-# UNPACK #-} !(UArray Int Hash)
     }
 
-instance Show (MBloom s a) where
-    show mb = "MBloom { " ++ show (lengthMB mb) ++ " bits } "
-
 instance Show (Bloom a) where
-    show ub = "Bloom { " ++ show (length ub) ++ " bits } "
+    show ub = "Bloom { " ++ show ((1::Int) `shiftL` shift ub) ++ " bits } "
 
 instance NFData (Bloom a) where
     rnf !_ = ()
 
--- | Create a new mutable Bloom filter.  For efficiency, the number of
--- bits used may be larger than the number requested.  It is always
--- rounded up to the nearest higher power of two, but clamped at a
--- maximum of 4 gigabits, since hashes are 32 bits in size.
---
--- For a safer creation interface, use 'create'.  To convert a
--- mutable filter to an immutable filter for use in pure code, use
--- 'unsafeFreezeMB'.
-newMB :: (a -> [Hash])          -- ^ family of hash functions to use
-      -> Int                    -- ^ number of bits in filter
-      -> ST s (MBloom s a)
-newMB hash numBits = MB hash shift mask `liftM` newArray numElems numBytes
-  where twoBits | numBits < 1 = 1
-                | numBits > maxHash = maxHash
-                | isPowerOfTwo numBits = numBits
-                | otherwise = nextPowerOfTwo numBits
-        numElems = max 2 (twoBits `shiftR` logBitsInHash)
-        numBytes = numElems `shiftL` logBytesInHash
-        trueBits = numElems `shiftL` logBitsInHash
-        shift = logPower2 trueBits
-        mask = trueBits - 1
-        isPowerOfTwo n = n .&. (n - 1) == 0
-
-maxHash :: Int
-#if WORD_SIZE_IN_BITS == 64
-maxHash = 4294967296
-#else
-maxHash = 1073741824
-#endif
-
 logBitsInHash :: Int
 logBitsInHash = 5 -- logPower2 bitsInHash
 
-logBytesInHash :: Int
-logBytesInHash = 2 -- logPower2 (sizeOf (undefined :: Hash))
-
 -- | Create an immutable Bloom filter, using the given setup function
 -- which executes in the 'ST' monad.
 --
 -- Note that the result of the setup function is not used.
 create :: (a -> [Hash])        -- ^ family of hash functions to use
         -> Int                  -- ^ number of bits in filter
-        -> (forall s. (MBloom s a -> ST s z))  -- ^ setup function (result is discarded)
+        -> (forall s. (MBloom s a -> ST s ()))  -- ^ setup function
         -> Bloom a
 {-# INLINE create #-}
 create hash numBits body = runST $ do
-  mb <- newMB hash numBits
-  _ <- body mb
-  unsafeFreezeMB mb
+  mb <- MB.new hash numBits
+  body mb
+  unsafeFreeze mb
+
+-- | Create an immutable Bloom filter from a mutable one.  The mutable
+-- filter /must not/ be modified afterwards, or a runtime crash may
+-- occur.  For a safer creation interface, use 'create'.
+unsafeFreeze :: MBloom s a -> ST s (Bloom a)
+unsafeFreeze mb = B (MB.hashes mb) (MB.shift mb) (MB.mask mb) `liftM`
+                    ST.unsafeFreeze (MB.bitArray mb)
+
+-- | Copy an immutable Bloom filter to create a mutable one.  There is
+-- no non-copying equivalent.
+thaw :: Bloom a -> ST s (MBloom s a)
+thaw ub = MB.MB (hashes ub) (shift ub) (mask ub) `liftM` ST.thaw (bitArray ub)
 
 -- | Create an empty Bloom filter.
 --
            -> a                 -- ^ element to insert
            -> Bloom a
 {-# INLINE [1] singleton #-}
-singleton hash numBits elt = create hash numBits (\mb -> insertMB mb elt)
+singleton hash numBits elt = create hash numBits (\mb -> MB.insert mb elt)
 
 -- | Given a filter's mask and a hash value, compute an offset into
 -- a word array and a bit offset within that word.
 -- | Hash the given value, returning a list of (word offset, bit
 -- offset) pairs, one per hash value.
 hashesM :: MBloom s a -> a -> [Int :* Int]
-hashesM mb elt = hashIdx (maskMB mb) `map` hashMB mb elt
+hashesM mb elt = hashIdx (MB.mask mb) `map` MB.hashes mb elt
 
 -- | Hash the given value, returning a list of (word offset, bit
 -- offset) pairs, one per hash value.
 hashesU :: Bloom a -> a -> [Int :* Int]
 hashesU ub elt = hashIdx (mask ub) `map` hashes ub elt
 
--- | Insert a value into a mutable Bloom filter.  Afterwards, a
--- membership query for the same value is guaranteed to return @True@.
-insertMB :: MBloom s a -> a -> ST s ()
-insertMB mb elt = do
-  let mu = bitArrayMB mb
-  forM_ (hashesM mb elt) $ \(word :* bit) -> do
-      old <- unsafeRead mu word
-      unsafeWrite mu word (old .|. (1 `shiftL` bit))
-
--- | Query a mutable Bloom filter for membership.  If the value is
--- present, return @True@.  If the value is not present, there is
--- /still/ some possibility that @True@ will be returned.
-elemMB :: a -> MBloom s a -> ST s Bool
-elemMB elt mb = loop (hashesM mb elt)
-  where mu = bitArrayMB mb
-        loop ((word :* bit):wbs) = do
-          i <- unsafeRead mu word
-          if i .&. (1 `shiftL` bit) == 0
-            then return False
-            else loop wbs
-        loop _ = return True
-
 -- | Query an immutable Bloom filter for membership.  If the value is
 -- present, return @True@.  If the value is not present, there is
 -- /still/ some possibility that @True@ will be returned.
         -> Bloom a
 {-# INLINE modify #-}
 modify body ub = runST $ do
-  mb <- thawMB ub
+  mb <- thaw ub
   _ <- body mb
-  unsafeFreezeMB mb
+  unsafeFreeze mb
 
 -- | Create a new Bloom filter from an existing one, with the given
 -- member added.
 -- fusion.
 insert :: a -> Bloom a -> Bloom a
 {-# NOINLINE insert #-}
-insert elt = modify (flip insertMB elt)
+insert elt = modify (flip MB.insert elt)
 
 -- | Create a new Bloom filter from an existing one, with the given
 -- members added.
 -- fusion.
 insertList :: [a] -> Bloom a -> Bloom a
 {-# NOINLINE insertList #-}
-insertList elts = modify $ \mb -> mapM_ (insertMB mb) elts
+insertList elts = modify $ \mb -> mapM_ (MB.insert mb) elts
 
 {-# RULES "Bloom insert . insert" forall a b u.
     insert b (insert a u) = insertList [a,b] u
 notElem elt ub = any test (hashesU ub elt)
   where test (off :* bit) = (bitArray ub `unsafeAt` off) .&. (1 `shiftL` bit) == 0
 
--- | Create an immutable Bloom filter from a mutable one.  The mutable
--- filter /must not/ be modified afterwards, or a runtime crash may
--- occur.  For a safer creation interface, use 'create'.
-unsafeFreezeMB :: MBloom s a -> ST s (Bloom a)
-unsafeFreezeMB mb = B (hashMB mb) (shiftMB mb) (maskMB mb) `liftM`
-                    unsafeFreeze (bitArrayMB mb)
-
--- | Copy an immutable Bloom filter to create a mutable one.  There is
--- no non-copying equivalent.
-thawMB :: Bloom a -> ST s (MBloom s a)
-thawMB ub = MB (hashes ub) (shift ub) (mask ub) `liftM` thaw (bitArray ub)
-
--- bitsInHash :: Int
--- bitsInHash = sizeOf (undefined :: Hash) `shiftL` 3
-
--- | Return the size of a mutable Bloom filter, in bits.
-lengthMB :: MBloom s a -> Int
-lengthMB = shiftL 1 . shiftMB
-
 -- | Return the size of an immutable Bloom filter, in bits.
 length :: Bloom a -> Int
 length = shiftL 1 . shift
 unfold hashes numBits f k = create hashes numBits (loop k)
   where loop :: forall s. b -> MBloom s a -> ST s ()
         loop j mb = case f j of
-                      Just (a, j') -> insertMB mb a >> loop j' mb
+                      Just (a, j') -> MB.insert mb a >> loop j' mb
                       _ -> return ()
 
 -- | Create an immutable Bloom filter, populating it from a list of
           -> [a]                -- ^ values to populate with
           -> Bloom a
 {-# INLINE [1] fromList #-}
-fromList hashes numBits list = create hashes numBits $ forM_ list . insertMB
+fromList hashes numBits list = create hashes numBits $ forM_ list . MB.insert
 
 {-# RULES "Bloom insertList . fromList" forall h n xs ys.
     insertList xs (fromList h n ys) = fromList h n (xs ++ ys)

Data/BloomFilter/Mutable.hs

+{-# LANGUAGE BangPatterns, CPP, Rank2Types, ScopedTypeVariables,
+    TypeOperators #-}
+
+-- |
+-- Module: Data.BloomFilter.Mutable
+-- Copyright: Bryan O'Sullivan
+-- License: BSD3
+--
+-- Maintainer: Bryan O'Sullivan <bos@serpentine.com>
+-- Stability: unstable
+-- Portability: portable
+--
+-- A fast, space efficient Bloom filter implementation.  A Bloom
+-- filter is a set-like data structure that provides a probabilistic
+-- membership test.
+--
+-- * Queries do not give false negatives.  When an element is added to
+--   a filter, a subsequent membership test will definitely return
+--   'True'.
+--
+-- * False positives /are/ possible.  If an element has not been added
+--   to a filter, a membership test /may/ nevertheless indicate that
+--   the element is present.
+--
+-- This module provides low-level control.  For an easier to use
+-- interface, see the "Data.BloomFilter.Easy" module.
+
+module Data.BloomFilter.Mutable
+    (
+    -- * Overview
+    -- $overview
+
+    -- ** Ease of use
+    -- $ease
+
+    -- ** Performance
+    -- $performance
+
+    -- * Types
+      Hash
+    , MBloom
+    -- * Mutable Bloom filters
+
+    -- ** Creation
+    , new
+
+    -- ** Accessors
+    , length
+    , elem
+
+    -- ** Mutation
+    , insert
+
+    -- * The underlying representation
+    -- | If you serialize the raw bit arrays below to disk, do not
+    -- expect them to be portable to systems with different
+    -- conventions for endianness or word size.
+
+    -- | The raw bit array used by the immutable 'MBloom' type.
+    , bitArray
+    ) where
+
+#include "MachDeps.h"
+
+import Control.Monad (liftM, forM_)
+import Control.Monad.ST (ST)
+import Data.Array.Base (unsafeRead, unsafeWrite)
+import Data.Array.ST (thaw, unsafeFreeze)
+import Data.Bits ((.&.), (.|.))
+import Data.BloomFilter.Array (newArray)
+import Data.BloomFilter.Util (FastShift(..), (:*)(..), nextPowerOfTwo)
+import Data.Word (Word32)
+import Data.BloomFilter.Mutable.Internal
+
+import Prelude hiding (elem, length, notElem,
+                       (/), (*), div, divMod, mod, rem)
+
+-- | Create a new mutable Bloom filter.  For efficiency, the number of
+-- bits used may be larger than the number requested.  It is always
+-- rounded up to the nearest higher power of two, but clamped at a
+-- maximum of 4 gigabits, since hashes are 32 bits in size.
+--
+-- For a safer creation interface, use 'create'.  To convert a
+-- mutable filter to an immutable filter for use in pure code, use
+-- 'unsafeFreeze'.
+new :: (a -> [Hash])          -- ^ family of hash functions to use
+      -> Int                    -- ^ number of bits in filter
+      -> ST s (MBloom s a)
+new hash numBits = MB hash shift mask `liftM` newArray numElems numBytes
+  where twoBits | numBits < 1 = 1
+                | numBits > maxHash = maxHash
+                | isPowerOfTwo numBits = numBits
+                | otherwise = nextPowerOfTwo numBits
+        numElems = max 2 (twoBits `shiftR` logBitsInHash)
+        numBytes = numElems `shiftL` logBytesInHash
+        trueBits = numElems `shiftL` logBitsInHash
+        shift = logPower2 trueBits
+        mask = trueBits - 1
+        isPowerOfTwo n = n .&. (n - 1) == 0
+
+maxHash :: Int
+#if WORD_SIZE_IN_BITS == 64
+maxHash = 4294967296
+#else
+maxHash = 1073741824
+#endif
+
+logBitsInHash :: Int
+logBitsInHash = 5 -- logPower2 bitsInHash
+
+logBytesInHash :: Int
+logBytesInHash = 2 -- logPower2 (sizeOf (undefined :: Hash))
+
+-- | Given a filter's mask and a hash value, compute an offset into
+-- a word array and a bit offset within that word.
+hashIdx :: Int -> Word32 -> (Int :* Int)
+hashIdx mask x = (y `shiftR` logBitsInHash) :* (y .&. hashMask)
+  where hashMask = 31 -- bitsInHash - 1
+        y = fromIntegral x .&. mask
+
+-- | Hash the given value, returning a list of (word offset, bit
+-- offset) pairs, one per hash value.
+hashesM :: MBloom s a -> a -> [Int :* Int]
+hashesM mb elt = hashIdx (mask mb) `map` hashes mb elt
+
+-- | Insert a value into a mutable Bloom filter.  Afterwards, a
+-- membership query for the same value is guaranteed to return @True@.
+insert :: MBloom s a -> a -> ST s ()
+insert mb elt = do
+  let mu = bitArray mb
+  forM_ (hashesM mb elt) $ \(word :* bit) -> do
+      old <- unsafeRead mu word
+      unsafeWrite mu word (old .|. (1 `shiftL` bit))
+
+-- | Query a mutable Bloom filter for membership.  If the value is
+-- present, return @True@.  If the value is not present, there is
+-- /still/ some possibility that @True@ will be returned.
+elem :: a -> MBloom s a -> ST s Bool
+elem elt mb = loop (hashesM mb elt)
+  where mu = bitArray mb
+        loop ((word :* bit):wbs) = do
+          i <- unsafeRead mu word
+          if i .&. (1 `shiftL` bit) == 0
+            then return False
+            else loop wbs
+        loop _ = return True
+
+-- bitsInHash :: Int
+-- bitsInHash = sizeOf (undefined :: Hash) `shiftL` 3
+
+-- | Return the size of a mutable Bloom filter, in bits.
+length :: MBloom s a -> Int
+length = shiftL 1 . shift
+
+
+-- | Slow, crummy way of computing the integer log of an integer known
+-- to be a power of two.
+logPower2 :: Int -> Int
+logPower2 k = go 0 k
+    where go j 1 = j
+          go j n = go (j+1) (n `shiftR` 1)
+
+-- $overview
+--
+-- Each of the functions for creating Bloom filters accepts two parameters:
+--
+-- * The number of bits that should be used for the filter.  Note that
+--   a filter is fixed in size; it cannot be resized after creation.
+--
+-- * A function that accepts a value, and should return a fixed-size
+--   list of hashes of that value.  To keep the false positive rate
+--   low, the hashes computes should, as far as possible, be
+--   independent.
+--
+-- By choosing these parameters with care, it is possible to tune for
+-- a particular false positive rate.  The @suggestSizing@ function in
+-- the "Data.BloomFilter.Easy" module calculates useful estimates for
+-- these parameters.
+
+-- $ease
+--
+-- This module provides both mutable and immutable interfaces for
+-- creating and querying a Bloom filter.  It is most useful as a
+-- low-level way to create a Bloom filter with a custom set of
+-- characteristics, perhaps in combination with the hashing functions
+-- in 'Data.BloomFilter.Hash'.
+--
+-- For a higher-level interface that is easy to use, see the
+-- 'Data.BloomFilter.Easy' module.
+
+-- $performance
+--
+-- The implementation has been carefully tuned for high performance
+-- and low space consumption.
+--
+-- For efficiency, the number of bits requested when creating a Bloom
+-- filter is rounded up to the nearest power of two.  This lets the
+-- implementation use bitwise operations internally, instead of much
+-- more expensive multiplication, division, and modulus operations.

Data/BloomFilter/Mutable/Internal.hs

+{-# LANGUAGE BangPatterns, CPP, Rank2Types, ScopedTypeVariables,
+    TypeOperators #-}
+
+-- |
+-- Module: Data.BloomFilter.Mutable.Internal
+-- Copyright: Bryan O'Sullivan
+-- License: BSD3
+--
+-- Maintainer: Bryan O'Sullivan <bos@serpentine.com>
+-- Stability: unstable
+-- Portability: portable
+
+module Data.BloomFilter.Mutable.Internal
+    (
+    -- * Types
+      Hash
+    , MBloom(..)
+    ) where
+
+import Data.Array.Base (STUArray)
+import Data.Bits (shiftL)
+import Data.Word (Word32)
+
+import Prelude hiding (elem, length, notElem,
+                       (/), (*), div, divMod, mod, rem)
+
+-- | A hash value is 32 bits wide.  This limits the maximum size of a
+-- filter to about four billion elements, or 512 megabytes of memory.
+type Hash = Word32
+
+-- | A mutable Bloom filter, for use within the 'ST' monad.
+data MBloom s a = MB {
+      hashes :: !(a -> [Hash])
+    , shift :: {-# UNPACK #-} !Int
+    , mask :: {-# UNPACK #-} !Int
+    , bitArray :: {-# UNPACK #-} !(STUArray s Int Hash)
+    }
+
+instance Show (MBloom s a) where
+    show mb = "MBloom { " ++ show ((1::Int) `shiftL` shift mb) ++ " bits } "

bloomfilter.cabal

       base >= 4
   exposed-modules:  Data.BloomFilter
                     Data.BloomFilter.Easy
+                    Data.BloomFilter.Mutable
                     Data.BloomFilter.Hash
   other-modules:    Data.BloomFilter.Array
+                    Data.BloomFilter.Mutable.Internal
                     Data.BloomFilter.Util
   c-sources:        cbits/lookup3.c
   ghc-options:      -O2 -Wall
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.