Commits

Sergey Astanin committed 4d55d54

actually usable groupby() with unique groups

Comments (0)

Files changed (1)

 """Functions to split or partition sequences."""
 
 import collections
-import platform
+import itertools
+import operator
+from platform import python_version_tuple
 
-__all__ = [ "chop", "partition", "split" ]
+__all__ = [ "chop", "groupby", "partition", "split" ]
 __author__ = "Sergey Astanin"
 __license__ = "MIT"
 __version__ = "0.3"
 
-if platform.python_version_tuple()[0] == "2":
+if python_version_tuple()[0] == "2":
     _range = xrange
+    _imap = itertools.imap
 else:
     _range = range
+    _imap = map
 
 class _SubSequencer:
     """
                 if pv == predicate_value:
                     return n
                 else:
-                    self.subappend(pv, n)
+                    self._subappend(pv, n)
+
+def groupby(predicate, sequence, predicate_values=()):
+    """
+    Split a sequence into subsequences with the same predicate value
+    on all elements. Return an iterator over pairs of
+    (predicate_value, subsequence_iterator).
+
+    Arguments:
+
+    predicate         a function on sequence elements
+    sequence          original sequence
+    predicate_values  if given, build only subsequences for these
+                      values of the predicate function
+
+    Unlike itertools.groupby, return only one iterator per predicate
+    value; argument order is reversed to match other functions in this
+    module.
+
+    >>> [(k, list(i)) for k,i in groupby(lambda x: x%3, range(7))]
+    [(0, [0, 3, 6]), (1, [1, 4]), (2, [2, 5])]
+
+    This function is lazy and consumes sequence only on demand, but to
+    build the comlete list of predicate values it needs to scan the
+    entire sequence. To avoid such an eager behaviour, the function can
+    take a list of possible predicate_values in advance:
+
+    >>> if python_version_tuple()[0] > '2': xrange=range
+    >>> gs = groupby(lambda x: x%3, xrange(int(1e9)), predicate_values=(0,1,2))
+    >>> d = dict(gs)
+    >>> list(itertools.takewhile(lambda x: x < 20, d[1]))
+    [1, 4, 7, 10, 13, 16, 19]
+
+    """
+    queues = collections.defaultdict(collections.deque)
+    kvs = _imap(lambda v: (predicate(v), v), sequence)
+    kvs, kvs2 = itertools.tee(kvs, 2)
+    def uniqkeys():
+        keys = set()
+        for k, v in kvs:
+            if not (k in keys):
+                keys.add(k)
+                yield k
+    def subsequence(k):
+        while True:
+            queued = queues[k]
+            if queued:
+                yield queued.popleft()
+            else:
+                k2, v2 = next(kvs2)
+                queues[k2].append(v2)
+    pvals =  uniqkeys() if not predicate_values else predicate_values
+    return _imap(lambda k: (k, subsequence(k)), pvals)
 
 def partition(condition, sequence):
     """
     As the function works in single pass, it leads to build-up of both
     subsequences even if only one of them is consumed.
 
-    It is similar to Data.List.partition in Haskell, partition-by in
+    It is similar to Data.List.partition in Haskell, group-by in
     Clojure, or running two complementary filters:
 
        from itertools import ifilter, ifilterfalse
 
     This function is lazy and produces new chunks only on demand:
 
-    >>> if platform.python_version_tuple()[0] > '2': xrange=range
+    >>> if python_version_tuple()[0] > '2': xrange=range
     >>> chunks = chop(3, xrange(int(1e9)))
     >>> next(chunks)
     [0, 1, 2]
 
     This function is lazy and produces new chunks only on demand:
 
-    >>> if platform.python_version_tuple()[0] > '2': xrange=range
+    >>> if python_version_tuple()[0] > '2': xrange=range
     >>> chunks = split(9, xrange(int(1e9)))
     >>> next(chunks)
     [0, 1, 2, 3, 4, 5, 6, 7, 8]