Commits

Sergey Astanin committed 30299d4

split(): break into chunks on given separator

Comments (0)

Files changed (1)

 """Functions to split or partition sequences."""
 
-from collections import deque
-from platform import python_version_tuple
+import collections
+import platform
 
-__all__ = [ "partition", "chop" ]
+__all__ = [ "chop", "partition", "split" ]
 __author__ = "Sergey Astanin"
 __license__ = "MIT"
 __version__ = "0.2"
 
-if python_version_tuple()[0] == "2":
+if platform.python_version_tuple()[0] == "2":
     _range = xrange
 else:
     _range = range
 
-class _SplitSeq:
+class _SubSequencer:
     """
     Lazily process a sequence in single pass and split into two.
-
     Computes both output sequences even if only one of them is consumed.
     """
     def __init__(self, condition, sequence):
         self.cond = condition
-        self.goods = deque([])
-        self.bads = deque([])
+        self.goods = collections.deque([])
+        self.bads = collections.deque([])
         self.seq = iter(sequence)
     def getNext(self, getGood=True):
         if getGood:
 
     """
     cond = condition if condition else bool  # eval as bool if condition is None
-    ss = _SplitSeq(cond, sequence)
+    ss = _SubSequencer(cond, sequence)
     def condition_holds():
         while 1:
             yield ss.getNext(getGood=True)
 
 def chop(n, sequence, truncate=False):
     """
-    Split a sequence into chunks of size n. Return an iterator over chunks.
+    Split a sequence into chunks of size n.
+    Return an iterator over chunks.
 
     Arguments:
 
     >>> list(chop(3, range(6)))
     [[0, 1, 2], [3, 4, 5]]
 
+    This function is lazy and produces new chunks only on demand:
+
+    >>> if platform.python_version_tuple()[0] > '2': xrange=range
+    >>> chunks = chop(3, xrange(int(1e9)))
+    >>> next(chunks)
+    [0, 1, 2]
+
     """
     assert n > 1, "chunk size is not positive"
     def chopper():
                 yield head
     return chopper()
 
+def _nextByDelim(delimiter, seq):
+    "Next chunk from from the sequence seq, and sequence tail."
+    iseq = iter(seq)
+    chunk = []
+    try:
+        while True:
+            x = next(iseq)
+            if x != delimiter:
+                chunk.append(x)
+            else:
+                break
+        return chunk, iseq
+    except StopIteration:
+        return chunk, ()
+
+def split(delimiter, sequence, maxsplit=None):
+    """
+    Break a sequence on elements equal to delimiter.
+    Return an iterator over chunks (delimiters excluded).
+
+    If maxsplit is given, at most maxsplit splits are done.
+
+    >>> list(split(0, [1,2,3,0,4,5,0,0,6]))
+    [[1, 2, 3], [4, 5], [], [6]]
+
+    >>> list(map(list, split(0, [1,2,3,0,4,5,0,0,6], maxsplit=2)))
+    [[1, 2, 3], [4, 5], [0, 6]]
+
+    This function is lazy and produces new chunks only on demand:
+
+    >>> if platform.python_version_tuple()[0] > '2': xrange=range
+    >>> chunks = split(9, xrange(int(1e9)))
+    >>> next(chunks)
+    [0, 1, 2, 3, 4, 5, 6, 7, 8]
+
+    """
+    def splitter():
+        tail = sequence
+        splits = 0
+        while tail:
+            if maxsplit and splits >= maxsplit:
+                yield tail
+                tail = None
+            else:
+                chunk, tail = _nextByDelim(delimiter, tail)
+                splits += 1
+                yield chunk
+    return splitter()
+
 if __name__ == "__main__":
     import doctest
     doctest.testmod()