Commits

Marko Tasic committed 5c8882d

in writing.py added MERGE_SMALL_LIMITED merge function based on MERGE_SMALL but that limits size of merged segment

Comments (0)

Files changed (1)

src/whoosh/filedb/filewriting.py

 
 from __future__ import with_statement
 from bisect import bisect_right
+from itertools import islice
 
 from whoosh.fields import UnknownFieldError
 from whoosh.store import LockError
     return newsegments
 
 
+def MERGE_SMALL_LIMITED(writer, segments, max_count_factor=10, max_total_size=128 * 1024 ** 2, max_segments=10):
+    """
+    This policy merges small segments, where "small" is defined using a
+    heuristic based on the fibonacci sequence AND segment size!
+    """
+    total_docs = 0
+    total_size = 0
+    newsegments = list(islice(segments, max_segments, None))
+    sorted_segment_list = sorted(islice(segments, 0, max_segments), key=lambda s: s.doc_count_all())
+    
+    for i, seg in enumerate(sorted_segment_list):
+        count = seg.doc_count_all()
+        
+        if count <= 0:
+            continue
+        
+        total_docs += count
+        total_size += writer.storage.file_length(seg.make_filename(seg.COMPOUND_EXT))
+        
+        if total_docs < fib(i + 5) * max_count_factor and total_size < max_total_size:
+            reader = SegmentReader(writer.storage, writer.schema, seg)
+            writer.add_reader(reader)
+            reader.close()
+        else:
+            newsegments.append(seg)
+    
+    return newsegments
+
+
 def OPTIMIZE(writer, segments):
     """This policy merges all existing segments.
     """