Commits

Virgil Dupras committed 88bfb63

Optimized layout.group_textboxes() to fix a problem where many text elements would cause the function to stall for eons.

Comments (0)

Files changed (1)

pdfminer/layout.py

 from itertools import combinations
+from collections import defaultdict
 
 from .utils import (INF, get_bound, uniq, fsplit, drange, bbox2str, matrix2str, apply_matrix_pt,
     trailiter)
             objs = set(plane.find((x0,y0,x1,y1)))
             return objs.difference((obj1,obj2))
         # XXX this still takes O(n^2)  :(
-        dists = []
+        # `dists` used to be a list. It was converted to the defaultdict below. It's a bit faster,
+        # but it's still O(n^2).
+        dists = defaultdict(list)
         for obj1, obj2 in combinations(boxes, 2):
-            dists.append((0, dist(obj1, obj2), obj1, obj2))
-        # we sort by dist and our tuple is (c,dist,obj1,obj2)
-        sortkey = lambda tup: tup[:2]
-        dists.sort(key=sortkey)
+            dists[obj1].append((0, dist(obj1, obj2), obj2))
+        # we sort by dist
+        sortkey = lambda tup: tup[:1]
+        for subdists in dists.values():
+            subdists.sort(key=sortkey)
         plane = Plane(boxes)
-        while dists:
-            (c,d,obj1,obj2) = dists.pop(0)
+        while True:
+            for key, subdists in list(dists.items()):
+                if not subdists:
+                    del dists[key]
+            if not dists:
+                break
+            obj1, subdists = min(dists.items(), key=lambda x: x[1][0][1])
+            c, d, obj2 = subdists.pop(0)
+            if obj1 not in plane or obj2 not in plane:
+                continue
             if c == 0 and isany(obj1, obj2):
-                dists.append((1,d,obj1,obj2))
+                subdists.append((1, d, obj2))
                 continue
             if (isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or
                 isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL))):
                 group = LTTextGroupLRTB([obj1,obj2])
             plane.remove(obj1)
             plane.remove(obj2)
-            dists = [(c,d,o1,o2) for (c,d,o1,o2) in dists if o1 in plane and o2 in plane]
+            del dists[obj1]
+            if obj2 in dists:
+                del dists[obj2]
+            subdists = dists[group]
             for other in plane:
-                dists.append((0, dist(group,other), group, other))
-            dists.sort(key=sortkey)
+                subdists.append((0, dist(group, other), other))
+            subdists.sort(key=sortkey)
             plane.add(group)
         assert len(plane) in {0, 1}
         return list(plane)