Commits

Bob Harris committed d4b947a

_actually_ faster binary search version (rev 390 was in fact slower than rev 389)

  • Participants
  • Parent commits 5196de0

Comments (0)

Files changed (1)

File lib/bx/intervals/random.py

     """
 
     # Implementation:
-    #   We keep a list of the gaps, sorted from smallest to largest (so that
-    #   we can use insort to insert split gaps without having to reverse the
-    #   list).  We then place each length by following steps:
+    #   We keep a list of the gaps, sorted from largest to smallest.  We then
+    #   place each length by following steps:
     #     (1) construct a candidate counts array (cc array)
     #     (2) choose a candidate at random
     #     (3) find gap containing that candidate
     #
     #   The cc array is only constructed if there's a change (different length
     #   to place, or the gap list has changed).  It contains, for each gap, the
-    #   total number of number of candidate positions in gaps *following* it in
+    #   total number of number of candidate positions in gaps *preceding* it in
     #   the gap list:
-    #     cc[i] = sum over k in (i+1)..(N-1) of length[i] - L + 1
+    #     cc[i] = sum over k in 0..(i-1) of length[i] - L + 1
     #   where N is the number of gaps and L is the length being thrown.
     #   At the same time, we determine the total number of candidates (the total
     #   number of places the current length can be placed) and the index range
     #   example:
     #     for L = 20
     #     i =           0   1   2   3   4   5   6   7   8   9
-    #     length[i] =   8  11  17  29  40  48  50  56  66  96
-    #     cc[i] =       X   X   X 242 221 192 161 124  77   0
+    #     length[i] =  96  66  56  50  48  40  29  17  11   8
+    #     cc[i] =       0  77 124 161 192 221 242   X   X   X
     #     candidates = 252
-    #     lo_gap = 3
-    #     hi_gap = 9
+    #     lo_gap = 0
+    #     hi_gap = 6
     #
     #   The candidate is chosen in (0..candidates-1).  The candidate counts
     #   array allows us to do a binary search to locate the gap that holds that
     #   candidate.  Continuing the example above, we choose a random candidate
     #   s in (0..251).  If s happens to be in (124..160), it will be mapped to
-    #   gap 7 at start position s-124.
+    #   gap 2 at start position s-124.
     #
-    #   During the binary search, if we are looking at gap 5, if s < cc[5]
-    #   then the desired gap is gap 6 or higher.  Otherwise iit is gap 5 or
-    #   lower.
+    #   During the binary search, if we are looking at gap 3, if s < cc[3]
+    #   then the desired gap is gap 2 or lower.  Otherwise it is gap 3 or
+    #   higher.
 
     # Use mask to find the gaps;  gaps is a list of (length,start,end)
     lengths = [length for length in lengths if length > 0]
         end = mask.next_set( start )
         if end-start >= min_length:
             gaps.append( ( end-start, start, end ) )
-    # Sort (long regions last)    
+    # Sort (long regions first)    
     gaps.sort()
+    gaps.reverse()
     # And throw
     prev_length = None # (force initial cc array construction)
     cc = [0] * (len( gaps ) + len(lengths) - 1)
             prev_length = length
             assert len( cc ) >= len( gaps )
             candidates = 0
-            lo_gap = len(gaps) - 1
-            while lo_gap >= 0:
-                gap_len = gaps[lo_gap][0]
+            hi_gap = 0
+            for gap in gaps:
+                gap_len = gap[0]
                 if gap_len < length:
                     break
-                cc[lo_gap] = candidates
+                cc[hi_gap] = candidates
                 candidates += gap_len - length + 1
-                lo_gap -= 1
+                hi_gap += 1
             if candidates == 0:
                 raise MaxtriesException( "No gap can fit region of length %d" % length )
-            lo_gap += 1
+            hi_gap -= 1
         # Select a candidate
         s = random.randrange( candidates )
         #..
         #..for ix in range( len( gaps ) ):
         #..    gap = gaps[ix]
-        #..    if ix < lo_gap: print "%2s: %5s %5s %5s" % ( ix, gap[1], gap[0], "X" )
-        #..    else:           print "%2s: %5s %5s %5s" % ( ix, gap[1], gap[0], cc[ix] )
+        #..    if ix <= hi_gap: print "%2s: %5s %5s %5s" % ( ix, gap[1], gap[0], cc[ix] )
+        #..    else:            print "%2s: %5s %5s %5s" % ( ix, gap[1], gap[0], "X" )
         #..print "s = %s (of %s candidates)" % ( s, candidates )
-        # Locate gap containing that candidate, by binary search
-        lo = lo_gap
-        hi = len(gaps) - 1
+        # .Locate gap containing that candidate, by binary search
+        lo = 0
+        hi = hi_gap
         while hi > lo:
-            mid = (lo + hi) / 2
-            if s < cc[mid]: lo = mid+1  # (s <  num candidates from mid+1..N-1)
-            else:           hi = mid    # (s >= num candidates from mid+1..N-1)
+            mid = (lo + hi + 1) / 2     # (we round up to prevent infinite loop)
+            if s < cc[mid]: hi = mid-1  # (s <  num candidates from 0..mid-1)
+            else:           lo = mid    # (s >= num candidates from 0..mid-1)
         s -= cc[lo]
         # If we are not allowing overlaps we will remove the placed interval
         # from the gap list
             gap_length, gap_start, gap_end = gaps.pop( lo )
             assert s >= 0
             assert gap_start + s + length <= gap_end, "Expected: %d + %d + %d == %d <= %d" % ( gap_start, s, length, gap_start + s + length, gap_end )
+            gaps.reverse()
             if s >= min_length:
                 bisect.insort( gaps, ( s, gap_start, gap_start + s ) )
             if s + length <= gap_length - min_length:
                 bisect.insort( gaps, ( gap_length - ( s + length ), gap_start + s + length, gap_end ) )
+            gaps.reverse()
             prev_length = None # (force cc array construction)
         # Save the new interval
         save_interval_func( gap_start + s, gap_start + s + length )