Commits

James Taylor committed e974cb1

Updated to deal with multiple datasets, both sampling methods now work correctly,
but the gap list based one turns out to be slower. Might be a data structure issue
though, needs optimization.

Comments (0)

Files changed (1)

bed_rand_intersect.py

         for gap in gaps:
             if gap[0] >= length:
                 max_candidate += 1
-                candidate_bases += ( gap[0] - length )
+                candidate_bases += ( gap[0] - length + 1 )
             else: 
                 break
         if max_candidate == 0:
         # Map back to region
         chosen_index = 0
         for gap in gaps:
-            if s > ( gap[0] - length ):
-                s -= ( gap[0] - length )
+            gap_length, gap_start, gap_end = gap
+            if s > ( gap_length - length + 1 ):
+                s -= ( gap_length - length + 1 )
                 chosen_index += 1
             else:
                 break
         # Remove the chosen gap and split
+        assert ( gap_length, gap_start, gap_end ) == gaps.pop( chosen_index )
+        # gap_length, gap_start, gap_end =  gaps.pop( chosen_index )
+        assert s >= 0
+        assert gap_start + s + length <= gap_end
         gaps.reverse()
-        gap_length, gap_start, gap_end =  gaps.pop( chosen_index )
         if s > 0:
             bisect.insort( gaps, ( s, gap_start, gap_start + s ) )
-            print "gap before inserted"
         if s + length < gap_length:
             bisect.insort( gaps, ( gap_length - ( s + length ), gap_start + s + length, gap_end) )
-            print "gap after inserted"
         gaps.reverse()
         # And finally set the bits
         assert bits[gap_start + s] == 0 
         assert bits.next_set( gap_start + s, gap_start+s+length ) == gap_start+s+length 
-        assert( gap_start + s > 0 and gap_start + s + length < bits.size ), "Bad interval %d %d %d %d" % ( gap_start, s, length, bits.size )
-        bits.set_range( gap_start + s, length )
-        print gaps    
+        assert( gap_start + s >= 0 and gap_start + s + length <= bits.size ), "Bad interval %d %d %d %d %d" % ( gap_start, s, length, gap_start + s + length, bits.size )
+        bits.set_range( gap_start + s, length )   
     assert bits.count_range( 0, bits.size ) == sum( lengths )
     return bits
             
     bits = BitSet( total_length )
     bits |= mask
     lengths = lengths[:]
-    lengths.shuffle()
+    random.shuffle( lengths )
     for length in lengths:
         for i in range( maxtries ):
             # Pick a random start position
         if chr == r_chr and start < r_stop and stop >= r_start:
             rval.append( ( chr, max( start, r_start ), min( stop, r_stop ) ) )
     return rval        
-            
-def random_count_overlap( mask, lengths1, lengths2 ):
-    # Build randomly covered bitmasks
-    bits1 = throw_random( lengths1, mask )
-    bits2 = throw_random( lengths2, mask )
-    # Find intersection
-    bits1 &= bits2
-    # Print amount intersecting
-    return bits1.count_range( 0, bits1.size )
-
 
 def main():
     region_fname = sys.argv[1]
-    intervals1_fname = sys.argv[2]       
-    intervals2_fname = sys.argv[3]       
-    mask_fname = sys.argv[4]       
-    nsamples = int( sys.argv[5] )
-    total_actual = 0
-    total_lengths1 = 0
-    total_lengths2 = 0
-    total_samples = zeros( nsamples )
+    mask_fname = sys.argv[2]       
+    nsamples = int( sys.argv[3] )
+    intervals1_fname = sys.argv[4]       
+    intervals2_fnames = sys.argv[5:]       
+    nfeatures = len( intervals2_fnames )
+    total_actual = zeros( nfeatures )
+    # total_lengths1 = 0
+    total_lengths2 = zeros( nfeatures )
+    total_samples = zeros( ( nsamples, nfeatures ) )
     for line in open( region_fname ):
         # Load lengths for all intervals overlapping region
         fields = line.split()
         r_length = r_stop - r_start
         intervals1 = overlapping_in_bed( intervals1_fname, r_chr, r_start, r_stop )
         bits1 = as_bits( r_start, r_length, intervals1 )
-        intervals2 = overlapping_in_bed( intervals2_fname, r_chr, r_start, r_stop )
-        bits2 = as_bits( r_start, r_length, intervals2 )
         mask = overlapping_in_bed( mask_fname, r_chr, r_start, r_stop )
         bits_mask = as_bits( r_start, r_length, mask )
         # Sanity checks
         assert count_overlap( bits1, bits_mask ) == 0
-        assert count_overlap( bits2, bits_mask ) == 0
-        # Observed values
-        actual_overlap = count_overlap( bits1, bits2 )
-        total_actual += actual_overlap
-        # Sample 
-        lengths1 = [ stop - start for chr, start, stop in intervals1 ]
-        lengths2 = [ stop - start for chr, start, stop in intervals2 ]
-        total_lengths1 += sum( lengths1 )
-        total_lengths2 += sum( lengths2 )
-        for i in range( nsamples ):
-            total_samples[i] += random_count_overlap( bits_mask, lengths1, lengths2 )
-    print "total covered by first: %d, second: %d, overlap: %d" % ( total_lengths1, total_lengths2, total_actual )
-    print "observed overlap: %d, sample mean: %d, sample stdev: %d" % ( total_actual, stats.amean( total_samples ), stats.asamplestdev( total_samples ) )
-    print "z-score:", ( total_actual - stats.amean( total_samples ) ) / stats.asamplestdev( total_samples )
-    print "percentile:", sum( total_actual > total_samples ) / nsamples
+        # For each data set
+        for featnum, intervals2_fname in enumerate( intervals2_fnames ):
+            intervals2 = overlapping_in_bed( intervals2_fname, r_chr, r_start, r_stop )
+            bits2 = as_bits( r_start, r_length, intervals2 )
+            assert count_overlap( bits2, bits_mask ) == 0
+            # Observed values
+            actual_overlap = count_overlap( bits1, bits2 )
+            total_actual[featnum] += actual_overlap
+            # Sample 
+            lengths2 = [ stop - start for chr, start, stop in intervals2 ]
+            total_lengths2[ featnum ] += sum( lengths2 )
+            for i in range( nsamples ):
+                # Build randomly covered bitmask for second set
+                random2 = throw_random( lengths2, bits_mask )
+                # Find intersection
+                random2 &= bits1
+                # Print amount intersecting
+                total_samples[ i, featnum ] += random2.count_range( 0, random2.size )
+    fraction_overlap = total_samples / total_lengths2
+    print "\t".join( intervals2_fnames )
+    print "\t".join( map( str, total_actual/total_lengths2 ) )
+    for row in fraction_overlap:
+        print "\t".join( map( str, row ) )
+    #print "total covered by first: %d, second: %d, overlap: %d" % ( total_lengths1, total_lengths2, total_actual )
+    #print "observed overlap: %d, sample mean: %d, sample stdev: %d" % ( total_actual, stats.amean( total_samples ), stats.asamplestdev( total_samples ) )
+    #print "z-score:", ( total_actual - stats.amean( total_samples ) ) / stats.asamplestdev( total_samples )
+    #print "percentile:", sum( total_actual > total_samples ) / nsamples
     
 if __name__ == "__main__": main()
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.