Commits

James Taylor committed ef8be30

Various small changes to the bag of bed tools.

  • Participants
  • Parent commits f27d6ae

Comments (0)

Files changed (7)

 import psyco_full
 import sys
 from bx.bitset import BinnedBitSet
+from bx.bitset_builders import *
 from itertools import *
 
 import pkg_resources
 else:
     input = sys.stdin
 
-bitsets = binned_bitsets_from_file( open( in_fname ) )
+bitsets = binned_bitsets_from_file( input )
 
 total = 0
 for chrom in bitsets:

bed_merge_overlapping.py

 else:
     input = sys.stdin
 
-bitsets = binned_bitsets_from_file( input )
+bitsets = binned_bitsets_from_bed_file( input )
 
 for chrom in bitsets:
     bits = bitsets[chrom]

bed_rand_intersect.py

         for gap in gaps:
             if gap[0] >= length:
                 max_candidate += 1
-                candidate_bases += ( gap[0] - length + 1 )
+                candidate_bases += ( gap[0] - length )
             else: 
                 break
         if max_candidate == 0:
         chosen_index = 0
         for gap in gaps:
             gap_length, gap_start, gap_end = gap
-            if s > ( gap_length - length + 1 ):
-                s -= ( gap_length - length + 1 )
+            if s > ( gap_length - length ):
+                s -= ( gap_length - length )
                 chosen_index += 1
             else:
                 break

bed_subtract_basewise.py

 import pkg_resources
 pkg_resources.require( "bx-python" )
 
-def read_len( f ):
-    """Read a 'LEN' file and return a mapping from chromosome to length"""
-    mapping = dict()
-    for line in f:
-        fields = line.split()
-        mapping[ fields[0] ] = int( fields[1] )
-    return mapping
-
 def print_bits_as_bed( bits ):
     end = 0
     while 1:
 
 options, args = cookbook.doc_optparse.parse( __doc__ )
 try:
-    in_fname, in2_fname, len_fname = args
+    in_fname, in2_fname = args
 except:
     cookbook.doc_optparse.exit()
 
-lens = read_len( open( len_fname ) )
-
 # Read first bed into some bitsets
 
-bitsets1 = binned_bitsets_from_file( open( in_fname ), lens=lens )
-bitsets2 = binned_bitsets_from_file( open( in2_fname ), lens=lens )
+bitsets1 = binned_bitsets_from_file( open( in_fname ) )
+bitsets2 = binned_bitsets_from_file( open( in2_fname ) )
 
 for chrom in bitsets1:  
     if chrom not in bitsets1:
 
     def clear( self, index ):
         bitClearOne( self.bits, index )
+        
+    def clone( self ):
+        other = BitSet( self.bitCount )
+        other.ior( self )
+        return other
 
     def set_range( self, start, count ):   
         bitSetRange( self.bits, start, count )

bx/bitset_builders.py

 from warnings import warn
 from bx.bitset import *
+import re
 
 def binned_bitsets_from_file( f, chrom_col=0, start_col=1, end_col=2, strand_col=5, upstream_pad=0, downstream_pad=0, lens={} ):
     """
     last_bitset = None
     bitsets = dict() 
     for line in f:
-        if line.startswith("#"):
+        if line.startswith("#") or line.isspace():
             continue
         fields = line.split()
         strand = "+"
         last_bitset.set_range( start, end-start )
     return bitsets
 
+def binned_bitsets_from_bed_file( f, chrom_col=0, start_col=1, end_col=2, strand_col=5, upstream_pad=0, downstream_pad=0, lens={} ):
+    """
+    Read a file into a dictionary of bitsets. The defaults arguments 
+    
+    - 'f' should be a file like object (or any iterable containing strings)
+    - 'chrom_col', 'start_col', and 'end_col' must exist in each line. 
+    - 'strand_col' is optional, any line without it will be assumed to be '+'
+    - if 'lens' is provided bitset sizes will be looked up from it, otherwise
+      chromosomes will be assumed to be the maximum size
+    """
+    last_chrom = None
+    last_bitset = None
+    bitsets = dict() 
+    offset = 0
+    for line in f:
+        if line.startswith("#") or line.isspace():
+            continue
+        # Ignore browser lines completely
+        if line.startswith( "browser" ):
+            continue
+        # Need to check track lines due to the offset 
+        if line.startswith( "track" ):
+            m = re.search( "offset=(\d+)", line )
+            if m and m.group( 1 ):
+                offset = int( m.group(1) )
+            continue
+        fields = line.split()
+        strand = "+"
+        if len(fields) > strand_col:
+            if fields[strand_col] == "-": strand = "-"
+        chrom = fields[chrom_col]
+        if chrom != last_chrom:
+            if chrom not in bitsets:
+                if chrom in lens:
+                    size = lens[chrom]
+                else:
+                    size = MAX
+                bitsets[chrom] = BinnedBitSet( size ) 
+            last_chrom = chrom
+            last_bitset = bitsets[chrom]
+        start, end = int( fields[start_col] ) + offset, int( fields[end_col] ) + offset
+        # Switch to '+' strand coordinates if not already
+        if strand == '-':
+            start = size - end
+            end = size - start
+        if upstream_pad: start = max( 0, start - upstream_pad )
+        if downstream_pad: end = min( size, end + downstream_pad )
+        if start > end: warn( "Interval start after end!" )
+        last_bitset.set_range( start, end-start )
+    return bitsets
+
 def binned_bitsets_proximity( f, chrom_col=0, start_col=1, end_col=2, strand_col=5, upstream=0, downstream=0 ):
     """Read a file into a dictionary of bitsets"""
     last_chrom = None
 bed_merge_overlapping.py
 bed_coverage.py
 bed_complement.py
+bed_subtract_basewise.py
 find_in_sorted_file.py
 int_seqs_to_char_strings.py
 interval_count_intersections.py