Commits

Greg Von Kuster committed 30d3106

Fix bug in binned_bitsets - now handling errors in incoming lens dictionary. Added a new subclass of NiceReaderWrapper to handle ValueError, IndexError and OverflowError exceptions thorwn when creating / using bitsets.

Comments (0)

Files changed (7)

lib/bx/intervals/io.py

                                 self.strand_col, self.default_strand, fix_strand=self.fix_strand )
 
     def binned_bitsets( self , upstream_pad=0, downstream_pad=0, lens={} ):
+        # The incoming lens dictionary is a dictionary of chromosome lengths
+        # which are used to initialize the bitsets.
         last_chrom = None
         last_bitset = None
         bitsets = dict()
                 chrom = interval[self.chrom_col]
                 if chrom != last_chrom:
                     if chrom not in bitsets:
-                        if chrom in lens:
-                            size = lens[chrom]
-                        else:
-                            size = MAX
+                        size = lens.get( chrom, MAX )
                         try:
                             bbs = BinnedBitSet( size )
                         except ValueError, e:
-                            continue
+                            # We will only reach here when constructing this bitset from the lens dict
+                            # since the value of MAX is always safe.
+                            raise Exception( "Invalid chrom length %s in 'lens' dictionary. %s" % ( str( size ), str( e ) ) )
                         bitsets[chrom] = bbs
                     last_chrom = chrom
                     last_bitset = bitsets[chrom]
-                start = max(int( interval[self.start_col]), 0 )
-                end = min(int( interval[self.end_col]), size)
-                try:
-                    last_bitset.set_range( start, end-start )
-                except OverflowError, e:
-                    continue
+                start = max( int( interval[self.start_col] ), 0 )
+                end = min( int( interval[self.end_col] ), size)
+                last_bitset.set_range( start, end-start )
         return bitsets
 
 class NiceReaderWrapper( GenomicIntervalReader ):
         while 1:
             self.current_line = self.input_wrapper.next()
             yield self.current_line
+
+class BitsetSafeReaderWrapper( NiceReaderWrapper ):
+    def __init__( self, reader, lens={} ):
+        # This class handles any ValueError, IndexError and OverflowError exceptions that may be thrown when
+        # the bitsets are being created by skipping the problem lines.
+        # The incoming lens dictionary is a dictionary of chromosome lengths
+        # which are used to initialize the bitsets.
+        NiceReaderWrapper.__init__( self, reader.input )
+        self.lens = lens
+    def next( self ):
+        rval = NiceReaderWrapper.next( self )
+        if type( rval ) == GenomicInterval:
+            if rval.end > self.lens.get( rval.chrom, MAX ): # MAX_INT is defined in bx.bitset
+                try:
+                    # This will only work if reader is a NiceReaderWrapper
+                    self.skipped += 1
+                    # no reason to stuff an entire bad file into memmory
+                    if self.skipped < 10:
+                        self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) )
+                except:
+                    pass
+                return self.next()
+        return rval

lib/bx/intervals/operations/base_coverage.py

 from bx.intervals.io import *
 from bx.intervals.operations import *
 
-def base_coverage(reader):
-    bitsets = reader.binned_bitsets()
+def base_coverage( reader ):
+    # Handle any ValueError, IndexError and OverflowError exceptions that may be thrown when
+    # the bitsets are being created by skipping the problem lines
+    base_reader = BitsetSafeReaderWrapper( reader, lens={} )
+    bitsets = base_reader.binned_bitsets()
     coverage = 0
     for chrom in bitsets:
         try:
             coverage += bitsets[chrom].count_range(0, MAX_END)
         except IndexError, e:
-            try:
-                # This will work only if reader is a NiceReaderWrapper
-                reader.skipped += 1
-                # no reason to stuff an entire bad file into memmory
-                if reader.skipped < 10:
-                    reader.skipped_lines.append( ( reader.linenum, reader.current_line, str( e ) ) )
-            except:
-                pass
+            base_reader.skipped += 1
+            # no reason to stuff an entire bad file into memmory
+            if base_reader.skipped < 10:
+                base_reader.skipped_lines.append( ( base_reader.linenum, base_reader.current_line, str( e ) ) )
             continue
     return coverage

lib/bx/intervals/operations/complement.py

 
 from bx.intervals.io import *
 from bx.intervals.operations import *
+from bx.bitset import MAX
 
-def complement(reader, lens):
-    # load into bitsets
-    bitsets = reader.binned_bitsets(upstream_pad = 0, downstream_pad = 0, lens = lens)
-
+def complement( reader, lens ):
+    # Handle any ValueError, IndexError and OverflowError exceptions that may be thrown when
+    # the bitsets are being created by skipping the problem lines
+    complement_reader = BitsetSafeReaderWrapper( reader, lens=lens )
+    bitsets = complement_reader.binned_bitsets( upstream_pad=0, downstream_pad=0, lens=lens )
     # NOT them all
     for key, value in bitsets.items():
         value.invert()
-
     # Read remaining intervals and subtract
     for chrom in bitsets:
         bitset = bitsets[chrom]
-        out_intervals = bits_set_in_range( bitset, 0, lens.get(chrom, 512*1024*1024) )
+        out_intervals = bits_set_in_range( bitset, 0, lens.get( chrom, MAX ) )
         try:
             # Write the intervals
             for start, end in out_intervals:
-                fields = ["."  for x in range(max(reader.chrom_col, reader.start_col, reader.end_col)+1)]
+                fields = ["."  for x in range(max(complement_reader.chrom_col, complement_reader.start_col, complement_reader.end_col)+1)]
                 # default the column to a + if it exists
-                if reader.strand_col < len( fields ) and reader.strand_col >= 0:
-                    fields[reader.strand_col] = "+"
-                fields[reader.chrom_col] = chrom
-                fields[reader.start_col] = start
-                fields[reader.end_col] = end
-                new_interval = GenomicInterval(reader, fields, reader.chrom_col, reader.start_col, reader.end_col, reader.strand_col, "+")
+                if complement_reader.strand_col < len( fields ) and complement_reader.strand_col >= 0:
+                    fields[complement_reader.strand_col] = "+"
+                fields[complement_reader.chrom_col] = chrom
+                fields[complement_reader.start_col] = start
+                fields[complement_reader.end_col] = end
+                new_interval = GenomicInterval(complement_reader, fields, complement_reader.chrom_col, complement_reader.start_col, complement_reader.end_col, complement_reader.strand_col, "+")
                 yield new_interval
         except IndexError, e:
-            try:
-                # This will work only if reader is a NiceReaderWrapper
-                reader.skipped += 1
-                # no reason to stuff an entire bad file into memmory
-                if reader.skipped < 10:
-                    reader.skipped_lines.append( ( reader.linenum, reader.current_line, str( e ) ) )
-            except:
-                pass
+            complement_reader.skipped += 1
+            # no reason to stuff an entire bad file into memmory
+            if complement_reader.skipped < 10:
+                complement_reader.skipped_lines.append( ( complement_reader.linenum, complement_reader.current_line, str( e ) ) )
             continue
 
 

lib/bx/intervals/operations/coverage.py

 from bx.intervals.operations import *
 
 def coverage(readers, comments=True):
-    # Read all but first into bitsets and union to one
+    # The incoming lens dictionary is a dictionary of chromosome lengths which are used to initialize the bitsets.
     primary = readers[0]
     intersect = readers[1:]
+    # Handle any ValueError, IndexError and OverflowError exceptions that may be thrown when
+    # the bitsets are being created by skipping the problem lines
+    intersect[0] = BitsetSafeReaderWrapper( intersect[0], lens={} )
     bitsets = intersect[0].binned_bitsets()
     intersect = intersect[1:]
     for andset in intersect:

lib/bx/intervals/operations/intersect.py

 from bx.intervals.operations import *
 
 def intersect(readers, mincols=1, upstream_pad=0, downstream_pad=0, pieces=True, lens={}, comments=True):
-
+    # The incoming lens dictionary is a dictionary of chromosome lengths which are used to initialize the bitsets.
     # Read all but first into bitsets and intersect to one
     primary = readers[0]
     intersect = readers[1:]
-    bitsets = intersect[0].binned_bitsets(upstream_pad = upstream_pad, downstream_pad = downstream_pad, lens = lens)
+    # Handle any ValueError, IndexError and OverflowError exceptions that may be thrown when
+    # the bitsets are being created by skipping the problem lines
+    intersect[0] = BitsetSafeReaderWrapper( intersect[0], lens=lens )
+    bitsets = intersect[0].binned_bitsets( upstream_pad=upstream_pad, downstream_pad=downstream_pad, lens=lens )
     intersect = intersect[1:]
     for andset in intersect:
         bitset2 = andset.binned_bitsets(upstream_pad = upstream_pad, downstream_pad = downstream_pad, lens = lens)
                         out_intervals = bits_set_in_range( bitsets[chrom], start, end )
                     else:
                         out_intervals = [ ( start, end ) ]
+                # Write the intervals
+                for start, end in out_intervals:
+                    new_interval = interval.copy()
+                    new_interval.start = start
+                    new_interval.end = end
+                    yield new_interval
             except IndexError, e:
                 try:
                     # This will only work if primary is a NiceReaderWrapper
                 except:
                     pass
                 continue
-            # Write the intervals
-            for start, end in out_intervals:
-                new_interval = interval.copy()
-                new_interval.start = start
-                new_interval.end = end
-                yield new_interval

lib/bx/intervals/operations/merge.py

 from bx.intervals.operations import *
 
 # sorting could make this a less memory intensive operation(?)
-def merge(interval, mincols=1):
+def merge( interval, mincols=1 ):
+    # Handle any ValueError, IndexError and OverflowError exceptions that may be thrown when
+    # the bitsets are being created by skipping the problem lines
+    interval = BitsetSafeReaderWrapper( interval, lens={} )
     bitsets = interval.binned_bitsets()
     if interval.header:
         yield interval.header

lib/bx/intervals/operations/subtract.py

 from bx.intervals.operations import *
 
 def subtract(readers, mincols=1, upstream_pad=0, downstream_pad=0, pieces=True, lens={}, comments=True):
-
-    # Read all but first into bitsets and union to one (if confused,
-    # read DeMorgan's...)
+    # The incoming lens dictionary is a dictionary of chromosome lengths which are used to initialize the bitsets.
+    # Read all but first into bitsets and union to one (if confused, read DeMorgan's...)
     primary = readers[0]
     union = readers[1:]
-    bitsets = union[0].binned_bitsets(upstream_pad = upstream_pad, downstream_pad = downstream_pad, lens = lens)
+    # Handle any ValueError, IndexError and OverflowError exceptions that may be thrown when
+    # the bitsets are being created by skipping the problem lines
+    union[0] = BitsetSafeReaderWrapper( union[0], lens=lens )
+    bitsets = union[0].binned_bitsets( upstream_pad=upstream_pad, downstream_pad=downstream_pad, lens=lens )
     union = union[1:]
     for andset in union:
         bitset2 = andset.binned_bitsets(upstream_pad = upstream_pad, downstream_pad = downstream_pad, lens = lens)