Commits

John Chilton  committed 4027b3a

Several more fastq groomer optimizations.

  • Participants
  • Parent commits 0544199
  • Branches fastq-groomer-optimizations

Comments (0)

Files changed (1)

File lib/galaxy_utils/sequence/fastq.py

 
     score_system = 'phred' #phred or solexa
     sequence_space = 'base' #base or color
+
     @classmethod
     def get_class_by_format( cls, format ):
         assert format in FASTQ_FORMATS, 'Unknown format type specified: %s' % format
             return max( min( score, cls.quality_max ), cls.quality_min )
         return map( restrict_score, decimal_score_list )
     @classmethod
-    def transform_scores_to_valid_range( cls, decimal_score_list ):
+    def transform_scores_to_valid_range( cls, decimal_score_list):
         cls_quality_max = cls.quality_max
         cls_quality_min = cls.quality_min
         for i in range( len( decimal_score_list ) ):
             score = decimal_score_list[i]
             if( score <= cls_quality_max and score >= cls_quality_min ):
-                decimal_score_list[i] = str(score)
+                transformed_score = score
             elif(score >= cls_quality_max):
-                decimal_score_list[i] = str(cls_quality_max)
+                transformed_score = cls_quality_max
             else:
-                decimal_score_list[i] = str(cls_quality_min)
-
+                transformed_score = cls_quality_min
+            decimal_score_list[i] = str(transformed_score)
+    @classmethod
+    def transform_scores_to_valid_range_ascii( cls, decimal_score_list ):
+        cls_quality_max = cls.quality_max
+        cls_quality_min = cls.quality_min
+        to_quality = cls.to_quality
+        for i in range( len( decimal_score_list ) ):
+            score = decimal_score_list[i]
+            if( score <= cls_quality_max and score >= cls_quality_min ):
+                transformed_score = score
+            elif(score >= cls_quality_max):
+                transformed_score = cls_quality_max
+            else:
+                transformed_score = cls_quality_min
+            transformed_score = chr(transformed_score + to_quality)
+            decimal_score_list[i] = transformed_score
     @classmethod
     def convert_base_to_color_space( cls, sequence ):
         return cls.color_space_converter.to_color_space( sequence )
         return cls.color_space_converter.to_base_space( sequence )
     def is_ascii_encoded( self ):
         #as per fastq definition only decimal quality strings can have spaces (and TABs for our purposes) in them (and must have a trailing space)
-        if ' ' in self.quality:
+        quality = self.quality
+        if ' ' in quality:
             return False
-        if '\t' in self.quality:
+        if '\t' in quality:
             return False
         return True
 
                 return []
     def get_decimal_quality_scores( self ):
         if self.is_ascii_encoded():
-            return [ ord( val ) - self.to_quality for val in self.quality ]
+            to_quality = self.to_quality
+            return [ ord( val ) - to_quality for val in self.quality ]
         else:
             quality = self.quality.rstrip() #decimal scores should have a trailing space
             if quality:
             else:
                 new_read.sequence = self.convert_color_to_base_space( self.sequence )
         new_read.description = self.description
+
+        if force_quality_encoding is None:
+            if self.is_ascii_encoded():
+                new_encoding = 'ascii'
+            else:
+                new_encoding = 'decimal'
+        else:
+            new_encoding = force_quality_encoding
+
         if self.score_system != new_read.score_system:
             if self.score_system == 'phred':
                 score_list = self.convert_score_phred_to_solexa( self.get_decimal_quality_scores() )
                 score_list = self.convert_score_solexa_to_phred( self.get_decimal_quality_scores() )
         else:
             score_list = self.get_decimal_quality_scores()
-        new_class.transform_scores_to_valid_range( score_list )
-        new_read.quality = "%s " % " ".join( score_list )
+
+        if new_encoding == 'ascii':
+            new_class.transform_scores_to_valid_range_ascii( score_list )
+            new_read.quality = "".join(score_list)
+        else:
+            new_class.transform_scores_to_valid_range( score_list )
+            #new_read.quality = "%s " % " ".join( score_list )
+            new_read.quality = " ".join(score_list) + " "
+
         #new_read.quality = "%s " % " ".join( map( str, new_class.restrict_scores_to_valid_range( score_list ) ) ) #need trailing space to be valid decimal fastq
-        if force_quality_encoding is None:
-            if self.is_ascii_encoded():
-                new_encoding = 'ascii'
-            else:
-                new_encoding = 'decimal'
-        else:
-            new_encoding = force_quality_encoding
-        if new_encoding == 'ascii':
-            new_read.quality = "".join( new_read.get_ascii_quality_scores() )
+        #if new_encoding == 'ascii':
+        #    new_read.quality = "".join( new_read.get_ascii_quality_scores() )
         return new_read
+
     def get_sequence( self ):
         return self.sequence
     def slice( self, left_column_offset, right_column_offset ):
         #lengths
         seq_len = len( fastq_read )
         self.seq_lens[ seq_len ] = self.seq_lens.get( seq_len, 0 ) + 1
+
         #decimal qualities by column
         for i, val in enumerate( fastq_read.get_decimal_quality_scores() ):
             if i == len( self.nuc_index_quality ):
-                self.nuc_index_quality.append( {} )
-            self.nuc_index_quality[ i ][ val ] = self.nuc_index_quality[ i ].get( val, 0 ) + 1
+                self.nuc_index_quality.append( {} )            
+            hash = self.nuc_index_quality[ i ]
+            hash[ val ] = hash.get( val, 0 ) + 1
         #bases by column
-        for i, nuc in enumerate( fastq_read.get_sequence() ):
-            if i == len( self.nuc_index_base ):
-                self.nuc_index_base.append( {} )
-            if nuc not in self.distinct_nucleotides:
-                self.distinct_nucleotides.append( nuc )
-            if self.count_bases_per_column:
+        if self.count_bases_per_column:
+            for i, nuc in enumerate( fastq_read.get_sequence() ):
+                if nuc not in self.distinct_nucleotides:
+                    self.distinct_nucleotides.append( nuc )
+                if i == len( self.nuc_index_base ):
+                    self.nuc_index_base.append( {} )
                 nuc = nuc.upper()
                 self.nuc_index_base[ i ][ nuc ] = self.nuc_index_base[ i ].get( nuc, 0 ) + 1
+        else:
+            for nuc in fastq_read.get_sequence():
+                if nuc not in self.distinct_nucleotides:
+                    self.distinct_nucleotides.append( nuc )
 
     def get_valid_formats( self, check_list = None ):
         if not check_list:
                 rval.description = line
                 break
             rval.append_sequence( line )
+        # Should have at least one line right? - convert to a mock dowhile
+        # loop to avoid unneeded insufficent_quality_length
+        # check.
+        line = self.file.readline()
+        rval.append_quality( line )
         while rval.insufficient_quality_length():
             line = self.file.readline()
             if not line: