Commits

Anonymous committed 9b7d5c1

Refactor data providers to use the get_iterator/process_data framework whenever providing individual data points.

Comments (0)

Files changed (1)

lib/galaxy/visualization/tracks/data_providers.py

         # Override.
         pass
         
+    def get_iterator( self, chrom, start, end ):
+        """
+        Returns an iterator that provides data in the region chrom:start-end
+        """
+        # Override.
+        pass
+        
+    def process_data( self, iterator, start_val=0, max_vals=None, **kwargs ):
+        """
+        Process data from an iterator to a format that can be provided to client.
+        """
+        # Override.
+        pass
+        
+        
     def get_data( self, chrom, start, end, start_val=0, max_vals=None, **kwargs ):
         """ 
         Returns data in region defined by chrom, start, and end. start_val and
         Return value must be a dictionary with the following attributes:
             dataset_type, data
         """
-        # Override.
-        pass
+        iterator = self.get_iterator( chrom, start, end )
+        return self.process_data( iterator, start_val, max_vals, **kwargs )
         
     def get_filters( self ):
         """ 
                     
         bgzip_fname = self.dependencies['bgzip'].file_name
         
-        # if os.path.getsize(self.converted_dataset.file_name) == 0:
-            # return { 'kind': messages.ERROR, 'message': "Tabix converted size was 0, meaning the input file had invalid values." }
         tabix = ctabix.Tabixfile(bgzip_fname, index_filename=self.converted_dataset.file_name)
         
         # If chrom is not found in indexes, try removing the first three 
             chrom = chrom[3:]
         
         return tabix.fetch(reference=chrom, start=start, end=end)
-        
-    def get_data( self, chrom, start, end, start_val=0, max_vals=None, **kwargs ):
-        iterator = self.get_iterator( chrom, start, end )
-        return self.process_data( iterator, start_val, max_vals, **kwargs )
-        
+                
     def write_data_to_file( self, chrom, start, end, filename ):
         iterator = self.get_iterator( chrom, start, end )
         out = open( filename, "w" )
     
     def get_iterator( self, chrom, start, end ):
         raise "Unimplemented Method"
-        
-    def get_data( self, chrom, start, end, start_val=0, max_vals=None, **kwargs ):
-        iterator = self.get_iterator( chrom, start, end )
-        return self.process_data( iterator, start_val, max_vals, **kwargs )
-    
+            
     def process_data( self, iterator, start_val=0, max_vals=None, **kwargs ):
         """
         Provides
     
     col_name_data_attr_mapping = { 'Qual' : { 'index': 6 , 'name' : 'Qual' } }
     
-
-    def get_iterator( self, chrom, start, end ):
-        raise "Unimplemented Method"
-
-    def get_data( self, chrom, start, end, start_val=0, max_vals=None, **kwargs ):
-        iterator = self.get_iterator( chrom, start, end )
-        return self.process_data( iterator, start_val, max_vals, **kwargs )
-
     def process_data( self, iterator, start_val=0, max_vals=None, **kwargs ):
         """
         Returns a dict with the following attributes:
         
         # Cleanup.
         bamfile.close()
-    
-    def get_data( self, chrom, start, end, start_val=0, max_vals=sys.maxint, **kwargs ):
+        
+    def get_iterator( self, chrom, start, end ):
         """
-        Fetch reads in the region and additional metadata.
+        Returns an iterator that provides data in the region chrom:start-end
+        """
+        start, end = int(start), int(end)
+        orig_data_filename = self.original_dataset.file_name
+        index_filename = self.converted_dataset.file_name
         
+        # Attempt to open the BAM file with index
+        bamfile = csamtools.Samfile( filename=orig_data_filename, mode='rb', index_filename=index_filename )
+        try:
+            data = bamfile.fetch(start=start, end=end, reference=chrom)
+        except ValueError, e:
+            # Some BAM files do not prefix chromosome names with chr, try without
+            if chrom.startswith( 'chr' ):
+                try:
+                    data = bamfile.fetch( start=start, end=end, reference=chrom[3:] )
+                except ValueError:
+                    return None
+            else:
+                return None
+        return data
+                
+    def process_data( self, iterator, start_val=0, max_vals=None, **kwargs ):
+        """
         Returns a dict with the following attributes:
             data - a list of reads with the format 
                     [<guid>, <start>, <end>, <name>, <read_1>, <read_2>] 
             max_high - highest coordinate for the returned reads
             message - error/informative message
         """
-        start, end = int(start), int(end)
-        orig_data_filename = self.original_dataset.file_name
-        index_filename = self.converted_dataset.file_name
-        no_detail = "no_detail" in kwargs
-        
-        # Attempt to open the BAM file with index
-        bamfile = csamtools.Samfile( filename=orig_data_filename, mode='rb', index_filename=index_filename )
-        message = None
-        try:
-            data = bamfile.fetch(start=start, end=end, reference=chrom)
-        except ValueError, e:
-            # Some BAM files do not prefix chromosome names with chr, try without
-            if chrom.startswith( 'chr' ):
-                try:
-                    data = bamfile.fetch( start=start, end=end, reference=chrom[3:] )
-                except ValueError:
-                    return None
-            else:
-                return None
-                
         # Decode strand from read flag.
         def decode_strand( read_flag, mask ):
             strand_flag = ( read_flag & mask == 0 )
         results = []
         paired_pending = {}
         unmapped = 0
-        for count, read in enumerate( data ):
+        message = None
+        for count, read in enumerate( iterator ):
             if count < start_val:
                 continue
             if ( count - start_val - unmapped ) >= max_vals:
 
             results.append( [ "%i_%s" % ( read_start, qname ), read_start, read_end, qname, r1, r2 ] )
             
-        # Clean up.
-        bamfile.close()
+        # Clean up. TODO: is this needed? If so, we'll need a cleanup function after processing the data.
+        # bamfile.close()
         
         max_low, max_high = get_bounds( results, 1, 2 )
                 
             for interval in feature.intervals:
                 out.write(interval.raw_line + '\n')
         out.close()
-    
-    def get_data( self, chrom, start, end, start_val=0, max_vals=sys.maxint, **kwargs ):
+        
+    def get_iterator( self, chrom, start, end ):
+        """
+        Returns an array with values: (a) source file and (b) an iterator that
+        provides data in the region chrom:start-end
+        """
         start, end = int(start), int(end)
         source = open( self.original_dataset.file_name )
         index = Indexes( self.converted_dataset.file_name )
-        results = []
-        message = None
 
         # If chrom is not found in indexes, try removing the first three 
         # characters (e.g. 'chr') and see if that works. This enables the
         chrom = str(chrom)
         if chrom not in index.indexes and chrom[3:] in index.indexes:
             chrom = chrom[3:]
+            
+        return index.find(chrom, start, end)
+
+    def process_data( self, iterator, start_val=0, max_vals=None, **kwargs ):
+        results = []
+        message = None
+        source = open( self.original_dataset.file_name )
 
         #
         # Build data to return. Payload format is:
         #
         filter_cols = from_json_string( kwargs.get( "filter_cols", "[]" ) )
         no_detail = ( "no_detail" in kwargs )
-        for count, val in enumerate( index.find(chrom, start, end) ):
+        for count, val in enumerate( iterator ):
             start, end, offset = val[0], val[1], val[2]
             if count < start_val:
                 continue
     NOTE: this data provider does not use indices, and hence will be very slow
     for large datasets.
     """
-    def get_data( self, chrom, start, end, start_val=0, max_vals=sys.maxint, **kwargs ):
+    
+    def get_iterator( self, chrom, start, end ):
+        """
+        Returns an iterator that provides data in the region chrom:start-end
+        """
         start, end = int( start ), int( end )
         source = open( self.original_dataset.file_name )
+        return GFFReaderWrapper( source, fix_strand=True )
+        
+    def process_data( self, iterator, start_val=0, max_vals=None, **kwargs ):
+        """
+        Process data from an iterator to a format that can be provided to client.
+        """
         results = []
         message = None
         offset = 0
         
-        for count, feature in enumerate( GFFReaderWrapper( source, fix_strand=True ) ):
+        for count, feature in enumerate( iterator ):
             if count < start_val:
                 continue
             if count-start_val >= max_vals: