1. galaxy
  2. galaxy-central

Commits

jeremy goecks  committed bdd35af

Viz framework: (a) push data provider creation to registry to simplify provider creation; (b) fix bugs in filters module naming; (c) enable deeper sampling in BBI data provider.

  • Participants
  • Parent commits 8a8dcc0
  • Branches default

Comments (0)

Files changed (8)

File lib/galaxy/visualization/data_providers/basic.py

View file
  • Ignore whitespace
         f.close()
             
         return data
-
-class DataProviderRegistry( object ):
-    """
-    Registry for data providers that enables listing and lookup.
-    """
-
-    def __init__( self ):
-        # Mapping from dataset type name to a class that can fetch data from a file of that
-        # type. First key is converted dataset type; if result is another dict, second key
-        # is original dataset type. TODO: This needs to be more flexible.
-        self.dataset_type_name_to_data_provider = {
-            "tabix": { 
-                Vcf: VcfTabixDataProvider,
-                Bed: BedTabixDataProvider,
-                Gtf: GtfTabixDataProvider,
-                ENCODEPeak: ENCODEPeakTabixDataProvider,
-                Interval: IntervalTabixDataProvider,
-                ChromatinInteractions: ChromatinInteractionsTabixDataProvider, 
-                "default" : TabixDataProvider 
-            },
-            "interval_index": IntervalIndexDataProvider,
-            "bai": BamDataProvider,
-            "bam": SamDataProvider,
-            "summary_tree": SummaryTreeDataProvider,
-            "bigwig": BigWigDataProvider,
-            "bigbed": BigBedDataProvider
-        }
-
-    def get_data_provider( name=None, original_dataset=None ):
-        """
-        Returns data provider class by name and/or original dataset.
-        """
-        data_provider = None
-        if name:
-            value = dataset_type_name_to_data_provider[ name ]
-            if isinstance( value, dict ):
-                # Get converter by dataset extension; if there is no data provider,
-                # get the default.
-                data_provider = value.get( original_dataset.datatype.__class__, value.get( "default" ) )
-            else:
-                data_provider = value
-        elif original_dataset:
-            # Look up data provider from datatype's informaton.
-            try:
-                # Get data provider mapping and data provider for 'data'. If 
-                # provider available, use it; otherwise use generic provider.
-                _ , data_provider_mapping = original_dataset.datatype.get_track_type()
-                if 'data_standalone' in data_provider_mapping:
-                    data_provider_name = data_provider_mapping[ 'data_standalone' ]
-                else:
-                    data_provider_name = data_provider_mapping[ 'data' ]
-                if data_provider_name:
-                    data_provider = self.get_data_provider( name=data_provider_name, original_dataset=original_dataset )
-                else: 
-                    data_provider = GenomeDataProvider
-            except:
-                pass
-        return data_provider
-        

File lib/galaxy/visualization/data_providers/genome.py

View file
  • Ignore whitespace
         
 class GenomeDataProvider( BaseDataProvider ):
     """ Base class for genome data providers. """
+
+    data_type = None
     
     """ 
     Mapping from column name to payload data; this mapping is used to create
 
 
 class TabixDataProvider( FilterableMixin, GenomeDataProvider ):
+    data_type = 'tabix'
+
     """
     Tabix index data provider for the Galaxy track browser.
     """
 #
 
 class IntervalDataProvider( GenomeDataProvider ):
+    data_type = 'interval_index'
+
     """
-    Processes BED data from native format to payload format.
+    Processes interval data from native format to payload format.
     
     Payload format: [ uid (offset), start, end, name, strand, thick_start, thick_end, blocks ]
     """
     
     Payload format: [ uid (offset), start, end, name, strand, thick_start, thick_end, blocks ]
     """
+
+    data_type = 'interval_index'
     
     def get_iterator( self, chrom, start, end ):
         raise Exception( "Unimplemented Method" )
     for large datasets.
     """
 
+    data_type = 'interval_index'
+
     def get_iterator( self, chrom=None, start=None, end=None ):
         # Read first line in order to match chrom naming format.
         line = source.readline()
     """
     
     col_name_data_attr_mapping = { 'Qual' : { 'index': 6 , 'name' : 'Qual' } }
+
+    data_type = 'bai'
     
     def process_data( self, iterator, start_val=0, max_vals=None, **kwargs ):
         """
     for large datasets.
     """
 
+    data_type = 'tabix'
+
     def get_iterator( self, chrom, start, end ):
         # Read first line in order to match chrom naming format.
         line = source.readline()
     """
     Summary tree data provider for the Galaxy track browser. 
     """
+
+    data_type = 'summary_tree'
     
     CACHE = LRUCache( 20 ) # Store 20 recently accessed indices for performance
     
     Provides access to intervals from a sorted indexed BAM file. Position data
     is reported in 1-based, closed format, i.e. SAM/BAM format.
     """
+
+    data_type = 'bai'
     
     def get_filters( self ):
         """
         return { 'data': results, 'message': message, 'max_low': max_low, 'max_high': max_high }
         
 class SamDataProvider( BamDataProvider ):
+
+    data_type = 'bai'
     
     def __init__( self, converted_dataset=None, original_dataset=None, dependencies=None ):
         """ Create SamDataProvider. """
     """
     BBI data provider for the Galaxy track browser. 
     """
+
+    data_type = 'bigwig'
+
     def valid_chroms( self ):
         # No way to return this info as of now
         return None
         f.close()
         return all_dat is not None
 
-    def get_data( self, chrom, start, end, start_val=0, max_vals=None, **kwargs ):
+    def get_data( self, chrom, start, end, start_val=0, max_vals=None, num_samples=1000, **kwargs ):
         # Bigwig can be a standalone bigwig file, in which case we use 
         # original_dataset, or coming from wig->bigwig conversion in 
         # which we use converted_dataset
 
             return dict( data=dict( min=min, max=max, mean=mean, sd=sd ) )
 
-        # Sample from region using approximately this many samples.
-        N = 1000
-
         def summarize_region( bbi, chrom, start, end, num_points ):
             '''
             Returns results from summarizing a region using num_points.
             NOTE: num_points cannot be greater than end - start or BBI
-            will return None for all positions.s
+            will return None for all positions.
             '''
             result = []
 
             return result
 
         # Approach is different depending on region size.
-        if end - start < N:
+        num_samples = int( num_samples )
+        if end - start < num_samples:
             # Get values for individual bases in region, including start and end.
             # To do this, need to increase end to next base and request number of points.
             num_points = end - start + 1
         else:
             # 
             # The goal is to sample the region between start and end uniformly 
-            # using ~N data points. The challenge is that the size of sampled 
-            # intervals rarely is full bases, so sampling using N points will 
-            # leave the end of the region unsampled due to remainders for each
-            # interval. To recitify this, a new N is calculated based on the 
+            # using ~N (num_samples) data points. The challenge is that the size of 
+            # sampled intervals rarely is full bases, so sampling using N points 
+            # will leave the end of the region unsampled due to remainders for 
+            # each interval. To recitify this, a new N is calculated based on the 
             # step size that covers as much of the region as possible.
             #
             # However, this still leaves some of the region unsampled. This 
             #
 
             # Start with N samples.
-            num_points = N
+            num_points = num_samples
             step_size = ( end - start ) / num_points
             # Add additional points to sample in the remainder not covered by 
             # the initial N samples.
     Interval index files used only for GFF files.
     """
     col_name_data_attr_mapping = { 4 : { 'index': 4 , 'name' : 'Score' } }
+
+    data_type = 'interval_index'
     
     def write_data_to_file( self, regions, filename ):
         source = open( self.original_dataset.file_name )
     NOTE: this data provider does not use indices, and hence will be very slow
     for large datasets.
     """
+
+    data_type = 'interval_index'
     
     def get_iterator( self, chrom, start, end ):
         """

File lib/galaxy/visualization/data_providers/registry.py

View file
  • Ignore whitespace
             "bigbed": BigBedDataProvider
         }
 
-    def get_data_provider( self, name=None, raw=False, original_dataset=None ):
+    def get_data_provider( self, trans, name=None, source='data', raw=False, original_dataset=None ):
         """
         Returns data provider class by name and/or original dataset.
         """
 
-        # If getting raw data, use original dataset type to get data provider.
+        data_provider = None
         if raw:
+            # Working with raw data.
             if isinstance( original_dataset.datatype, Gff ):
-                return RawGFFDataProvider
+                data_provider_class = RawGFFDataProvider
             elif isinstance( original_dataset.datatype, Bed ):
-                return RawBedDataProvider
+                data_provider_class = RawBedDataProvider
             elif isinstance( original_dataset.datatype, Vcf ):
-                return RawVcfDataProvider
+                data_provider_class = RawVcfDataProvider
             elif isinstance( original_dataset.datatype, Tabular ):
-                return ColumnDataProvider
+                data_provider_class = ColumnDataProvider
 
-        # Using converted dataset, so get corrsponding data provider.
-        data_provider = None
-        if name:
-            value = self.dataset_type_name_to_data_provider[ name ]
-            if isinstance( value, dict ):
-                # Get converter by dataset extension; if there is no data provider,
-                # get the default.
-                data_provider = value.get( original_dataset.datatype.__class__, value.get( "default" ) )
-            else:
-                data_provider = value
-        elif original_dataset:
-            # Look up data provider from datatype's informaton.
-            try:
-                # Get data provider mapping and data provider for 'data'. If 
-                # provider available, use it; otherwise use generic provider.
+            data_provider = data_provider_class( original_dataset=original_dataset )
+
+        else:
+            # Working with converted or standalone dataset.
+
+            if name:
+                # Provider requested by name; get from mappings.
+                value = self.dataset_type_name_to_data_provider[ name ]
+                if isinstance( value, dict ):
+                    # Get converter by dataset extension; if there is no data provider,
+                    # get the default.
+                    data_provider_class = value.get( original_dataset.datatype.__class__, value.get( "default" ) )
+                else:
+                    data_provider_class = value
+
+                # If name is the same as original dataset's type, dataset is standalone.
+                # Otherwise, a converted dataset is being used.
+                if name == original_dataset.ext:
+                    data_provider = data_provider_class( original_dataset=original_dataset )
+                else:
+                    converted_dataset = original_dataset.get_converted_dataset( trans, name )
+                    deps = original_dataset.get_converted_dataset_deps( trans, name )
+                    data_provider = data_provider_class( original_dataset=original_dataset, 
+                                                         converted_dataset=converted_dataset,
+                                                         dependencies=deps )
+                
+            elif original_dataset:
+                # No name, so look up a provider name from datatype's information.
+
+                # Dataset must have get_track_type function to get data.
+                if not hasattr( original_dataset.datatype, 'get_track_type'):
+                    return None
+                
+                # Get data provider mapping and data provider.
                 _ , data_provider_mapping = original_dataset.datatype.get_track_type()
                 if 'data_standalone' in data_provider_mapping:
                     data_provider_name = data_provider_mapping[ 'data_standalone' ]
                 else:
-                    data_provider_name = data_provider_mapping[ 'data' ]
-                if data_provider_name:
-                    data_provider = self.get_data_provider( name=data_provider_name, original_dataset=original_dataset )
-                else: 
-                    data_provider = GenomeDataProvider
-            except:
-                pass
+                    data_provider_name = data_provider_mapping[ source ]
+                
+                data_provider = self.get_data_provider( trans, name=data_provider_name, original_dataset=original_dataset )
+
         return data_provider

File lib/galaxy/web/api/datasets.py

View file
  • Ignore whitespace
         if msg:
             return msg
             
-        # NOTE: finding valid chroms is prohibitive for large summary trees and is not currently used by
-        # the client.
-        valid_chroms = None
         # Check for data in the genome window.
         data_provider_registry = trans.app.data_provider_registry
-        if data_sources.get( 'index' ):
-            tracks_dataset_type = data_sources['index']['name']
-            converted_dataset = dataset.get_converted_dataset( trans, tracks_dataset_type )
-            indexer = data_provider_registry.get_data_provider( tracks_dataset_type )( converted_dataset, dataset )
-            if not indexer.has_data( chrom ):
-                return messages.NO_DATA
-            #valid_chroms = indexer.valid_chroms()
-        else:
-            # Standalone data provider
-            standalone_provider = data_provider_registry.get_data_provider( data_sources['data_standalone']['name'] )( dataset )
-            kwargs = {"stats": True}
-            if not standalone_provider.has_data( chrom ):
-                return messages.NO_DATA
-            #valid_chroms = standalone_provider.valid_chroms()
+        data_provider = trans.app.data_provider_registry.get_data_provider( trans, original_dataset= dataset, source='index' )
+        if not data_provider.has_data( chrom ):
+            return messages.NO_DATA
             
         # Have data if we get here
-        return { "status": messages.DATA, "valid_chroms": valid_chroms }
+        return { "status": messages.DATA, "valid_chroms": None }
 
     def _search_features( self, trans, dataset, query ):
         """
         data_provider_registry = trans.app.data_provider_registry
         if mode == "Coverage":
             # Get summary using minimal cutoffs.
-            tracks_dataset_type = data_sources['index']['name']
-            converted_dataset = dataset.get_converted_dataset( trans, tracks_dataset_type )
-            indexer = data_provider_registry.get_data_provider( tracks_dataset_type )( converted_dataset, dataset )
+            indexer = data_provider_registry.get_data_provider( trans, original_dataset=dataset, source='index' )
             summary = indexer.get_data( chrom, low, high, resolution=kwargs[ 'resolution' ], detail_cutoff=0, draw_cutoff=0 )
             if summary == "detail":
                 # Use maximum level of detail--2--to get summary data no matter the resolution.
                 summary = indexer.get_data( chrom, low, high, resolution=kwargs[ 'resolution' ], level=2, detail_cutoff=0, draw_cutoff=0 )
             frequencies, max_v, avg_v, delta = summary
-            return { 'dataset_type': tracks_dataset_type, 'data': frequencies, 'max': max_v, 'avg': avg_v, 'delta': delta }
+            return { 'dataset_type': indexer.data_type, 'data': frequencies, 'max': max_v, 'avg': avg_v, 'delta': delta }
 
         if 'index' in data_sources and data_sources['index']['name'] == "summary_tree" and mode == "Auto":
             # Only check for summary_tree if it's Auto mode (which is the default)
             # 
             # Have to choose between indexer and data provider
-            tracks_dataset_type = data_sources['index']['name']
-            converted_dataset = dataset.get_converted_dataset( trans, tracks_dataset_type )
-            indexer = data_provider_registry.get_data_provider( tracks_dataset_type )( converted_dataset, dataset )
+            indexer = data_provider_registry.get_data_provider( trans, original_dataset=dataset, source='index' )
             summary = indexer.get_data( chrom, low, high, resolution=kwargs[ 'resolution' ] )
             if summary is None:
-                return { 'dataset_type': tracks_dataset_type, 'data': None }
+                return { 'dataset_type': indexer.data_type, 'data': None }
                 
             if summary == "draw":
                 kwargs["no_detail"] = True # meh
                 extra_info = "no_detail"
             elif summary != "detail":
                 frequencies, max_v, avg_v, delta = summary
-                return { 'dataset_type': tracks_dataset_type, 'data': frequencies, 'max': max_v, 'avg': avg_v, 'delta': delta }
+                return { 'dataset_type': indexer.data_type, 'data': frequencies, 'max': max_v, 'avg': avg_v, 'delta': delta }
         
         # Get data provider.
-        if "data_standalone" in data_sources:
-            tracks_dataset_type = data_sources['data_standalone']['name']
-            data_provider_class = data_provider_registry.get_data_provider( name=tracks_dataset_type, original_dataset=dataset )
-            data_provider = data_provider_class( original_dataset=dataset )
-        else:
-            tracks_dataset_type = data_sources['data']['name']
-            data_provider_class = data_provider_registry.get_data_provider( name=tracks_dataset_type, original_dataset=dataset )
-            converted_dataset = dataset.get_converted_dataset( trans, tracks_dataset_type )
-            deps = dataset.get_converted_dataset_deps( trans, tracks_dataset_type )
-            data_provider = data_provider_class( converted_dataset=converted_dataset, original_dataset=dataset, dependencies=deps )
+        data_provider = data_provider_registry.get_data_provider( trans, original_dataset=dataset, source='data' )
         
         # Allow max_vals top be data provider set if not passed
         if max_vals is None:
 
         # Get and return data from data_provider.
         result = data_provider.get_data( chrom, int( low ), int( high ), int( start_val ), int( max_vals ), **kwargs )
-        result.update( { 'dataset_type': tracks_dataset_type, 'extra_info': extra_info } )
+        result.update( { 'dataset_type': data_provider.data_type, 'extra_info': extra_info } )
         return result
 
     def _raw_data( self, trans, dataset, **kwargs ):
     
         # Return data.
         data = None
-        data_provider = trans.app.data_provider_registry.get_data_provider( raw=True, original_dataset=dataset )
+        data_provider = trans.app.data_provider_registry.get_data_provider( trans, raw=True, original_dataset=dataset )
         
         if data_provider == ColumnDataProvider:
             #pre: should have column kwargs
             #TODO??: could default to first two here
             assert 'cols' in kwargs, (
                 "ColumnDataProvider needs a 'cols' parameter in the query string" )
-            data = data_provider( original_dataset=dataset ).get_data( **kwargs )
+            data = data_provider.get_data( **kwargs )
             
         else:
             # Default to genomic data.
             # FIXME: need better way to set dataset_type.
             low, high = int( kwargs.get( 'low' ) ), int( kwargs.get( 'high' ) )
-            data = data_provider( original_dataset=dataset ).get_data( start=low, end=high, **kwargs )
+            data = data_provider.get_data( start=low, end=high, **kwargs )
             data[ 'dataset_type' ] = 'interval_index'
             data[ 'extra_info' ] = None
             if isinstance( dataset.datatype, Vcf ):

File lib/galaxy/web/api/tools.py

View file
  • Ignore whitespace
         if run_on_regions:
             for jida in original_job.input_datasets:
                 input_dataset = jida.dataset
-                if data_provider_registry.get_data_provider( original_dataset=input_dataset ):
-                    # Can index dataset.
-                    track_type, data_sources = input_dataset.datatype.get_track_type()
-                    # Convert to datasource that provides 'data' because we need to
-                    # extract the original data.
-                    data_source = data_sources[ 'data' ]
-                    msg = self.convert_dataset( trans, input_dataset, data_source )
-                    if msg is not None:
-                        messages_list.append( msg )
+                data_provider = data_provider_registry.get_data_provider( trans, original_dataset=input_dataset, source='data' )
+                if data_provider:
+                    if not data_provider.converted_dataset:
+                        msg = self.convert_dataset( trans, input_dataset, data_source )
+                        if msg is not None:
+                            messages_list.append( msg )
 
         # Return any messages generated during conversions.
         return_message = get_highest_priority_msg( messages_list )
                     trans.app.security_agent.set_all_dataset_permissions( new_dataset.dataset, hda_permissions )
 
                     # Write subset of data to new dataset
-                    data_provider_class = data_provider_registry.get_data_provider( original_dataset=input_dataset )
-                    data_provider = data_provider_class( original_dataset=input_dataset, 
-                                                         converted_dataset=converted_dataset,
-                                                         dependencies=deps )
+                    data_provider = data_provider_registry.get_data_provider( trans, original_dataset=input_dataset, source='data' )
                     trans.app.object_store.create( new_dataset.dataset )
                     data_provider.write_data_to_file( regions, new_dataset.file_name )
 

File lib/galaxy/web/base/controller.py

View file
  • Ignore whitespace
                     prefs = {}
 
                 track_type, _ = dataset.datatype.get_track_type()
-                track_data_provider_class = trans.app.data_provider_registry.get_data_provider( original_dataset=dataset )
-                track_data_provider = track_data_provider_class( original_dataset=dataset )
+                track_data_provider = trans.app.data_provider_registry.get_data_provider( trans, 
+                                                                                          original_dataset=dataset, 
+                                                                                          source='data' )
                 
                 return {
                     "track_type": track_type,
         """
         # Get data provider.
         track_type, _ = dataset.datatype.get_track_type()
-        track_data_provider_class = trans.app.data_provider_registry.get_data_provider( original_dataset=dataset )
-        track_data_provider = track_data_provider_class( original_dataset=dataset )
+        track_data_provider = trans.app.data_provider_registry.get_data_provider( trans, original_dataset=dataset )
+ 
         
         if isinstance( dataset, trans.app.model.HistoryDatasetAssociation ):
             hda_ldda = "hda"

File static/scripts/viz/trackster/filters.js

View file
  • Ignore whitespace
 });
 
 return {
-    FiltersManager: FiltersManager
+    FiltersManager: FiltersManager,
+    NumberFilter: NumberFilter
 };
 
 });

File static/scripts/viz/trackster/tracks.js

View file
  • Ignore whitespace
 define( ["libs/underscore", "viz/visualization", "viz/trackster/util", 
          "viz/trackster/slotting", "viz/trackster/painters", "mvc/data",
          "viz/trackster/filters" ], 
-         function( _, visualization, util, slotting, painters, data, filters ) {
+         function( _, visualization, util, slotting, painters, data, filters_mod ) {
 
 var extend = _.extend;
 var get_random_color = util.get_random_color;
     moveable(this.container_div, this.drag_handle_class, ".group", this);
     
     // Set up filters.
-    this.filters_manager = new filters.FiltersManager(this);
+    this.filters_manager = new filters_mod.FiltersManager(this);
     this.header_div.after(this.filters_manager.parent_div);
     // For saving drawables' filter managers when group-level filtering is done:
     this.saved_filters_managers = [];
     if ('filters' in obj_dict) {
         // FIXME: Pass collection_dict to DrawableCollection/Drawable will make this easier.
         var old_manager = this.filters_manager;
-        this.filters_manager = new filters.FiltersManager(this, obj_dict.filters);
+        this.filters_manager = new filters_mod.FiltersManager(this, obj_dict.filters);
         old_manager.parent_div.replaceWith(this.filters_manager.parent_div);
     
         if (obj_dict.filters.visible) {
                     if (filters.length === num_feature_tracks) {
                         // Add new filter.
                         // FIXME: can filter.copy() be used?
-                        new_filter = new NumberFilter( {
+                        new_filter = new filters_mod.NumberFilter( {
                                         name: filters[0].name, 
                                         index: filters[0].index
                                         } );
     moveable(track.container_div, track.drag_handle_class, ".group", track);
     
     // Attribute init.
-    this.filters_manager = new filters.FiltersManager(this, ('filters' in obj_dict ? obj_dict.filters : null));
+    this.filters_manager = new filters_mod.FiltersManager(this, ('filters' in obj_dict ? obj_dict.filters : null));
     // HACK: set filters manager for data manager.
     // FIXME: prolly need function to set filters and update data_manager reference.
     this.data_manager.set('filters_manager', this.filters_manager);
     }
 };
 
-// Exports
 return {
     View: View,
     DrawableGroup: DrawableGroup,