Source code for biolite.pipeline

# BioLite - Tools for processing gene sequence data and automating workflows
# Copyright (c) 2012-2013 Brown University. All rights reserved.
# 
# This file is part of BioLite.
# 
# BioLite is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# BioLite is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with BioLite.  If not, see <http://www.gnu.org/licenses/>.

"""
BioLite borrows from Ruffus (http://code.google.com/p/ruffus/) the idea of
using Python function decorators to delineate pipeline stages. Pipelines are
created with a sequence of ordinary Python functions decorated by a pipeline
object, which registers each function as a *stage* in the pipeline.  The
pipeline object maintains a persistent, global dictionary, called the *state*,
and runs each stage by looking up the argument names in the stage function's
signature, and calling the function with the values in the state dictionary
whose keys match the function's argument names.  This is implemented using the
function inspection methods available from the :mod:`inspect` module in the
Python standard library.  If the stage function returns a dictionary, it is
*ingested* into the pipeline's state by adding values for any new keys and
updating values for existing keys.  Arguments passed on the command-line to the
pipeline script form the initial data in the pipeline's state.

As an example, the following code setups a pipeline with two command-line
arguments and one stage. Note how the variable names in the stage function's
signature match the names of the arguments.  The stage uses the `ingest` call
to pull the `output` path into the pipeline's state. This way, it is accessible
to other stages that might be added to this pipeline.

::

  from biolite.pipeline import BasePipeline
  from biolite.wrappers import FilterIllumina
  
  pipe = BasePipeline('filter', "Example pipeline")
  
  pipe.add_argument('input', short='i',
  	help="Input FASTA or FASTQ file to filter.")
  
  pipe.add_argument('quality', short='q', type=int, metavar='MIN',
  	default=28, help="Filter out reads that have a mean quality < MIN.")
  
  @pipe.stage
  def filter(input, quality):
  	'''
  	Filter out low-quality and adapter-contaminated reads
  	'''
  	output = input + '.filtered'
  	FilterIllumina([input], [output], quality=quality)
  	ingest('output')
  
  if __name__ == "__main__":
  	pipe.parse_args()
  	pipe.run()

This script is available in `examples/filter-pipeline.py` and produces the
following help message:

::

  $ python examples/filter-pipeline.py -h
  usage: filter-pipeline.py [-h] [--restart [CHK]] [--stage N] [--input INPUT]
                            [--quality MIN]
  
  Example pipeline
  
  optional arguments:
    -h, --help            show this help message and exit
    --restart [CHK]       Restart the pipeline from the last available
                          checkpoint, or from the specified checkpoint file CHK.
    --stage N             Start at stage number N. Note that some stages require
                          the output of previous stages, so starting in the
                          middle of a pipeline may not work.
    --input INPUT, -i INPUT
                          Input FASTA or FASTQ file to filter.
    --quality MIN, -q MIN
                          Filter out reads that have a mean quality < MIN. [28]
  
  pipeline stages:
    0) [filter] 
  	Filter out low-quality and adapter-contaminated reads

The pipeline module allows you to rapidly create full-featured pipeline scripts
with help messages, checkpointing and restart capabilities, and integration
with the BioLite diagnostics and catalog databases (using the `Pipeline` or
`IlluminaPipeline` derived classes).

Meta-Pipelines
--------------

Modularity is a key design goal, and it is possible to reuse one or more stages
of an existing pipeline when building a new pipeline. It is also possible to
build meta-pipelines that connect together several sub-pipelines.

Checkpoints
-----------

The pipeline object also incorporates fault tolerance.  At the end of each
stage, the pipeline stores a *checkpoint* by dumping its current state to a
binary file with the :mod:`cPickle` module. This way, if a run is interrupted,
either due to an internal error or to external conditions, such as a kill
signal from a batch system or a hardware failure, the run can be restarted from
the last completed stage (or, optionally, from any previous stage in the
checkpoint).
"""

import argparse
import cPickle
import glob
import inspect
import os
import shutil
import sys
import time

from collections import OrderedDict
from copy import deepcopy

import catalog
import config
import diagnostics
import utils

[docs]class BasePipeline: """ BasePipeline is the more generic class. It is designed to be used independently of the BioLite diagnostics and catalog features. """ def __init__(self, name, desc=""): self.name = name self.desc = desc self.args = OrderedDict() self.stages = list() self.nstage = -1 self.state = dict() self.checkpoints = list() self.skip = list() self.modules = list() self.pipelines = list() # Delayed creation of the parser happens in parse_args() self.parser = None self.parsed_args = dict() # Create an url to use for checkpoint filename, etc. self.safe_name = utils.safe_str(name) self.state['chkfile'] = self.safe_name + '.chk' # Add common arguments for all pipelines. self.add_argument('restart', metavar='CHK', nargs='?', help=""" Restart the pipeline from the last available checkpoint, or from the specified checkpoint file CHK.""") self.add_argument('stage', type=int, metavar='N', help=""" Start at stage number N. Note that some stages require the output of previous stages, so starting in the middle of a pipeline may not work.""") self.add_argument('skip', type=str, action='append', help=""" A list of stages and pipelines to skip. Give the names of stages, separated by commas, and any of those which are encountered will not be run. Given the name of a pipeline, all of the stages in that pipeline will not be run. This argument can be specified multiple times and the list is concatenated.""") ### Methods for building meta-pipelines from imported pipelines.
[docs] def import_stages(self, pipe, start=0): for func in pipe.stages[start:]: self.add_stage(func)
[docs] def import_arguments(self, pipe, names=None): if names is None: self.args.update(pipe.args) else: for name in names: self.args[name] = pipe.args[name]
[docs] def import_module(self, module, names=None, start=0): """Imports another pipeline module. Adds the pipeline as a subpipeline and links to the module itself so that it can be referenced later.""" self.modules.append(module) self.import_pipeline(module.pipe, names, start)
[docs] def import_pipeline(self, pipe, names=None, start=0): """Imports another pipeline. This should only be used in cases where the pipeline is in the same file as another pipeline.""" self.pipelines.append(pipe) self.import_arguments(pipe, names) self.import_stages(pipe, start)
[docs] def make_state(self, *args): state = dict() for arg in args: state[arg] = deepcopy(self.state[arg]) return state
[docs] def get(self, key): return self.state.get(key) ### This decorator adds stages to the pipeline.
[docs] def stage(self, func): """ Decorator to add functions as stages of this pipeline. """ # Check that input is a function if not inspect.isfunction(func): utils.info(""" The object you have decorated as a stage: %s is not an inspectable Python function.""") else: self.add_stage(func) return func
[docs] def add_stage(self, func): self.stages.append(func) # Provide references to this pipeline object for ingestion. if 'pipeline' not in func.__dict__: func.__dict__['pipeline'] = self.safe_name func.func_globals['ingest'] = self.ingest func.func_globals['make_state'] = self.make_state
[docs] def list_stages(self): lines = ["pipeline stages:"] for i in range(0, self.size()): f = self.stages[i] lines.append(" %2d) [%s] %s" % (i, f.__name__, f.__doc__)) return '\n'.join(lines)
[docs] def size(self): """ Returns the size of the pipeline (the number of stages it contains). """ return len(self.stages)
[docs] def parse_args(self): """ Reads values passed as arguments into the pipeline's *state*. """ # Create the parser on-the-fly so that we can incorporate the list of # stages as the epilog in the auto-generated help message. self.parser = argparse.ArgumentParser( description=self.desc, epilog=self.list_stages(), argument_default=argparse.SUPPRESS, formatter_class=argparse.RawDescriptionHelpFormatter) # All arguments added to the pipeline were cached in 'self.args'. for args,kwargs in self.args.values(): self.parser.add_argument(*args,**kwargs) args = vars(self.parser.parse_args()) if 'stage' in args: self.nstage = args.pop('stage') if self.nstage >= self.size(): utils.die("Pipeline has no stage %d" % self.nstage) if 'skip' in args: self.skip = sum(map(lambda x: x.split(","), args.pop('skip')), []) # Restart from a checkpoint, if indicated. if 'restart' in args: chkfile = args.pop('restart') # If a checkpoint file was not specified on the command line, use # the default name. if not chkfile: chkfile = self.state['chkfile'] self.restart(chkfile) self.parsed_args['restart'] = chkfile # Prune any arguments with default values that would override the # values in the state from the restart. for key in self.state: if key in args: if args[key] == self.args[key][1].get('default'): args.pop(key) for key,val in args.iteritems(): self.state[key] = val self.parsed_args[key] = val
[docs] def add_argument(self,name,**kwargs): """ Adds an argument `--name` to the pipeline. The single character keyword argument 'short' is used as the short versino of the argument (e.g. :samp:`short='n'` for `-n`). All other keyword arguments are passed through to the ArgumentParser when `parse_args` is called. """ args = ['--' + name] if 'short' in kwargs: args.append('-' + kwargs.pop('short')) if 'default' in kwargs: self.state[name] = kwargs['default'] # Auto-add default values to the help string. kwargs['help'] = str(kwargs.get('help')) + " [%(default)s]" # Cache this argument for now: it will be added to the parser # when self.parse_args() is called. self.args[name] = (args,kwargs)
[docs] def checkpoint(self): """ Writes checkpoint file by making a deep copy of the pipeline's current *state* and pickling it to the value of `chkfile` in the state (by default, this is the pipeline's name followed by '.chk' in the current working directory). """ # Expand the checkpoint list so it is big enough to accommodate the # current stage. nchk = len(self.checkpoints) if self.nstage >= nchk: for i in range(nchk, self.nstage + 1): self.checkpoints.append(dict()) # Make a deep copy of the current stage's state. self.checkpoints[self.nstage] = deepcopy(self.state) with open(self.state['chkfile'], 'w') as f: cPickle.dump(self.checkpoints, f)
[docs] def restart(self, chkfile): """ Restart the pipeline from the last stage written to the checkpoint file `chkfile`, which is unpickled and loaded as the current *state* using a deepcopy. """ utils.info("Restarting from checkpoint '%s'" % chkfile) with open(chkfile, 'r') as f: self.checkpoints = cPickle.load(f) # By default (if the current stage is unset), use the latest stage # stored in the checkpoint. nchk = len(self.checkpoints) utils.info("Checkpoint has %d stage(s)" % nchk) if self.nstage < 0: # Verify the number of stages in the checkpoint. if nchk >= len(self.stages): utils.die(""" This checkpoint has already completed the pipeline. To restart from an earlier stage in this checkpoint, use the --stage option.""") self.nstage = nchk elif self.nstage > nchk: utils.die("Can't restart at stage %d" % self.nstage) # Load the previous stage, since a checkpoint represents a completed # stage. if self.nstage > 0: self.state = deepcopy(self.checkpoints[self.nstage - 1])
[docs] def run(self): """ Starts the pipeline at the stage specified with `--stage`, or at stage 0 if no stage was specified. """ self.nstage = max(self.nstage, 0) utils.info("Starting at stage %d" % self.nstage) for s in self.stages[self.nstage:]: self.run_stage(s) self.nstage += 1
[docs] def rerun(self, state, start=0, stdout=None): """ Starts the pipeline without loading the command line arguments (e.g. for calling a full pipeline from within the stage of another pipeline), and instead using the provided `state`. The pipeline's stdout stream can be temporarily redirected to a log file using `stdout`. """ # Clear checkpoints from previous run. self.checkpoints = list() # Replace the state. self.state = deepcopy(state) self.state['chkfile'] = self.safe_name + '.chk' # Set the start point and run. self.nstage = start if stdout: sys.stdout = stdout BasePipeline.run(self) if stdout: sys.stdout = sys.__stdout__
[docs] def run_stage(self,func): """ Runs the current stage (from *self.nstage*) by using the :mod:`inspect` module to read the function signature of the decorated stage function, then injecting values from the *state* where the key matches the variable name in the function signature. """ name = func.__name__ diagnostics.prefix = [func.__dict__['pipeline'], name] utils.info(""" STAGE %d [%s] %s""" % (self.nstage, name, func.__doc__)) if func.__name__ not in self.skip and func.__dict__['pipeline'] not in self.skip: diagnostics.log("skipped", False) argspec = inspect.getargspec(func) if argspec.defaults is not None: utils.info(""" Stage '%s' has an argument with a default value, but this value will be ignored by the pipeline.""") argdict = dict() for arg in argspec.args: if not arg in self.state: utils.die("error: argument '%s' is required" % arg) else: argdict[arg] = self.state[arg] ret = func(**argdict) # Try to ingest any dictionaries returned by the stage function. if ret is not None: try: self.state.update(ret) except ValueError: utils.info(""" Could not ingest return object from the stage function. It must be a dictionary or a list of (key,val) tuples.""") else: diagnostics.log("skipped", True) utils.info("\n (Skipped.)") # Checkpoint is created at stage completion. self.checkpoint()
[docs] def ingest(self, *args): """ Called from inside a pipeline stage to ingest values back into the pipeline's *state*. It uses the :mod:`inspect` module to get the calling functions (i.e. the stage function's) local variable dictionary, and copies the variable names specified in the `args` list. """ lvars = utils.get_caller_locals() for name in args: try: self.state[name] = lvars[name] except KeyError: utils.info(""" Could not ingest variable '%s' The list of local variables does not contain that variable name.""" % name)
[docs]class Pipeline (BasePipeline): """ Extends BasePipeline to make use of the BioLite diagnostics and catalog databases. """ def __init__(self, name, desc=""): self.start = time.time() self.file = inspect.currentframe().f_back.f_code.co_filename BasePipeline.__init__(self, name, desc + """ To input the results from a previous pipeline run, use the (--previous, -p) argument with a 'RUN_SPEC', which is either a specific run ID to lookup in the diagnostics, or the wildcard '*', meaning the latest of any previous run found in the diagnostics for the given catalog ID.""") diagnostics.prefix = [self.safe_name] # Command-line arguments. self.add_argument('id', short='i', type=utils.safe_str, default='NoID', help=""" BioLite catalog ID of the input sequences.""") self.add_argument('newrun', action='store_true', help=""" Create a new run ID for this run, even if it is a restart.""") self.add_argument('previous', short='p', metavar='RUN_SPEC', default=None, help=""" Use the outputs of a previous pipeline run as inputs.""") self.add_argument('outdir', short='o', default=config.get_resource('outdir'), help=""" Path to the permanent storage location. The pipeline will use the output directory: OUTDIR/ID/RUN_ID""")
[docs] def set_outdir(self): """Setup the output directory.""" self.state['outdir'] = utils.safe_mkdir( os.path.join( self.state['outdir'], self.state['id'], str(self.state['_run_id']))) self.parsed_args['outdir'] = self.state['outdir']
[docs] def get_file(self): """Returns the absolute path to the file that this pipeline was created in.""" if self.file is not None: return os.path.abspath(self.file) else: # This probably doesn't work. It returns the path to the # pipeline class location, which is probably biolite.pipeline return os.path.abspath(inspect.getmodule(self).__file__)
[docs] def get_all_files(self): """Returns a flat list of all the files this pipeline and subpipelines are created in.""" files = [self.get_file()] for mod in self.modules: files += mod.pipe.get_all_files() return files
[docs] def run(self): # Parse arguments. self.parse_args() # Initialize the diagnostics file, used for global logging. id = self.state.get('id', None) if self.state.pop('newrun', False): # Create a new run ID by passing 'None' to init. run_id = None if 'outdir' not in self.parsed_args: utils.die("you must respecify the output directory for a new run") else: # Try to use the previous run ID. run_id = self.state.get('_run_id', None) self.state['_run_id'] = diagnostics.init(id, self.safe_name, run_id) # The output directory is set to: <outdir>/<id>/<run_id> # Only set if an outdir was specified as a parameter. # (Otherwise, if the outdir came from a restart, it has already been # set to the full path.) if 'outdir' in self.parsed_args: self.set_outdir() # Load the local diagnostics file into the diagnostics cache, # since some pipeline stages may depend on previous diagnotics entries # during a restart. diagnostics.load_cache() diagnostics.prefix = [diagnostics.INIT] diagnostics.log_dict(self.parsed_args) diagnostics.log_dict(config.resources, 'config.resources') diagnostics.log_dict(config.executables, 'config.executables', True) diagnostics.log_path(os.getcwd(), 'scratch') # Stick the configuration file into the output directory. # Don't use get_resource here because this is a special entry, # not specified in biolite.cfg! for path in config.resources['configpaths']: shutil.copy2(path, self.state['outdir']) diagnostics.log('configpaths', config.resources['configpaths']) # Log all the paths to the pipeline and subpipeline souce code. diagnostics.log('pipelinepaths', self.get_all_files()) # Set an exit handler for resource profiling. diagnostics.register_exit_profiler(self.start) BasePipeline.run(self) diagnostics.exit_profiler(self.start) # Call this at the end of a pipeline to store the paths to output files, # etc. in the diagnostics.
[docs] def finish(self, *args): lvars = utils.get_caller_locals() # Always log into the current entity of the calling stage. # Only log output if this is the final stage. if self.nstage == len(self.stages) - 1: diagnostics.prefix = [diagnostics.EXIT] diagnostics.prefix.append('scratch') diagnostics.log_path(os.getcwd()) diagnostics.prefix.pop() for name in args: try: diagnostics.log(name, lvars[name]) except KeyError: utils.info(""" Could not output final variable '%s' The list of local variables does not contain that variable name.""" % name) # Otherwise, ingest the values (e.g. in a meta-pipeline when reaching # the end of a sub-pipeline). else: self.state["scratch"] = os.getcwd() for name in args: try: self.state[name] = lvars[name] except KeyError: utils.info(""" Could not ingest variable '%s' The list of local variables does not contain that variable name.""" % name) # Add a hook for 'finish'.
[docs] def add_stage(self, func): BasePipeline.add_stage(self, func) func.func_globals['finish'] = self.finish
[docs] def log_state(self, *names): prefix_save = diagnostics.prefix diagnostics.prefix = [diagnostics.EXIT] for name in names: try: diagnostics.log(name, self.state[name]) except KeyError: utils.info("no variable '%s' in pipeline state", name) diagnostics.prefix = prefix_save # Initial stage (0) used by IlluminaPipeline.
def setup_fastq_paths(id, previous, fastq): """Determine the paths of the FASTQ input sequence data""" if fastq: # Manually specified paths. data = [map(os.path.abspath, fastq)] elif previous: # Lookup output paths from a previous pipeline run. values = diagnostics.lookup_prev_run(id, previous) try: data = diagnostics.str2list(values['data']) except KeyError: utils.die("No output FASTQ paths in diagnostics for previous run.") else: # Use the paths in the catalog entry. record = catalog.select(id) # Split the paths, and make sure there are two. try: data = [catalog.split_paths(record.paths)] except AttributeError: utils.die("Catalog entry missing for id '%s'." % id) except IndexError: utils.die("Catalog entry does not have 2 paths.") diagnostics.log('data', data) ingest('data')
[docs]class IlluminaPipeline (Pipeline): """ An extension of Pipeline that assumes that the input model is a forward and reverse FASTQ pair, such as a paired-end Illumina data set. """ def __init__(self, name, desc=""): Pipeline.__init__(self, name, desc + """ By default, the paths in the catalog are used as the input sequence files, if you have specifiec a catalog ID. You can manually specify your own input files using the (--fastq, -f) argument.""") # Override the file found by the Pipeline constructor (which is this # file, pipeline.py). self.file = inspect.currentframe().f_back.f_code.co_filename # Command-line arguments. self.add_argument('fastq', short='f', nargs=2, default=None, help=""" Manually specify paths to the forward and reverse FASTQ inputs.""") self.add_argument('gzip', short='z', action='store_true', default=False, help=""" Use gzip compression for output files.""") # Add setup stage. self.add_stage(setup_fastq_paths) # Default to starting imports at index 1, since we always add 'setup_paths' # as the 0 stage.
[docs] def import_stages(self, pipe, start=1): BasePipeline.import_stages(self, pipe, start)