# BioLite - Tools for processing gene sequence data and automating workflows
# Copyright (c) 2012-2013 Brown University. All rights reserved.
#
# This file is part of BioLite.
#
# BioLite is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# BioLite is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with BioLite. If not, see <http://www.gnu.org/licenses/>.
"""
A series of wrappers for external calls to various bioinformatics tools.
"""
import glob
import os
import random
import shlex
import subprocess
import sys
from collections import namedtuple
import config
import diagnostics
import utils
[docs]class BaseWrapper:
"""
A base class that handles generic wrapper functionality.
Wrappers for specific programs should inherit this class, call `self.init`
to specify their `name` (which is a key into the executable entries in the
BioLite configuration file), and append their arguments to the `self.args`
list.
By convention, a wrapper should call `self.run()` as the final line in its
`__init__` function. This allows for clean syntax and use of the wrapper
directly, without assigning it to a variable name, e.g.
wrappers.MyWrapper(arg1, arg2, ...)
When your wrapper runs, BaseWrapper will do the following:
* log the complete command line to diagnostics;
* optionally call the program with a version flag (invoked with `version`)
to obtain a version string, then log this to the :ref:`programs-table`
along with a hash of the binary executable file;
* append the command's stderr to a file called `name`.log in the CWD;
* also append the command's stdout to the same log file, unless you set
`self.stdout`, in which case stdout is redirected to a file of that name;
* on Linux, add a memory profiling library to the LD_PRELOAD environment
variable;
* call the command and check its return code (which should be 0 on success,
unless you specify a different code with `self.return_ok`), optionally
using the CWD specified in `self.cwd` or the environment specified in
`self.env`.
* parse the stderr of the command to find [biolite.profile] markers and
use the rusage values from `utils.safe_call` to populate a profile
entity in the diagnostics with walltime, usertime, systime, mem, and
vmem attributes.
"""
def __init__(self, name):
self.name = name
self.cmd = config.get_command(name)
self.args = []
self.return_ok = 0
self.cwd = None
self.stdout = None
self.env = None
self.max_concurrency = sys.maxint
self.output_patterns = None
init = __init__
"""A shortcut for calling the BaseWrapper __init__ from a subclass."""
[docs] def check_arg(self, flag, value):
"""
If `value` evaluates to True, append `flag` and `value` to the argument
list.
"""
if value is not None:
self.args.append(flag)
self.args.append(value)
[docs] def add_threading(self, flag):
"""
Indicates that this wrapper should use threading by appending an
argument with the specified `flag` followed by the number of threads
specified in the BioLite configuration file.
"""
threads = min(int(config.get_resource('threads')), self.max_concurrency)
if threads > 1:
self.args.append(flag)
self.args.append(threads)
[docs] def add_openmp(self):
"""
Indicates that this wrapper should use OpenMP by setting the
$OMP_NUM_THREADS environment variable equal to the number of threads
specified in the BioLite configuration file.
"""
threads = min(int(config.get_resource('threads')), self.max_concurrency)
self.env = os.environ.copy()
self.env['OMP_NUM_THREADS'] = str(threads)
[docs] def version(self, flag=None, cmd=None, path=None):
"""
Generates and logs a hash to distinguish this particular installation
of the program (on a certain host, with a certain compiler, program
version, etc.)
Specify the optional 'binary' argument if the wrapper name is not
actually the program, e.g. if your program has a Perl wrapper script.
Set 'binary' to the binary program that is likely to change between
versions.
Specify the optional 'cmd' argument if the command to run for version
information is different than what will be invoked by `run` (e.g.
if the program has a perl wrapper script, but you want to version an
underlying binary executable).
"""
# Setup the command to run.
if not cmd:
cmd = list(self.cmd)
if flag:
cmd.append(flag)
# Run the command.
try:
vstring = subprocess.check_output(cmd, stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as e:
vstring = e.output
except OSError as e:
utils.failed_executable(cmd[0], e)
if not path:
path = cmd[0]
# Generate a hash.
vhash = diagnostics.log_program_version(self.name, vstring, path)
if vhash:
diagnostics.prefix.append(self.name)
diagnostics.log('version', vhash)
diagnostics.prefix.pop()
[docs] def version_jar(self):
"""
Special case of version() when the executable is a JAR file.
"""
cmd = config.get_command('java')
cmd.append('-jar')
cmd += self.cmd
self.version(cmd=cmd, path=self.cmd[0])
[docs] def run(self, cmd=None):
"""
Call this function at the end of your class's `__init__` function.
"""
if not cmd:
cmd = self.cmd
diagnostics.log('command_executable', ' '.join(cmd))
cmd += map(str, self.args)
diagnostics.prefix.append(self.name)
diagnostics.log('command', ' '.join(cmd))
diagnostics.log('command_list', str(cmd))
stderr_path = os.path.join(self.name + '.log')
stderr_file = open(stderr_path, 'a')
print >> stderr_file, "[biolite] timestamp=%s" % utils.timestamp()
stderr_file.flush()
diagnostics.log('log', stderr_path)
# Write to a stdout file if it was set by the derived class.
# Otherwise, stdout and stderr will be combined into the log file.
if self.stdout:
stdout_file = open(self.stdout, 'w')
diagnostics.log('stdout', self.stdout)
else:
stdout_file = stderr_file
# Setup memory profiling on Linux.
if config.kernel == 'Linux':
if not self.env:
self.env = os.environ.copy()
old_preload = self.env.get('LD_PRELOAD')
self.env['LD_PRELOAD'] = os.path.join(config.libdir, 'libmemusage.so')
if old_preload:
self.env['LD_PRELOAD'] += ":%s" % old_preload
diagnostics.log("LD_PRELOAD", self.env['LD_PRELOAD'])
retcode, walltime, rusage = utils.safe_call(
cmd,
stdout=stdout_file,
stderr=stderr_file,
cwd=self.cwd,
return_ok=self.return_ok,
env=self.env)
# Log profile.
diagnostics.prefix.append('profile')
diagnostics.log('name', self.name)
diagnostics.log('return', retcode)
diagnostics.log('walltime', walltime)
diagnostics.log('usertime', rusage.ru_utime)
diagnostics.log('systime', rusage.ru_stime)
if config.kernel == 'Darwin':
diagnostics.log('mem', rusage.ru_maxrss / 1024)
diagnostics.prefix.pop()
stderr_file.close()
if self.stdout:
stdout_file.close()
# Reverse any output patterns, since they will be matched against
# program output from the last line backward.
if self.output_patterns:
self.output_patterns.reverse()
diagnostics.log_program_output(stderr_path, self.output_patterns)
diagnostics.prefix.pop()
[docs] def run_jar(self, mem=None):
"""
Special case of run() when the executable is a JAR file.
"""
cmd = config.get_command('java')
if mem:
cmd.append('-Xmx%s' % mem)
cmd.append('-jar')
cmd += self.cmd
self.run(cmd)
[docs]def estimate_insert_size():
"""
For tools that need insert sizes, use available estimates from the
diagnostics database, or resort to the default values in the BioLite
configuration file.
Returns an AttributeDict with the fields `mean`, `stddev` and `max`.
"""
entity = diagnostics.INSERT_SIZE
size = utils.AttributeDict()
# Try the diagnostics cache, if the estimate is from an earlier stage
# of the current pipeline.
size.mean = float(diagnostics.local_lookup(entity).get('mean', 0))
size.stddev = float(diagnostics.local_lookup(entity).get('stddev', 0))
size.max = 0
if not size.mean:
# Search the global diagnostics database (passing None to lookup_by_id
# will use the currently initialized id in diagnostics.
values = diagnostics.lookup_by_id(None, entity)
size.mean = float(values.get('mean', 0))
size.stddev = float(values.get('stddev', 0))
if not size.mean:
size.mean = int(config.get_resource('max_insert_size'))
size.stddev = int(config.get_resource('mean_insert_size'))
if not size.max:
size.max = size.mean + \
size.stddev * int(config.get_resource('max_insert_stddev'))
return size
### BioLite command line tools ###
[docs]class CountLines (BaseWrapper):
"""
usage: count_lines [-t THREADS] [INPUT ...]
Count the number of lines in the INPUT files using multiple threads to
increase throughput.
"""
def __init__(self, *inputs):
self.init('count_lines')
self.version('-v')
self.add_threading('-t')
self.args += inputs
self.run()
[docs]class Coverage (BaseWrapper):
"""
usage: coverage [-i SAM] [-o STATS]
Parses a SAM alignment file and writes a coverage table to STATS with
columns for the reference name, the length of the referene, and the number
of reads covering it in the alignment.
"""
def __init__(self, sam, stats):
self.init('coverage')
self.version('-v')
self.args += ('-i', sam)
self.stdout = stats
self.run()
[docs]class Exclude (BaseWrapper):
"""
usage: exclude -x EXCLUDE_FILE [-k] [...] [-i INPUT ...] [-o OUTPUT ...]
Filters all the reads in the input files (FASTA or FASTQ is automatically
detected) and excludes those with ids found in any of the EXCLUDE_FILEs.
If multiple input files are specified, these are treated as paired files.
So if a sequence in one input is excluded, its pair is also excluded from
the same position in all other input files.
If the -k flag is specified, invert the selection to keep instead of exclude.
"""
def __init__(self, exclude_files, input_files, output_files, keep = False):
self.init('exclude')
self.version('-v')
if keep:
self.args += ('-k',)
for x in exclude_files:
self.args += ('-x', x)
for i in input_files[:2]:
self.args += ('-i', i)
for o in output_files[:2]:
self.args += ('-o', o)
self.run()
[docs]class Fastq2Fasta (BaseWrapper):
"""
usage: fastq2fasta -i FASTQ [...] [-o FASTA ...] [-q QUAL ...] [-a]
[-t OFFSET] [-s SUFFIX]
Converts each FASTQ input file to a FASTA file and quality score file
with the names <basename>.fasta and <basename>.fasta.qual, where <basename>
is the name of INPUT up to the last period (or with the names FASTA and QUAL
if specified).
FASTA and QUAL are *appended* to (not truncated).
"""
def __init__(self, fastq_path, fasta_path=None, qual_path=None, suffix=None):
self.init('fastq2fasta')
self.version('-v')
self.args += ('-i', fastq_path)
self.check_arg('-o', fasta_path)
self.check_arg('-q', qual_path)
self.check_arg('-s', suffix)
self.run()
[docs]class Fasta2Fastq (BaseWrapper):
"""
usage: fasta2fastq -i FASTA [...] -q QUAL [...] [-o FASTQ] [-a] [-t OFFSET]
Merges each FASTA file and its corresponding QUAL file into a FASTQ file
with the name <basename>.fastq, where <basename> in the FASTA name up to the
last period (or with name FASTQ if specified).
FASTQ is *appended* to (not truncated).
"""
def __init__(self, fasta_path, qual_path, fastq_path=None):
self.init('fasta2fastq')
self.version('-v')
self.args += ('-i', fasta_path, '-q', qual_path)
self.check_arg('-o', fastq_path)
self.run()
[docs]class FilterIllumina (BaseWrapper):
"""
usage: filter_illumina [-i INPUT ...] [-o OUTPUT ...] [-u UNPAIRED-OUTPUT]
[-t OFFSET] [-q QUALITY] [-n NREADS] [-a] [-b] [-s SEP]
Filters out low-quality and adapter-contaminated reads from Illumina data.
If multiple input files are specified, these are treated as paired files.
So if a sequence in one input is filtered, its pair is also filtered from
the same position in all other input files.
"""
def __init__(self, inputs, outputs, unpaired_output=None, offset=None, quality=None, nreads=None, adapters=True, bases=True, sep=None):
self.init('filter_illumina')
self.version('-v')
for input in inputs[:2]:
self.args += ('-i', input)
for output in outputs[:2]:
self.args += ('-o', output)
self.check_arg('-u', unpaired_output)
self.check_arg('-t', offset)
self.check_arg('-q', quality)
self.check_arg('-n', nreads)
if adapters:
self.args.append('-a')
if bases:
self.args.append('-b')
self.check_arg('-s', sep)
self.run()
[docs]class Interleave (BaseWrapper):
"""
usage: interleave -i INPUT [...] [-o OUTPUT] [-s SEP]
Interleaves the records in the input files (FASTA or FASTQ is automatically
detected) and writes them to OUTPUT, or to stdout if no OUTPUT is specified.
"""
def __init__(self, inputs, output, sep=None):
self.init('interleave')
self.version('-v')
for i in inputs[:2]:
self.args += ('-i', i)
self.check_arg('-s', sep)
self.args += ('-o', output)
self.run()
[docs]class Randomize (BaseWrapper):
"""
usage: randomize [-i INPUT] [-o OUTPUT] [-r READ-ORDER] [-w WRITE-ORDER]
Randomizes the order of sequences in each INPUT file and writes these to a
corresponding OUTPUT file. By default, a new random write order is generated
and saved to WRITE-ORDER, if specified. Alternatively, specifying a
READ-ORDER file uses that order instead of a random one.
"""
def __init__(self, input, output, order_mode=None, order_file="order.txt"):
self.init('randomize')
self.version('-v')
self.args += ('-i', input, '-o', output)
if order_mode is None or order_mode is True:
self.args += ('-w', order_file)
elif isinstance(order_mode, str):
self.args += ('-%c' % order_mode, order_file)
elif order_mode is False:
self.args += ('-r', order_file)
self.run()
[docs]class InsertStats (BaseWrapper):
"""
usage: insert_stats -i SAM -o HIST -m MAX_INSERT
Reads a SAM alignment file and uses it to estimate the mean and std. dev.
of the insert size of the mapped paired-end reads. A histogram of all insert
sizes encountered is written to the HIST file.
"""
def __init__(self, input, histogram=None, histogram_max=None):
self.init('insert_stats')
self.version('-v')
self.args += ('-i', input)
if histogram:
self.args += ('-o', histogram)
if histogram_max:
self.args += ('-m', histogram_max)
else:
self.args += ('-m', config.get_resource('max_insert_size'))
self.run()
[docs]class PileupStats (BaseWrapper):
"""
usage: pileup_stats -i PILEUP -o HIST -m OVERLAP
Reads a SAMtools pileup file and uses it to find potential sequence disconnects.
A histogram of all disconnect events encountered is written to the HIST file.
"""
def __init__(self, input, histogram=None, overlap=None):
self.init('pileup_stats')
self.version('-v')
self.args += ('-i', input)
if histogram is not None:
self.args += ('-o', histogram)
if overlap is not None:
self.args += ('-m', overlap)
self.run()
### Third-party command line tools ###
[docs]class FastQC (BaseWrapper):
"""
A wrapper for FastQC.
http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/
"""
def __init__(self, input, outdir):
self.init('fastqc')
self.version('-v')
# FastQC can run in a multithreaded mode with the -t option
self.add_threading('-t')
self.args += ('-o', outdir, input)
self.run()
[docs]class Dustmasker (BaseWrapper):
"""
A wrapper for dustmasker from NCBI Blast+. http://nebc.nerc.ac.uk/bioinformatics/docs/blast+.html
"""
def __init__(self, input, output, window=None, level=None, linker=None, infmt='fasta', outfmt='fasta'):
self.init('dustmasker')
self.version('-version-full')
self.check_arg('-window', window)
self.check_arg('-level', level)
self.check_arg('-linker', linker)
self.args += (
'-in', input, '-infmt', infmt, '-out', output,
'-outfmt', outfmt)
self.run()
[docs]class Segmasker (BaseWrapper):
"""
A wrapper for segmasker from NCBI Blast+. http://nebc.nerc.ac.uk/bioinformatics/docs/blast+.html
"""
def __init__(self, input, output, window=None, locut=None, hicut=None, infmt="fasta", outfmt="fasta"):
self.init('segmasker')
self.version('-version-full')
self.check_arg('-window', window)
self.check_arg('-locut', locut)
self.check_arg('-hicut', hicut)
self.args += (
'-in', input, '-infmt', infmt, '-out', output,
'-outfmt', outfmt, '-parse_seqids')
self.run()
[docs]class Blastn (BaseWrapper):
"""
A wrapper for blastn from NCBI Blast. http://blast.ncbi.nlm.nih.gov/
"""
def __init__(self, query, db, out, outfmt=5, evalue=0.0001, targets=20):
self.init('blastn')
self.version('-version')
self.args += (
'-query', query, '-db', os.path.abspath(db), '-out', out,
'-outfmt', outfmt, '-evalue', evalue, '-max_target_seqs', targets)
self.add_threading('-num_threads')
self.run()
[docs]class Blastp (BaseWrapper):
"""
A wrapper for blastn from NCBI Blast. http://blast.ncbi.nlm.nih.gov/
"""
def __init__(self, query, db, out, outfmt=5, evalue=0.0001, targets=20):
self.init('blastp')
self.version('-version')
self.args += (
'-query', query, '-db', os.path.abspath(db), '-out', out,
'-outfmt', outfmt, '-evalue', evalue, '-max_target_seqs', targets)
self.add_threading('-num_threads')
self.run()
[docs]class Blastx (BaseWrapper):
"""
A wrapper for blastx from NCBI Blast. http://blast.ncbi.nlm.nih.gov/
"""
def __init__(self, query, db, out, outfmt=5, evalue=0.0001, targets=20):
self.init('blastx')
self.version('-version')
self.args += (
'-query', query, '-db', os.path.abspath(db), '-out', out,
'-outfmt', outfmt, '-evalue', evalue, '-max_target_seqs', targets)
self.add_threading('-num_threads')
self.run()
[docs]class Rpsblast (BaseWrapper):
"""
A wrapper for blastn from NCBI Blast. http://blast.ncbi.nlm.nih.gov/
"""
def __init__(self, query, db, out, outfmt=5, evalue=0.0001):
self.init('rpsblast')
self.version('-version')
self.args += (
'-query', query, '-db', os.path.abspath(db), '-out', out,
'-outfmt', outfmt, '-evalue', evalue )
self.add_threading('-num_threads')
self.run()
[docs]class MultiBlast (BaseWrapper):
"""
usage: multiblast BLAST THREADS QUERY_LIST OUT [ARGS]
Runs a Blast PROGRAM (e.g. blastx, blastn, blastp) in parallel on a list of
queries (in QUERY_LIST). Additional arguments to PROGRAM can be appended as
ARGS.
The PROGRAM is called on each query with threading equal to THREADS.
Recommendation: set THREADS to the number of cores divided by the number of
query files.
The individual XML outputs for each query file are concatenated into a single
output file OUT.
Example usage:
multiblast blastn 4 "query1.fa query2.fa" all-queries.xml -e 1e-6
"""
def __init__(self, blast, threads, qlist, db, out, evalue=0.0001, targets=20):
if not glob.glob(db + '.*'):
utils.die("missing blast database '%s'" % db)
self.init(blast)
self.version('-version')
self.args += (
threads, ' '.join(qlist), out,
'-db', os.path.abspath(db), '-evalue', evalue,
'-max_target_seqs', targets)
self.run(config.get_command('multiblast') + self.cmd)
[docs]class MakeBlastDB (BaseWrapper):
"""
A wrapper for makeblastdb from NCBI Blast. http://blast.ncbi.nlm.nih.gov/
"""
def __init__(self, dbtype, in_name, db_name):
self.init('makeblastdb')
self.version('-version')
self.args += (
'-dbtype', dbtype, '-in', in_name,
'-out', os.path.abspath(db_name))
self.run()
[docs]class Bowtie2 (BaseWrapper):
"""
A wrapper for the bowtie2 short-read aligner.
http://bowtie-bio.sourceforge.net/
"""
def __init__(self, inputs, mapping_file, output_path, local=True, sensitive=True, all=True, max_insert=None):
self.init('bowtie2')
self.version('--version', config.get_command('bowtie2-align'))
inputs = tuple(inputs)
if len(inputs) == 1:
self.args += ('-U', inputs[0])
elif len(inputs) == 2:
if max_insert is None:
max_insert = int(estimate_insert_size().max)
self.args += ('-1', inputs[0], '-2', inputs[1], '-X', max_insert)
else:
utils.die("Bowtie2 wrapper expects either 1 (SE) or 2 (PE) inputs")
if local:
self.args.append('--local')
else:
self.args.append('--end-to-end')
if sensitive:
self.args.append('--very-sensitive')
if all:
self.args.append('-a')
else:
self.args += ('-M', 2)
self.add_threading('-p')
self.args += (
'--reorder', '--no-unal', '-t', '-S', output_path, mapping_file)
self.output_patterns = map(diagnostics.OutputPattern._make, [
(r"(\d+) reads; of these:$",0,"nreads"),
(r" (\d+) \S+ were paired; of these:$",0,"npairs"),
(r" (\d+) \S+ aligned concordantly 0 times$",0,"nconcord0"),
(r" (\d+) \S+ aligned concordantly exactly 1 time$",0,"nconcord1"),
(r" (\d+) \S+ aligned concordantly >1 times$",0,"nconcord2"),
(r" (\d+) \S+ aligned discordantly 1 time$",0,"ndiscord1"),
(r" (\d+) mates make up the pairs; of these:$",0,"nunpaired"),
(r" (\d+) \S+ aligned 0 times$",0,"nunpaired0"),
(r" (\d+) \S+ aligned exactly 1 time$",0,"nunpaired1"),
(r" (\d+) \S+ aligned >1 times$",0,"nunpaired2")])
self.run()
[docs]class Bowtie2Build (BaseWrapper):
"""
A wrapper for bowtie2-build component of Bowtie2.
http://bowtie-bio.sourceforge.net/
"""
def __init__(self, input_path, outdir_path):
self.init('bowtie2-build')
self.version('--version')
self.args += (input_path, outdir_path)
self.run()
[docs]class SamToBam (BaseWrapper):
def __init__(self, input_path, output_path):
self.init('samtools')
self.version()
self.args += ('view', '-bS', '-o', output_path, input_path)
self.run()
[docs]class SamView (BaseWrapper):
def __init__(self, input_path, regions, output_path):
self.init('samtools')
self.version()
self.args += ('view', '-o', output_path, input_path)
self.args += regions
self.run()
[docs]class SamSort (BaseWrapper):
def __init__(self, input_path, output_path):
self.init('samtools')
self.version()
self.args += ('sort', input_path, output_path)
self.run()
[docs]class SamIndex (BaseWrapper):
def __init__(self, input_path):
self.init('samtools')
self.version()
self.args += ('index', input_path)
self.run()
[docs]class SamPileup (BaseWrapper):
def __init__(self, reference_path, bam_path, output_path):
self.init('samtools')
self.version()
self.args += (
'mpileup', '-BQ0', '-d1000000000', '-f', reference_path, bam_path)
self.stdout = output_path
self.run()
[docs]class Trinity (BaseWrapper):
def __init__(self, inputs, outdir, max_insert=None, min_length=None, seq_type='fq'):
self.init('trinity')
self.version('--version')
# We have to turn return code checking off, because Trinity can exit
# on errors during the print_butterfly_assemblies.pl script even though
# it has completed successfully.
self.return_ok = None
if max_insert is None:
max_insert = int(estimate_insert_size().max)
if len(inputs) == 1:
self.args += ['--single', inputs[0]]
elif len(inputs) == 2:
self.args += ['--left', inputs[0], '--right', inputs[1]]
else:
utils.die("Trinity wrapper expects either 1 (SE) or 2 (PE) inputs")
mem = config.get_resource('memory').upper()
threads = int(config.get_resource('threads'))
self.args += (
'--no_run_butterfly', '--seqType', seq_type, '--output', outdir,
'--JM', mem)
self.check_arg('--min_contig_length', min_length)
self.check_arg('--group_pairs_distance', max_insert)
self.add_threading('--CPU')
self.run()
[docs]class ParallelButterfly (BaseWrapper):
def __init__(self, commands):
self.init('butterfly')
java = config.get_command('java')[0]
butterfly = config.get_command('butterfly')[0]
threads = int(config.get_resource('threads'))
mem = int(0.9 * utils.mem_to_mb(config.get_resource('memory')))
self.args += (
'--butterfly', '--jar', butterfly, '--java', java,
'--cpus', threads, '--mem', mem, '--paramfile', commands)
self.run(config.get_command('multijar'))
[docs]class Oases (BaseWrapper):
"""
A wrapper for Oases, a *de novo* transcriptome assembler.
http://www.ebi.ac.uk/~zerbino/oases/
"""
def __init__(self, outdir, ins_length=None, ins_length_sd=None, min_length=None, merge=False):
self.init('oases')
self.version()
self.args.append(outdir)
if merge:
self.args += ('-merge', 'yes')
else:
if ins_length is None:
estimates = estimate_insert_size()
ins_length = int(estimates.mean)
if estimates.stddev > 0:
ins_length_sd = int(estimates.stddev)
self.args += ('-ins_length', ins_length)
self.check_arg('-ins_length_sd', ins_length_sd)
self.check_arg('-min_trans_lgth', min_length)
self.run()
[docs]class VelvetH (BaseWrapper):
"""
A wrapper for the velveth component of the Velvet *de novo* assember.
http://www.ebi.ac.uk/~zerbino/velvet/
If `merge` is True, `input_path` must be a list of transcript FASTA files.
Otherwise, `input_path` should a single FASTQ filename containing shuffled
short reads or a list of FASTQ files where the first two form a paired file
and the third is unpaired short reads.
"""
def __init__(self, inputs, outdir, kmer=61, merge=False):
self.init('velveth')
self.version()
self.max_concurrency = 16
self.args += (outdir, kmer)
if merge:
self.args += ('-fasta', '-long')
self.args += inputs
elif len(inputs) == 1:
self.args += ('-fastq', '-short', inputs[0])
elif len(inputs) == 2:
self.args += (
'-fastq', '-shortPaired', '-separate', inputs[0], inputs[1])
else:
utils.die("VelvetH wrapper expects either 1 (SE) or 2 (PE) inputs")
self.add_openmp()
self.run()
[docs]class VelvetG (BaseWrapper):
"""
A wrapper for the velvetg component of the Velvet *de novo* assember.
http://www.ebi.ac.uk/~zerbino/velvet/
"""
def __init__(self, outdir, ins_length=None, ins_length_sd=None, min_length=None, merge=False, exp_cov='auto'):
self.init('velvetg')
self.version()
self.max_concurrency = 16
self.args += (outdir, '-read_trkg', 'yes', '-exp_cov', exp_cov)
if merge:
self.args += ('-conserveLong', 'yes')
else:
if ins_length is None:
estimates = estimate_insert_size()
ins_length = int(estimates.mean)
if estimates.stddev > 0:
ins_length_sd = int(estimates.stddev)
self.args += ('-ins_length', ins_length)
self.check_arg('-ins_length_sd', ins_length_sd)
self.check_arg('-min_contig_lgth', min_length)
self.add_openmp()
self.run()
[docs]class Macse (BaseWrapper):
"""
Multiple alignment of coding sequences.
"""
def __init__(self, input, output, frameshift=-40, stopcodon=-150):
self.init('macse')
self.version_jar()
self.args += (
'-i', input, '-o', output, '-f', frameshift, '-s', stopcodon)
mem = 0.9 * utils.mem_to_mb(config.get_resource('memory'))
self.run_jar('%dM' % mem)
[docs]class ParallelMacse (BaseWrapper):
"""
Multiple alignment of coding sequences, run in parallel.
"""
def __init__(self, inputs, outputs, frameshift=-40, stopcodon=-150, workfile='paramacse.commands.txt'):
self.init('macse')
self.version_jar()
if len(inputs) != len(outputs):
utils.die("ParallelMacse: input/output lists of different lengths")
with open(workfile, 'w') as f:
for i,_ in enumerate(inputs):
print >>f, "-i\t%s\t-o\t%s\t-f\t%d\t-s\t%d\t%s.macse.log" % (
inputs[i], outputs[i], frameshift, stopcodon, inputs[i])
java = config.get_command('java')[0]
macse = config.get_command('macse')[0]
threads = int(config.get_resource('threads'))
mem = int(0.6 * utils.mem_to_mb(config.get_resource('memory')))
self.args += (
'--jar', macse, '--java', java,
'--cpus', threads, '--mem', mem, '--paramfile', workfile)
self.run(config.get_command('multijar'))
[docs]class Raxml (BaseWrapper):
"""
Maximum Likelihood based inference of phylogenetic trees.
"""
def __init__(self, input, output, model, output_dir, pars_rseed=None, extra_flags=None):
self.init('raxml')
self.version('-v')
self.add_threading('-T')
if pars_rseed is None:
pars_rseed = random.randint(0,sys.maxint)
self.args += (
'-s', input, '-n', output, '-m', model, '-w', output_dir,
'-p', pars_rseed)
if extra_flags:
self.args += shlex.split(extra_flags)
self.run()
[docs]class Gblocks (BaseWrapper):
"""
Selection of conserved block from multiple sequence alignments for phylogenetics.
"""
def __init__(self, input, t='p', b1=None, b2=None, b3=10, b4=5, b5='a'):
self.init('Gblocks')
# Ignore Gblocks broken exit code
self.return_ok = None
self.args += (input, '-t='+t)
if b1 is not None:
self.args.append('-b1='+str(b1))
if b2 is not None:
self.args.append('-b2='+str(b2))
self.args += ('-b3='+str(b3), '-b4='+str(b4), '-b5='+str(b5))
self.run()
[docs]class Mcl (BaseWrapper):
"""
Analysis of networks.
"""
def __init__(self, input, output, inflation=2.1):
self.init('mcl')
self.version('--version')
self.args += (input, '--abc', '-I', inflation, '-o', output)
self.add_threading('-te')
self.run()
[docs]class Parallel (BaseWrapper):
"""
GNU parallel utility
http://www.gnu.org/software/parallel/
"""
def __init__(self, commands, *args, **kwargs):
self.init('parallel')
self.version('--version')
self.args += ('-a', commands)
self.add_threading('-j')
self.args += args
self.stdout = kwargs.get('stdout')
self.cwd = kwargs.get('cwd')
self.run()
[docs]class Sga (BaseWrapper):
"""
String Graph Assembler
"""
def __init__(self, command, *args, **kwargs):
self.init('sga')
self.version('--version')
self.args.append(command)
self.args += args
if command in frozenset(('index', 'merge', 'correct', 'fm-merge', 'overlap', 'filter')):
self.add_threading('-t')
self.cwd = kwargs.get('cwd')
self.run()