# BioLite - Tools for processing gene sequence data and automating workflows
# Copyright (c) 2012-2013 Brown University. All rights reserved.
#
# This file is part of BioLite.
#
# BioLite is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# BioLite is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with BioLite. If not, see <http://www.gnu.org/licenses/>.
"""
Provides a collection of helper functions that coordinate multiple wrappers
from the :ref:`wrappers` to accomplish a unified goal or automate a common
analysis task.
Workflows are available for the following groups of tasks:
* Assembly statistics and sweeps
* Contig parsing
* Blast result parsing
* SamTools automation
* Transcript cleaning
"""
import os
import re
import shutil
import sys
from Bio.Blast import NCBIXML
from Bio import SeqIO
from collections import namedtuple, OrderedDict
import config
import diagnostics
import utils
import wrappers
ContigHeader = namedtuple('ContigHeader', "locus transcript confidence length")
"""
A namedtuple for storing the information from a transcript header.
"""
standard_contig_header = \
"Locus_{0}_Transcript_{1}/0_Confidence_{2}_Length_{3}\n"
oases_header_pattern = re.compile(\
r'Locus_([\d\.]+)_Transcript_(\d+)/\d+_Confidence_([\.\d]+)_Length_(\d+)$')
trinity_header_pattern = re.compile(\
r'comp(\d+)_c(\d+)_seq(\d+) len=(\d+) path=\[([-: \d]+)\]$')
[docs]def contig_stats(fasta_path, hist_path):
"""
Parses the assembled contigs in `fasta_path` and writes a histogram of
contig length to `hist_path`.
Writes the total contig count, mean length, and N50 length to
the diagnostics.
"""
count = 0
mean = 0.0
n50 = 0
# Parse the fasta file to construct a histogram, stored as a dictionary
# {contig_length: frequency}, and to calculate total contig count and
# mean length.
hist = dict()
total_length = 0
lengths = list()
for record in SeqIO.parse(fasta_path, 'fasta'):
count += 1
length = len(record.seq)
lengths.append(length)
hist[length] = hist.get(length, 0) + 1
lengths.sort()
total_length = sum(lengths)
if count:
mean = total_length / float(count)
# Find the n50 value
cum_length = 0
for length in lengths:
cum_length += length
if cum_length >= (total_length / 2):
n50 = length
break
# Loop through the histogram to dump it to a text file.
with open(hist_path, 'w') as f:
for key in sorted(hist.keys()):
print >>f, key, hist[key]
diagnostics.prefix.append('contig_stats')
diagnostics.log('count', count)
diagnostics.log('mean', mean)
diagnostics.log('n50', n50)
diagnostics.log_path(hist_path)
diagnostics.prefix.pop()
[docs]def oases_clean(workdir='./'):
"""
Cleans up a work directory that was used for an Oases assembly.
"""
files = ('Sequences', 'Roadmaps', 'PreGraph', 'Graph2', 'LastGraph', 'contigs.fa', 'Log')
for f in files:
utils.safe_remove(os.path.join(workdir, f))
[docs]def oases_assemblies(inputs, kmers=[61], workdir='./', min_length=None, ins_length=None):
"""
Automates Oases assemblies that sweep multiple `kmers`.
If `inputs` is a list of FASTQ files, they are automatically shuffled
together. Or, provide a singleton list with the path to a pre-shuffled
FASTQ file.
"""
diagnostics.prefix.append('oases_assemblies')
contig_list = list()
for k in kmers:
diagnostics.prefix.append('k%d' % k)
subdir = os.path.join(workdir, 'k%d' % k)
# clear out any previous runs
shutil.rmtree(subdir, ignore_errors=True)
utils.safe_mkdir(subdir)
#oases_clean(subdir)
wrappers.VelvetH(inputs, subdir, kmer=k)
wrappers.VelvetG(subdir, ins_length, min_length=min_length)
# makes transcripts.fa
wrappers.Oases(subdir, ins_length, min_length=min_length)
contigs = os.path.join(subdir, 'transcripts.fa')
contigs_k = os.path.join(workdir, 'transcripts-k%d.fa' % k)
shutil.move(contigs, contigs_k)
contig_list.append(contigs_k)
diagnostics.prefix.pop()
diagnostics.prefix.pop()
return contig_list
[docs]def oases_concat_assembly(inputs, concat_path, kmers, workdir='./', ins_length=None):
"""
Performs Oases assemblies sweeping over the provided `kmers` list, and
concatenates all contigs to `concat_path`.
If `inputs` is a list of FASTQ files, they are automatically shuffled
together. Or, provide a singleton list with the path to a pre-shuffled
FASTQ file.
"""
utils.truncate_file(concat_path)
for contigs in oases_assemblies(inputs, kmers, workdir, ins_length):
utils.cat_to_file(contigs, concat_path)
[docs]def oases_merge_assembly(inputs, merge_path, merge_kmer, kmers, min_length=None, workdir='./', ins_length=None):
"""
Implements the Oases-M protocol for merging several Oases assemblies, as
described in:
Schulz, M. H., Zerbino, D. R., Vingron, M., & Birney, E. (2012). Oases:
Robust de novo RNA-seq assembly across the dynamic range of expression
levels. Bioinformatics (Oxford, England), 1-7.
doi:10.1093/bioinformatics/bts094
Performs Oases assemblies sweeping over the provided `kmers` list, then
performs a Oases merge assembly with `merge_kmer`.
"""
diagnostics.prefix.append('oases_merge_assembly')
oases_clean(workdir)
contig_list = oases_assemblies(inputs, kmers, workdir, min_length, ins_length)
wrappers.VelvetH(contig_list, workdir, kmer=merge_kmer, merge=True)
wrappers.VelvetG(workdir, merge=True, min_length=min_length)
wrappers.Oases(workdir, merge=True, min_length=min_length)
contigs = os.path.join(workdir, 'transcripts.fa')
shutil.move(contigs, merge_path)
diagnostics.log('kmer', merge_kmer)
diagnostics.log_path(merge_path)
diagnostics.prefix.pop()
[docs]def trinity_assembly(out, inputs, workdir='./', min_length=None):
wrappers.Trinity(inputs, workdir, min_length=min_length, seq_type='fq')
commands = os.path.join(workdir, 'chrysalis', 'butterfly_commands')
contigs = os.path.join(workdir, 'Trinity.fasta')
if os.path.exists(commands):
wrappers.ParallelButterfly(commands)
else:
utils.die("no butterfly command output from Trinity")
if os.path.exists(contigs):
with open(contigs, 'r') as f1, open(out, 'w') as f2:
for record in SeqIO.parse(f1, 'fasta'):
match = trinity_header_pattern.match(record.description)
if match is not None:
num, sub, seq, length, path = match.groups()
record.name = ''
record.description = ''
record.id = standard_contig_header.format(
"%s.%s" % (num, sub), seq,
sum(c == ' ' for c in path), length)
SeqIO.write(record, f2, 'fasta')
else:
utils.die("no contig output from Trinity")
BlastHit = namedtuple('BlastHit', "query title definition id evalue rank orient mask score bitscore length percent")
"""
A namedtuple for storing several fields of a Blast hit.
"""
[docs]def blast_hits(xml_path, nlimit=None):
"""
Reads an XML formatted BLAST report, and yields one named tuple per
alignment, i.e. per hit between a query and a subject. Each named tuple
has the following elements:
query title definition id evalue rank orient mask score bitscore length percent
where:
* orient is 1 if query and subject are in the same direction, 2 if they are
in the opposite direction, and 0 if direction is inconsistent across hsp's
* evalue is the minimum evalue across hsp's
* score, bitcore and length are maximal across hsp's
"""
with open(xml_path, 'r') as f:
# Loop over the blast records.
for entry in NCBIXML.parse(f):
# Limit the number of alignments per query, if nlimit is set.
if nlimit:
alignments = entry.alignments[:nlimit]
else:
alignments = entry.alignments
rank = 0
# Loop over the alignments.
for alignment in alignments:
rank = rank + 1
evalue = sys.maxint
score = -1
bitscore = -1.0
length = -1
orient = 0
# Initialize a string to hold the sites that are within hsp's
# String has the same length as the query Sites with 0 are not
# in any hsp, sites that are nonzero are in hsps
mask = ['0'] * entry.query_length
# Loop over the hsp's to get the lowest evalue and the
# orientation.
for hsp in alignment.hsps:
# Check if the orient is the same by comparing
# orientation of query and subject.
if (hsp.query_start < hsp.query_end) == \
(hsp.sbjct_start < hsp.sbjct_end):
orient |= 1
# Or different.
else:
orient |= 2
# Record the lowest e-value.
evalue = min(evalue, hsp.expect)
# Create a 2 element list with the starting and end point
# of the hsp, adjusted to be 0 offset rather than 1 offset
ends = [hsp.query_start-1, hsp.query_end-1]
# Sort the list to orient it so that start is less then
# finish, then loop over the string and set the sites in
# the hsp range to '1'
ends.sort()
for i in range(ends[0], ends[1] + 1):
mask[i] = '1'
# Convert list to string
mask_string = ''.join(mask)
# Record the highest score.
score = max(score, hsp.score)
# Record the highest bitscore.
bitscore = max(bitscore, hsp.bits)
# Record the longest HSP.
length = max(length, hsp.align_length)
# Set orient to 0 of it is a mix between hsps (=3),
# otherwise use the consistent value (1 or -1).
orient = -1 if orient == 2 else orient % 3
yield BlastHit(
entry.query,
alignment.title,
alignment.hit_def,
alignment.hit_id,
evalue,
rank,
orient,
mask_string,
score,
bitscore,
length,
length / float(alignment.length))
[docs]def blast_top_hits(xml_path):
"""
Similar to `blast_hits`, but returns an OrderedDict keyed by query name
with only one hit (the top hit) per query.
"""
hits = OrderedDict()
for hit in blast_hits(xml_path, nlimit=1):
hits[hit.query] = hit
return hits
[docs]def blast_annotate_seqs(hits, fasta_in, hits_out, misses_out, all_out=False, rpkms={}):
"""
Iterates through the records in `fasta_in` and looks for a hit in a dict of
BlastHit object, `hits`.
For each record with a hit, the RPKM (if provided), hit title, and evalue
are added to the ID and the record is written to `hits_out`.
If there is no hit, the record is written to `misses_out`.
If `all_out` is True, then hits are also written to `misses_out`.
"""
with open(hits_out, 'w') as f1, open(misses_out, 'w') as f2:
for seq in SeqIO.parse(open(fasta_in), 'fasta'):
hit = hits.get(seq.id)
new_id = [str(seq.id).strip().replace('|', '_')]
seq.description = ''
if seq.id in rpkms:
new_id += ('RPKM', '%.2f' % rpkms[seq.id])
if hit:
new_id += ('BlastHit', hit.title, '%g' % hit.evalue)
seq.id = '|'.join(new_id)
SeqIO.write(seq, f1, 'fasta')
if all_out:
SeqIO.write(seq, f2, 'fasta')
else:
seq.id = '|'.join(new_id)
SeqIO.write(seq, f2, 'fasta')
rRNAhit = namedtuple('rRNAhit', "locus gene confidence orient query")
"""
A named tuple that associates a Blast hit for a transcript with the locus and
confidence values for the transcript.
"""
[docs]def rrna_blast_hits(xml_path, unpack_header_func):
"""
Reads an XML formatted BLAST report, and saves one top hit per locus, using
the transcript with the highest confidence for the locus.
The locus name and confidence are extracted from the query name with
the supplied 'unpack_header_func' function.
Returns both a set of all the queries in the XML report, and a dictionary
keyed by locus and storing the rRNA hits:
set(queries), dict(hits)
The rRNA hits are tuple with the following fields:
(locus gene confidence orient query)
"""
queries = set()
hits = OrderedDict()
for hit in blast_hits(xml_path, nlimit=1):
queries.add(hit.query)
# Select exemplar based on confidence for each locus with rRNA
# similarity.
header = unpack_header_func(hit.query)
# Extract gene name from the hit definition.
# Example definition:
# 18s_gi|60545004|gb|AY937338.1| Nanomia bijuga small subunit ribosomal RNA gene, partial sequen
gene = hit.definition.partition('|')[0]
# Create rRNA hit tuple.
new_hit = rRNAhit(header.locus, gene, header.confidence, hit.orient, hit.query)
# Insert new exemplar, or replace existing exemplars that have lower
# confidence.
existing_hit = hits.get(header.locus, None)
if not existing_hit or new_hit.confidence > existing_hit.confidence:
hits[header.locus] = new_hit
return queries, hits
[docs]def multiblast(blast, query, db, out, evalue=0.0001, cores=4, targets=20):
"""
Prepares a single `query` file for the `multiblast` by dividing the queries
into :samp:`nodes = threads/cores` many chunks, where `threads` is from
the BioLite configuration file.
Executes the Blast operation `blast` (e.g. 'blastx') in parallel on each
`node`, then concatenates the XML output into a single XML file `out`.
"""
threads = int(config.get_resource('threads'))
nodes = threads / cores
if threads % cores:
nodes += 1
# Break up query.
qbase = os.path.basename(query)
qlist = list()
outfiles = list()
for i in xrange(nodes):
qlist.append("%s.multiblast.%d" % (qbase, i))
outfiles.append(open(qlist[-1], 'w'))
with open(query, 'r') as f:
i = 0
for record in SeqIO.parse(query, 'fasta'):
outfiles[i].write(">%s\n%s\n" % (record.id, record.seq))
if nodes > 0:
i = (i + 1) % nodes
for f in outfiles:
f.close()
wrappers.MultiBlast(blast, cores, qlist, db, out, evalue, targets)
[docs]def sort_and_index_sam(sam_path):
"""
Uses SamTools to convert a SAM file at `sam_path` to BAM, then sort and
index the BAM.
Returns the filename of the final output, which is '_sorted.bam' appended
to `sam_path`.
"""
basename = os.path.splitext(sam_path)[0]
bam_path = basename + '.bam'
sorted_path = basename + '_sorted.bam'
diagnostics.prefix.append('bam')
wrappers.SamToBam(sam_path, bam_path)
diagnostics.prefix.pop()
diagnostics.prefix.append('sort')
wrappers.SamSort(bam_path, basename + '_sorted')
diagnostics.prefix.pop()
diagnostics.prefix.append('index')
wrappers.SamIndex(sorted_path)
diagnostics.prefix.pop()
return sorted_path
[docs]def calculate_rpkms(coverage_table):
rpkms = dict()
# Load coverage table into a dict and calculate RPKM as follows:
# (1e9 * coverage) / (len * coverage_sum)
coverage_sum = 0
with open(coverage_table, 'r') as f:
for line in f:
col = line.strip().split('\t')
length = float(col[1])
coverage = float(col[2])
rpkms[col[0]] = 1e9 * coverage / length
coverage_sum += coverage
coverage_norm = 1.0 / coverage_sum
for key in rpkms:
rpkms[key] *= coverage_norm
return rpkms
[docs]def dustmasker(fasta_in, clean_out, dirty_out, max_lowc=0.8, min_region=0.1):
#unpack_func=unpack_oases_header):
"""
"""
dust_out = 'dust.fa'
wrappers.Dustmasker(fasta_in, dust_out)
stats = {
'bases': 0,
'transcripts': 0,
'lowc_bases': 0,
'lowc_regions': 0,
#'lowc_loci': 0,
'lowc_transcripts': 0,
'lowc_min_regions': 0}
def count_upper_regions(seq):
lengths = [0]
for i in xrange(len(seq) - 1):
if seq[i].isupper():
lengths[-1] += 1
if seq[i+1].islower():
lengths.append(0)
if seq[-1].isupper():
lengths[-1] += 1
return lengths
# last_locus = ''
# lowc_locus = False
with open(clean_out, 'w') as f1, open(dirty_out, 'w') as f2:
for record in SeqIO.parse(open(dust_out), 'fasta'):
stats['transcripts'] += 1
#locus = unpack_func(record.id).locus
#if locus != last_locus:
# last_locus = locus
# stats['lowc_loci'] += lowc_locus
# lowc_locus = True
l = len(record.seq)
stats['bases'] += l
lowc_len = sum(1 for c in record.seq if c.islower())
stats['lowc_bases'] += lowc_len
if lowc_len:
stats['lowc_regions'] += 1
if float(lowc_len) / float(l) > max_lowc:
stats['lowc_transcripts'] += 1
record.description = 'maxlowc'
SeqIO.write(record, f2, 'fasta')
elif float(max(count_upper_regions(record.seq))) < min_region * l:
stats['lowc_min_regions'] += 1
record.description = 'minregion'
SeqIO.write(record, f2, 'fasta')
else:
lowc_locus = False
SeqIO.write(record, f1, 'fasta')
diagnostics.prefix.append('dustmasker')
diagnostics.log_dict(stats)
diagnostics.prefix.pop()
[docs]def clean_rrna(fasta_in, clean_out, rrna_out):
"""
Blastn against rRNA, transferring sequences with or without a hit to their
own files. Even when rRNA reads are removed prior to assembly, some may
make it through and be assembled from the full dataset (including low
frequency contaminant rRNAs).
"""
# BLAST transcripts against known rRNA database.
rrna_blast = 'rrna_blast.xml'
dbname = os.path.join(os.getcwd(), 'rRNA_blast')
wrappers.MakeBlastDB('nucl', config.get_resource('rrna_fasta'), dbname)
multiblast('blastn', fasta_in, dbname, rrna_blast, cores=1)
hits = blast_top_hits(rrna_blast)
utils.info("Total of %d matches against ribosomal RNA." % len(hits))
blast_annotate_seqs(hits, fasta_in, rrna_out, clean_out)
[docs]def clean_univec(fasta_in, clean_out, vector_out):
"""
Blastn against univec, transferring sequences with or without a hit to
their own files This removes sequences that still have adapters, or that
are contaminated with plasmids (including the protein expression plasmids
used to manufacture sample prep enzymes).
"""
# BLAST transcripts against UniVec.
univec_blast = 'univec_blast.xml'
multiblast(
'blastn',
fasta_in,
config.get_resource('univec_blastdb'),
univec_blast,
cores=1)
hits = blast_top_hits(univec_blast)
utils.info("Total of %d matches against UniVec." % len(hits))
blast_annotate_seqs(hits, fasta_in, vector_out, clean_out)
[docs]def clean_swissprot(fasta_in, clean_out, annotated_out, blast_out, rpkms=None):
"""
Blastn against SwissProt, transferring sequences with or without a hit to
their own files, used in comparing assemblies.
"""
# BLAST transcripts against SwissProt.
multiblast(
'blastx',
fasta_in,
config.get_resource('swissprot_blastdb'),
blast_out,
evalue=1e-6,
cores=1)
hits = blast_top_hits(blast_out)
utils.info("Total of %d matches against SwissProt." % len(hits))
blast_annotate_seqs(hits, fasta_in, clean_out, annotated_out, True, rpkms)
return hits
[docs]def filter_coverage_table(coverage_table, seq_ids, filtered_table):
"""
Filters a `coverage_table` so that only entries with IDs in the list
`seq_ids` remain and writes output to the path `filtered_table`.
"""
diagnostics.prefix.append('filter_coverage_table')
with open(coverage_table, 'r') as file_in:
with open(filtered_table, 'w') as file_out:
for line in file_in:
if line.partition('\t')[0] in seq_ids:
file_out.write(line)
diagnostics.log_path(filtered_table)
diagnostics.prefix.pop()