1. Casey Dunn
  2. agalma

Source

agalma / agalma / genetree.py

#!/usr/bin/env python
#
# Agalma - Tools for processing gene sequence data and automating workflows
# Copyright (c) 2012-2013 Brown University. All rights reserved.
# 
# This file is part of Agalma.
# 
# Agalma is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# Agalma is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with Agalma.  If not, see <http://www.gnu.org/licenses/>.

'''
Builds gene trees for each set of homologous sequences, it builds a
phylogenetic tree using the maximum likelihood optimality criterion as
implemented in RAxML; see http://www.exelixis-lab.org/ and
doi:10.1093/bioinformatics/btl446
'''

import glob
import os

from dendropy import Tree
import numpy

from Bio import AlignIO

from biolite import diagnostics
from biolite import report
from biolite import utils
from biolite import wrappers

from biolite.pipeline import Pipeline

pipe = Pipeline('genetree', __doc__)

# Command-line arguments
pipe.add_argument('align_id', default=None, metavar='ID', help="""
	Use the alignments from a previous multalign run.""")
pipe.add_argument('align_dir', default=None, metavar='DIR', help="""
	Use an explicit path to a directory containing a FASTA (.fa) file for
	each alignment.""")
pipe.add_argument('seq_type', default='protein', help="""
	Choose whether gene trees will be built using protein or nucleotide
	sequences.""")
pipe.add_argument('model', short='m', default='PROTGAMMAWAG', help="""
	Choose a substitution model of evolution for RAxML.""")
pipe.add_argument('bootstrap', type=int, default=100, help="""
	Number of bootstrap replicates to run for RAxML.""")
pipe.add_argument('raxml_flags', default='', help="""
	Extra parameters to pass to RAxML. Make sure the flags make sense: refer to
	the RAxML Manual for help.""")
pipe.add_argument('min_support', short='s', type=float, default=0, help="""
	Filter out trees with mean bootstrap below min_support.""")

# Pipeline stages
@pipe.stage
def setup_path(id, align_dir, align_id, seq_type):
	"""Determine the path of the input alignments"""

	if not align_id:
		prev = diagnostics.lookup_last_run(id, 'multalign')
		if prev:
			align_id = int(prev.run_id)
			utils.info("using previous 'multalign' run id %d" % align_id)

	if align_dir:
		align_dir = os.path.abspath(align_dir)
	elif align_id:
		# Lookup values for a previous run
		values = diagnostics.lookup(align_id, diagnostics.EXIT)
		if seq_type == 'protein':
			try:
				align_dir = values['protdir']
			except KeyError:
				utils.die("No 'protdir' entry found in run %s" % align_id)
		elif seq_type == 'nucleotide':
			try:
				align_dir = values['nucdir']
			except KeyError:
				utils.die("No 'nucdir' entry found in run %s" % align_id)
	else:
		utils.die(
			"No directory or run id provided: use --align_dir or --align_id")
		
	ingest('align_dir')


@pipe.stage
def convert_format(align_dir, outdir):
	"""Convert alignments from fasta to phylip format"""
	
	phylip_dir = "phylip-aligns"
	utils.safe_mkdir(phylip_dir)

	for fasta in glob.glob(os.path.join(align_dir, '*.fa')):
		basename = os.path.splitext(os.path.basename(fasta))[0]
		phylip = os.path.join(phylip_dir, basename + '.phy')
		# Convert to phylip-relaxed to allow taxon names longer than 10	
		try:
			AlignIO.convert(fasta, 'fasta', phylip, 'phylip-relaxed')
		except ValueError:
			utils.info("warning: skipping empty cluster '%s'" % basename)
	
	ingest('phylip_dir')


@pipe.stage
def genetrees(phylip_dir, seq_type, model, bootstrap, raxml_flags, outdir):
	"""Build gene trees"""
	
	prot_models = ('PROTCATWAG', 'PROTCATLG',
	               'PROTGAMMAWAG', 'PROTGAMMALG')
	nuc_models = ('GTRMIX', 'GTRCAT', 'GTRGAMMA')

	if seq_type == 'protein' and model not in prot_models:
		utils.die(
			"%s is not a valid model for %s sequences" % (model, seq_type))
	elif seq_type == 'nucleotide' and model not in nuc_models:
		utils.die(
			"%s is not a valid model for %s sequences" % (model, seq_type))

	all_trees_dir = os.path.join(outdir, 'all-trees')
	utils.safe_mkdir(all_trees_dir)
	
	if bootstrap:
		# Configure raxml to run both bootstraps and ml
		seed = 12345
		raxml_flags = '-f a -N {0} -x {1} '.format(bootstrap, seed) + raxml_flags
	
	with open('genetree.commands.sh', 'w') as f:
		for phylip in glob.glob(os.path.join(phylip_dir, '*.phy')):
			basename = os.path.splitext(os.path.basename(phylip))[0]
			print >>f, '{0} {1} {2} {3} {4} {5} -T {6}'.format(
				config.get_command('raxml')[0], phylip, basename, model, 
				all_trees_dir, extra_flags=raxml_flags, config.get_resource('threads'))
	# Run RAxML
	wrappers.Parallel(
                'genetree.commands.sh', '--joblog', 'genetree.commands.log', '--resume',
                return_ok=None, threads=1)

	diagnostics.log('raxml_dir', all_trees_dir)
	ingest('all_trees_dir')

@pipe.stage
def filtertrees(all_trees_dir, min_support, bootstrap, outdir):
	"""Filter trees according to their mean support"""

	if not bootstrap:
		utils.info("skipping: boot-strapping is disabled")
		filtered_trees_dir = all_trees_dir
		finish('filtered_trees_dir')

	filtered_trees_dir = os.path.join(outdir, 'filtered_trees')
	utils.safe_mkdir(filtered_trees_dir)	

	hist = {}
	for tree_path in glob.glob(os.path.join(all_trees_dir, 'RAxML_bipartitions.*')):
		basename = os.path.basename(tree_path)
		tree = Tree(stream=open(tree_path), schema='newick')
		nodes = tree.internal_nodes()
		support_vals = []

		# Loop over the nodes in the tree
		for node in nodes:
			# Skip nodes without support, ie the root
			if node.label:
				support = float(node.label)
				support_vals.append( support )
		
		mean_support = 0
		if len(support_vals) > 0:
			mean_support = numpy.mean( support_vals )
		hist[mean_support] = hist.get(mean_support, 0) + 1

		if mean_support >= min_support:
			tree.write_to_path(
							os.path.join(filtered_trees_dir, basename), 
							'newick', as_unrooted=True)			

	diagnostics.log('histogram', str(hist))
	finish('filtered_trees_dir')

if __name__ == "__main__":
	# Run the pipeline.
	pipe.run()
	# Push the local diagnostics to the global database.
	diagnostics.merge()

# Report. #
class Report(report.BaseReport):
	def init(self):
		self.name = pipe.name
		# Lookups
		self.lookup('outdir', diagnostics.INIT, 'outdir')
		self.lookup('raxml_dir', 'genetree.genetrees', 'raxml_dir')
		self.lookup('histogram', 'genetree.filtertrees', attribute='histogram')
		# Generators
		self.generator(self.mean_support_histogram)
		self.generator(self.supermatrix_tree)

	def mean_support_histogram(self):
		"""
		Histograms of mean bootstrap supports within each tree.
		"""
		if self.check('histogram'):
			hist = eval(self.data.histogram)
			title = "Distribution of Mean Bootstrap Supports"
			imgname = '%d.mean_supports.png' % self.run_id
			props = {
				'xlabel': "Mean Bootstrap Support",
				'xlim': [0, max(hist.iterkeys())],
				'ylabel': "Frequency"}
			return [self.histogram(imgname, hist, props=props)]

	def supermatrix_tree(self):
		"""
		Maximum-likelihood tree for the supermatrix.
		"""
		if self.check('raxml_dir'):
			newick = os.path.join(
						self.data.raxml_dir,
						'RAxML_bestTree.supermatrix')
			if os.path.exists(newick):
				self.add_js('jsphylosvg-min.js')
				self.add_js('raphael-min.js')
				return ["""
					<div id="run_{0}_canvas"></div>
					<script type="text/javascript">
					var run_{0}_newick="{1}";
					Smits.PhyloCanvas.Render.Style.text["font-style"]="italic";
					run_{0}_canvas = new Smits.PhyloCanvas(
						run_{0}_newick,
						"run_{0}_canvas",
						800, 800);
					</script>
					""".format(
						self.run_id,
						open(newick).readline().rstrip().replace('_',' '))]