Commits

Mark Howison committed 3e305ef

Fixes #58: improved multalign performance and robustness

Comments (0)

Files changed (1)

agalma/multalign.py

 import database
 
 from biolite import catalog
+from biolite import config
 from biolite import diagnostics
 from biolite import report
 from biolite import utils
 	taxa = {}
 	nseqs = 0
 
-	utils.safe_mkdir('untrimmed_clusters')
+	untrim_dir = utils.safe_mkdir('untrimmed_clusters')
 
 	# Load components from database
 	sql = """
 			# than 5.
 			if len(taxa_count) >= min_taxa and taxa_mean < 5:
 				name = 'homologs_{0}_{1}'.format(_run_id, comp_id)
-				fasta  = os.path.join('untrimmed_clusters', name + '.fa')
+				fasta  = os.path.join(untrim_dir, name + '.fa')
 				SeqIO.write(records, open(fasta, 'w'), 'fasta')
 				clusters.append(Cluster(name, size, weight, [fasta]))
 				nseqs += size
 def trim_sequences(clusters):
 	"""Compare homologs within a cluster by all-to-all BLAST and trim sequence ends not included in any HSP"""
 
-	utils.safe_mkdir('trimmed_clusters')
-	
-	# Loop through each fasta file
-	for cluster in clusters:
-
-		fasta = cluster.fasta[-1]
-		trimmed_fasta = os.path.join('trimmed_clusters', cluster.name + '.fa')
-		cluster.fasta.append(trimmed_fasta)
-
-		# Create blast DB
-		dbname = os.path.join(os.getcwd(), "trim_allvall") 
-		wrappers.MakeBlastDB('nucl', fasta, dbname)
+	# Create an allvall tblastx command for each cluster
+	with open('trim.commands.sh', 'w') as f:
+		for cluster in clusters:
+			print >>f, '{0} -dbtype nucl -in {1} && {2} -db {1} -query {1} -evalue 1e-6 -outfmt "6 qseqid sseqid qstart qend" -out {1}.tblastx.txt'.format(
+				config.get_command('makeblastdb')[0], cluster.fasta[-1],
+				config.get_command('tblastx')[0])
 
-		# Run tblastx
-		report = 'trim_allvall.xml'
-		workflows.multiblast('tblastx', fasta, dbname, report, evalue='1e-6')
+	# Run tblastx
+	wrappers.Parallel(
+		'trim.commands.sh', '--joblog', 'trim.commands.log', '--resume',
+		return_ok=None)
 
+	trim_dir = utils.safe_mkdir('trimmed_clusters')
+	
+	clusters_keep = []
+	for cluster in clusters:
+		report = cluster.fasta[-1] + '.tblastx.txt'
+		if not os.path.exists(report):
+			utils.info("dropping cluster '%s': no tblastx report" % cluster.name)
+			continue
+		# Trim sequences using blast report
+		endpoints = {}
+		for line in open(report):
+			query, subject, qstart, qend = utils.multimap(
+				(str, str, int, int), line.rstrip().split())
+			# Skip self hits
+			if query != subject:
+				if query in endpoints:
+					endpoints[query] = (
+						min(endpoints[query][0], qstart, qend),
+						max(endpoints[query][1], qstart, qend))
+				else:
+					endpoints[query] = (min(qstart, qend), max(qstart, qend))
 		# Load original sequences into a dict
-		seqs = dict()
-		for record in SeqIO.parse(open(fasta), 'fasta'):
-			seqs[record.id] = record
-
+		trimmed_fasta = os.path.join(trim_dir, cluster.name + '.fa')
 		with open(trimmed_fasta, 'w') as f:
-			for record in NCBIXML.parse(open(report)):
-				seq = seqs[record.query]
-				# Make sure the sequence in our dict matches up with the query
-				assert len(seq.seq) == record.query_length
-				# Initialize list as long as query
-				mask = ['0'] * record.query_length
-				for align in record.alignments:
-					# Skip hit to itself
-					if record.query != align.hit_def:
-						for hsp in align.hsps:
-							# Mask the query range, adjusting the range by -1
-							# because BLAST reports 1-based coordinates
-							for i in range(hsp.query_start-1, hsp.query_end):
-								mask[i] = '1'
-				# Apply mask
-				mask = ''.join(mask)
-				start = mask.find('1')
-				end = mask.rfind('1')
-				if (start >= 0) and (start < end):
-					seq.seq = seq.seq[start:end+1]
-				SeqIO.write(seq, f, 'fasta')
+			for record in SeqIO.parse(open(cluster.fasta[-1]), 'fasta'):
+				# Trim to endpoints
+				if record.id in endpoints:
+					record.seq = record.seq[endpoints[record.id][0]:endpoints[record.id][1]]
+				SeqIO.write(record, f, 'fasta')
+		cluster.fasta.append(trimmed_fasta)
+		clusters_keep.append(cluster)
+
+	return {'clusters': clusters_keep}
 
 
 @pipe.stage
 	utils.safe_mkdir(workdir)
 
 	for cluster in clusters:
+		# Skip any clusters that didn't finish MACSE
+		if not (os.path.exists(cluster.fasta[-2]) and os.path.exists(cluster.fasta[-1])):
+			utils.info("dropping cluster %s: no MACSE output" % cluster.name)
+			continue
+
 		aa_keep = []
 		dna_keep = []
 
 
 	for cluster in clusters:
 
-		# Skip empty clusters, which were removed in the last stage
-		if not cluster.size:
-			continue
-
 		aa_name = cluster.fasta[-2]
 		dna_name = cluster.fasta[-1]
 
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.