"""Find and assist in removal of splice sites from coding sequences.
This utilizes the GeneSplicer program from TIGR:
http://cbcb.umd.edu/software/GeneSplicer/
"""
import time
import os
import subprocess
from Bio import Application
from Bio.Application import _Option, _Argument
from Bio import Fasta
from SynBio.Codons.Data import splice_train_map, eukaryotic_organisms
class SpliceSiteRemover:
"""Remove splice sites from coding sequences.
"""
def __init__(self, work_dir, no_use_thresh, count_compensate,
splice_thresh_a, splice_thresh_d, logger):
self._runner = GeneSplicerRunner(work_dir, splice_thresh_a,
splice_thresh_d)
self._no_use_thresh = no_use_thresh
self._count_compensate = count_compensate
self._logger = logger
self._try_max = 5
def remove_splice_sites(self, in_name, in_cds, usage_orgs):
"""Remove splice sites from the given coding sequence.
"""
nt_seq = in_cds.get_sequence()
site_counts = {}
while 1:
nt_seq = in_cds.get_sequence()
splice_sites = self._runner.find_splice_sites(in_name, nt_seq,
usage_orgs)
remain_sites = [x for x in splice_sites if
x not in site_counts.keys()
or site_counts[x] <= self._try_max]
if len(remain_sites) == 0:
break
for site in splice_sites:
try:
site_counts[site] += 1
except KeyError:
site_counts[site] = 1
in_cds = self._remove_site(in_cds, site)
return in_cds.get_codons()
def _remove_site(self, cds, site):
"""Remove the given splice site region from the sequence.
"""
to_remove = cds.get_sequence()[site.start:site.end]
codon_info = cds.get_codons_in_region(
max(site.start - 1, 0),
min(site.end + 1, len(cds.get_sequence()) - 1))
initial_seq = cds.codon_list_seq(codon_info)
self._logger.info("\t Start region: %s; remove %s" % (initial_seq,
to_remove))
for work_index, w_codon in codon_info:
start_codon = w_codon.cur_codon
cur_cai_index = 0
# adjust all of the codons to be the best codon that does
# not match the cur codon in the region
while 1:
try:
new_codon = w_codon.get_codon_at_frequency(
cur_cai_index, self._no_use_thresh,
self._count_compensate)
except IndexError:
new_codon = None
if new_codon != start_codon:
break
cur_cai_index += 1
if cur_cai_index > 6:
new_codon = None
break
if new_codon is not None:
w_codon.change_codon(new_codon)
cds.update_codon_index(work_index, w_codon)
return cds
class GeneSplicerRunner:
"""Run gene splicer on a fasta file, returning acceptor sites.
"""
def __init__(self, work_dir, splice_thresh_a, splice_thresh_d):
self._work_dir = work_dir
self._splice_thresh_a = splice_thresh_a
self._splice_thresh_d = splice_thresh_d
# We don't have a windows version of genesplice so just ignore
try:
self._train_dir = os.environ['GENESPLICER_DIR']
except KeyError:
self._train_dir = None
def find_splice_sites(self, in_name, in_sequence, usage_orgs):
"""Find the splice sites in a Fasta Record.
"""
if self._train_dir is None:
return []
# find the training databases to use for this sequence
if type(usage_orgs) != type([]):
usage_orgs = [usage_orgs]
all_train_org_dirs = []
for usage_org in usage_orgs:
if usage_org in eukaryotic_organisms:
train_org_dirs = []
for test_org_dir, test_orgs in splice_train_map.iteritems():
if usage_org in test_orgs:
train_org_dirs.append(test_org_dir)
assert len(train_org_dirs) > 0
for train_org_dir in train_org_dirs:
if train_org_dir not in all_train_org_dirs:
all_train_org_dirs.append(train_org_dir)
final_to_dirs = []
for train_org_dir in all_train_org_dirs:
final_to_dirs.append(os.path.join(self._train_dir, train_org_dir))
# if no eukaryotic sites to remove, don't return anything
if len(final_to_dirs) == 0:
return []
in_file = os.path.join(self._work_dir, 'splice_find.fa')
out_file = os.path.join(self._work_dir, 'splice_out.txt')
in_rec = Fasta.Record()
in_rec.title = in_name
in_rec.sequence = in_sequence
try:
splice_sites = self._do_work(in_rec, in_file, out_file,
final_to_dirs)
finally:
if os.path.exists(in_file):
os.remove(in_file)
if os.path.exists(out_file):
os.remove(out_file)
return splice_sites
def _do_work(self, in_rec, in_file, out_file, final_to_dirs):
in_handle = open(in_file, 'w')
in_handle.write(str(in_rec) + '\n')
in_handle.close()
all_splice_sites = []
for train_org_dir in final_to_dirs:
gs_cl = GeneSplicerCommandline()
gs_cl.set_parameter('infile', in_file)
gs_cl.set_parameter('train_dir', train_org_dir)
gs_cl.set_parameter('-f', out_file)
gs_cl.set_parameter('-a', self._splice_thresh_a)
gs_cl.set_parameter('-d', self._splice_thresh_d)
child = subprocess.Popen(str(gs_cl).split(), stdout =
subprocess.PIPE, stderr = subprocess.PIPE)
child.wait()
splice_sites = self._parse_output(out_file)
all_splice_sites.extend(splice_sites)
# remove duplicates
all_splice_sites = list(set(all_splice_sites))
all_splice_sites.sort()
return all_splice_sites
def _parse_output(self, out_file):
splice_sites = []
out_handle = open(out_file)
for line in out_handle:
parts = line.strip().split()
(start, end, score, category, splice_type) = parts
start = int(start)
end = int(end)
if start > end:
tmp = start
start = end
end = tmp
start = start - 1 # 0-based coordinates
splice_sites.append(_SpliceSite(start, end,
float(score), category, splice_type))
out_handle.close()
return splice_sites
class _SpliceSite:
def __init__(self, start, end, score, category, splice_type):
self.start = start
self.end = end
self.score = score
self.category = category
self.splice_type = splice_type
def __str__(self):
return '%s %s %s %s %s' % (self.start, self.end, self.score,
self.category, self.splice_type)
def __cmp__(self, other):
assert isinstance(other, _SpliceSite)
return cmp((self.start, self.end), (other.start, other.end))
def __hash__(self):
return hash((self.start, self.end))
class GeneSplicerCommandline(Application.AbstractCommandline):
"""Commandline for the GeneSplicer alignment program.
"""
def __init__(self, cmd = 'genesplicer'):
Application.AbstractCommandline.__init__(self)
self.program_name = cmd
self.parameters = \
[_Argument(['infile'], ['input'], None, 1,
'Input file'),
_Argument(['train_dir'], ['input'], None, 1,
'Specific genome training directory'),
_Option(['-f'], ['output'], None, 1,
'Write result in file name'),
_Option(['-a'], ['input'], None, 0,
'Acceptor site threshold'),
_Option(['-d'], ['input'], None, 0,
'Donor site threshold'),
_Option(['-e'], ['input'], None, 0,
'The maximum acceptor score within n bp is chosen'),
_Option(['-i'], ['input'], None, 0,
'The maximum donor score within n bp is chosen')
]