chapmanb / synbio (http://bcbio.wordpress.com/)

Python Synthetic Biology libraries

Clone this repository (size: 8.4 MB): HTTPS / SSH
$ hg clone http://bitbucket.org/chapmanb/synbio/
commit 2: 775d9e9a5c4d
parent 1: d62271144b61
branch: default
tags: tip
Fix setup to reflect actual modules. Add in SQLAlchemy database models.
cha...@sobchak.mgh.harvard.edu
10 months ago
synbio / SynBio / Codons / SpliceSites.py
r2:775d9e9a5c4d 223 loc 8.3 KB embed / history / annotate / raw /
"""Find and assist in removal of splice sites from coding sequences.

This utilizes the GeneSplicer program from TIGR:

    http://cbcb.umd.edu/software/GeneSplicer/
"""
import time
import os
import subprocess

from Bio import Application
from Bio.Application import _Option, _Argument
from Bio import Fasta

from SynBio.Codons.Data import splice_train_map, eukaryotic_organisms

class SpliceSiteRemover:
    """Remove splice sites from coding sequences.
    """
    def __init__(self, work_dir, no_use_thresh, count_compensate,
            splice_thresh_a, splice_thresh_d, logger):
        self._runner = GeneSplicerRunner(work_dir, splice_thresh_a,
                splice_thresh_d)
        self._no_use_thresh = no_use_thresh
        self._count_compensate = count_compensate
        self._logger = logger
        
        self._try_max = 5

    def remove_splice_sites(self, in_name, in_cds, usage_orgs):
        """Remove splice sites from the given coding sequence.
        """
        nt_seq = in_cds.get_sequence()

        site_counts = {}
        while 1:
            nt_seq = in_cds.get_sequence()
            splice_sites = self._runner.find_splice_sites(in_name, nt_seq,
                    usage_orgs)
            remain_sites = [x for x in splice_sites if 
                    x not in site_counts.keys()
                    or site_counts[x] <= self._try_max]
            if len(remain_sites) == 0:
                break
            for site in splice_sites:
                try:
                    site_counts[site] += 1
                except KeyError:
                    site_counts[site] = 1
                in_cds = self._remove_site(in_cds, site)
        return in_cds.get_codons()

    def _remove_site(self, cds, site):
        """Remove the given splice site region from the sequence.
        """
        to_remove = cds.get_sequence()[site.start:site.end]
        codon_info = cds.get_codons_in_region(
                max(site.start - 1, 0),
                min(site.end + 1, len(cds.get_sequence()) - 1))
        initial_seq = cds.codon_list_seq(codon_info)
        self._logger.info("\t Start region: %s; remove %s" % (initial_seq,
            to_remove))
        for work_index, w_codon in codon_info:
            start_codon = w_codon.cur_codon
            cur_cai_index = 0
            # adjust all of the codons to be the best codon that does
            # not match the cur codon in the region
            while 1:
                try:
                    new_codon = w_codon.get_codon_at_frequency(
                            cur_cai_index, self._no_use_thresh,
                            self._count_compensate)
                except IndexError:
                    new_codon = None
                if new_codon != start_codon:
                    break
                cur_cai_index += 1
                if cur_cai_index > 6:
                    new_codon = None
                    break
            if new_codon is not None:
                w_codon.change_codon(new_codon)
                cds.update_codon_index(work_index, w_codon)

        return cds

class GeneSplicerRunner:
    """Run gene splicer on a fasta file, returning acceptor sites.
    """
    def __init__(self, work_dir, splice_thresh_a, splice_thresh_d):
        self._work_dir = work_dir
        self._splice_thresh_a = splice_thresh_a
        self._splice_thresh_d = splice_thresh_d
        # We don't have a windows version of genesplice so just ignore
        try:
            self._train_dir = os.environ['GENESPLICER_DIR']
        except KeyError: 
            self._train_dir = None
            
    def find_splice_sites(self, in_name, in_sequence, usage_orgs):
        """Find the splice sites in a Fasta Record.
        """
        if self._train_dir is None:
            return []
        # find the training databases to use for this sequence
        if type(usage_orgs) != type([]):
            usage_orgs = [usage_orgs]
        all_train_org_dirs = []
        for usage_org in usage_orgs:
            if usage_org in eukaryotic_organisms:
                train_org_dirs = []
                for test_org_dir, test_orgs in splice_train_map.iteritems():
                    if usage_org in test_orgs:
                        train_org_dirs.append(test_org_dir)
                assert len(train_org_dirs) > 0
                for train_org_dir in train_org_dirs:
                    if train_org_dir not in all_train_org_dirs:
                        all_train_org_dirs.append(train_org_dir)
        final_to_dirs = []
        for train_org_dir in all_train_org_dirs:
            final_to_dirs.append(os.path.join(self._train_dir, train_org_dir))
        # if no eukaryotic sites to remove, don't return anything
        if len(final_to_dirs) == 0:
            return []
        in_file = os.path.join(self._work_dir, 'splice_find.fa')
        out_file = os.path.join(self._work_dir, 'splice_out.txt')
        in_rec = Fasta.Record()
        in_rec.title = in_name
        in_rec.sequence = in_sequence
        try:
            splice_sites = self._do_work(in_rec, in_file, out_file,
                    final_to_dirs)
        finally:
            if os.path.exists(in_file):
                os.remove(in_file)
            if os.path.exists(out_file):
                os.remove(out_file)
        return splice_sites

    def _do_work(self, in_rec, in_file, out_file, final_to_dirs):
        in_handle = open(in_file, 'w')
        in_handle.write(str(in_rec) + '\n')
        in_handle.close()

        all_splice_sites = []
        for train_org_dir in final_to_dirs:
            gs_cl = GeneSplicerCommandline()
            gs_cl.set_parameter('infile', in_file)
            gs_cl.set_parameter('train_dir', train_org_dir)
            gs_cl.set_parameter('-f', out_file)
            gs_cl.set_parameter('-a', self._splice_thresh_a)
            gs_cl.set_parameter('-d', self._splice_thresh_d)
            child = subprocess.Popen(str(gs_cl).split(), stdout =
                    subprocess.PIPE, stderr = subprocess.PIPE)
            child.wait()
            splice_sites = self._parse_output(out_file)
            all_splice_sites.extend(splice_sites)
        # remove duplicates
        all_splice_sites = list(set(all_splice_sites))
        all_splice_sites.sort()
        return all_splice_sites

    def _parse_output(self, out_file):
        splice_sites = []
        out_handle = open(out_file)
        for line in out_handle:
            parts = line.strip().split()
            (start, end, score, category, splice_type) = parts
            start = int(start)
            end = int(end)
            if start > end:
                tmp = start
                start = end
                end = tmp
            start = start - 1 # 0-based coordinates
            splice_sites.append(_SpliceSite(start, end,
                float(score), category, splice_type))

        out_handle.close()
        return splice_sites

class _SpliceSite:
    def __init__(self, start, end, score, category, splice_type):
        self.start = start
        self.end = end
        self.score = score
        self.category = category
        self.splice_type = splice_type

    def __str__(self):
        return '%s %s %s %s %s' % (self.start, self.end, self.score,
                self.category, self.splice_type)

    def __cmp__(self, other):
        assert isinstance(other, _SpliceSite)
        return cmp((self.start, self.end), (other.start, other.end))

    def __hash__(self):
        return hash((self.start, self.end))

class GeneSplicerCommandline(Application.AbstractCommandline):
    """Commandline for the GeneSplicer alignment program.
    """
    def __init__(self, cmd = 'genesplicer'):
        Application.AbstractCommandline.__init__(self)
        self.program_name = cmd

        self.parameters = \
          [_Argument(['infile'], ['input'], None, 1,
              'Input file'),
           _Argument(['train_dir'], ['input'], None, 1,
               'Specific genome training directory'),
           _Option(['-f'], ['output'], None, 1,
              'Write result in file name'),
           _Option(['-a'], ['input'], None, 0,
               'Acceptor site threshold'),
           _Option(['-d'], ['input'], None, 0,
               'Donor site threshold'),
           _Option(['-e'], ['input'], None, 0,
               'The maximum acceptor score within n bp is chosen'),
           _Option(['-i'], ['input'], None, 0,
               'The maximum donor score within n bp is chosen')
           ]