chapmanb / synbio (http://bcbio.wordpress.com/)

Python Synthetic Biology libraries

Clone this repository (size: 8.4 MB): HTTPS / SSH
$ hg clone http://bitbucket.org/chapmanb/synbio/
commit 2: 775d9e9a5c4d
parent 1: d62271144b61
branch: default
tags: tip
Fix setup to reflect actual modules. Add in SQLAlchemy database models.
cha...@sobchak.mgh.harvard.edu
10 months ago
synbio / SynBio / OligoSequences / AmbiguousOligoDesign.py
r2:775d9e9a5c4d 244 loc 10.5 KB embed / history / annotate / raw /
"""Provide design algorithms to work around ambiguous designs.
"""
import re

from Bio.Seq import Seq
from Bio.Alphabet.IUPAC import unambiguous_dna

from SynBio.OligoSequences.Oligo import (DesignLinkerEndCorrect,
        OligoDesignerMixin, AssemblyDesignError)
from OligoRegion import FixedEndOligo

class AmbiguousOverlapDesign(OligoDesignerMixin):
    """Design that avoid ambiguous regions in oligo overlaps.

    This attempts to do oligo designs based on avoiding putting ambiguous
    regions in overlaps. By avoiding this, the necessary ambiguous oligos
    can be ordered separately and combined into a PAM reaction following
    duplexing and MutS filtering.

    Currently, ambiguous regions are recognized by a hack -- special non GATC
    bases embedded in the sequence.
    """
    def __init__(self, max_size, special_max, parameters,
            assembly_props, pre_builder, post_builder, ambig_letters):
        assert len(parameters) >= 0
        (self._buffer, self._overlap, self._temp_thresh, self._five_size,
                self._three_size) = parameters[0]
        self._extra_params = parameters[1:]
     
        self._min_size = max(self._overlap + 1, 20)
        self._max_size = max_size
        self._special_max = special_max

        self._assembly_props = assembly_props
        self._pre_builder = pre_builder
        self._post_builder = post_builder
        self._ambig_letters = ambig_letters

        self._backup_design = DesignLinkerEndCorrect(self._max_size,
                self._five_size, self._three_size,
                [(self._buffer, self._overlap, self._temp_thresh)],
                self._assembly_props, self._pre_builder, self._post_builder)
    
    def get_builders(self):
        """Retrieve the pre and post builders associated with this breakdown.
        """
        new_pre = self._pre_builder.copy()
        new_post = self._post_builder.copy()
        return new_pre, new_post

    def get_flex_splitter(self):
        if len(self._extra_params) == 0:
            raise ValueError('No additional designers available.')
        else:
            new_design = AmbiguousOverlapDesign(self._max_size,
              self._special_max, self._extra_params, self._assembly_props,
              self._pre_builder, self._post_builder, self._ambig_letters)
            new_design.set_melt_calc(self._melt_calc)
            return new_design

    def get_overlap(self):
        return self._overlap
    
    def set_melt_calc(self, melt_calc):
        self._melt_calc = melt_calc
        self._backup_design.set_melt_calc(melt_calc)
    
    def get_min_max(self, conservative = False):
        return self._min_size, self._max_size
    
    def split(self, parent_name, sequence, is_first, is_last):
        """Split the sequence, avoiding ambiguous positions in the sequence.

        The algorithm is:
        Find ambiguous positions in the sequence
        If no ambiguous positions in the sequence, use the standard splitter
        Otherwise:
          Find possible overlaps in the sequence, which are of the specified
           minimum melting temperature and contain no ambiguous bases.
          Starting at 0
          While we haven't reached the end of the sequence
            Find the best overlap end within the start and end range
            Add piece, move start position to overlap start.
        """
        ambig_positions = self._find_ambiguous(sequence)
        # if we have no ambiguous positions, use the backup
        if len(ambig_positions) == 0:
            return self._backup_design.split(parent_name, sequence, is_first,
                    is_last)
        else:
            print ambig_positions
            overlaps = self._find_overlaps(sequence, ambig_positions)
            print '\n'.join([str(x) for x in overlaps])
            all_oligos = self._design_breakpoints(parent_name, sequence,
                    overlaps)
            for oligo in all_oligos:
                if oligo.bad_size(self._min_size, self._special_max) != 0:
                    raise AssemblyDesignError("Oligo not in size: %s" % oligo)
            return all_oligos

    def _design_breakpoints(self, parent_name, sequence, overlaps):
        """Choose oligos in the sequence with allowed overlaps.

        This walks from left to right across the sequence and picks oligos
        which fall in the allowed overlaps. Oligos are chosen to maximize
        size (and thus minimize the number of oligos) and to have reasonable
        buffers around ambiguous pieces for adjustment.
        """
        all_oligos = []
        pos_overlaps = [x for x in overlaps if x.end == self._five_size]
        if len(pos_overlaps) == 0:
            raise AssemblyDesignError('No overlap for initial oligo: %s' %
                    self._five_size)
        cur_overlap = pos_overlaps[0]
        cur_oligo = FixedEndOligo(self._get_name(parent_name, len(all_oligos)),
                sequence, 0, self._five_size, self._assembly_props, None,
                self._buffer, five_fixed = True)
        final_overlaps = [x for x in overlaps if x.start == (len(sequence) -
            self._three_size)]
        if len(final_overlaps) == 0:
            raise AssemblyDesignError('No overlap for final oligo: %s' %
                    (len(sequence) - self._three_size))
        final_overlap = final_overlaps[0]
        while cur_oligo is not None:
            all_oligos.append(cur_oligo)
            cur_min = cur_overlap.start + self._min_size + self._buffer
            cur_max = cur_overlap.start + self._max_size - self._buffer
            special_max = cur_overlap.start + self._special_max - self._buffer
            pos_overlaps = [x for x in overlaps if x.end in 
                    range(cur_min, cur_max)]
            special_overlaps = [x for x in overlaps if x.end in 
                    range(cur_min, special_max)]
            # terminating condition -- we are done splitting it
            if cur_min >= final_overlap.end:
                next_overlap = None
            elif len(pos_overlaps) > 0:
                next_overlap = self._choose_overlap(pos_overlaps,
                        cur_overlap, final_overlap)
            elif len(special_overlaps) > 0:
                next_overlap = self._choose_overlap(special_overlaps,
                        cur_overlap, final_overlap)
            else:
                raise AssemblyDesignError('No overlap at: %s %s' %
                        (cur_min, cur_max))
            if next_overlap:
                cur_oligo = FixedEndOligo(self._get_name(parent_name,
                    len(all_oligos)), sequence, cur_overlap.start,
                    next_overlap.end, self._assembly_props, None, self._buffer)
                print cur_overlap.start, next_overlap.end
                cur_overlap = next_overlap
            else:
                cur_oligo = None
        final_oligo = FixedEndOligo(self._get_name(parent_name,
            len(all_oligos)), sequence, final_overlap.start, len(sequence),
            self._assembly_props, None, self._buffer, three_fixed = True)
        all_oligos.append(final_oligo)
        if len(all_oligos) % 2 != 0:
            raise AssemblyDesignError('Non-even number of oligos designed')
        return all_oligos

    def _choose_overlap(self, pos_overlaps, cur_overlap, final_overlap):
        """Pick an overlap to use from the given choices.

        We want to pick this with the following restrictions:
        - Keep us to an even number of oligos
        - Do not choose right on ambiguous position junctions
        - Work with the final overlap.

        XXX this does not guarantee an even number of oligos right now.
        """
        # print final_overlap
        # print [str(x) for x in pos_overlaps]
        overlap_final = [x for x in pos_overlaps if x.start ==
                final_overlap.start]
        if len(overlap_final) > 0:
            return overlap_final[0]
        else:
            pos_overlaps.reverse()
            for ol in pos_overlaps:
                if ol.buffer_distance >= 3:
                    return ol
            return pos_overlaps[0]

    def _find_overlaps(self, sequence, ambiguous_positions):
        """Find available overlaps in the sequence.

        Overlaps are:
         - At least as long as _overlap
         - Have a melting temperature at least _temp_thresh
         - Do not contain any ambiguous positions
        """
        overlaps = []
        for index in range(len(sequence) - self._overlap):
            cur_end = index + self._overlap
            cur_melt = -1.0
            while (cur_melt < self._temp_thresh and cur_end < len(sequence)):
                cur_seq = sequence[index:cur_end]
                cur_ambig = self._find_ambiguous(cur_seq)
                if len(cur_ambig) > 0:
                    break
                cur_comp = Seq(cur_seq, unambiguous_dna).complement().data
                cur_melt = self._melt_calc.calculate_melting(cur_seq, cur_comp)
                cur_end += 1
            # if we are above the melt temp and no ambiguous positions
            if cur_melt >= self._temp_thresh:
                if (len(set(ambiguous_positions) &
                    set(range(index, cur_end))) == 0):
                    buffer_distances = []
                    for ambig in ambiguous_positions:
                        buffer_distances.append(abs(ambig - index))
                        buffer_distances.append(abs(ambig - cur_end))
                    overlaps.append(_OverlapRegion(index, cur_end, cur_melt,
                        min(buffer_distances)))
        return overlaps

    def _find_ambiguous(self, sequence):
        """Find any ambiguous positions in the sequence to design.

        Currently this looks for specific letters which are use to
        estimate ambiguity. Eventually this should be determined from database
        information on ambiguous regions.
        """
        ambig_positions = []
        for check_ambig in self._ambig_letters:
            pat = re.compile(r'%s' % check_ambig)
            for match in pat.finditer(sequence):
                start, end = match.span()
                assert end == start + 1
                assert sequence[start] == check_ambig
                ambig_positions.append(start)
        return ambig_positions

class _OverlapRegion:
    """Represent details on a potential overlap region in the sequence.
    """
    def __init__(self, start, end, melt_temp, buffer_distance):
        self.start = start
        self.end = end
        self.melt_temp = melt_temp
        self.buffer_distance = buffer_distance

    def __str__(self):
        return 'OL: %s to %s; %s %s' % (self.start, self.end, self.melt_temp,
                self.buffer_distance)