chapmanb / synbio (http://bcbio.wordpress.com/)

Python Synthetic Biology libraries

Clone this repository (size: 8.4 MB): HTTPS / SSH
$ hg clone http://bitbucket.org/chapmanb/synbio/
commit 2: 775d9e9a5c4d
parent 1: d62271144b61
branch: default
tags: tip
Fix setup to reflect actual modules. Add in SQLAlchemy database models.
cha...@sobchak.mgh.harvard.edu
10 months ago
synbio / SynBio / Sequencing / ProcessManagers.py
r2:775d9e9a5c4d 169 loc 6.9 KB embed / history / annotate / raw /
"""Managers which handle the details of processing sequencing reads on clones.
"""
from Parts import ClonedSequence, QualitySequence
from SynBio.OligoDatabase.Database import SequenceMissingError
from SynBio.Sequencing.Process import (LocalReadAligner, PositionMapperPlus,
        AnalyzedReadPlus)

class ReadAlignmentManager:
    """Manage the alignment of reads to a reference sequence.
    """
    def __init__(self, origin_finder, work_dir, logger, only_ref_seq = True):
        """Initialize a manager for dealing with reads

        only_ref_seq is a boolean specifying whether we should only report
        sequencing information in the reference sequence. The alternative is
        to report read info in the upstream and downstream vector regions.
        """
        self._origin_finder = origin_finder
        self._read_aligner = LocalReadAligner(work_dir, logger)
        self._logger = logger
        self._only_ref_seq = only_ref_seq

        self._name_match_thresh = 0.9

    def align_read(self, qual_seq, clone_ref):
        """Do the work on aligning a read to the reference sequence

        qual_seq -- QualitySequence object which can be used to query the
        actual read and quality information

        clone_ref -- ClonedSequence object used to query the read

        Returns an AnalyzedRead object which will be used to provide
        information about the alignment.
        """
        unique_parts = clone_ref.unique_name()
        ref_name = '_'.join([str(x) for x in unique_parts])
        self._logger.info('===== Aligning read %s to %s' %
                (qual_seq.name, ref_name))
        #print qual_seq.name
        clone_ref_seqs = clone_ref.get_clone_ref_seqs()
        if clone_ref_seqs.is_pool():
            raise NotImplementedError('Pooled sequences not done')
        else:
            base_seq, wvec_seq = clone_ref_seqs.get_base_seqs()
            ref_match = self._read_aligner.align_read_to_ref(qual_seq, ref_name,
                    wvec_seq)
            # log the match details
            length_percent = ref_match.match_length_percent()
            match_score = ref_match.match_score()
            self._logger.info("== %s %s %s"  % (ref_match.is_rc,
                length_percent, match_score))
            self._logger.info(str(ref_match))

            final_a = None
            if ref_match.is_decent():
                final_a = self._get_aligned_read(ref_match, wvec_seq,
                        base_seq)
            if final_a is None or final_a.is_vector():
                final_a = self._check_non_aligned(qual_seq, clone_ref)
            self._logger.info('==================')
            return final_a

    def _get_aligned_read(self, ref_match, wvec_seq, base_seq):
        """Return details for a read that is correctly aligned to the reference.
        """
        mapper = PositionMapperPlus(ref_match.ref_match_seq,
                wvec_seq, base_seq, self._only_ref_seq)
        return AnalyzedReadPlus(mapper, ref_match.read_match_seq,
                ref_match.read_qual_values, ref_match.read_orig_seq,
                ref_match.is_rc)

    def _check_non_aligned(self, qual_seq, clone_ref):
        """Deal with a sequence that did not align to the reference sequence.
        """
        for_seq_rec, for_qual_values = qual_seq.fasta_plus_quality()
        design_a = clone_ref.get_design_assembly()
        cur_a_name = design_a.get_name()
        alt_origin = self._origin_finder.find_origin(
                cur_a_name, for_seq_rec.sequence)
        if alt_origin is None:
            self._logger.info('Matches nothing')
            return None
        else:
            self._logger.info('Alt origin: %s %s %s %s' %
                    (clone_ref.unique_name(), alt_origin.alt_name,
                     alt_origin.match_self, alt_origin.match_vector))
            return alt_origin

class TrimmedReadManager:
    """Manage the sorting of reads onto associated clones.

    Utilizing the information stored about a sequencing plate in the database,
    this takes in trimmed reads and orders them according to which clone they
    belong to on the plate.
    """
    def __init__(self, name_parser, plate_db, work_dir, logger):
        self._name_parser = name_parser
        self._plate_db = plate_db
        self._work_dir = work_dir
        self._logger = logger
        
        self._clones = []
        self._read_info = {"total" : None, "quality" : 0}
    
    def set_read_count(self, total_reads):
        assert self._read_info["total"] is None, "Already set read count"
        self._read_info["total"] = total_reads

    def get_read_info(self):
        return self._read_info
    
    def add_qual_sequence(self, name, seq_list, quality_list, orig_read_seq):
        """Add a quality sequence, getting info from the file name.
        """
        plate, pos, final_pos = self._name_parser.parse_abi_name(name)
        self._plate_db.verify_plate(plate)

        self._process_read(name, plate, pos, final_pos, seq_list, quality_list,
                orig_read_seq)
    
    def _process_read(self, name, plate, pos, final_pos, seq_list,
            quality_list, orig_read_seq):
        self._logger.info("  *> Adding read on plate %s, position %s" %
                (plate, pos))

        plate_well = self._plate_db.get_well(pos, self._work_dir)
        clone_index = self._add_clone(plate_well)
        
        if len(seq_list) > 0 and len(quality_list) > 0:
            self._read_info["quality"] += 1
            clip_left, clip_right = self._find_clip_values(seq_list,
                    orig_read_seq)
            self._add_qual_read(clone_index, plate, final_pos, name,
                    seq_list, quality_list, clip_left, clip_right)
    
    def _find_clip_values(self, clip_seq_list, orig_read_seq):
        """Find the regions where the left and right of this read are trimmed.

        This returns the total trim (vector plus quality) used to find
        the region of a ab1 read under use.
        """
        clip_seq = "".join(clip_seq_list)
        clip_start = orig_read_seq.find(clip_seq)
        assert clip_start > 0, "We lost our sequence in trimming %s : %s" % \
          (clip_seq, orig_read_seq)
        clip_end = clip_start + len(clip_seq)

        return clip_start, clip_end
   
    def _add_clone(self, plate_well):
        new_clone = ClonedSequence(plate_well)
        if new_clone not in self._clones:
            self._clones.append(new_clone)
        clone_index = self._clones.index(new_clone)
        return clone_index

    def _add_qual_read(self, clone_index, plate, final_pos, name,
            seq_list, quality_list, clip_left, clip_right):
        """Add a quality trimmed read to our current reads.
        """
        new_read = QualitySequence(name, plate, final_pos,
            seq_list, quality_list, clip_left, clip_right)
        self._clones[clone_index].add_read(new_read)
    
    def get_sequenced_clones(self):
        """Return all unique sequenced clones.
        """
        return self._clones