Source

utils / mergeAptamer.py

Full commit
#!/usr/bin/env python
from difflib import SequenceMatcher
from Bio import pairwise2
from Bio.Align import AlignInfo
from Bio.Alphabet import generic_dna
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
import HTSeq
import sys
import itertools

def consensus(ali_tmp):
  ali = MultipleSeqAlignment([SeqRecord(Seq(ali_tmp[0], generic_dna), id="R1"), SeqRecord(Seq(ali_tmp[1], generic_dna), id="R2")])
  return AlignInfo.SummaryInfo(ali).dumb_consensus().tostring()
    
def main():
  const1 = 'GGAGACAAGAATAAACGCT'
  const2 = 'TTCGACAGGAGGCTCACA' # reverse complement of the R2 constant part after P5
  barcode_l = 4
  
  (read1, read2) = sys.argv[1:3]

  sqm = SequenceMatcher(None, read1, read2)
  fname_out = read1[:sqm.get_matching_blocks()[0][2] - 2]  + read1[sqm.get_matching_blocks()[1][1]:read1.rfind(".fastq.gz")] + "_aptamers.txt"
  
  
  
  it1 = itertools.chain(HTSeq.FastqReader(read1, "phred"))
  it2 = itertools.chain(HTSeq.FastqReader(read2, "phred"))
  
  good = 0
  bad = 0
  while 1:
    try:
      s1 = it1.next()  
      s2 = it2.next()
    except StopIteration:
      break
    br1 = s1.name.split(':')[-1]
    br2 = s2.name.split(':')[-1]
    rs1 = s1.seq
    rs2 = HTSeq.reverse_complement(s2.seq)
    ali_tmp = pairwise2.align.globalms(rs1,rs2,2,-1,-2,-1)[0]
    if ali_tmp[2] >= 90:
      fragment = consensus(ali_tmp)
      sys.stdout.write("\n".join(['@' + fragment, '#' + ali_tmp[0], '#' + ali_tmp[1],'']))
    


if __name__ == "__main__": main()