Created by Iakov Davydov last modified 2016-02-29

examples.py

      import fasta
from timeit import timeit

import numpy as np

from Bio import SeqIO
from Bio import AlignIO
from Bio.SeqIO.FastaIO import SimpleFastaParser


def count_adenines_fast(fn):
    """Count number of adenines per position of the alignment, fast-fasta"""
    ali = fasta.read_matrix(open(fn))
    return np.sum(ali == 'A', axis=0)

def count_adenines_biopython(fn):
    """Count number of adenines per position of the alignment, biopython"""
    ali = AlignIO.read(open(fn), 'fasta')
    return [sum((ali[j, i] == 'A'
                     for j in xrange(len(ali))))
            for i in xrange(ali.get_alignment_length())]


if __name__ == '__main__':
    N = 10
    print 'Reading large files'
    print 'fast-fasta time:', timeit(lambda: list(fasta.read(open('long.fasta'))), number=N)
    print 'biopython time:', timeit(lambda: list(SeqIO.parse(open('long.fasta'), 'fasta')), number=N)
    print 'SimpleFastaParser time:', timeit(lambda: list(SimpleFastaParser(open('long.fasta'))), number=N)
    N = 1000
    print 'Accessing alignment positions'
    print 'fast-fasta time:', timeit(lambda: count_adenines_fast('x.fst'), number=N)
    print 'biopython time:', timeit(lambda: count_adenines_biopython('x.fst'), number=N)

    # Output:
    # Reading large files
    # Reading large files
    # fast-fasta time: 1.35685396194
    # biopython time: 9.09956002235
    # SimpleFastaParser time: 2.85863208771
    # Accessing alignment positions
    # fast-fasta time: 0.568293094635
    # biopython time: 10.8877089024

    

fasta.py

Raw

      """Very simple fasta I/O functions.

These functions are working much faster compared to Biopython.
This comes at a cost: they are less general.

"""

import numpy as np


def read_matrix(f):
    """Read alignment from fasta file as numpy matrix"""
    seqs = []
    for r in read(f):
        seqs.append([c for c in r[1]])
    return np.array(seqs, dtype=np.character)

def read(f):
    """Read fasta sequences from file f and return as tuples (accesion, seq)"""
    seq = None
    for line in f:
        if line.startswith('>'):
            if not seq is None:
                yield (ac, seq)
            # You can edit this line depending on your fasta files
            ac = line.split(' ', 1)[0][1:].strip()
            seq = ''
        else:
            seq += line.strip()
    if seq:
        yield (ac, seq)

def write(l, f):
    """Write list of sequences in form of tuples (accession, sequence) to file f"""
    for ac, seq in l:
        f.write('>%s\n' % ac)
        f.write(seq)
        f.write('\n')

    

Comments (0)

HTTPS

SSH

You can clone a snippet to your computer for local editing. Learn more.

Snippets

Iakov Davydov fast fasta

Comments (0)