Snippets

Iakov Davydov fast fasta

Created by Iakov Davydov last modified
import fasta
from timeit import timeit

import numpy as np

from Bio import SeqIO
from Bio import AlignIO
from Bio.SeqIO.FastaIO import SimpleFastaParser


def count_adenines_fast(fn):
    """Count number of adenines per position of the alignment, fast-fasta"""
    ali = fasta.read_matrix(open(fn))
    return np.sum(ali == 'A', axis=0)

def count_adenines_biopython(fn):
    """Count number of adenines per position of the alignment, biopython"""
    ali = AlignIO.read(open(fn), 'fasta')
    return [sum((ali[j, i] == 'A'
                     for j in xrange(len(ali))))
            for i in xrange(ali.get_alignment_length())]


if __name__ == '__main__':
    N = 10
    print 'Reading large files'
    print 'fast-fasta time:', timeit(lambda: list(fasta.read(open('long.fasta'))), number=N)
    print 'biopython time:', timeit(lambda: list(SeqIO.parse(open('long.fasta'), 'fasta')), number=N)
    print 'SimpleFastaParser time:', timeit(lambda: list(SimpleFastaParser(open('long.fasta'))), number=N)
    N = 1000
    print 'Accessing alignment positions'
    print 'fast-fasta time:', timeit(lambda: count_adenines_fast('x.fst'), number=N)
    print 'biopython time:', timeit(lambda: count_adenines_biopython('x.fst'), number=N)

    # Output:
    # Reading large files
    # Reading large files
    # fast-fasta time: 1.35685396194
    # biopython time: 9.09956002235
    # SimpleFastaParser time: 2.85863208771
    # Accessing alignment positions
    # fast-fasta time: 0.568293094635
    # biopython time: 10.8877089024
"""Very simple fasta I/O functions.

These functions are working much faster compared to Biopython.
This comes at a cost: they are less general.

"""

import numpy as np


def read_matrix(f):
    """Read alignment from fasta file as numpy matrix"""
    seqs = []
    for r in read(f):
        seqs.append([c for c in r[1]])
    return np.array(seqs, dtype=np.character)

def read(f):
    """Read fasta sequences from file f and return as tuples (accesion, seq)"""
    seq = None
    for line in f:
        if line.startswith('>'):
            if not seq is None:
                yield (ac, seq)
            # You can edit this line depending on your fasta files
            ac = line.split(' ', 1)[0][1:].strip()
            seq = ''
        else:
            seq += line.strip()
    if seq:
        yield (ac, seq)

def write(l, f):
    """Write list of sequences in form of tuples (accession, sequence) to file f"""
    for ac, seq in l:
        f.write('>%s\n' % ac)
        f.write(seq)
        f.write('\n')

Comments (0)

HTTPS SSH

You can clone a snippet to your computer for local editing. Learn more.