bx-python / scripts / mask_quality.py

#!/usr/bin/env python

"""
Masks an AXT or MAF file based on quality (from a binned_array) and
outputs AXT or MAF.

Binned array form of quality scores can be generated with `qv_to_bqv.py`.

usage: %prog input output
    -i, --input=N: Format of input (axt or maf)
    -o, --output=N: Format of output (axt or maf)
    -m, --mask=N: Character to use as mask character
    -q, --quality=N: Min quality allowed
    -t, --type=N: base_pair or nqs
    -l, --list=N: colon seperated list of species,len_file[,qualityfile].
"""

import sys
import bx.align.axt
import bx.align.maf
import bx.binned_array
from bx.cookbook import doc_optparse
import fileinput
from bx.align.sitemask.quality import *

def main():
    
    options, args = doc_optparse.parse( __doc__ )
    try:
        inputformat = options.input
        outputformat = options.output
        mask = options.mask
        minqual = int(options.quality)
        qtype = options.type
        speciesAndLens = options.list
        inputfile = args[0]
        outputfile = args[1]
    except:
        doc_optparse.exception()

    outstream = open( outputfile, "w" )
    instream = open( inputfile, "r" )

    qualfiles = {}

    # read lens
    specieslist = speciesAndLens.split(":")
    species_to_lengths = {}
    
    for entry in specieslist:
        fields = entry.split(",")
        lenstream = fileinput.FileInput( fields[1] )
        lendict = dict()
        for line in lenstream:
            region = line.split()
            lendict[region[0]] = int(region[1])
        species_to_lengths[fields[0]] = lendict
        if len(fields) >= 3:
            qualfiles[fields[0]] = fields[2]

    specieslist = map( lambda(a): a.split(":")[0], specieslist )
    
    # open quality binned_arrays
    reader = None
    writer = None
    
    if inputformat == "axt":
        # load axt
        if len(specieslist) != 2:
            print "AXT is pairwise only."
            sys.exit()
        reader = bx.align.axt.Reader(instream, species1=specieslist[0], \
                                     species2=specieslist[1], \
                                     species_to_lengths = species_to_lengths)
    elif outputformat == "maf":
        # load maf
        reader = bx.align.maf.Reader(instream, species_to_lengths=species_to_lengths)

    if outputformat == "axt":
        # setup axt
        if len(specieslist) != 2:
            print "AXT is pairwise only."
            sys.exit()
        writer = bx.align.axt.Writer(outstream, attributes=reader.attributes)
    elif outputformat == "maf":
        # setup maf
        writer = bx.align.maf.Writer(outstream, attributes=reader.attributes)

    qualfilter = Simple( mask=mask, qualspecies = species_to_lengths, \
                         qualfiles = qualfiles, minqual = minqual, cache=50 )

    qualfilter.run( reader, writer.write )

    print "For "+str(qualfilter.total)+" base pairs, "+str(qualfilter.masked)+" base pairs were masked."
    print str(float(qualfilter.masked)/float(qualfilter.total) * 100)+"%"
    
if __name__ == "__main__":
    main()
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.