Commits

Ross Lazarus committed 00b0933

eesh

Comments (0)

Files changed (84)

__init__.py

Empty file removed.

bwa_wrapper.py

-#!/usr/bin/env python
-
-"""
-Runs BWA on single-end or paired-end data.
-Produces a SAM file containing the mappings.
-Works with BWA version 0.5.3-0.5.5.
-
-usage: bwa_wrapper.py [options]
-    -t, --threads=t: The number of threads to use
-    -r, --ref=r: The reference genome to use or index
-    -f, --fastq=f: The (forward) fastq file to use for the mapping
-    -F, --rfastq=F: The reverse fastq file to use for mapping if paired-end data
-    -u, --output=u: The file to save the output (SAM format)
-    -g, --genAlignType=g: The type of pairing (single or paired)
-    -p, --params=p: Parameter setting to use (pre_set or full)
-    -s, --fileSource=s: Whether to use a previously indexed reference sequence or one from history (indexed or history)
-    -n, --maxEditDist=n: Maximum edit distance if integer
-    -m, --fracMissingAligns=m: Fraction of missing alignments given 2% uniform base error rate if fraction
-    -o, --maxGapOpens=o: Maximum number of gap opens
-    -e, --maxGapExtens=e: Maximum number of gap extensions
-    -d, --disallowLongDel=d: Disallow a long deletion within specified bps
-    -i, --disallowIndel=i: Disallow indel within specified bps
-    -l, --seed=l: Take the first specified subsequences
-    -k, --maxEditDistSeed=k: Maximum edit distance to the seed
-    -M, --mismatchPenalty=M: Mismatch penalty
-    -O, --gapOpenPenalty=O: Gap open penalty
-    -E, --gapExtensPenalty=E: Gap extension penalty
-    -R, --suboptAlign=R: Proceed with suboptimal alignments even if the top hit is a repeat
-    -N, --noIterSearch=N: Disable iterative search
-    -T, --outputTopN=T: Output top specified hits
-    -S, --maxInsertSize=S: Maximum insert size for a read pair to be considered mapped good
-    -P, --maxOccurPairing=P: Maximum occurrences of a read for pairings
-    -D, --dbkey=D: Dbkey for reference genome
-    -H, --suppressHeader=h: Suppress header
-"""
-
-import optparse, os, shutil, subprocess, sys, tempfile
-
-def stop_err( msg ):
-    sys.stderr.write( '%s\n' % msg )
-    sys.exit()
-
-def __main__():
-    #Parse Command Line
-    parser = optparse.OptionParser()
-    parser.add_option( '-t', '--threads', dest='threads', help='The number of threads to use' )
-    parser.add_option( '-r', '--ref', dest='ref', help='The reference genome to use or index' )
-    parser.add_option( '-f', '--fastq', dest='fastq', help='The (forward) fastq file to use for the mapping' )
-    parser.add_option( '-F', '--rfastq', dest='rfastq', help='The reverse fastq file to use for mapping if paired-end data' )
-    parser.add_option( '-u', '--output', dest='output', help='The file to save the output (SAM format)' )
-    parser.add_option( '-g', '--genAlignType', dest='genAlignType', help='The type of pairing (single or paired)' )
-    parser.add_option( '-p', '--params', dest='params', help='Parameter setting to use (pre_set or full)' )
-    parser.add_option( '-s', '--fileSource', dest='fileSource', help='Whether to use a previously indexed reference sequence or one form history (indexed or history)' )
-    parser.add_option( '-n', '--maxEditDist', dest='maxEditDist', help='Maximum edit distance if integer' )
-    parser.add_option( '-m', '--fracMissingAligns', dest='fracMissingAligns', help='Fraction of missing alignments given 2% uniform base error rate if fraction' )
-    parser.add_option( '-o', '--maxGapOpens', dest='maxGapOpens', help='Maximum number of gap opens' )
-    parser.add_option( '-e', '--maxGapExtens', dest='maxGapExtens', help='Maximum number of gap extensions' )
-    parser.add_option( '-d', '--disallowLongDel', dest='disallowLongDel', help='Disallow a long deletion within specified bps' )
-    parser.add_option( '-i', '--disallowIndel', dest='disallowIndel', help='Disallow indel within specified bps' )
-    parser.add_option( '-l', '--seed', dest='seed', help='Take the first specified subsequences' )
-    parser.add_option( '-k', '--maxEditDistSeed', dest='maxEditDistSeed', help='Maximum edit distance to the seed' )
-    parser.add_option( '-M', '--mismatchPenalty', dest='mismatchPenalty', help='Mismatch penalty' )
-    parser.add_option( '-O', '--gapOpenPenalty', dest='gapOpenPenalty', help='Gap open penalty' )
-    parser.add_option( '-E', '--gapExtensPenalty', dest='gapExtensPenalty', help='Gap extension penalty' )
-    parser.add_option( '-R', '--suboptAlign', dest='suboptAlign', help='Proceed with suboptimal alignments even if the top hit is a repeat' )
-    parser.add_option( '-N', '--noIterSearch', dest='noIterSearch', help='Disable iterative search' )
-    parser.add_option( '-T', '--outputTopN', dest='outputTopN', help='Output top specified hits' )
-    parser.add_option( '-S', '--maxInsertSize', dest='maxInsertSize', help='Maximum insert size for a read pair to be considered mapped good' )
-    parser.add_option( '-P', '--maxOccurPairing', dest='maxOccurPairing', help='Maximum occurrences of a read for pairings' )
-    parser.add_option( '-D', '--dbkey', dest='dbkey', help='Dbkey for reference genome' )
-    parser.add_option( '-H', '--suppressHeader', dest='suppressHeader', help='Suppress header' )
-    (options, args) = parser.parse_args()
-    # make temp directory for placement of indices
-    tmp_index_dir = tempfile.mkdtemp()
-    tmp_dir = tempfile.mkdtemp()
-    # index if necessary
-    if options.fileSource == 'history':
-        ref_file = tempfile.NamedTemporaryFile( dir=tmp_index_dir )
-        ref_file_name = ref_file.name
-        ref_file.close()
-        os.symlink( options.ref, ref_file_name )
-        # determine which indexing algorithm to use, based on size
-        try:
-            size = os.stat( options.ref ).st_size
-            if size <= 2**30: 
-                indexingAlg = 'is'
-            else:
-                indexingAlg = 'bwtsw'
-        except:
-            indexingAlg = 'is'
-        indexing_cmds = '-a %s' % indexingAlg
-        cmd1 = 'bwa index %s %s' % ( indexing_cmds, ref_file_name )
-        try:
-            tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name
-            tmp_stderr = open( tmp, 'wb' )
-            proc = subprocess.Popen( args=cmd1, shell=True, cwd=tmp_index_dir, stderr=tmp_stderr.fileno() )
-            returncode = proc.wait()
-            tmp_stderr.close()
-            # get stderr, allowing for case where it's very large
-            tmp_stderr = open( tmp, 'rb' )
-            stderr = ''
-            buffsize = 1048576
-            try:
-                while True:
-                    stderr += tmp_stderr.read( buffsize )
-                    if not stderr or len( stderr ) % buffsize != 0:
-                        break
-            except OverflowError:
-                pass
-            tmp_stderr.close()
-            if returncode != 0:
-                raise Exception, stderr
-        except Exception, e:
-            # clean up temp dirs
-            if os.path.exists( tmp_index_dir ):
-                shutil.rmtree( tmp_index_dir )
-            if os.path.exists( tmp_dir ):
-                shutil.rmtree( tmp_dir )
-            stop_err( 'Error indexing reference sequence. ' + str( e ) )
-    else:
-        ref_file_name = options.ref
-    # set up aligning and generate aligning command options
-    if options.params == 'pre_set':
-        aligning_cmds = '-t %s' % options.threads
-        gen_alignment_cmds = ''
-    else:
-        if options.maxEditDist != '0':
-            editDist = options.maxEditDist
-        else:
-            editDist = options.fracMissingAligns
-        if options.seed != '-1':
-            seed = '-l %s' % options.seed
-        else:
-            seed = ''
-        if options.suboptAlign == 'true':
-            suboptAlign = '-R'
-        else:
-            suboptAlign = ''
-        if options.noIterSearch == 'true':
-            noIterSearch = '-N'
-        else:
-            noIterSearch = ''
-        aligning_cmds = '-n %s -o %s -e %s -d %s -i %s %s -k %s -t %s -M %s -O %s -E %s %s %s' % \
-                        ( editDist, options.maxGapOpens, options.maxGapExtens, options.disallowLongDel,
-                          options.disallowIndel, seed, options.maxEditDistSeed, options.threads,
-                          options.mismatchPenalty, options.gapOpenPenalty, options.gapExtensPenalty,
-                          suboptAlign, noIterSearch )
-        if options.genAlignType == 'single':
-            gen_alignment_cmds = '-n %s' % options.outputTopN
-        elif options.genAlignType == 'paired':
-            gen_alignment_cmds = '-a %s -o %s' % ( options.maxInsertSize, options.maxOccurPairing )
-    # set up output files
-    tmp_align_out = tempfile.NamedTemporaryFile( dir=tmp_dir )
-    tmp_align_out_name = tmp_align_out.name
-    tmp_align_out.close()
-    tmp_align_out2 = tempfile.NamedTemporaryFile( dir=tmp_dir )
-    tmp_align_out2_name = tmp_align_out2.name
-    tmp_align_out2.close()
-    # prepare actual aligning and generate aligning commands
-    cmd2 = 'bwa aln %s %s %s > %s' % ( aligning_cmds, ref_file_name, options.fastq, tmp_align_out_name )
-    cmd2b = ''
-    if options.genAlignType == 'paired':
-        cmd2b = 'bwa aln %s %s %s > %s' % ( aligning_cmds, ref_file_name, options.rfastq, tmp_align_out2_name )
-        cmd3 = 'bwa sampe %s %s %s %s %s %s >> %s' % ( gen_alignment_cmds, ref_file_name, tmp_align_out_name, tmp_align_out2_name, options.fastq, options.rfastq, options.output )
-    else:
-        cmd3 = 'bwa samse %s %s %s %s >> %s' % ( gen_alignment_cmds, ref_file_name, tmp_align_out_name, options.fastq, options.output )
-    # perform alignments
-    buffsize = 1048576
-    try:
-        # need to nest try-except in try-finally to handle 2.4
-        try:
-            # align
-            try:
-                tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
-                tmp_stderr = open( tmp, 'wb' )
-                proc = subprocess.Popen( args=cmd2, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
-                returncode = proc.wait()
-                tmp_stderr.close()
-                # get stderr, allowing for case where it's very large
-                tmp_stderr = open( tmp, 'rb' )
-                stderr = ''
-                try:
-                    while True:
-                        stderr += tmp_stderr.read( buffsize )
-                        if not stderr or len( stderr ) % buffsize != 0:
-                            break
-                except OverflowError:
-                    pass
-                tmp_stderr.close()
-                if returncode != 0:
-                    raise Exception, stderr
-            except Exception, e:
-                raise Exception, 'Error aligning sequence. ' + str( e )
-            # and again if paired data
-            try:
-                if cmd2b:
-                    tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
-                    tmp_stderr = open( tmp, 'wb' )
-                    proc = subprocess.Popen( args=cmd2b, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
-                    returncode = proc.wait()
-                    tmp_stderr.close()
-                    # get stderr, allowing for case where it's very large
-                    tmp_stderr = open( tmp, 'rb' )
-                    stderr = ''
-                    try:
-                        while True:
-                            stderr += tmp_stderr.read( buffsize )
-                            if not stderr or len( stderr ) % buffsize != 0:
-                                break
-                    except OverflowError:
-                        pass
-                    tmp_stderr.close()
-                    if returncode != 0:
-                        raise Exception, stderr
-            except Exception, e:
-                raise Exception, 'Error aligning second sequence. ' + str( e )
-            # generate align
-            try:
-                tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
-                tmp_stderr = open( tmp, 'wb' )
-                proc = subprocess.Popen( args=cmd3, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
-                returncode = proc.wait()
-                tmp_stderr.close()
-                # get stderr, allowing for case where it's very large
-                tmp_stderr = open( tmp, 'rb' )
-                stderr = ''
-                try:
-                    while True:
-                        stderr += tmp_stderr.read( buffsize )
-                        if not stderr or len( stderr ) % buffsize != 0:
-                            break
-                except OverflowError:
-                    pass
-                tmp_stderr.close()
-                if returncode != 0:
-                    raise Exception, stderr
-            except Exception, e:
-                raise Exception, 'Error generating alignments. ' + str( e ) 
-            # remove header if necessary
-            if options.suppressHeader == 'true':
-                tmp_out = tempfile.NamedTemporaryFile( dir=tmp_dir)
-                tmp_out_name = tmp_out.name
-                tmp_out.close()
-                try:
-                    shutil.move( options.output, tmp_out_name )
-                except Exception, e:
-                    raise Exception, 'Error moving output file before removing headers. ' + str( e )
-                fout = file( options.output, 'w' )
-                for line in file( tmp_out.name, 'r' ):
-                    if not ( line.startswith( '@HD' ) or line.startswith( '@SQ' ) or line.startswith( '@RG' ) or line.startswith( '@PG' ) or line.startswith( '@CO' ) ):
-                        fout.write( line )
-                fout.close()
-            # check that there are results in the output file
-            if os.path.getsize( options.output ) > 0:
-                sys.stdout.write( 'BWA run on %s-end data' % options.genAlignType )
-            else:
-                raise Exception, 'The output file is empty. You may simply have no matches, or there may be an error with your input file or settings.'
-        except Exception, e:
-            stop_err( 'The alignment failed.\n' + str( e ) )
-    finally:
-        # clean up temp dir
-        if os.path.exists( tmp_index_dir ):
-            shutil.rmtree( tmp_index_dir )
-        if os.path.exists( tmp_dir ):
-            shutil.rmtree( tmp_dir )
-
-if __name__=="__main__": __main__()

listFiles.py

-#Provides Upload tool with access to list of available files
-import glob,sys
-import galaxy.app as thisapp
-import galaxy.util
-
-from elementtree.ElementTree import XML
-
-librepos = '/usr/local/galaxy/data/rg'
-myrepos = '/home/rerla/galaxy'
-marchinirepos = '/usr/local/galaxy/data/rg/snptest'
-
-from galaxy.tools.parameters import DataToolParameter
-
-#Provides Upload tool with access to list of available builds
-
-builds = []
-#Read build names and keys from galaxy.util
-for dbkey, build_name in galaxy.util.dbnames:
-    builds.append((build_name,dbkey,False))
-
-#Return available builds
-def get_available_builds(defval='hg18'):
-    for i,x in enumerate(builds):
-        if x[1] == defval:
-           x = list(x)
-           x[2] = True
-           builds[i] = tuple(x)
-    return builds
-
-
-
-def get_tabular_cols( input, outformat='gg' ):
-    """numeric only other than rs for strict genome graphs
-    otherwise tabular. Derived from galaxy tool source around August 2007 by Ross"""
-    columns = []
-    seenCnames = {}
-    elems = []
-    colnames = ['Col%d' % x for x in range(input.metadata.columns+1)]
-    strict = (outformat=='gg')
-    for i, line in enumerate( file ( input.file_name ) ):
-        if line and not line.startswith( '#' ): 
-            line = line.rstrip('\r\n')
-            elems = line.split( '\t' )
-    
-            """
-            Strict gg note:
-            Since this tool requires users to select only those columns
-            that contain numerical values, we'll restrict the column select
-            list appropriately other than the first column which must be a marker
-            """
-            if len(elems) > 0:
-                for col in range(1, input.metadata.columns+1):
-		    isFloat = False # short circuit common result
-                    try:
-                        val = float(elems[col-1])
-                        isFloat = True
-                    except:
-                        val = elems[col-1]
-                        if val:
-                            if i == 0: # header row
-                               colnames[col] = val
-                    if isFloat or (not strict) or (col == 1): # all in if not GG
-                        option = colnames[col]
-                        if not seenCnames.get(option,None): # new
-                              columns.append((option,str(col),False))
-                              seenCnames[option] = option
-            #print 'get_tab: %d=%s. Columns=%s' % (i,line,str(columns))
-            if len(columns) > 0 and i > 10:
-                """
-                We have our select list built, so we can break out of the outer most for loop
-                """
-                break 
-        if i == 30:
-            break # Hopefully we never get here...
-    for option in range(min(5,len(columns))):
-      (x,y,z) = columns[option]
-      columns[option] = (x,y,True)
-    return columns # sorted select options
-
-def get_marchini_dir():
-    """return the filesystem directory for snptest style files"""
-    return marchinirepos
-
-
-def get_lib_SNPTESTCaCofiles():
-    """return a list of file names - without extensions - available for caco studies
-    These have a common file name with both _1 and _2 suffixes"""
-    d = get_marchini_dir()
-    testsuffix = '.gen_1' # glob these
-    flist = glob.glob('%s/*%s' % (d,testsuffix))
-    flist = [x.split(testsuffix)[0] for x in flist] # leaves with a list of file set names
-    if len(flist) > 0:
-        dat = [(flist[0],flist[0],True),]
-	dat += [(x,x,False) for x in flist[1:]]
-    else:
-        dat = [('No Marchini CaCo files found in %s - convert some using the Marchini converter tool' % d,'None',True),]
-    return dat
-
-def getChropt():
-    """return dynamic chromosome select options
-    """
-    c = ['X','Y']
-    c += ['%d' % x for x in range(1,23)]
-    dat = [(x,x,False) for x in c]
-    x,y,z = dat[3]
-    dat[3] = (x,y,True)
-    return dat
-
-
-def get_phecols(fname=''):
-   """ return a list of phenotype columns for a multi-select list
-   prototype:
-   foo = ('fake - not yet implemented','not implemented','False')
-   dat = [foo for x in range(5)]
-   return dat
-   """
-   try:
-   	header = file(fname,'r').next().split()
-   except:
-        return [('get_phecols unable to open file %s' % fname,'None',False),]
-   dat = [(x,x,False) for x in header]
-   return dat
-
-#Return various kinds of files
-
-def get_lib_pedfiles():
-    dat = glob.glob('%s/ped/*.ped' % librepos)
-    dat += glob.glob('%s/ped/*.ped' % myrepos)
-    dat.sort()
-    if len(dat) > 0:
-        dat = [x.split('.ped')[0] for x in dat]
-    	dat = [(x,x,'True') for x in dat]
-    else:
-        dat = [('No ped files - add some to %s/ped or %s/ped' % (librepos,myrepos),'None',True),]
-    return dat
-
-def get_lib_phefiles():
-    ext = 'phe'
-    dat = glob.glob('%s/pheno/*.%s' % (librepos,ext))
-    dat += glob.glob('%s/pheno/*.%s' % (myrepos,ext))
-    dat.sort()
-    if len(dat) > 0:
-    	dat = [(x,x,'False') for x in dat]
-    else:
-        dat = [('No %s files - add some to %s/pheno or %s/pheno' % (ext,librepos,myrepos),'None',True),]
-    return dat
-
-def get_lib_bedfiles():
-    dat = glob.glob('%s/plinkbed/*.bed' % librepos)
-    dat += glob.glob('%s/plinkbed/*.bed' % myrepos)
-    dat.sort()
-    if len(dat) > 0:
-        dat = [x.split('.bed')[0] for x in dat]
-    	dat = [(x,x,False) for x in dat]
-    else:
-        dat = [('No bed files - Please import some to %s/plinkbed or %s/plinkbed' % (librepos,myrepos),'None',True),]
-    return dat
-
-def get_lib_fbatfiles():
-    dat = glob.glob('%s/plinkfbat/*.ped' % librepos)
-    dat += glob.glob('%s/plinkfbat/*.ped' % myrepos)
-    dat.sort()
-    if len(dat) > 0:
-    	dat = [(x,x,False) for x in dat]
-    else:
-        dat = [('No fbat bed files - Please import some to %s/plinkfbat or %s/plinkfbat' % (librepos,myrepos),'None',True),]
-    return dat
-
-def get_lib_mapfiles():
-    dat = glob.glob('%s/ped/*.map' % librepos)
-    dat += glob.glob('%s/ped/*.map' % myrepos)
-    dat.sort()
-    if len(dat) > 0:
-    	dat = [(x,x,False) for x in dat]
-    else:
-        dat = [('No map files - add some to %s/ped' % librepos,'None',True),]
-    return dat
-
-def get_my_pedfiles():
-    dat = glob.glob('%s/*.ped' % myrepos)
-    if len(dat) > 0:
-    	dat = [(x,x,False) for x in dat]
-    else:
-        dat = [('No ped files - add some to %s' % librepos,'None',True),]
-    return dat
-
-def get_my_mapfiles():
-    dat = glob.glob('%s/*.map' % myrepos)
-    if len(dat) > 0:
-    	dat = [(x,x,'True') for x in dat]
-    else:
-        dat = [('No ped files - add some to %s' % librepos,'None',True),]
-    return dat
-
-def get_lib_xlsfiles():
-    dat = glob.glob('%s/*.xls' % librepos)
-    if len(dat) > 0:
-    	dat = [(x,x,False) for x in dat]
-    else:
-        dat = [('No ped files - add some to %s' % librepos,'None',True),]
-    return dat
-
-def get_lib_htmlfiles():
-    dat = glob.glob('%s/*.html' % librepos)
-    if len(dat) > 0:
-    	dat = [(x,x,False) for x in dat]
-    else:
-        dat = [('No ped files - add some to %s' % librepos,'None',True),]
-    return dat
-
-def get_my_xlsfiles():
-    dat = glob.glob('%s/*.xls' %  myrepos)
-    if len(dat) > 0:
-    	dat = [(x,x,False) for x in dat]
-    else:
-        dat = [('No ped files - add some to %s' % librepos,'None',True),]
-    return dat
-
-def get_my_htmlfiles():
-    dat = glob.glob('%s/*.html' % myrepos)
-    if len(dat) > 0:
-    	dat = [(x,x,False) for x in dat]
-    else:
-        dat = [('No ped files - add some to %s' % librepos,'None',True),]
-    return dat
-
-

plinkbinJZ.py

-#!/usr/bin/env python2.4
-"""
-"""
-
-import optparse,os,subprocess,gzip,struct,time,commands
-from array import array
-
-#from AIMS import util
-#from pga import util as pgautil
-
-__FILE_ID__ = '$Id: plinkbinJZ.py,v 1.14 2009/07/13 20:16:50 rejpz Exp $'
-
-VERBOSE = True
-
-MISSING_ALLELES = set(['N', '0', '.', '-',''])
-
-AUTOSOMES = set(range(1, 23) + [str(c) for c in range(1, 23)])
-
-MAGIC_BYTE1 = '00110110'
-MAGIC_BYTE2 = '11011000'
-FORMAT_SNP_MAJOR_BYTE = '10000000'
-FORMAT_IND_MAJOR_BYTE = '00000000'
-MAGIC1 = (0, 3, 1, 2)
-MAGIC2 = (3, 1, 2, 0)
-FORMAT_SNP_MAJOR = (2, 0, 0, 0)
-FORMAT_IND_MAJOR = (0, 0, 0, 0)
-HEADER_LENGTH = 3
-
-HOM0 = 3
-HOM1 = 0
-MISS = 2
-HET  = 1
-HOM0_GENO = (0, 0)
-HOM1_GENO = (1, 1)
-HET_GENO = (0, 1)
-MISS_GENO = (-9, -9)
-
-GENO_TO_GCODE = {
-    HOM0_GENO: HOM0, 
-    HET_GENO: HET, 
-    HOM1_GENO: HOM1, 
-    MISS_GENO: MISS, 
-    }
-
-CHROM_REPLACE = {
-    'X': '23',
-    'Y': '24',
-    'XY': '25',
-    'MT': '26',
-    'M': '26',
-}
-
-MAP_LINE_EXCEPTION_TEXT = """
-One or more lines in the *.map file has only three fields.
-The line was:
-
-%s
-
-If you are running rgGRR through EPMP, this is usually a
-sign that you are using an old version of the map file.
-You can correct the problem by re-running Subject QC.  If
-you have already tried this, please contact the developers,
-or file a bug.
-"""
-
-INT_TO_GCODE = {
-     0: array('i', (0, 0, 0, 0)),   1: array('i', (2, 0, 0, 0)),   2: array('i', (1, 0, 0, 0)),   3: array('i', (3, 0, 0, 0)), 
-     4: array('i', (0, 2, 0, 0)),   5: array('i', (2, 2, 0, 0)),   6: array('i', (1, 2, 0, 0)),   7: array('i', (3, 2, 0, 0)), 
-     8: array('i', (0, 1, 0, 0)),   9: array('i', (2, 1, 0, 0)),  10: array('i', (1, 1, 0, 0)),  11: array('i', (3, 1, 0, 0)), 
-    12: array('i', (0, 3, 0, 0)),  13: array('i', (2, 3, 0, 0)),  14: array('i', (1, 3, 0, 0)),  15: array('i', (3, 3, 0, 0)), 
-    16: array('i', (0, 0, 2, 0)),  17: array('i', (2, 0, 2, 0)),  18: array('i', (1, 0, 2, 0)),  19: array('i', (3, 0, 2, 0)), 
-    20: array('i', (0, 2, 2, 0)),  21: array('i', (2, 2, 2, 0)),  22: array('i', (1, 2, 2, 0)),  23: array('i', (3, 2, 2, 0)), 
-    24: array('i', (0, 1, 2, 0)),  25: array('i', (2, 1, 2, 0)),  26: array('i', (1, 1, 2, 0)),  27: array('i', (3, 1, 2, 0)), 
-    28: array('i', (0, 3, 2, 0)),  29: array('i', (2, 3, 2, 0)),  30: array('i', (1, 3, 2, 0)),  31: array('i', (3, 3, 2, 0)), 
-    32: array('i', (0, 0, 1, 0)),  33: array('i', (2, 0, 1, 0)),  34: array('i', (1, 0, 1, 0)),  35: array('i', (3, 0, 1, 0)), 
-    36: array('i', (0, 2, 1, 0)),  37: array('i', (2, 2, 1, 0)),  38: array('i', (1, 2, 1, 0)),  39: array('i', (3, 2, 1, 0)), 
-    40: array('i', (0, 1, 1, 0)),  41: array('i', (2, 1, 1, 0)),  42: array('i', (1, 1, 1, 0)),  43: array('i', (3, 1, 1, 0)), 
-    44: array('i', (0, 3, 1, 0)),  45: array('i', (2, 3, 1, 0)),  46: array('i', (1, 3, 1, 0)),  47: array('i', (3, 3, 1, 0)), 
-    48: array('i', (0, 0, 3, 0)),  49: array('i', (2, 0, 3, 0)),  50: array('i', (1, 0, 3, 0)),  51: array('i', (3, 0, 3, 0)), 
-    52: array('i', (0, 2, 3, 0)),  53: array('i', (2, 2, 3, 0)),  54: array('i', (1, 2, 3, 0)),  55: array('i', (3, 2, 3, 0)), 
-    56: array('i', (0, 1, 3, 0)),  57: array('i', (2, 1, 3, 0)),  58: array('i', (1, 1, 3, 0)),  59: array('i', (3, 1, 3, 0)), 
-    60: array('i', (0, 3, 3, 0)),  61: array('i', (2, 3, 3, 0)),  62: array('i', (1, 3, 3, 0)),  63: array('i', (3, 3, 3, 0)), 
-    64: array('i', (0, 0, 0, 2)),  65: array('i', (2, 0, 0, 2)),  66: array('i', (1, 0, 0, 2)),  67: array('i', (3, 0, 0, 2)), 
-    68: array('i', (0, 2, 0, 2)),  69: array('i', (2, 2, 0, 2)),  70: array('i', (1, 2, 0, 2)),  71: array('i', (3, 2, 0, 2)), 
-    72: array('i', (0, 1, 0, 2)),  73: array('i', (2, 1, 0, 2)),  74: array('i', (1, 1, 0, 2)),  75: array('i', (3, 1, 0, 2)), 
-    76: array('i', (0, 3, 0, 2)),  77: array('i', (2, 3, 0, 2)),  78: array('i', (1, 3, 0, 2)),  79: array('i', (3, 3, 0, 2)), 
-    80: array('i', (0, 0, 2, 2)),  81: array('i', (2, 0, 2, 2)),  82: array('i', (1, 0, 2, 2)),  83: array('i', (3, 0, 2, 2)), 
-    84: array('i', (0, 2, 2, 2)),  85: array('i', (2, 2, 2, 2)),  86: array('i', (1, 2, 2, 2)),  87: array('i', (3, 2, 2, 2)), 
-    88: array('i', (0, 1, 2, 2)),  89: array('i', (2, 1, 2, 2)),  90: array('i', (1, 1, 2, 2)),  91: array('i', (3, 1, 2, 2)), 
-    92: array('i', (0, 3, 2, 2)),  93: array('i', (2, 3, 2, 2)),  94: array('i', (1, 3, 2, 2)),  95: array('i', (3, 3, 2, 2)), 
-    96: array('i', (0, 0, 1, 2)),  97: array('i', (2, 0, 1, 2)),  98: array('i', (1, 0, 1, 2)),  99: array('i', (3, 0, 1, 2)), 
-   100: array('i', (0, 2, 1, 2)), 101: array('i', (2, 2, 1, 2)), 102: array('i', (1, 2, 1, 2)), 103: array('i', (3, 2, 1, 2)), 
-   104: array('i', (0, 1, 1, 2)), 105: array('i', (2, 1, 1, 2)), 106: array('i', (1, 1, 1, 2)), 107: array('i', (3, 1, 1, 2)), 
-   108: array('i', (0, 3, 1, 2)), 109: array('i', (2, 3, 1, 2)), 110: array('i', (1, 3, 1, 2)), 111: array('i', (3, 3, 1, 2)), 
-   112: array('i', (0, 0, 3, 2)), 113: array('i', (2, 0, 3, 2)), 114: array('i', (1, 0, 3, 2)), 115: array('i', (3, 0, 3, 2)), 
-   116: array('i', (0, 2, 3, 2)), 117: array('i', (2, 2, 3, 2)), 118: array('i', (1, 2, 3, 2)), 119: array('i', (3, 2, 3, 2)), 
-   120: array('i', (0, 1, 3, 2)), 121: array('i', (2, 1, 3, 2)), 122: array('i', (1, 1, 3, 2)), 123: array('i', (3, 1, 3, 2)), 
-   124: array('i', (0, 3, 3, 2)), 125: array('i', (2, 3, 3, 2)), 126: array('i', (1, 3, 3, 2)), 127: array('i', (3, 3, 3, 2)), 
-   128: array('i', (0, 0, 0, 1)), 129: array('i', (2, 0, 0, 1)), 130: array('i', (1, 0, 0, 1)), 131: array('i', (3, 0, 0, 1)), 
-   132: array('i', (0, 2, 0, 1)), 133: array('i', (2, 2, 0, 1)), 134: array('i', (1, 2, 0, 1)), 135: array('i', (3, 2, 0, 1)), 
-   136: array('i', (0, 1, 0, 1)), 137: array('i', (2, 1, 0, 1)), 138: array('i', (1, 1, 0, 1)), 139: array('i', (3, 1, 0, 1)), 
-   140: array('i', (0, 3, 0, 1)), 141: array('i', (2, 3, 0, 1)), 142: array('i', (1, 3, 0, 1)), 143: array('i', (3, 3, 0, 1)), 
-   144: array('i', (0, 0, 2, 1)), 145: array('i', (2, 0, 2, 1)), 146: array('i', (1, 0, 2, 1)), 147: array('i', (3, 0, 2, 1)), 
-   148: array('i', (0, 2, 2, 1)), 149: array('i', (2, 2, 2, 1)), 150: array('i', (1, 2, 2, 1)), 151: array('i', (3, 2, 2, 1)), 
-   152: array('i', (0, 1, 2, 1)), 153: array('i', (2, 1, 2, 1)), 154: array('i', (1, 1, 2, 1)), 155: array('i', (3, 1, 2, 1)), 
-   156: array('i', (0, 3, 2, 1)), 157: array('i', (2, 3, 2, 1)), 158: array('i', (1, 3, 2, 1)), 159: array('i', (3, 3, 2, 1)), 
-   160: array('i', (0, 0, 1, 1)), 161: array('i', (2, 0, 1, 1)), 162: array('i', (1, 0, 1, 1)), 163: array('i', (3, 0, 1, 1)), 
-   164: array('i', (0, 2, 1, 1)), 165: array('i', (2, 2, 1, 1)), 166: array('i', (1, 2, 1, 1)), 167: array('i', (3, 2, 1, 1)), 
-   168: array('i', (0, 1, 1, 1)), 169: array('i', (2, 1, 1, 1)), 170: array('i', (1, 1, 1, 1)), 171: array('i', (3, 1, 1, 1)), 
-   172: array('i', (0, 3, 1, 1)), 173: array('i', (2, 3, 1, 1)), 174: array('i', (1, 3, 1, 1)), 175: array('i', (3, 3, 1, 1)), 
-   176: array('i', (0, 0, 3, 1)), 177: array('i', (2, 0, 3, 1)), 178: array('i', (1, 0, 3, 1)), 179: array('i', (3, 0, 3, 1)), 
-   180: array('i', (0, 2, 3, 1)), 181: array('i', (2, 2, 3, 1)), 182: array('i', (1, 2, 3, 1)), 183: array('i', (3, 2, 3, 1)), 
-   184: array('i', (0, 1, 3, 1)), 185: array('i', (2, 1, 3, 1)), 186: array('i', (1, 1, 3, 1)), 187: array('i', (3, 1, 3, 1)), 
-   188: array('i', (0, 3, 3, 1)), 189: array('i', (2, 3, 3, 1)), 190: array('i', (1, 3, 3, 1)), 191: array('i', (3, 3, 3, 1)), 
-   192: array('i', (0, 0, 0, 3)), 193: array('i', (2, 0, 0, 3)), 194: array('i', (1, 0, 0, 3)), 195: array('i', (3, 0, 0, 3)), 
-   196: array('i', (0, 2, 0, 3)), 197: array('i', (2, 2, 0, 3)), 198: array('i', (1, 2, 0, 3)), 199: array('i', (3, 2, 0, 3)), 
-   200: array('i', (0, 1, 0, 3)), 201: array('i', (2, 1, 0, 3)), 202: array('i', (1, 1, 0, 3)), 203: array('i', (3, 1, 0, 3)), 
-   204: array('i', (0, 3, 0, 3)), 205: array('i', (2, 3, 0, 3)), 206: array('i', (1, 3, 0, 3)), 207: array('i', (3, 3, 0, 3)), 
-   208: array('i', (0, 0, 2, 3)), 209: array('i', (2, 0, 2, 3)), 210: array('i', (1, 0, 2, 3)), 211: array('i', (3, 0, 2, 3)), 
-   212: array('i', (0, 2, 2, 3)), 213: array('i', (2, 2, 2, 3)), 214: array('i', (1, 2, 2, 3)), 215: array('i', (3, 2, 2, 3)), 
-   216: array('i', (0, 1, 2, 3)), 217: array('i', (2, 1, 2, 3)), 218: array('i', (1, 1, 2, 3)), 219: array('i', (3, 1, 2, 3)), 
-   220: array('i', (0, 3, 2, 3)), 221: array('i', (2, 3, 2, 3)), 222: array('i', (1, 3, 2, 3)), 223: array('i', (3, 3, 2, 3)), 
-   224: array('i', (0, 0, 1, 3)), 225: array('i', (2, 0, 1, 3)), 226: array('i', (1, 0, 1, 3)), 227: array('i', (3, 0, 1, 3)), 
-   228: array('i', (0, 2, 1, 3)), 229: array('i', (2, 2, 1, 3)), 230: array('i', (1, 2, 1, 3)), 231: array('i', (3, 2, 1, 3)), 
-   232: array('i', (0, 1, 1, 3)), 233: array('i', (2, 1, 1, 3)), 234: array('i', (1, 1, 1, 3)), 235: array('i', (3, 1, 1, 3)), 
-   236: array('i', (0, 3, 1, 3)), 237: array('i', (2, 3, 1, 3)), 238: array('i', (1, 3, 1, 3)), 239: array('i', (3, 3, 1, 3)), 
-   240: array('i', (0, 0, 3, 3)), 241: array('i', (2, 0, 3, 3)), 242: array('i', (1, 0, 3, 3)), 243: array('i', (3, 0, 3, 3)), 
-   244: array('i', (0, 2, 3, 3)), 245: array('i', (2, 2, 3, 3)), 246: array('i', (1, 2, 3, 3)), 247: array('i', (3, 2, 3, 3)), 
-   248: array('i', (0, 1, 3, 3)), 249: array('i', (2, 1, 3, 3)), 250: array('i', (1, 1, 3, 3)), 251: array('i', (3, 1, 3, 3)), 
-   252: array('i', (0, 3, 3, 3)), 253: array('i', (2, 3, 3, 3)), 254: array('i', (1, 3, 3, 3)), 255: array('i', (3, 3, 3, 3)), 
-   }
-
-GCODE_TO_INT = dict([(tuple(v),k) for (k,v) in INT_TO_GCODE.items()])
-
-### Exceptions
-class DuplicateMarkerInMapFile(Exception): pass
-class MapLineTooShort(Exception): pass
-class ThirdAllele(Exception): pass
-class PedError(Exception): pass
-class BadMagic(Exception):
-    """ Raised when one of the MAGIC bytes in a bed file does not match
-    """
-    pass
-class BedError(Exception):
-    """ Raised when parsing a bed file runs into problems
-    """
-    pass
-class UnknownGenocode(Exception):
-    """ Raised when we get a 2-bit genotype that is undecipherable (is it possible?)
-    """
-    pass
-class UnknownGeno(Exception): pass
-
-### Utility functions
-
-def timenow():
-    """return current time as a string
-    """
-    return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
-
-def ceiling(n, k):
-    ''' Return the least multiple of k which is greater than n
-    '''
-    m = n % k
-    if m == 0:
-        return n
-    else:
-        return n + k - m
-
-def nbytes(n):
-    ''' Return the number of bytes required for n subjects
-    '''
-    return 2*ceiling(n, 4)/8
-
-### Primary module functionality
-class LPed:
-    """ The uber-class for processing the Linkage-format *.ped/*.map files
-    """
-    def __init__(self,  base):
-        self.base = base
-        self._ped = Ped('%s.ped' % (self.base))
-        self._map = Map('%s.map' % (self.base))
-
-        self._markers = {}
-        self._ordered_markers = []
-        self._marker_allele_lookup = {}
-        self._autosomal_indices = set()
-        
-        self._subjects = {}
-        self._ordered_subjects = []
-        
-        self._genotypes = []
-
-    def parse(self):
-        """
-        """
-        if VERBOSE: print 'plinkbinJZ: Analysis started: %s' % (timenow())
-        self._map.parse()
-        self._markers = self._map._markers
-        self._ordered_markers = self._map._ordered_markers
-        self._autosomal_indices = self._map._autosomal_indices
-        
-        self._ped.parse(self._ordered_markers)
-        self._subjects = self._ped._subjects
-        self._ordered_subjects = self._ped._ordered_subjects
-        self._genotypes = self._ped._genotypes
-        self._marker_allele_lookup = self._ped._marker_allele_lookup
-        
-        ### Adjust self._markers based on the allele information
-        ### we got from parsing the ped file
-        for m,  name in enumerate(self._ordered_markers):
-            a1,  a2 = self._marker_allele_lookup[m][HET]
-            self._markers[name][-2] = a1
-            self._markers[name][-1] = a2
-        if VERBOSE: print 'plinkbinJZ: Analysis finished: %s' % (timenow())
-
-    def getSubjectInfo(self, fid, oiid):
-        """
-        """
-        return self._subject_info[(fid, oiid)]
-
-    def getSubjectInfoByLine(self, line):
-        """
-        """
-        return self._subject_info[self._ordered_subjects[line]]
-    
-    def getGenotypesByIndices(self, s, mlist, format):
-        """ needed for grr if lped - deprecated but..
-        """
-        mlist = dict(zip(mlist,[True,]*len(mlist))) # hash quicker than 'in' ?
-        raw_array = array('i', [row[s] for m,row in enumerate(self._genotypes) if mlist.get(m,None)])            
-        if format == 'raw':
-            return raw_array
-        elif format == 'ref':
-            result = array('i', [0]*len(mlist))
-            for m, gcode in enumerate(raw_array):
-                if gcode == HOM0:
-                    nref = 3
-                elif gcode == HET:
-                    nref = 2
-                elif gcode == HOM1:
-                    nref = 1
-                else:
-                    nref = 0
-                result[m] = nref
-            return result
-        else:
-            result = []
-            for m, gcode in enumerate(raw_array):
-                result.append(self._marker_allele_lookup[m][gcode])
-            return result
-    
-    def writebed(self, base):
-        """
-        """
-        dst_name = '%s.fam' % (base)        
-        print 'Writing pedigree information to [ %s ]' % (dst_name)
-        dst = open(dst_name, 'w')
-        for skey in self._ordered_subjects:            
-            (fid, iid, did, mid, sex, phe, sid, d_sid, m_sid) = self._subjects[skey]
-            dst.write('%s %s %s %s %s %s\n' % (fid, iid, did, mid, sex, phe))
-        dst.close()
-
-        dst_name = '%s.bim' % (base)        
-        print 'Writing map (extended format) information to [ %s ]' % (dst_name)
-        dst = open(dst_name, 'w')        
-        for m, marker in enumerate(self._ordered_markers):
-            chrom, name, genpos, abspos,  a1,  a2 = self._markers[marker]
-            dst.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (chrom, name, genpos, abspos, a1, a2))
-        dst.close()
-
-        bed_name = '%s.bed' % (base)        
-        print 'Writing genotype bitfile to [ %s ]' % (bed_name)
-        print 'Using (default) SNP-major mode'
-        bed = open(bed_name, 'w')
-
-        ### Write the 3 header bytes
-        bed.write(struct.pack('B', int(''.join(reversed(MAGIC_BYTE1)), 2)))
-        bed.write(struct.pack('B', int(''.join(reversed(MAGIC_BYTE2)), 2)))
-        bed.write(struct.pack('B', int(''.join(reversed(FORMAT_SNP_MAJOR_BYTE)), 2)))
-        
-        ### Calculate how many "pad bits" we should add after the last subject
-        nsubjects = len(self._ordered_subjects)
-        nmarkers = len(self._ordered_markers)
-        total_bytes = nbytes(nsubjects)
-        nbits = nsubjects  * 2
-        pad_nibbles = ((total_bytes * 8) - nbits)/2
-        pad = array('i', [0]*pad_nibbles)
-
-        ### And now write genotypes to the file
-        for m in xrange(nmarkers):
-            geno = self._genotypes[m]
-            geno.extend(pad)
-            bytes = len(geno)/4
-            for b in range(bytes):
-                idx = b*4
-                gcode = tuple(geno[idx:idx+4])
-                try:
-                    byte = struct.pack('B', GCODE_TO_INT[gcode])
-                except KeyError:
-                    print m, b, gcode
-                    raise
-                bed.write(byte)
-        bed.close()
-        
-    def autosomal_indices(self):
-        """ Return the indices of markers in this ped/map that are autosomal.
-            This is used by rgGRR so that it can select a random set of markers
-            from the autosomes (sex chroms screw up the plot)
-        """
-        return self._autosomal_indices
-
-class Ped:
-    def __init__(self, path):
-        self.path = path
-        self._subjects = {}
-        self._ordered_subjects = []
-        self._genotypes = []
-        self._marker_allele_lookup = {}
-        
-    def lineCount(self,infile):
-        """ count the number of lines in a file - efficiently using wget
-        """
-        return int(commands.getoutput('wc -l %s' % (infile)).split()[0])    
-         
-
-    def parse(self,  markers):
-        """ Parse a given file -- this needs to be memory-efficient so that large
-            files can be parsed (~1 million markers on ~5000 subjects?).  It
-            should also be fast, if possible.
-        """
-                
-        ### Find out how many lines are in the file so we can ...
-        nsubjects = self.lineCount(self.path)
-        ### ... Pre-allocate the genotype arrays
-        nmarkers = len(markers)
-        _marker_alleles = [['0', '0'] for _ in xrange(nmarkers)]
-        self._genotypes = [array('i', [-1]*nsubjects) for _ in xrange(nmarkers)]
-
-        if self.path.endswith('.gz'):
-            pfile = gzip.open(self.path, 'r')
-        else:
-            pfile = open(self.path, 'r')
-
-        for s, line in enumerate(pfile):
-            line = line.strip()
-            if not line:
-                continue
-            
-            fid, iid, did, mid, sex, phe, genos = line.split(None, 6)
-            sid = iid.split('.')[0]
-            d_sid = did.split('.')[0]
-            m_sid = mid.split('.')[0]
-            
-            skey = (fid, iid)            
-            self._subjects[skey] = (fid, iid, did, mid, sex, phe, sid, d_sid, m_sid)
-            self._ordered_subjects.append(skey)
-
-            genotypes = genos.split()
-            
-            for m, marker in enumerate(markers):
-                idx = m*2
-                a1, a2 = genotypes[idx:idx+2] # Alleles for subject s, marker m
-                s1, s2 = seen = _marker_alleles[m] # Alleles seen for marker m
-                
-                ### FIXME: I think this can still be faster, and simpler to read
-                # Two pieces of logic intertwined here:  first, we need to code
-                # this genotype as HOM0, HOM1, HET or MISS.  Second, we need to
-                # keep an ongoing record of the genotypes seen for this marker
-                if a1 == a2:
-                    if a1 in MISSING_ALLELES:
-                        geno = MISS_GENO
-                    else:
-                        if s1 == '0':
-                            seen[0] = a1
-                        elif s1 == a1 or s2 == a2:
-                            pass
-                        elif s2 == '0':
-                            seen[1] = a1
-                        else:
-                            raise ThirdAllele('a1=a2=%s, seen=%s?' % (a1, str(seen)))
-                    
-                        if a1 == seen[0]:
-                            geno = HOM0_GENO
-                        elif a1 == seen[1]:
-                            geno = HOM1_GENO
-                        else:
-                            raise PedError('Cannot assign geno for a1=a2=%s from seen=%s' % (a1, str(seen)))
-                elif a1 in MISSING_ALLELES or a2 in MISSING_ALLELES:
-                    geno = MISS_GENO
-                else:
-                    geno = HET_GENO
-                    if s1 == '0':
-                        seen[0] = a1
-                        seen[1] = a2
-                    elif s2 == '0':
-                        if s1 == a1:
-                            seen[1] = a2
-                        elif s1 == a2:
-                            seen[1] = a1
-                        else:
-                            raise ThirdAllele('a1=%s, a2=%s, seen=%s?' % (a1, a2, str(seen)))
-                    else:
-                        if sorted(seen) != sorted((a1, a2)):
-                            raise ThirdAllele('a1=%s, a2=%s, seen=%s?' % (a1, a2, str(seen)))
-                                        
-                gcode = GENO_TO_GCODE.get(geno, None)
-                if gcode is None:
-                    raise UnknownGeno(str(geno))
-                self._genotypes[m][s] = gcode
-
-        # Build the _marker_allele_lookup table
-        for m,  alleles in enumerate(_marker_alleles):                
-            if len(alleles) == 2:
-                a1,  a2 = alleles
-            elif len(alleles) == 1:
-                a1 = alleles[0]
-                a2 = '0'
-            else:
-                print 'All alleles blank for %s: %s' % (m,  str(alleles))
-                raise
-
-            self._marker_allele_lookup[m] = {
-                HOM0: (a2, a2),
-                HOM1: (a1, a1),
-                HET : (a1, a2),
-                MISS: ('0','0'),
-                }
-
-        if VERBOSE: print '%s(%s) individuals read from [ %s ]' % (len(self._subjects),  nsubjects,  self.path)
-        
-class Map:
-    def __init__(self, path=None):
-        self.path = path
-        self._markers = {}
-        self._ordered_markers = []
-        self._autosomal_indices = set()
-
-    def __len__(self):
-        return len(self._markers)
-
-    def parse(self):
-        """ Parse a Linkage-format map file
-        """
-        if self.path.endswith('.gz'):
-            fh = gzip.open(self.path, 'r')
-        else:
-            fh = open(self.path, 'r')
-            
-        for i, line in enumerate(fh):
-            line = line.strip()
-            if not line:
-                continue
-
-            fields = line.split()
-            if len(fields) < 4:
-                raise MapLineTooShort(MAP_LINE_EXCEPTION_TEXT % (str(line),  len(fields)))
-            else:
-                chrom, name, genpos, abspos = fields
-            if name in self._markers:
-                raise DuplicateMarkerInMapFile('Marker %s was found twice in map file %s' % (name, self.path))
-            abspos = int(abspos)
-            if abspos < 0:
-                continue
-            if chrom in AUTOSOMES:
-                self._autosomal_indices.add(i)
-            chrom = CHROM_REPLACE.get(chrom, chrom)
-            self._markers[name] = [chrom, name, genpos, abspos,  None,  None]
-            self._ordered_markers.append(name)
-        fh.close()
-        if VERBOSE: print '%s (of %s) markers to be included from [ %s ]' % (len(self._ordered_markers),  i,  self.path)
-
-class BPed:
-    """ The uber-class for processing Plink's Binary Ped file format *.bed/*.bim/*.fam
-    """
-    def __init__(self,  base):
-        self.base = base
-        self._bed = Bed('%s.bed' % (self.base))
-        self._bim = Bim('%s.bim' % (self.base))
-        self._fam = Fam('%s.fam' % (self.base))
-
-        self._markers = {}
-        self._ordered_markers = []
-        self._marker_allele_lookup = {}
-        self._autosomal_indices = set()
-        
-        self._subjects = {}
-        self._ordered_subjects = []
-        
-        self._genotypes = []
-        
-    def parse(self,  quick=False):
-        """
-        """
-        self._quick = quick
-        
-        self._bim.parse()
-        self._markers = self._bim._markers
-        self._ordered_markers = self._bim._ordered_markers
-        self._marker_allele_lookup = self._bim._marker_allele_lookup
-        self._autosomal_indices = self._bim._autosomal_indices
-        
-        self._fam.parse()
-        self._subjects = self._fam._subjects
-        self._ordered_subjects = self._fam._ordered_subjects
-
-        self._bed.parse(self._ordered_subjects,  self._ordered_markers,  quick=quick)
-        self._bedf = self._bed._fh
-        self._genotypes = self._bed._genotypes
-        self.nsubjects = len(self._ordered_subjects)
-        self.nmarkers = len(self._ordered_markers)
-        self._bytes_per_marker = nbytes(self.nsubjects)
-
-    def writeped(self, path=None):
-        """
-        """
-        path = self.path = path or self.path
-        
-        map_name = self.path.replace('.bed', '.map')
-        print 'Writing map file [ %s ]' % (map_name)
-        dst = open(map_name, 'w')
-        for m in self._ordered_markers:
-            chrom, snp, genpos, abspos, a1, a2 = self._markers[m]
-            dst.write('%s\t%s\t%s\t%s\n' % (chrom, snp, genpos, abspos))
-        dst.close()
-
-        ped_name = self.path.replace('.bed', '.ped')
-        print 'Writing ped file [ %s ]' % (ped_name)
-        ped = open(ped_name, 'w')
-        firstyikes = False
-        for s, skey in enumerate(self._ordered_subjects):
-            idx = s*2
-            (fid, iid, did, mid, sex, phe, oiid, odid, omid) = self._subjects[skey]
-            ped.write('%s %s %s %s %s %s' % (fid, iid, odid, omid, sex, phe))
-            genotypes_for_subject = self.getGenotypesForSubject(s)
-            for m, snp in enumerate(self._ordered_markers):
-                #a1, a2 = self.getGenotypeByIndices(s, m)
-                a1,a2 = genotypes_for_subject[m]
-                ped.write(' %s %s' % (a1, a2))
-            ped.write('\n')
-        ped.close()
-
-    def getGenotype(self, subject, marker):
-        """ Retrieve a genotype for a particular subject/marker pair
-        """
-        m = self._ordered_markers.index(marker)
-        s = self._ordered_subjects.index(subject)
-        return self.getGenotypeByIndices(s, m)
-
-    def getGenotypesForSubject(self, s, raw=False):
-        """ Returns list of genotypes for all m markers
-            for subject s.  If raw==True, then an array
-            of raw integer gcodes is returned instead
-        """
-        if self._quick:
-            nmarkers = len(self._markers)
-            raw_array = array('i', [0]*nmarkers)
-            seek_nibble = s % 4
-            for m in xrange(nmarkers):
-                seek_byte = m * self._bytes_per_marker + s/4 + HEADER_LENGTH
-                self._bedf.seek(seek_byte)
-                geno = struct.unpack('B', self._bedf.read(1))[0]
-                quartet = INT_TO_GCODE[geno]
-                gcode = quartet[seek_nibble]
-                raw_array[m] = gcode
-        else:
-            raw_array = array('i', [row[s] for row in self._genotypes])
-            
-        if raw:
-            return raw_array
-        else:
-            result = []
-            for m, gcode in enumerate(raw_array):
-                result.append(self._marker_allele_lookup[m][gcode])
-            return result
-        
-    def getGenotypeByIndices(self, s, m):
-        """
-        """
-        if self._quick:
-            # Determine which byte we need to seek to, and
-            # which nibble within the byte we need
-            seek_byte = m * self._bytes_per_marker + s/4 + HEADER_LENGTH
-            seek_nibble = s % 4
-            self._bedf.seek(seek_byte)
-            geno = struct.unpack('B', self._bedf.read(1))[0]
-            quartet = INT_TO_GCODE[geno]
-            gcode = quartet[seek_nibble]
-        else:
-            # Otherwise, just grab the genotypes from the
-            # list of arrays
-            genos_for_marker = self._genotypes[m]
-            gcode = genos_for_marker[s]
-
-        return self._marker_allele_lookup[m][gcode]
-
-    def getGenotypesByIndices(self, s, mlist, format):
-        """
-        """
-        if self._quick:
-            raw_array = array('i', [0]*len(mlist))
-            seek_nibble = s % 4
-            for i,m in enumerate(mlist):
-                seek_byte = m * self._bytes_per_marker + s/4 + HEADER_LENGTH
-                self._bedf.seek(seek_byte)
-                geno = struct.unpack('B', self._bedf.read(1))[0]
-                quartet = INT_TO_GCODE[geno]
-                gcode = quartet[seek_nibble]
-                raw_array[i] = gcode
-            mlist = set(mlist)
-        else:
-            mlist = set(mlist)
-            raw_array = array('i', [row[s] for m,row in enumerate(self._genotypes) if m in mlist])
-            
-        if format == 'raw':
-            return raw_array
-        elif format == 'ref':
-            result = array('i', [0]*len(mlist))
-            for m, gcode in enumerate(raw_array):
-                if gcode == HOM0:
-                    nref = 3
-                elif gcode == HET:
-                    nref = 2
-                elif gcode == HOM1:
-                    nref = 1
-                else:
-                    nref = 0
-                result[m] = nref
-            return result
-        else:
-            result = []
-            for m, gcode in enumerate(raw_array):
-                result.append(self._marker_allele_lookup[m][gcode])
-            return result
-    
-    def getSubject(self, s):
-        """
-        """
-        skey = self._ordered_subjects[s]
-        return self._subjects[skey]
-    
-    def autosomal_indices(self):
-        """ Return the indices of markers in this ped/map that are autosomal.
-            This is used by rgGRR so that it can select a random set of markers
-            from the autosomes (sex chroms screw up the plot)
-        """
-        return self._autosomal_indices
-
-class Bed:
-
-    def __init__(self, path):
-        self.path = path
-        self._genotypes = []
-        self._fh = None
-
-    def parse(self, subjects,  markers,  quick=False):
-        """ Parse the bed file, indicated either by the path parameter,
-            or as the self.path indicated in __init__.  If quick is
-            True, then just parse the bim and fam, then genotypes will
-            be looked up dynamically by indices
-        """
-        self._quick = quick
-        
-        ordered_markers = markers
-        ordered_subjects = subjects
-        nsubjects = len(ordered_subjects)
-        nmarkers = len(ordered_markers)
-        
-        bed = open(self.path, 'rb')
-        self._fh = bed
-
-        byte1 = bed.read(1)
-        byte2 = bed.read(1)
-        byte3 = bed.read(1)
-        format_flag = struct.unpack('B', byte3)[0]
-
-        h1 = tuple(INT_TO_GCODE[struct.unpack('B', byte1)[0]])
-        h2 = tuple(INT_TO_GCODE[struct.unpack('B', byte2)[0]])
-        h3 = tuple(INT_TO_GCODE[format_flag])
-
-        if h1 != MAGIC1 or h2 != MAGIC2:
-            raise BadMagic('One or both MAGIC bytes is wrong: %s==%s or %s==%s' % (h1, MAGIC1, h2, MAGIC2))
-        if format_flag:
-            print 'Detected that binary PED file is v1.00 SNP-major mode (%s, "%s")\n' % (format_flag, h3)
-        else:
-            raise 'BAD_FORMAT_FLAG? (%s, "%s")\n' % (format_flag, h3)
-
-        print 'Parsing binary ped file for %s markers and %s subjects' % (nmarkers, nsubjects)
-
-        ### If quick mode was specified, we're done ...
-        self._quick = quick
-        if quick:
-            return
-            
-        ### ... Otherwise, parse genotypes into an array, and append that
-        ### array to self._genotypes
-        ngcodes = ceiling(nsubjects, 4)
-        bytes_per_marker = nbytes(nsubjects)
-        for m in xrange(nmarkers):
-            genotype_array = array('i', [-1]*(ngcodes))
-            for byte in xrange(bytes_per_marker):
-                intval = struct.unpack('B', bed.read(1))[0]
-                idx = byte*4
-                genotype_array[idx:idx+4] = INT_TO_GCODE[intval]
-            self._genotypes.append(genotype_array)
-        
-class Bim:
-    def __init__(self, path):
-        """
-        """
-        self.path = path
-        self._markers = {}
-        self._ordered_markers = []
-        self._marker_allele_lookup = {}
-        self._autosomal_indices = set()
-
-    def parse(self):
-        """
-        """
-        print 'Reading map (extended format) from [ %s ]' % (self.path)
-        bim = open(self.path, 'r')
-        for m, line in enumerate(bim):
-            chrom, snp, gpos, apos, a1, a2 = line.strip().split()
-            self._markers[snp] = (chrom, snp, gpos, apos, a1, a2)
-            self._marker_allele_lookup[m] = {
-                HOM0: (a2, a2),
-                HOM1: (a1, a1),
-                HET : (a1, a2),
-                MISS: ('0','0'),
-                }
-            self._ordered_markers.append(snp)
-            if chrom in AUTOSOMES:
-                self._autosomal_indices.add(m)
-        bim.close()
-        print '%s markers to be included from [ %s ]' % (m+1, self.path)
-
-class Fam:
-    def __init__(self, path):
-        """
-        """
-        self.path = path
-        self._subjects = {}
-        self._ordered_subjects = []
-
-    def parse(self):
-        """
-        """
-        print 'Reading pedigree information from [ %s ]' % (self.path)
-        fam = open(self.path, 'r')
-        for s, line in enumerate(fam):
-            fid, iid, did, mid, sex, phe = line.strip().split()
-            sid = iid.split('.')[0]
-            d_sid = did.split('.')[0]
-            m_sid = mid.split('.')[0]
-            skey = (fid, iid)
-            self._ordered_subjects.append(skey)
-            self._subjects[skey] = (fid, iid, did, mid, sex, phe, sid, d_sid, m_sid)
-        fam.close()
-        print '%s individuals read from [ %s ]' % (s+1, self.path)
-
-### Command-line functionality and testing
-def test(arg):
-    '''
-    '''
-    
-    import time
-
-    if arg == 'CAMP_AFFY.ped':
-        print 'Testing bed.parse(quick=True)'
-        s = time.time()
-        bed = Bed(arg.replace('.ped', '.bed'))
-        bed.parse(quick=True)
-        print bed.getGenotype(('400118', '10300283'), 'rs2000467')
-        print bed.getGenotype(('400118', '10101384'), 'rs2294019')
-        print bed.getGenotype(('400121', '10101149'), 'rs2294019')        
-        print bed.getGenotype(('400123', '10200290'), 'rs2294019')        
-        assert bed.getGenotype(('400118', '10101384'), 'rs2294019') == ('4','4')
-        e = time.time()
-        print 'e-s = %s\n' % (e-s)
-    
-    print 'Testing bed.parse'
-    s = time.time()
-    bed = BPed(arg)
-    bed.parse(quick=False)
-    e = time.time()
-    print 'e-s = %s\n' % (e-s)
-
-    print 'Testing bed.writeped'
-    s = time.time()
-    outname = '%s_BEDTEST' % (arg)
-    bed.writeped(outname)
-    e = time.time()
-    print 'e-s = %s\n' % (e-s)
-    del(bed)
-
-    print 'Testing ped.parse'
-    s = time.time()
-    ped = LPed(arg)
-    ped.parse()
-    e = time.time()
-    print 'e-s = %s\n' % (e-s)
-
-    print 'Testing ped.writebed'
-    s = time.time()
-    outname = '%s_PEDTEST' % (arg)
-    ped.writebed(outname)
-    e = time.time()
-    print 'e-s = %s\n' % (e-s)
-    del(ped)
-    
-def profile_bed(arg):
-    """
-    """
-    bed = BPed(arg)
-    bed.parse(quick=False)
-    outname = '%s_BEDPROFILE' % (arg)
-    bed.writeped(outname)
-
-def profile_ped(arg):
-    """
-    """
-    ped = LPed(arg)
-    ped.parse()
-    outname = '%s_PEDPROFILE' % (arg)
-    ped.writebed(outname)
-
-if __name__ == '__main__':
-    """ Run as a command-line, this script should get one or more arguments,
-        each one a ped file to be parsed with the PedParser (unit tests?)
-    """
-    op = optparse.OptionParser()
-    op.add_option('--profile-bed', action='store_true', default=False)
-    op.add_option('--profile-ped', action='store_true', default=False)
-    opts, args = op.parse_args()
-    
-    if opts.profile_bed:
-        import profile
-        import pstats
-        profile.run('profile_bed(args[0])', 'fooprof')
-        p = pstats.Stats('fooprof')
-        p.sort_stats('cumulative').print_stats(10)
-    elif opts.profile_ped:
-        import profile
-        import pstats
-        profile.run('profile_ped(args[0])', 'fooprof')
-        p = pstats.Stats('fooprof')
-        p.sort_stats('cumulative').print_stats(10)
-    else:
-        for arg in args:
-            test(arg)
-    
-    ### Code used to generate the INT_TO_GCODE dictionary
-    #print '{\n  ',
-    #for i in range(256):
-    #   b = INT2BIN[i]
-    #    ints = []
-    #    s = str(i).rjust(3)
-    #    #print b
-    #    for j in range(4):
-    #        idx = j*2
-    #        #print i, j, idx, b[idx:idx+2], int(b[idx:idx+2], 2)
-    #        ints.append(int(b[idx:idx+2], 2))
-    #    print '%s: array(\'i\', %s),' % (s,tuple(ints)),
-    #    if i > 0 and (i+1) % 4 == 0:
-    #        print '\n  ',
-    #print '}'
-
-

rgBWASampe.py

-#!/usr/bin/env python
-
-"""
-Runs BWA on single-end or paired-end data.
-Produces a SAM file containing the mappings.
-Works with BWA version 0.5.8 with
-read group patch from Broad at http://www.broadinstitute.org/gsa/wiki/index.php/BWA_patch_to_generate_read_group
-
-usage: bwa_wrapper.py [options]
-    -t, --threads=t: The number of threads to use
-    -r, --ref=r: The reference genome to use or index
-    -f, --fastq=f: The (forward) fastq file to use for the mapping
-    -F, --rfastq=F: The reverse fastq file to use for mapping if paired-end data
-    -u, --output=u: The file to save the output (SAM format)
-    -g, --genAlignType=g: The type of pairing (single or paired)
-    -p, --params=p: Parameter setting to use (pre_set or full)
-    -s, --fileSource=s: Whether to use a previously indexed reference sequence or one from history (indexed or history)
-    -n, --maxEditDist=n: Maximum edit distance if integer
-    -m, --fracMissingAligns=m: Fraction of missing alignments given 2% uniform base error rate if fraction
-    -o, --maxGapOpens=o: Maximum number of gap opens
-    -e, --maxGapExtens=e: Maximum number of gap extensions
-    -d, --disallowLongDel=d: Disallow a long deletion within specified bps
-    -i, --disallowIndel=i: Disallow indel within specified bps
-    -l, --seed=l: Take the first specified subsequences
-    -k, --maxEditDistSeed=k: Maximum edit distance to the seed
-    -M, --mismatchPenalty=M: Mismatch penalty
-    -O, --gapOpenPenalty=O: Gap open penalty
-    -E, --gapExtensPenalty=E: Gap extension penalty
-    -R, --suboptAlign=R: Proceed with suboptimal alignments even if the top hit is a repeat
-    -N, --noIterSearch=N: Disable iterative search
-    -T, --outputTopN=T: Output top specified hits
-    -S, --maxInsertSize=S: Maximum insert size for a read pair to be considered mapped good
-    -P, --maxOccurPairing=P: Maximum occurrences of a read for pairings
-    -D, --dbkey=D: Dbkey for reference genome
-    -H, --suppressHeader=h: Suppress header
-    --rgID
-    --rgSM
-    --rgLB
-    --rgPL   
-"""
-
-import optparse, os, shutil, subprocess, sys, tempfile
-
-def stop_err( msg ):
-    sys.stderr.write( '%s\n' % msg )
-    sys.exit()
-
-def __main__():
-    #Parse Command Line
-    parser = optparse.OptionParser()
-    parser.add_option( '-t', '--threads', dest='threads', help='The number of threads to use' )
-    parser.add_option( '-r', '--ref', dest='ref', help='The reference genome to use or index' )
-    parser.add_option( '-f', '--fastq', dest='fastq', help='The (forward) fastq file to use for the mapping' )
-    parser.add_option( '-F', '--rfastq', dest='rfastq', help='The reverse fastq file to use for mapping if paired-end data' )
-    parser.add_option( '-u', '--output', dest='output', help='The file to save the output (SAM format)' )
-    parser.add_option( '-g', '--genAlignType', dest='genAlignType', help='The type of pairing (single or paired)' )
-    parser.add_option( '-p', '--params', dest='params', help='Parameter setting to use (pre_set or full)' )
-    parser.add_option( '-s', '--fileSource', dest='fileSource', help='Whether to use a previously indexed reference sequence or one form history (indexed or history)' )
-    parser.add_option( '-n', '--maxEditDist', dest='maxEditDist', help='Maximum edit distance if integer' )
-    parser.add_option( '-m', '--fracMissingAligns', dest='fracMissingAligns', help='Fraction of missing alignments given 2% uniform base error rate if fraction' )
-    parser.add_option( '-o', '--maxGapOpens', dest='maxGapOpens', help='Maximum number of gap opens' )
-    parser.add_option( '-e', '--maxGapExtens', dest='maxGapExtens', help='Maximum number of gap extensions' )
-    parser.add_option( '-d', '--disallowLongDel', dest='disallowLongDel', help='Disallow a long deletion within specified bps' )
-    parser.add_option( '-i', '--disallowIndel', dest='disallowIndel', help='Disallow indel within specified bps' )
-    parser.add_option( '-l', '--seed', dest='seed', help='Take the first specified subsequences' )
-    parser.add_option( '-k', '--maxEditDistSeed', dest='maxEditDistSeed', help='Maximum edit distance to the seed' )
-    parser.add_option( '-M', '--mismatchPenalty', dest='mismatchPenalty', help='Mismatch penalty' )
-    parser.add_option( '-O', '--gapOpenPenalty', dest='gapOpenPenalty', help='Gap open penalty' )
-    parser.add_option( '-E', '--gapExtensPenalty', dest='gapExtensPenalty', help='Gap extension penalty' )
-    parser.add_option( '-R', '--suboptAlign', dest='suboptAlign', help='Proceed with suboptimal alignments even if the top hit is a repeat' )
-    parser.add_option( '-N', '--noIterSearch', dest='noIterSearch', help='Disable iterative search' )
-    parser.add_option( '-T', '--outputTopN', dest='outputTopN', help='Output top specified hits' )
-    parser.add_option( '-S', '--maxInsertSize', dest='maxInsertSize', help='Maximum insert size for a read pair to be considered mapped good' )
-    parser.add_option( '-P', '--maxOccurPairing', dest='maxOccurPairing', help='Maximum occurrences of a read for pairings' )
-    parser.add_option( '-D', '--dbkey', dest='dbkey', help='Dbkey for reference genome' )
-    parser.add_option( '-H', '--suppressHeader', dest='suppressHeader', help='Suppress header' )
-    # set up aligning and generate aligning command options
-    # added october 20 ross lazarus for patch from Broad to allow bwa to add read group annotation
-    parser.add_option( '--rgID', dest='rgID', help='Read Group ID',default='' )
-    parser.add_option( '--rgSM', dest='rgSM', help='Read Group Sample',default='' )
-    parser.add_option( '--rgLB', dest='rgLB', help='Read Group Library' )
-    parser.add_option( '--rgPL', dest='rgPL', help='Read Group Platform' )
-    parser.add_option( '--title', dest='title', help='Job title' )
-
-    (options, args) = parser.parse_args()
-    # make temp directory for placement of indices
-    tmp_index_dir = tempfile.mkdtemp()
-    tmp_dir = tempfile.mkdtemp()
-    # index if necessary
-    if options.fileSource == 'history':
-        ref_file = tempfile.NamedTemporaryFile( dir=tmp_index_dir )
-        ref_file_name = ref_file.name
-        ref_file.close()
-        os.symlink( options.ref, ref_file_name )
-        # determine which indexing algorithm to use, based on size
-        try:
-            size = os.stat( options.ref ).st_size
-            if size <= 2**30: 
-                indexingAlg = 'is'
-            else:
-                indexingAlg = 'bwtsw'
-        except:
-            indexingAlg = 'is'
-        indexing_cmds = '-a %s' % indexingAlg
-        cmd1 = 'bwa index %s %s' % ( indexing_cmds, ref_file_name )
-        try:
-            tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name
-            tmp_stderr = open( tmp, 'wb' )
-            proc = subprocess.Popen( args=cmd1, shell=True, cwd=tmp_index_dir, stderr=tmp_stderr.fileno() )
-            returncode = proc.wait()
-            tmp_stderr.close()
-            # get stderr, allowing for case where it's very large
-            tmp_stderr = open( tmp, 'rb' )
-            stderr = ''
-            buffsize = 1048576
-            try:
-                while True:
-                    stderr += tmp_stderr.read( buffsize )
-                    if not stderr or len( stderr ) % buffsize != 0:
-                        break
-            except OverflowError:
-                pass
-            tmp_stderr.close()
-            if returncode != 0:
-                raise Exception, stderr
-        except Exception, e:
-            # clean up temp dirs
-            if os.path.exists( tmp_index_dir ):
-                shutil.rmtree( tmp_index_dir )
-            if os.path.exists( tmp_dir ):
-                shutil.rmtree( tmp_dir )
-            stop_err( 'Error indexing reference sequence. ' + str( e ) )
-    else:
-        ref_file_name = options.ref
-    # set up aligning and generate aligning command options
-    # added october 20 ross lazarus for patch from Broad to allow bwa to add read group annotation
-    read_group_cmds = ''
-    if options.rgLB > '':
-        read_group_cmds = '-l %s -p %s' % (options.rgLB, options.rgPL)
-    if options.rgID > '':
-        read_group_cmds = '-i %s -m %s %s' % (options.rgID, options.rgSM, read_group_cmds)
-    if options.params == 'pre_set':
-        aligning_cmds = '-t %s' % options.threads
-        gen_alignment_cmds = ''
-    else:
-        if options.maxEditDist != '0':
-            editDist = options.maxEditDist
-        else:
-            editDist = options.fracMissingAligns
-        if options.seed != '-1':
-            seed = '-l %s' % options.seed
-        else:
-            seed = ''
-        if options.suboptAlign == 'true':
-            suboptAlign = '-R'
-        else:
-            suboptAlign = ''
-        if options.noIterSearch == 'true':
-            noIterSearch = '-N'
-        else:
-            noIterSearch = ''
-        aligning_cmds = '-n %s -o %s -e %s -d %s -i %s %s -k %s -t %s -M %s -O %s -E %s %s %s' % \
-                        ( editDist, options.maxGapOpens, options.maxGapExtens, options.disallowLongDel,
-                          options.disallowIndel, seed, options.maxEditDistSeed, options.threads,
-                          options.mismatchPenalty, options.gapOpenPenalty, options.gapExtensPenalty,
-                          suboptAlign, noIterSearch )
-        if options.genAlignType == 'single':
-            gen_alignment_cmds = '-n %s' % options.outputTopN
-        elif options.genAlignType == 'paired':
-            gen_alignment_cmds = '-a %s -o %s' % ( options.maxInsertSize, options.maxOccurPairing )
-    # set up output files
-    tmp_align_out = tempfile.NamedTemporaryFile( dir=tmp_dir )
-    tmp_align_out_name = tmp_align_out.name
-    tmp_align_out.close()
-    tmp_align_out2 = tempfile.NamedTemporaryFile( dir=tmp_dir )
-    tmp_align_out2_name = tmp_align_out2.name 
-    tmp_align_out2.close()
-    # prepare actual aligning and generate aligning commands
-    cmd2 = 'bwa aln %s %s %s > %s' % ( aligning_cmds, ref_file_name, options.fastq, tmp_align_out_name )
-    cmd2b = ''
-    if options.genAlignType == 'paired':
-        cmd2b = 'bwa aln %s %s %s > %s' % ( aligning_cmds, ref_file_name, options.rfastq, tmp_align_out2_name )
-        cmd3 = 'bwa sampe %s %s %s %s %s %s %s >> %s' % ( gen_alignment_cmds, read_group_cmds, ref_file_name, tmp_align_out_name, tmp_align_out2_name, options.fastq, options.rfastq, options.output )
-    else:
-        cmd3 = 'bwa samse %s %s %s %s %s >> %s' % ( gen_alignment_cmds, read_group_cmds, ref_file_name, tmp_align_out_name, options.fastq, options.output )
-    # perform alignments
-    buffsize = 1048576
-    try:
-        # need to nest try-except in try-finally to handle 2.4
-        try:
-            # align
-            try:
-                tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
-                tmp_stderr = open( tmp, 'wb' )
-                proc = subprocess.Popen( args=cmd2, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
-                returncode = proc.wait()
-                tmp_stderr.close()
-                # get stderr, allowing for case where it's very large
-                tmp_stderr = open( tmp, 'rb' )
-                stderr = ''
-                try:
-                    while True:
-                        stderr += tmp_stderr.read( buffsize )
-                        if not stderr or len( stderr ) % buffsize != 0:
-                            break
-                except OverflowError:
-                    pass
-                tmp_stderr.close()
-                if returncode != 0:
-                    raise Exception, stderr
-            except Exception, e:
-                raise Exception, 'Error aligning sequence. ' + str( e )
-            # and again if paired data
-            try:
-                if cmd2b:
-                    tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
-                    tmp_stderr = open( tmp, 'wb' )
-                    proc = subprocess.Popen( args=cmd2b, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
-                    returncode = proc.wait()
-                    tmp_stderr.close()
-                    # get stderr, allowing for case where it's very large
-                    tmp_stderr = open( tmp, 'rb' )
-                    stderr = ''
-                    try:
-                        while True:
-                            stderr += tmp_stderr.read( buffsize )
-                            if not stderr or len( stderr ) % buffsize != 0:
-                                break
-                    except OverflowError:
-                        pass
-                    tmp_stderr.close()
-                    if returncode != 0:
-                        raise Exception, stderr
-            except Exception, e:
-                raise Exception, 'Error aligning second sequence. ' + str( e )
-            # generate align
-            try:
-                tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
-                tmp_stderr = open( tmp, 'wb' )
-                proc = subprocess.Popen( args=cmd3, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
-                returncode = proc.wait()
-                tmp_stderr.close()
-                # get stderr, allowing for case where it's very large
-                tmp_stderr = open( tmp, 'rb' )
-                stderr = ''
-                try:
-                    while True:
-                        stderr += tmp_stderr.read( buffsize )
-                        if not stderr or len( stderr ) % buffsize != 0:
-                            break
-                except OverflowError:
-                    pass
-                tmp_stderr.close()
-                if returncode != 0:
-                    raise Exception, stderr
-            except Exception, e:
-                raise Exception, 'Error generating alignments. ' + str( e ) 
-            # remove header if necessary
-            if options.suppressHeader == 'true':
-                tmp_out = tempfile.NamedTemporaryFile( dir=tmp_dir)
-                tmp_out_name = tmp_out.name
-                tmp_out.close()
-                try:
-                    shutil.move( options.output, tmp_out_name )
-                except Exception, e:
-                    raise Exception, 'Error moving output file before removing headers. ' + str( e )
-                fout = file( options.output, 'w' )
-                for line in file( tmp_out.name, 'r' ):
-                    if not ( line.startswith( '@HD' ) or line.startswith( '@SQ' ) or line.startswith( '@RG' ) or line.startswith( '@PG' ) or line.startswith('@CO') ):
-                        fout.write( line )
-                fout.close()
-            # check that there are results in the output file
-            if os.path.getsize( options.output ) > 0:
-                sys.stdout.write( 'BWA run on %s-end data' % options.genAlignType )
-            else:
-                raise Exception, 'The output file is empty. You may simply have no matches, or there may be an error with your input file or settings.'
-        except Exception, e:
-            stop_err( 'The alignment failed.\n' + str( e ) )
-    finally:
-        # clean up temp dir
-        if os.path.exists( tmp_index_dir ):
-            shutil.rmtree( tmp_index_dir )
-        if os.path.exists( tmp_dir ):
-            shutil.rmtree( tmp_dir )
-
-if __name__=="__main__": __main__()
-

rgBWASampe.xml

-<tool id="channing_bwa" name="Map with BWA" version="1.0.4">
-  <description>with Read Group options</description>
-  <command interpreter="python">rgBWASampe.py
---threads="4"
-#if $genomeSource.refGenomeSource == "history":
---ref=$genomeSource.ownFile
-#else:
---ref=$genomeSource.indices
-#end if
---fastq=$paired.input1
-#if $paired.sPaired == "paired":
---rfastq=$paired.input2
-#else:
---rfastq="None"
-#end if
---rgID "$params.rgID" --rgSM "$params.rgSM" --rgLB "$params.rgLB" --rgPL "$params.rgPL" --title "$params.title"
---output=$output --genAlignType=$paired.sPaired --params=$params.source_select --fileSource=$genomeSource.refGenomeSource
-#if $params.source_select == "pre_set":
---maxEditDist="None" --fracMissingAligns="None" --maxGapOpens="None" --maxGapExtens="None" --disallowLongDel="None" --disallowIndel="None" --seed="None" --maxEditDistSeed="None" --mismatchPenalty="None" --gapOpenPenalty="None" --gapExtensPenalty="None" --suboptAlign="None" --noIterSearch="None" --outputTopN="None" --maxInsertSize="None" --maxOccurPairing="None"
-#else:
---maxEditDist=$params.maxEditDist --fracMissingAligns=$params.fracMissingAligns --maxGapOpens=$params.maxGapOpens --maxGapExtens=$params.maxGapExtens --disallowLongDel=$params.disallowLongDel --disallowIndel=$params.disallowIndel --seed=$params.seed --maxEditDistSeed=$params.maxEditDistSeed --mismatchPenalty=$params.mismatchPenalty --gapOpenPenalty=$params.gapOpenPenalty --gapExtensPenalty=$params.gapExtensPenalty --suboptAlign=$params.suboptAlign --noIterSearch=$params.noIterSearch --outputTopN=$params.outputTopN --maxInsertSize=$params.maxInsertSize --maxOccurPairing=$params.maxOccurPairing
-#end if
-#if $genomeSource.refGenomeSource == "history":
---dbkey=$dbkey
-#else:
---dbkey="None"
-#end if
---suppressHeader=$suppressHeader
-  </command>
-  <requirements>
-    <requirement type='package'>bwa</requirement>
-  </requirements>
-  <inputs>
-    <conditional name="genomeSource">
-      <param name="refGenomeSource" type="select" label="Will you select a reference genome from your history or use a built-in index?">
-        <option value="indexed">Use a built-in index</option>
-        <option value="history">Use one from the history</option>
-      </param>
-      <when value="indexed">
-        <param name="indices" type="select" label="Select a reference genome">
-          <options from_data_table="bwa_indexes"/>
-          <!--
-          <options from_file="bwa_index.loc">
-            <column name="value" index="1" />
-            <column name="name" index="0" />
-          </options>
-          -->
-        </param>
-      </when>
-      <when value="history">
-        <param name="ownFile" type="data" format="fasta" metadata_name="dbkey" label="Select a reference from history" />
-      </when>
-    </conditional>
-    <conditional name="paired">
-      <param name="sPaired" type="select" label="Is this library mate-paired?">
-        <option value="single">Single-end</option>
-        <option value="paired">Paired-end</option>
-      </param>
-      <when value="single">
-        <param name="input1" type="data" format="fastqsanger" label="FASTQ file" help="Must have Sanger-scaled quality values with ASCII offset 33"/>
-      </when>
-      <when value="paired">
-        <param name="input1" type="data" format="fastqsanger" label="Forward FASTQ file" help="Must have Sanger-scaled quality values with ASCII offset 33"/>
-        <param name="input2" type="data" format="fastqsanger" label="Reverse FASTQ file" help="Must have Sanger-scaled quality values with ASCII offset 33"/>
-      </when>
-    </conditional>
-    <conditional name="params">
-      <param name="source_select" type="select" label="BWA settings to use" help="For most mapping needs use Commonly Used settings. If you want full control use Full Parameter List">
-        <option value="pre_set">Commonly Used</option>
-        <option value="full">Full Parameter List</option>
-      </param>
-      <when value="pre_set">
-        <param name="title" size="200" type="text" value="BWA map job notes go here" label="Job name - put things here to remind you what you did " 
-          help="Brief note of whatever will help you remember what this was for?" />
-        <param name="rgID" size="200" type="text" value="" label="Read Group ID" help="Optional: Leave blank to ignore. Needed to add sample read groups" />
-        <param name="rgSM" size="200" type="text" value="" label="Read Group Sample" help="If Read Group ID supplied only: Add read group parameter" />
-        <param name="rgLB" size="200" type="text" value="" label="Read Group Library" help="Optional: Leave blank to ignore. Add read group parameter" />
-        <param name="rgPL" size="200" type="text" value="" label="Read Group Platform" help="Add read group parameter"/>
-      </when>
-      <when value="full">
-        <param name="title" size="200" type="text" value="BWA map job notes go here" label="Job name - put things here to remind you what you did " 
-          help="Brief note of whatever will help you remember what this was for?" />
-        <param name="rgID" size="200" type="text" value="" label="Read Group ID" help="Optional: Leave blank to ignore. Needed to add sample read groups" />
-        <param name="rgSM" size="200" type="text" value="" label="Read Group Sample" help="If Read Group ID supplied only: Add read group parameter" />
-        <param name="rgLB" size="200" type="text" value="" label="Read Group Library" help="Optional: Leave blank to ignore. Add read group parameter" />
-        <param name="rgPL" size="200" type="text" value="" label="Read Group Platform" help="Add read group parameter"/>
-        <param name="maxEditDist" type="integer" value="0" label="Maximum edit distance (-n)" help="Enter this value OR a fraction of missing alignments, not both" />
-        <param name="fracMissingAligns" type="float" value="0.04" label="Fraction of missing alignments given 2% uniform base error rate (-n)" help="Enter this value OR maximum edit distance, not both" />
-        <param name="maxGapOpens" type="integer" value="1" label="Maximum number of gap opens (-o)" />
-        <param name="maxGapExtens" type="integer" value="-1" label="Maximum number of gap extensions (-e)" help="-1 for k-difference mode (disallowing long gaps)" />
-        <param name="disallowLongDel" type="integer" value="16" label="Disallow long deletion within [value] bp towards the 3'-end (-d)" />
-        <param name="disallowIndel" type="integer" value="5" label="Disallow insertion/deletion within [value] bp towards the end (-i)" />
-        <param name="seed" type="integer" value="-1" label="Number of first subsequences to take as seed (-l)" help="Enter -1 for infinity" />
-        <param name="maxEditDistSeed" type="integer" value="2" label="Maximum edit distance in the seed (-k)" />
-        <param name="mismatchPenalty" type="integer" value="3" label="Mismatch penalty (-M)" help="BWA will not search for suboptimal hits with a score lower than [value]" />
-        <param name="gapOpenPenalty" type="integer" value="11" label="Gap open penalty (-O)" />
-        <param name="gapExtensPenalty" type="integer" value="4" label="Gap extension penalty (-E)" />
-        <param name="suboptAlign" type="boolean" truevalue="true" falsevalue="false" checked="no" label="Proceed with suboptimal alignments even if the top hit is a repeat" help="By default, BWA only searches for suboptimal alignments if the top hit is unique. Using this option has no effect on accuracy for single-end reads. It is mainly designed for improving the alignment accuracy of paired-end reads. However, the pairing procedure will be slowed down, especially for very short reads (~32bp) (-R)" />
-        <param name="noIterSearch" type="boolean" truevalue="true" falsevalue="false" checked="no" label="Disable iterative search" help="All hits with no more than maxDiff differences will be found. This mode is much slower than the default (-N)" />
-        <param name="outputTopN" type="integer" value="-1" label="Output top [value] hits" help="For single-end reads only. Enter -1 to disable outputting multiple hits. NOTE: If you put in a positive value here, your output will NOT be in SAM format (-n)" />
-        <param name="maxInsertSize" type="integer" value="500" label="Maximum insert size for a read pair to be considered as being mapped properly" help="For paired-end reads only. Only used when there are not enough good alignments to infer the distribution of insert sizes (-a)" />
-        <param name="maxOccurPairing" type="integer" value="100000" label="Maximum occurrences of a read for pairing" help="For paired-end reads only. A read with more occurrences will be treated as a single-end read. Reducing this parameter helps faster pairing (-o)" />
-      </when>
-    </conditional>
-    <param name="suppressHeader" type="boolean" truevalue="true" falsevalue="false" checked="false" label="Suppress the header in the output SAM file" 
-    help="Headers contain metadata. You probably want to preserve your metadata, so leave this unset" />
-  </inputs>
-  <outputs>
-    <data format="sam" name="output" label="${params.title}">
-      <actions>
-        <conditional name="genomeSource.refGenomeSource">
-          <when value="indexed">
-            <action type="metadata" name="dbkey">
-              <option type="from_data_table" name="bwa_indexes" column="0">
-                <filter type="param_value" ref="genomeSource.indices" column="1"/>
-              </option>
-            </action>
-          </when>
-        </conditional>
-      </actions>
-    </data>
-  </outputs>
-  <tests>
-    <test>
-      <!--
-      BWA commands:
-      bwa aln -t 4 phiX test-data/bwa_wrapper_in1.fastq > bwa_wrapper_out1.sai
-      bwa samse phiX bwa_wrapper_out1.sai test-data/bwa_wrapper_in1.fastq >> bwa_wrapper_out1.sam
-      phiX.fa is the prefix for the reference files (phiX.fa.amb, phiX.fa.ann, phiX.fa.bwt, ...)
-      remove the comment lines (beginning with '@') from the resulting sam file
-      -->
-      <param name="refGenomeSource" value="indexed" />
-      <param name="indices" value="phiX" />
-      <param name="sPaired" value="single" />
-      <param name="input1" value="bwa_wrapper_in1.fastq" ftype="fastqsanger" />
-      <param name="source_select" value="pre_set" />
-      <param name="suppressHeader" value="true" />
-      <output name="output" file="bwa_wrapper_out1.sam" ftype="sam" sort="True" />
-    </test>
-    <test>
-      <!--
-      BWA commands:
-      cp test-data/phiX.fasta phiX.fasta
-      bwa index -a is phiX.fasta
-      bwa aln -n 0.04 -o 1 -e -1 -d 16 -i 5 -k 2 -t 4 -M 3 -O 11 -E 4 -R -N phiX.fasta test-data/bwa_wrapper_in1.fastq > bwa_wrapper_out2.sai
-      bwa samse phiX.fasta bwa_wrapper_out2.sai test-data/bwa_wrapper_in1.fastq > bwa_wrapper_out2.sam
-      phiX.fa is the prefix for the reference files (phiX.fa.amb, phiX.fa.ann, phiX.fa.bwt, ...)
-      remove the comment lines (beginning with '@') from the resulting sam file
-      -->
-      <param name="refGenomeSource" value="history" />
-      <param name="ownFile" value="phiX.fasta" />
-      <param name="sPaired" value="single" />
-      <param name="input1" value="bwa_wrapper_in1.fastq" ftype="fastqsanger" />
-      <param name="source_select" value="full" />
-      <param name="maxEditDist" value="0" />
-      <param name="fracMissingAligns" value="0.04" />
-      <param name="maxGapOpens" value="1" />
-      <param name="maxGapExtens" value="-1" />
-      <param name="disallowLongDel" value="16" />
-      <param name="disallowIndel" value="5" />
-      <param name="seed" value="-1" />
-      <param name="maxEditDistSeed" value="2" />
-      <param name="mismatchPenalty" value="3" />
-      <param name="gapOpenPenalty" value="11" />
-      <param name="gapExtensPenalty" value="4" />
-      <param name="suboptAlign" value="true" />
-      <param name="noIterSearch" value="true" />
-      <param name="outputTopN" value="-1" />
-      <param name="maxInsertSize" value="500" />
-      <param name="maxOccurPairing" value="100000" />
-      <param name="suppressHeader" value="true" />
-      <output name="output" file="bwa_wrapper_out2.sam" ftype="sam" sort="True" />
-    </test>
-    <test>
-      <!--
-      BWA commands:
-      bwa aln -n 0.04 -o 1 -e -1 -d 16 -i 5 -k 2 -t 4 -M 3 -O 11 -E 4 -R -N phiX.fa test-data/bwa_wrapper_in2.fastq > bwa_wrapper_out3a.sai
-      bwa aln -n 0.04 -o 1 -e -1 -d 16 -i 5 -k 2 -t 4 -M 3 -O 11 -E 4 -R -N phiX.fa test-data/bwa_wrapper_in3.fastq > bwa_wrapper_out3b.sai
-      bwa sampe -a 500 -o 100000 phiX.fasta bwa_wrapper_out3a.sai bwa_wrapper_out3b.sai test-data/bwa_wrapper_in2.fastq test-data/bwa_wrapper_in3.fastq > bwa_wrapper_out3.sam
-      phiX.fa is the prefix for the reference
-      remove the comment lines (beginning with '@') from the resulting sam file
-      -->
-      <param name="refGenomeSource" value="indexed" />
-      <param name="indices" value="phiX" />
-      <param name="sPaired" value="paired" />
-      <param name="input1" value="bwa_wrapper_in2.fastq" ftype="fastqsanger" />
-      <param name="input2" value="bwa_wrapper_in3.fastq" ftype="fastqsanger" />
-      <param name="source_select" value="full" />
-      <param name="maxEditDist" value="0" />
-      <param name="fracMissingAligns" value="0.04" />
-      <param name="maxGapOpens" value="1" />
-      <param name="maxGapExtens" value="-1" />
-      <param name="disallowLongDel" value="16" />
-      <param name="disallowIndel" value="5" />
-      <param name="seed" value="-1" />
-      <param name="maxEditDistSeed" value="2" />
-      <param name="mismatchPenalty" value="3" />
-      <param name="gapOpenPenalty" value="11" />
-      <param name="gapExtensPenalty" value="4" />
-      <param name="suboptAlign" value="true" />
-      <param name="noIterSearch" value="true" />
-      <param name="outputTopN" value="-1" />
-      <param name="maxInsertSize" value="500" />
-      <param name="maxOccurPairing" value="100000" />
-      <param name="suppressHeader" value="true" />
-      <output name="output" file="bwa_wrapper_out3.sam" ftype="sam" sort="True" />
-    </test>
-  </tests>
-  <help>
-
-**What it does**
-
-BWA is a fast light-weighted tool that aligns relatively short sequences (queries) to a sequence database (large), such as the human reference genome. It is developed by Heng Li at the Sanger Insitute. Li H. and Durbin R. (2009) Fast and accurate short read alignment with Burrows-Wheeler transform. Bioinformatics, 25, 1754-60.
-
-------
-
-**Know what you are doing**
-
-.. class:: warningmark
-
-There is no such thing (yet) as an automated gearshift in short read mapping. It is all like stick-shift driving in San Francisco. In other words = running this tool with default parameters will probably not give you meaningful results. A way to deal with this is to **understand** the parameters by carefully reading the `documentation`__ and experimenting. Fortunately, Galaxy makes experimenting easy.
-
- .. __: http://bio-bwa.sourceforge.net/
-
-------
-
-**Input formats**
-
-BWA accepts files in Sanger FASTQ format. Use the FASTQ Groomer to prepare your files.
-
-------
-
-**Outputs**
-
-The output is in SAM format, and has the following columns::
-
-    Column  Description
-  --------  --------------------------------------------------------
-  1  QNAME  Query (pair) NAME
-  2  FLAG   bitwise FLAG
-  3  RNAME  Reference sequence NAME
-  4  POS    1-based leftmost POSition/coordinate of clipped sequence
-  5  MAPQ   MAPping Quality (Phred-scaled)
-  6  CIGAR  extended CIGAR string
-  7  MRNM   Mate Reference sequence NaMe ('=' if same as RNAME)
-  8  MPOS   1-based Mate POSition
-  9  ISIZE  Inferred insert SIZE
-  10 SEQ    query SEQuence on the same strand as the reference
-  11 QUAL   query QUALity (ASCII-33 gives the Phred base quality)
-  12 OPT    variable OPTional fields in the format TAG:VTYPE:VALU
-
-The flags are as follows::
-
-    Flag  Description
-  ------  -------------------------------------
-  0x0001  the read is paired in sequencing
-  0x0002  the read is mapped in a proper pair
-  0x0004  the query sequence itself is unmapped
-  0x0008  the mate is unmapped
-  0x0010  strand of the query (1 for reverse)
-  0x0020  strand of the mate
-  0x0040  the read is the first read in a pair
-  0x0080  the read is the second read in a pair
-  0x0100  the alignment is not primary
-
-It looks like this (scroll sideways to see the entire example)::
-
-  QNAME	FLAG	RNAME	POS	MAPQ	CIAGR	MRNM	MPOS	ISIZE	SEQ	QUAL	OPT
-  HWI-EAS91_1_30788AAXX:1:1:1761:343	4	*	0	0	*	*	0	0	AAAAAAANNAAAAAAAAAAAAAAAAAAAAAAAAAAACNNANNGAGTNGNNNNNNNGCTTCCCACAGNNCTGG	hhhhhhh;;hhhhhhhhhhh^hOhhhhghhhfhhhgh;;h;;hhhh;h;;;;;;;hhhhhhghhhh;;Phhh
-  HWI-EAS91_1_30788AAXX:1:1:1578:331	4	*	0	0	*	*	0	0	GTATAGANNAATAAGAAAAAAAAAAATGAAGACTTTCNNANNTCTGNANNNNNNNTCTTTTTTCAGNNGTAG	hhhhhhh;;hhhhhhhhhhhhhhhhhhhhhhhhhhhh;;h;;hhhh;h;;;;;;;hhhhhhhhhhh;;hhVh
-
--------
-
-**BWA settings**
-
-All of the options have a default value. You can change any of them. All of the options in BWA have been implemented here.
-
-------
-
-**BWA parameter list**
-
-This is an exhaustive list of BWA options:
-
-For **aln**::
-
-  -n NUM  Maximum edit distance if the value is INT, or the fraction of missing
-          alignments given 2% uniform base error rate if FLOAT. In the latter
-          case, the maximum edit distance is automatically chosen for different
-          read lengths. [0.04]
-  -o INT  Maximum number of gap opens [1]
-  -e INT  Maximum number of gap extensions, -1 for k-difference mode
-          (disallowing long gaps) [-1]
-  -d INT  Disallow a long deletion within INT bp towards the 3'-end [16]
-  -i INT  Disallow an indel within INT bp towards the ends [5]
-  -l INT  Take the first INT subsequence as seed. If INT is larger than the
-          query sequence, seeding will be disabled. For long reads, this option
-          is typically ranged from 25 to 35 for '-k 2'. [inf]
-  -k INT  Maximum edit distance in the seed [2]
-  -t INT  Number of threads (multi-threading mode) [1]
-  -M INT  Mismatch penalty. BWA will not search for suboptimal hits with a score
-          lower than (bestScore-misMsc). [3]
-  -O INT  Gap open penalty [11]
-  -E INT  Gap extension penalty [4]
-  -c      Reverse query but not complement it, which is required for alignment
-          in the color space.
-  -R      Proceed with suboptimal alignments even if the top hit is a repeat. By
-          default, BWA only searches for suboptimal alignments if the top hit is
-          unique. Using this option has no effect on accuracy for single-end
-          reads. It is mainly designed for improving the alignment accuracy of
-          paired-end reads. However, the pairing procedure will be slowed down,
-          especially for very short reads (~32bp).
-  -N      Disable iterative search. All hits with no more than maxDiff
-          differences will be found. This mode is much slower than the default.
-
-For **samse**::
-
-  -n INT  Output up to INT top hits. Value -1 to disable outputting multiple
-          hits. NOTE: Entering a value other than -1 will result in output that
-          is not in SAM format, and therefore not usable further down the
-          pipeline. Check the BWA documentation for details on the format of
-          the output. [-1]
-
-For **sampe**::
-
-  -a INT  Maximum insert size for a read pair to be considered as being mapped
-          properly. Since version 0.4.5, this option is only used when there
-          are not enough good alignment to infer the distribution of insert
-          sizes. [500]
-  -o INT  Maximum occurrences of a read for pairing. A read with more
-          occurrences will be treated as a single-end read. Reducing this
-          parameter helps faster pairing. [100000]
-
-  </help>
-</tool>
-
-

rgCaCo.py

-#!/usr/local/bin/python
-# hack to run and process a plink case control association
-# expects args as  
-# bfilepath outname jobname outformat (wig,xls)
-# ross lazarus 
-# for wig files, we need annotation so look for map file or complain
-"""
-Parameters for wiggle track definition lines
-All options are placed in a single line separated by spaces:
-
-  track type=wiggle_0 name=track_label description=center_label \
-        visibility=display_mode color=r,g,b altColor=r,g,b \
-        priority=priority autoScale=on|off \
-        gridDefault=on|off maxHeightPixels=max:default:min \
-        graphType=bar|points viewLimits=lower:upper \
-        yLineMark=real-value yLineOnOff=on|off \
-        windowingFunction=maximum|mean|minimum smoothingWindow=off|2-16
-"""
-
-import sys,math,shutil,subprocess,os,time,tempfile,string
-from os.path import abspath
-from rgutils import timenow, plinke
-
-imagedir = '/static/rg' # if needed for images
-myversion = 'V000.1 April 2007'
-verbose = False
-
-def makeGFF(resf='',outfname='',logf=None,twd='.',name='track name',description='track description',topn=1000):
-    """
-    score must be scaled to 0-1000
-    
-    Want to make some wig tracks from each analysis
-    Best n -log10(p). Make top hit the window.
-    we use our tab output which has
-    rs	chrom	offset	ADD_stat	ADD_p	ADD_log10p
-    rs3094315	1	792429	1.151	0.2528	0.597223
-
-    """
-
-    def is_number(s):
-        try:
-            float(s)
-            return True
-        except ValueError:
-            return False
-    header = 'track name=%s description="%s" visibility=2 useScore=1 color=0,60,120\n' % (name,description)          
-    column_names = [ 'Seqname', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'Group' ]
-    halfwidth=100
-    resfpath = os.path.join(twd,resf)
-    resf = open(resfpath,'r')
-    resfl = resf.readlines() # dumb but convenient for millions of rows
-    resfl = [x.split() for x in resfl]
-    headl = resfl[0]
-    resfl = resfl[1:]
-    headl = [x.strip().upper() for x in headl]
-    headIndex = dict(zip(headl,range(0,len(headl))))
-    chrpos = headIndex.get('CHR',None)
-    rspos = headIndex.get('RS',None)
-    offspos = headIndex.get('OFFSET',None)
-    ppos = headIndex.get('LOG10ARMITAGEP',None)