Commits

Miha Stajdohar committed adc5783

Created Python package.

Comments (0)

Files changed (50)

+syntax: glob
+
+# Ignore dot files.
+.*
+
+# Ignore backup files
+*.orig
+
+# Ignore temporary editor files.
+*.swp
+*~
+
+# Ignore binaries.
+*.py[cod]
+*.so
+*.dll
+
+
+# Ignore files created by setup.py.
+_build
+build
+dist
+MANIFEST
+*.egg-info
+*.egg
+distribute*.tar.gz
+
+# Built documentation.
+docs/*/html
+include *.txt
+recursive-include docs *.txt
+BIOx
+====
+
+A Python library which `PIPA <http://pipa.biolab.si>`_ uses to perform bioinformatics analysis.
+
+Installation
+------------
+
+To install run::
+
+    python setup.py install
+
+Installation for Developers
+---------------------------
+
+To install in `development mode`_ run::
+
+    python setup.py develop
+
+.. _development mode: http://packages.python.org/distribute/setuptools.html#development-mode

__init__.py

-"""
-Biox
-====
-
-Biox is a set of python modules, designed to ease next-generation sequencing data analysis.
-"""
-
-import biox
-import data
-import map
-import utils
-import expression
-from config import *
-import math
-
-def gff3_from_fasta(fasta_file):
-    f = biox.data.Fasta(fasta_file)
-    while f.read():
-        row = [f.id, "biox", "chromosome", "1", len(f.sequence), ".", ".", ".", "ID=%s;Name=%s" % (f.id, f.id)]
-        print "\t".join(str(x) for x in row)
+"""
+Biox
+====
+
+Biox is a set of python modules, designed to ease next-generation sequencing data analysis.
+"""
+
+import biox
+import data
+import map
+import utils
+import expression
+from config import *
+import math
+
+def gff3_from_fasta(fasta_file):
+    f = biox.data.Fasta(fasta_file)
+    while f.read():
+        row = [f.id, "biox", "chromosome", "1", len(f.sequence), ".", ".", ".", "ID=%s;Name=%s" % (f.id, f.id)]
+        print "\t".join(str(x) for x in row)

biox/config_example.py

+bowtie_folder = ""
+bowtie_index_folder = ""
+samtools_folder = ""
+os_shell = ""
+import biox
+from os.path import join as pjoin
+
+class Bam():
+    
+    def __init__(self, filename):
+        self.samtools_exec = pjoin(biox.samtools_folder, "samtools")
+        self.filename = filename
+        
+    def get_coverage(self, chr, strand, start, stop):
+        raw_count = 0
+        command = "{samtools_exec} view -F 4 -q 30 {filename} {chr}:{start}-{stop}".format(samtools_exec = self.samtools_exec, filename = self.filename, chr = chr, start = start, stop = stop)
+        output, err = biox.utils.cmd(command)
+        output = output.split("\n")
+        for line in output:
+            line = line.split("\t")
+            if len(line)>3:
+                if int(line[3])<=start and int(line[7])<=stop:
+                    raw_count += 1
+        return raw_count

biox/data/Bedgraph.py

+class Bedgraph():
+    def __init__(self, filename1=None, min_value=None, only_positions=None):
+        self.trackname = {"type":"bedGraph", "color": "120,101,172", "altColor" : "200,120,59", "maxHeightPixels":"100:50:0", "visibility":"full", "priority":"20"}
+        self.track_header = ""
+        self.data = {}
+        self.sites = 0
+        self.data_sum = 0
+        self.norm_factor = 1000000
+        self.strand_file = {"+":"", "-": "-"}
+        if filename1 != None:
+            self.load_from_file(filename1, min_value=min_value, only_positions = only_positions)
+            
+    def load_from_file(self, filename1, min_value=0, only_positions=None):
+        if filename1.endswith(".gz"):
+            f = gzip.open(filename1, "rb")
+        else:
+            f = open(filename1, "rb")
+        self.track_header = f.readline()
+        r = f.readline()
+        while r:
+            r = r.rstrip("\r").rstrip("\n").split("\t")
+            chr = r[0]
+            start = int(r[1])
+            stop = int(r[2])
+            cDNA = float(r[3])
+            if abs(cDNA) < min_value:
+                r = f.readline()
+                continue
+            strand = "+" if cDNA>=0 else "-"
+            self.data.setdefault(chr, {}).setdefault(strand, {})
+            for p in xrange(start, stop):
+                if only_positions!=None:
+                    if p not in only_positions.get("%s%s" % (strand, chr), None):
+                        continue
+                self.data[chr][strand][p] = self.data[chr][strand].setdefault(p, 0) + abs(cDNA)
+            self.data_sum += abs(cDNA) * (stop-start)
+            r = f.readline()
+        f.close()
+        
+    def chromosomes(self):
+        return self.data.keys()
+        
+    def get_value(self, chr, strand, pos):
+        return self.data.get(chr, {}).get(strand, {}).get(pos, 0)
+
+    def peaks(self, flanking):
+        data_peaks = {}
+        self.data_sum = 0
+        for chr, chr_data in self.data.iteritems():
+            data_peaks[chr] = {}
+            for strand, strand_data in chr_data.iteritems():
+                data_peaks[chr][strand] = {}
+                pos_sorted = []
+                for (pos, val) in strand_data.iteritems():
+                    pos_sorted.append((val, pos))
+                pos_sorted.sort(reverse=True)
+                for (value, pos) in pos_sorted:
+                    if self.data[chr][strand].get(pos, 0)==0:
+                        continue
+                    sum_peak = 0
+                    pos_from = max(0, pos-flanking)
+                    pos_to = max(0, pos+flanking+1)
+                    for i in range(pos_from, pos_to):
+                        at_pos = self.get_value(chr, strand, i)
+                        sum_peak += self.get_value(chr, strand, i)
+                        if at_pos>0:
+                            self.data[chr][strand][i] = 0
+                    data_peaks[chr][strand][pos] = sum_peak
+                    self.data_sum += sum_peak
+        # replace original data with peak data
+        self.data = data_peaks
+
+    def region(self, chr, strand, pos_from, pos_to):
+        return sum([self.get_value(chr, strand, i) for i in xrange(pos_from, pos_to+1)])
+        
+    def get_region_max(self, chr, strand, pos_from, pos_to):
+        return max([self.get_value(chr, strand, i) for i in xrange(pos_from, pos_to+1)])
+        
+    def window_sum(self, chr, strand, pos, window_size=50):
+        sum_window = 0
+        pos_from = max(0, pos-int(window_size/2))
+        pos_to = max(0, pos+int(window_size/2)+1)
+        return sum([self.get_value(chr, strand, i) for i in range(pos_from, pos_to)])
+		
+    def window(self, window_size, smoothing=False):
+        data_window = {}
+        self.data_sum = 0
+        for chr, chr_data in self.data.iteritems():
+            print "window on ", chr
+            data_window[chr] = {}
+            for strand, strand_data in chr_data.iteritems():
+                data_window[chr][strand] = {}
+                for pos, value in strand_data.iteritems():
+                    position_range = [pos] if not smoothing else range(max(0, pos-window_size/2), max(0, pos+window_size/2+1))
+                    for pos_set in position_range:
+                        data_window[chr][strand][pos_set] = self.window_sum(chr, strand, pos_set, window_size=window_size)
+        # replace original data with peak data
+        self.data = data_window		
+                    
+    def set_name(self, name):
+        self.trackname["name"] = name
+        
+    def normalize(self):
+        data_sum_new = 0
+        for chr, strand_data in self.data.iteritems():
+            for strand, position_data in strand_data.iteritems():
+                for position, value in position_data.iteritems():
+                    new_value = (value/float(self.data_sum)) * self.norm_factor
+                    self.data[chr][strand][position] = new_value
+                    data_sum_new += new_value
+                    
+    def addup(self, Bg):
+        for chr, strand_data in Bg.data.iteritems():
+            for strand, position_data in strand_data.iteritems():
+                for position, value_new in position_data.iteritems():
+                    value_current = self.data.setdefault(chr, {}).setdefault(strand, {}).setdefault(position, 0)
+                    value_new = value_current + value_new
+                    delta = value_new - value_current
+                    self.data[chr][strand][position] = value_new
+                    self.data_sum += delta                    

biox/data/Fasta.py

+import biox
+import gzip
+import os
+
+class Fasta:
+
+    """
+    This class handles FASTA files.
+    """
+
+    def __init__(self, filename):
+        self.filename = filename
+        if os.path.exists(self.filename+".index"):
+            self.read_index()
+        else:
+            self.create_index()
+            self.read_index()
+        self.open_file()
+        
+    def open_file(self):
+        self.id = None
+        self.next_id = None
+        if self.filename.endswith(".gz"):
+            self.f = gzip.open(self.filename, "rt")
+        else:
+            self.f = open(self.filename, "rt")
+        
+    def read(self):
+        self.sequence = []
+        if self.next_id!=None:
+            self.id = self.next_id
+            self.next_id = None
+        r = self.f.readline()
+        if not r:
+            return False
+        while r:
+            r = r.replace("\r", "").replace("\n", "")
+            if r=="":
+                r = self.f.readline()
+                continue
+            if r[0]==">" and self.id==None:
+                self.id = r[1:]
+                r = self.f.readline()
+                continue
+            elif r[0]==">":
+                self.next_id = r[1:]
+                self.sequence = "".join(self.sequence)
+                return True
+            self.sequence.append(r)
+            r = self.f.readline()
+        self.sequence = "".join(self.sequence)
+        return True
+        
+    def create_index(self):
+        self.open_file()
+        f_linear = open(self.filename + ".linear", "wt")
+        f_index = open(self.filename + ".index", "wt")
+        while self.read():
+            f_linear.write(self.sequence)
+            f_index.write("%s\t%s\n" % (self.id, len(self.sequence)))
+        f_linear.close()
+        f_index.close()
+        
+    def read_index(self):
+        self.index = {}
+        f = open(self.filename+".index", "rt")
+        r = f.readline()
+        start = 0
+        while r:
+            r = r.replace("\r", "").replace("\n", "").split("\t")
+            self.index[r[0]] = start
+            start += int(r[1])
+            r = f.readline()
+
+    def get_seq(self, chr, start, stop, reverse_complement=False):
+        f = open(self.filename+".linear", "rb")
+        num_bytes = stop - start + 1
+        pos_from = self.index[chr] + start - 1
+        pos_to = pos_from + num_bytes
+        f.seek(pos_from)
+        seq = f.read(num_bytes)
+        f.close()
+        if reverse_complement:
+            seq = biox.data.Sequence.rev_comp(seq)
+        return seq

biox/data/Fastq.py

+"""
+Reads FASTQ files
+
+Example:
+
+f = FASTQreader(file_name)
+while f.read():
+    print f.id
+"""
+
+import gzip
+
+class Fastq:
+    def read_line(self):
+        return self.f.readline().rstrip("\r").rstrip("\n")
+        
+    def read(self):
+        self.id = self.read_line()
+        self.sequence = self.read_line()
+        self.plus = self.read_line() # +
+        self.quality = self.read_line()
+        self.uncut_sequence = self.sequence
+        self.uncut_quality = self.quality
+        self.records += 1
+        if self.id == "":
+                return False
+        if self.cut_bad:
+            qual = self.quality.rstrip("B")
+            if len(qual) <> len(self.quality):
+                self.sequence = self.sequence[:len(qual)]
+                self.quality = qual
+        return True
+
+    def __init__(self, file_name, cut_bad=False):
+        self.cut_bad = cut_bad
+        if file_name.endswith(".gz"):
+            self.f = gzip.open(file_name)
+        else:
+            self.f = open(file_name, "rt")
+        self.id = ""
+        self.sequence = ""
+        self.plus = ""
+        self.quality = ""
+        self.records = 0

biox/data/Gene.py

+class Gene():
+
+    def __init__(self, id = None, chr = None, strand = None, attrs = None, name = None, alias = None):
+        self.id = id
+        self.name = name
+        self.chr = chr
+        self.strand = strand
+        self.attrs = attrs
+        self.alias = alias
+        self.features = []
+        self.start = ()
+        self.stop = 0
+    
+    def add_feature(self, feature_new):
+        if len(self.features)==0:
+            self.features.append(feature_new)
+            self.start = feature_new.start
+            self.stop = feature_new.stop
+            return True
+        insert_at = 0
+        for feature in self.features:
+            if feature.start>=feature_new.start:
+                break
+            insert_at += 1
+        self.start = min(feature_new.start, self.start)
+        self.stop = max(feature_new.stop, self.stop)
+        self.features.insert(insert_at, feature_new)

biox/data/GeneFeature.py

+class GeneFeature():
+
+    def __init__(self, start, stop, type, gene):
+        self.start = start
+        self.stop = stop
+        self.type = type
+        self.gene = gene

biox/data/Gff3.py

+import biox
+import os
+import sys
+
+class Gff3():
+
+    def __init__(self, filename):
+        fasta_part = 0
+        mRNA_part = 0
+        f = open(filename, "rt")
+        r = f.readline()
+        self.genes = {}
+        self.mRNA_genes = {}
+        l = 1
+        while r:
+          if r.startswith("##FASTA") or fasta_part==1:
+            r = f.readline()
+            fasta_part = 1
+            l+=1
+            continue
+          if r.startswith("#"):
+            r = f.readline()
+            l+=1
+            continue
+          r = r.rstrip("\r\n").split("\t")
+          if len(r)==1:
+            r = f.readline()
+            continue
+          if r[0]=="##gff-version   3":
+            fasta_part = 0
+            r = f.readline()
+            l+=1
+            continue
+          #print r, len(r), l, fasta_part
+          seqid = r[0]
+          source = r[1]
+          type = r[2]
+          start = r[3]
+          stop = r[4]
+          strand = r[6]
+          attributes = {}
+          chromosome = r[0]
+          for att in r[-1].split(";"):
+            att = att.split("=")
+            attributes[att[0]] = att[1]
+          if type=="gene":
+            mRNA_part = 0
+            self.genes[attributes["ID"]] = {'chromosome':chromosome, 'strand':strand, 'data':{}, 'attributes':attributes}
+          if type=="mRNA":
+            mRNA_part = 1
+            gene_id = attributes["Parent"]
+            mRNA_id = attributes["ID"]
+            gene_data = self.genes.get(gene_id)["data"]
+            gene_data[mRNA_id] = {'exons':[], 'CDS':[], 'attributes':attributes}
+            self.genes[gene_id]["data"] = gene_data
+            self.mRNA_genes[attributes["ID"]] = attributes["Parent"]
+          if type=="pseudogene" or type=="tRNA":
+            mRNA_part = 0
+          if mRNA_part==0:
+            r = f.readline()
+            continue
+          if type=="CDS":
+            gene_id = self.mRNA_genes[attributes["Parent"]]
+            mRNA_id = attributes["Parent"]
+            self.genes[gene_id]["data"][mRNA_id]["exons"].append((start, stop))  
+          r = f.readline()
+          l+=1
+    
+    def write_gtf(self, filename):
+        f = open(filename, "wt")
+        gene_names = self.genes.keys()
+        for gene_id, gene_data in self.genes.items():
+          gene_strand = gene_data["strand"]
+          gene_chromosome = gene_data["chromosome"]
+          transcripts = gene_data["data"]
+          for mRNA_id, mRNA_data in transcripts.items():
+            for (exon_start, exon_stop) in mRNA_data["exons"]:
+              f.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (gene_chromosome, "", "exon", exon_start, exon_stop, ".", gene_strand, ".", "gene_id \"%s\"; transcript_id \"%s\";" % (gene_id, mRNA_id)))
+          for mRNA_id, mRNA_data in transcripts.items():
+            for (CDS_start, CDS_stop) in mRNA_data["CDS"]:
+              f.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (gene_chromosome, "", "CDS", CDS_start, CDS_stop, ".", gene_strand, ".", "gene_id \"%s\"; transcript_id \"%s\";" % (gene_id, mRNA_id)))
+
+    def return_genes(self):
+        pass
+import biox
+import os
+import sys
+import cPickle
+
+cache_data = {}
+def cache_string(string):
+    if cache_data.get(string, None)==None:
+        cache_data[string] = string
+        return string
+    else:
+        return cache_data[string]
+
+class Gtf():
+
+    def __init__(self, filename):
+        self.bin_size = 1000
+        self.genes = {}
+        self.filename = filename
+        f = biox.data.TabReader(filename)
+        while f.readline():
+            chr = f.r[0]
+            gene_type = f.r[2]
+            start = int(f.r[3])
+            stop = int(f.r[4])
+            strand = f.r[6]
+            attrs = {}
+            temp = f.r[-1].split(";")
+            for att in temp:
+                att = att.replace("\"", "")
+                att = att.lstrip(" ")
+                att = att.split(" ")
+                attrs[att[0]] = " ".join(att[1:])
+            if attrs.get("gene_id", None)==None:
+                continue
+            gene = self.genes.get(attrs["gene_id"], biox.data.Gene(attrs["gene_id"], chr, strand, attrs=attrs))
+            feature = biox.data.GeneFeature(start, stop, gene_type, gene)
+            gene.add_feature(feature)
+            self.genes[gene.id] = gene
+        self.load_index()
+    
+    def return_genes(self):
+        return self.genes
+        
+    def load_index(self):
+        if not os.path.exists(self.filename+".pindex"):
+            pindex = {}
+            for gene_id, gene in self.genes.items():
+                pindex_chr = pindex.get(gene.chr, {})
+                for feature in gene.features:
+                    if feature.type!="exon":
+                        continue
+                    for pos in range(feature.start, feature.stop+1):
+                        bin = pos/self.bin_size
+                        GL = pindex_chr.get(bin, set())
+                        GL.add(cache_string(gene_id))
+                        pindex_chr[bin] = GL
+                pindex[gene.chr] = pindex_chr
+            cPickle.dump(pindex, open(self.filename+".pindex", "wb"), -1)
+            self.pindex = pindex
+        else:
+            self.pindex = cPickle.load(open(self.filename+".pindex", "rb"))
+    
+    def get_genes(self, chr, pos):
+        bin = pos/self.bin_size
+        candidate_genes = self.pindex.get(chr, {}).get(bin, [])
+        position_genes = set()
+        for gene_id in candidate_genes:
+            for feature in self.genes[gene_id].features:
+                if feature.type!="exon":
+                    continue
+                if feature.start<=pos<=feature.stop:
+                    position_genes.add(gene_id)
+        return position_genes
+    
+    def find_overlap(self, gene_source):
+        """
+        Return overlapping feature to given feature
+        """
+        overlapping_genes = []
+        for gene_id, gene in self.genes.iteritems():
+            if gene.strand != gene_source.strand or gene.chr != gene_source.chr:
+                continue
+            overlap = biox.utils.interval_overlap(gene.start, gene.stop, gene_source.start, gene_source.stop)
+            if overlap>0:
+                overlapping_genes.append((overlap, gene))
+        return overlapping_genes
+
+    def write_gff3(self, filename):
+        f = open(filename, "wt")
+        for gene_id, gene in self.genes.iteritems():
+            row = [gene.chr, "biox", "gene", gene.start, gene.stop, "", gene.strand, ".", "ID=%s;Name=%s" % (gene_id, gene_id)] # gene
+            f.write("\t".join(str(x) for x in row) + "\n")
+            row = [gene.chr, "biox", "mRNA", gene.start, gene.stop, "", gene.strand, ".", "ID=%s.t1;Parent=%s" % (gene_id, gene_id)] # mRNA
+            f.write("\t".join(str(x) for x in row) + "\n")
+            for exon_index, feature in enumerate(gene.features):
+                if feature.type not in ["exon", "CDS"]:
+                    continue
+                row = [gene.chr, "biox", "CDS", feature.start, feature.stop, "", gene.strand, ".", "ID=%s.t1.cds;Parent=%s.t1" % (gene_id, gene_id)] # mRNA
+                f.write("\t".join(str(x) for x in row) + "\n")
+                row = [gene.chr, "biox", "exon", feature.start, feature.stop, "", gene.strand, ".", "ID=%s.t1.exon%s;Parent=%s.t1" % (gene_id, exon_index+1, gene_id)] # mRNA
+                f.write("\t".join(str(x) for x in row) + "\n")
+            f.write("\n")
+        f.close()

biox/data/Sequence.py

+bc = {'A': 'T', 'C': 'G', 'T': 'A', 'G': 'C', 'N': 'N', 'a': 't', 'c': 'g', 't': 'a', 'g': 'c', 'n': 'n'}
+
+def rev_comp(seq):
+    seq = [bc[s] for s in seq]
+    return "".join(seq[::-1])

biox/data/Sissrs.py

+import gzip
+import biox
+
+class Sissrs():
+    def __init__(self, filename):
+        self.data = []
+        f = open(filename, "rt")
+        r = f.readline()
+        while not r.startswith("Chr	cStart	cEnd	NumTags"):
+            r = f.readline()
+        r = f.readline()
+        r = f.readline()
+        while r:
+            if r.startswith("==========="):
+                r = f.readline()
+                continue
+            r = r.replace("\r", "").replace("\n", "").split("\t")
+            self.data.append(r)
+            r = f.readline()
+        f.close()
+        
+    def bedgraph(self, filename, p_value = None):
+        f = open(filename, "wt")
+        for r in self.data:
+            if len(r)>4 and p_value!=None:
+                if float(r[-1])>p_value:
+                    r = f.readline()
+                    continue
+            f.write("%s\t%s\t%s\t%s\n" % (r[0], r[1], r[2], r[3]))
+        f.close()

biox/data/TabReader.py

+class TabReader():
+
+    """
+    This class reads text data (.tab separated).
+    """
+
+    def __init__(self, filename="", header_at = 0, fast = False):
+        self.f = None
+        self.header = None
+        self.fast = fast
+        if filename!="":
+            self.initialize(filename, header_at = header_at)
+            
+    def initialize(self, filename, header_at = 0):
+        if filename.endswith(".gz"):
+            self.f = gzip.open(filename)
+        else:
+            self.f = open(filename)
+        while header_at>0:
+            self.f.readline()
+            header_at -= 1
+        self.header = self.f.readline()
+        self.header = self.header.replace("\"", "").replace("\r", "").replace("\n", "").split("\t")
+    
+    def readline(self):
+        line = self.f.readline()
+        if not line:
+            return False
+        line = line.replace("\r", "").replace("\n", "").split("\t")
+        if not self.fast:
+            for index, value in enumerate(line):
+                try:
+                    line[index] = line[index].replace("\"", "")
+                    line[index] = float(value)
+                except:
+                    pass
+        self.r = line
+        if not self.fast:
+            for index, name in enumerate(self.header):
+                self.set_value(name, line[index])
+                self.set_value("col%s" % index, line[index])
+        return True
+        
+    def set_value(self, name, value):
+        name = name.replace(" ", "_").rstrip("\r").rstrip("\n")
+        try:
+            value = float(value)
+        except:
+            pass
+        setattr(self, name, value)
+    
+import gzip
+import biox
+
+class Wig():
+    def __init__(self, filename=None, min_value=None, only_positions=None):
+        self.trackname = {}
+        self.data = {}
+        if not filename==None:
+            self.load(filename, min_value=min_value, only_positions = only_positions)
+            
+    def load(self, filename, min_value=0, only_positions=None):
+        if filename.endswith(".gz"):
+            f = gzip.open(filename, "rb")
+        else:
+            f = open(filename, "rb")
+        r = f.readline()
+        r = f.readline()
+        chr = None
+        while r:
+            r = r.rstrip("\r").rstrip("\n").split("\t")
+            if r[0].startswith("variableStep"):
+                r = r[0].split(" ")
+                if chr!=None:
+                    self.data[chr] = L
+                chr = r[1].split("=")[1]
+                span = int(r[2].split("=")[1])
+                self.data[chr] = {'+':[], '-':[]}
+                L = self.data[chr]
+                r = f.readline()
+                continue
+            chr = r[0]
+            start = int(r[0])
+            value = int(r[1])
+            strand = "+" if value>=0 else "-"
+            L[strand].append((span, start, value))
+            r = f.readline()
+        f.close()
+        for chr in self.data.keys():
+            self.data[chr]["+"].sort()
+            self.data[chr]["-"].sort()
+        
+    def chromosomes(self):
+        return self.data.keys()
+        
+    def value(self, chr, strand, pos):
+        L = self.data.get(chr, {}).get(strand, [])
+        for (span, start, value) in L:
+            if start<=pos<=start+span-1:
+                return value
+        return 0
+        
+    def region(self, chr, strand, pos_from, pos_to):
+        L = self.data.get(chr, {}).get(strand, [])
+        count = 0
+        for (span, start, value) in L:
+            if start>pos_to:
+                break
+            if start+span-1 < pos_from:
+                continue
+            count += biox.data.overlaps(start, start+span-1, pos_from, pos_to) * value
+        return count

biox/data/__init__.py

+import biox
+
+"""
+biox.data module for managing FASTQ and FASTA data files.
+"""
+
+# modules
+from TabReader import *
+from Fastq import *
+from Fasta import *
+from Bedgraph import *
+from Gff3 import *
+from Gtf import *
+from Gene import *
+from GeneFeature import *
+from Bam import *
+from Wig import *
+from Sissrs import *
+from Sequence import *
+
+def overlaps(start_a, stop_a, start_b, stop_b):
+    return max(0, min(stop_a, stop_b) - max(start_a, start_b) + 1)
+
+def fastq_qminmax(filename):
+    """
+    Returns min and max quality ord value from fastq file.
+    """
+    qmin = ()
+    qmax = 0
+    f = biox.data.Fastq(filename)
+    while f.read():
+        qmin = min(qmin, ord(f.quality[-1]))
+        qmax = max(qmax, ord(f.quality[0]))        
+    return qmin, qmax
+    
+def fasta_check(filename, allowed_chars=["A", "C", "T", "G", "N"]):
+    """
+    Checks if FASTA file format is valid.
+    """
+    f = biox.data.Fasta(filename)
+    valid_fasta = True
+    while f.read():
+        seq_len = 0
+        if len(f.sequence)==0:
+            return False, "Sequence with ID %s has length 0" % (f.id)
+        for allowed in allowed_chars:
+            seq_len += f.sequence.upper().count(allowed)
+        if seq_len!=len(f.sequence):
+            return False, "Sequence with characters other than %s" % str(allowed_chars)
+    return True, "File in FASTA format"
+
+def prepare_fasta_gbrowse(filename):
+    f = biox.data.Fasta(filename)
+    f_gff3 = open("chromosomes.gff3", "wt")
+    while f.read():
+        fout = open("%s.fa" % f.id, "wt")
+        fout.write(">%s\n" % f.id)
+        sequences = [f.sequence[i:i+50] for i in range(0, len(f.sequence), 50)]
+        fout.write("\n".join(sequences) + "\n")
+        fout.close()
+        row = [f.id, "biox", "chromosome", 1, len(f.sequence), ".", ".", ".", "ID=%s;Name=%s" % (f.id, f.id)]
+        f_gff3.write("\t".join(str(x) for x in row) + "\n")
+    f_gff3.close()
+    
+def prepare_fasta_mapability(input, output, seq_len):
+    f_output = open(output, "wt")
+    id_seq = 1
+    f_input = biox.data.Fasta(input)
+    while f_input.read():
+      for i in xrange(0, len(f_input.sequence)-seq_len):
+        f_output.write(">%s\n%s\n" % (id_seq, f_input.sequence[i:i+seq_len]))
+        id_seq += 1
+    f_output.close()

biox/expression/__init__.py

+import biox
+import math
+import os
+
+pysam_enabled = False
+
+try:
+    import pysam
+    pysam_enabled = True
+except:
+    pysam_enabled = False
+
+def testBit(int_type, offset):
+   mask = 1 << offset
+   return(int_type & mask)
+
+def region_expression_pileup(sam_filename, chr, start, stop):
+    import pysam
+    f = pysam.Samfile(sam_filename)
+    sum = 0
+    for rec in f.pileup(chr, start, stop):
+        if not start<=rec.pos<=stop:
+            continue
+        sum += rec.n
+    return round(sum/float(stop-start+1))
+   
+def gene_expression_overlap(gtf_file, bam_file, quality = 30, genes = None):
+    if pysam_enabled:
+        pysam_bam = pysam.Samfile(bam_file)
+    gtf = biox.data.Gtf(gtf_file)
+    genes_exp = {}
+    if genes==None:
+        for gene_id in gtf.genes:
+            genes_exp[gene_id] = 0
+    else:
+        for gene_id in genes:
+            genes_exp[gene_id] = 0
+            
+    current = 0
+    for gene_id, gene in gtf.genes.items():
+        current += 1
+        if current%300==0:
+            print "%.2f" % (float(current)/len(gtf.genes)), bam_file
+        if genes!=None and gene.id not in genes:
+            continue
+        for feature in gene.features:
+            if feature.type!="exon":
+                continue
+            assert(feature.start<=feature.stop)
+            if pysam_enabled:
+                genes_exp[gene_id] = genes_exp.get(gene_id, 0) + pysam_bam.count(gene.chr, feature.start-1, feature.stop-1)
+            else:
+                command = "samtools view -F 4 -q {quality} -c {bam_file} {chr}:{start}-{stop}".format(bam_file = bam_file, quality = 30, chr=gene.chr, start=feature.start, stop=feature.stop)
+                output, error = biox.utils.cmd(command)
+                if output!="":
+                    genes_exp[gene_id] = genes_exp.get(gene_id, 0) + int(output)
+    return genes_exp
+   
+def gene_expression(gtf_file, bam_file, quality = 30, genes = None):
+    gtf = biox.data.Gtf(gtf_file)
+    genes_exp = {}
+    if genes==None:
+        for gene_id in gtf.genes:
+            genes_exp[gene_id] = 0
+    else:
+        for gene_id in genes:
+            genes_exp[gene_id] = 0
+            
+    command = "samtools view -F 4 -q {quality} -c {bam_file}".format(bam_file = bam_file, quality = quality)
+    output, error = biox.utils.cmd(command)
+    reads = int(output)            
+    
+    command = "samtools view -F 4 -q {quality} {bam_file}".format(bam_file = bam_file, quality = quality)
+    current = 0
+    for line in biox.utils.cmd_pipe(command):
+        current += 1
+        if current%200000==0:
+            print "%.2f" % (current/float(reads)), bam_file
+        line = line.split("\t")
+        if len(line)>3:
+            chr = line[2]
+            flag = int(line[1])
+            seq_len = len(line[9])
+            strand = flag & 16 # is bit 5 set?
+            strand = "+" if strand==0 else "-"
+            pos = int(line[3]) if strand=="+" else int(line[3])+seq_len-1
+            position_genes = gtf.get_genes(chr, pos)
+            for gene_id in position_genes:
+                if genes!=None and gene_id not in genes:
+                    continue
+                genes_exp[gene_id] = genes_exp.get(gene_id, 0) + 1
+    return genes_exp
+    
+def gene_expression_promoters(gtf_file, bam_file, quality = 30, genes = None):
+    gtf = biox.data.Gtf(gtf_file)
+    genes_exp = {}
+    if genes==None:
+        for gene_id in gtf.genes:
+            genes_exp[gene_id] = 0
+    else:
+        for gene_id in genes:
+            genes_exp[gene_id] = 0
+            
+    command = "samtools view -F 4 -q {quality} -c {bam_file}".format(bam_file = bam_file, quality = quality)
+    output, error = biox.utils.cmd(command)
+    reads = int(output)            
+    
+    command = "samtools view -F 4 -q {quality} {bam_file}".format(bam_file = bam_file, quality = quality)
+    current = 0
+    for line in biox.utils.cmd_pipe(command):
+        current += 1
+        if current%200000==0:
+            print "%.2f" % (current/float(reads)), bam_file
+        line = line.split("\t")
+        if len(line)>3:
+            chr = line[2]
+            flag = int(line[1])
+            seq_len = len(line[9])
+            strand = flag & 16 # is bit 5 set?
+            strand = "+" if strand==0 else "-"
+            pos = int(line[3]) if strand=="+" else int(line[3])+seq_len-1
+            position_genes = gtf.get_genes(chr, pos)
+            for gene_id in position_genes:
+                if genes!=None and gene_id not in genes:
+                    continue
+                genes_exp[gene_id] = genes_exp.get(gene_id, 0) + 1
+    return genes_exp    
+
+def bam_chromosomes(bam_file):
+    chrs = {}
+    command = "samtools view -H {bam_file}".format(bam_file = bam_file)
+    output, error = biox.utils.cmd(command)
+    output = output.split("\n")
+    for line in output:
+        line = line.split("\t")
+        if line[0]=="@SQ":
+            chrs[line[1].split("SN:")[1]] = int(line[2].split("LN:")[1])
+    return chrs
+    
+def write_bam_chr(bam_file, chr_file):
+    chrs = bam_chromosomes(bam_file)
+    f = open(chr_file, "wt")
+    for chr, chr_len in chrs.items():
+        f.write("%s\t%s\n" % (chr, chr_len))
+    f.close()
+    return chr_file
+    
+def write_fasta_chr(fasta_file, chr_file):
+    fin = biox.data.Fasta(fasta_file)
+    f = open(chr_file, "wt")
+    while fin.read():
+        f.write("%s\t%s\n" % (fin.id, len(fin.sequence)))
+    f.close()
+    
+def bam_coverage(bam_file, chr="chr1", strand=None, start=1, stop=None, position = '5prime'):
+    if strand=="+":
+        strand_par = "-F 0x0010"
+    elif strand=="-":
+        strand_par = "-f 0x0010"
+    else:
+        strand_par = ""
+    chrs = bam_chromosomes(bam_file)
+    if start==None:
+        start = 1
+    if stop==None:
+        stop = chrs[chr]
+    start = int(start)
+    stop = int(stop)
+    command = "samtools view {strand_par} {bam_file} {chr}:{start}-{stop}".format(bam_file = bam_file, strand_par=strand_par, chr=chr, start=start, stop=stop)
+    result = {}
+    for line in biox.utils.cmd_pipe(command):
+        line = line.split("\t")
+        if len(line)>3:
+            strand = "+" if int(line[1])==0 else "-"
+            if position=='5prime':
+                pos = int(line[3]) if strand=="+" else int(line[3])+len(line[9])-1
+                if pos<start or pos>stop:
+                    continue
+                result[pos] = result.setdefault(pos, 0) + 1
+            if position=='span':
+                for pos in range(int(line[3]), int(line[3])+len(line[9])):
+                    if pos<start or pos>stop:
+                        continue                
+                    result[pos] = result.setdefault(pos, 0) + 1
+            if position=='span_coverage':
+                for pos in range(int(line[3]), int(line[3])+len(line[9])):
+                    if pos<start or pos>stop:
+                        continue                
+                    result[pos] = 1                    
+    result_list = []
+    for pos in range(start, stop+1):
+        result_list.append(result.get(pos, 0))
+    return result_list
+
+# All that is required to make a BigWig from a Bam is the bam file itself (with the header)
+def bam2wig(bam_filename, bw_filename, strand=None, position='span', scale=None):
+    strand_dic = {1:'+', -1:'-', '1':'+', '-1':'-', '+': '+', '-': '-'}
+    chrs_filename = bam_filename+".chrs"
+    bed_filename = bw_filename+".bed"
+    write_bam_chr(bam_filename, chrs_filename)
+    strand_parameter = "-strand %s" % strand_dic[strand] if strand!=None else ''
+    scale_parameter = "-scale %.5f" % float(scale) if scale!=None else '' # values are multiplied (*) by scale
+    command = "genomeCoverageBed -bg -ibam {bam_filename} -g {chrs_filename} {strand_parameter} {scale_parameter} > {bed_filename}".format(bam_filename=bam_filename, chrs_filename=chrs_filename, strand_parameter=strand_parameter, scale_parameter=scale_parameter, bed_filename=bed_filename)
+    output, error = biox.utils.cmd(command)
+    command = "bedGraphToBigWig {bed_filename} {chrs_filename} {bw_filename}".format(bed_filename=bed_filename, chrs_filename=chrs_filename, bw_filename=bw_filename)
+    output, error = biox.utils.cmd(command)
+    os.remove(bed_filename)
+    os.remove(chrs_filename)
+
+def bam_statistics(bam_filename):
+	stats = pysam.idxstats(bam_filename)
+	del stats[-1] # * 0 0 0 0 ...?
+	mapped_reads = sum([int(el.split("\t")[2]) for el in stats])
+	notmapped_reads = sum([int(el.split("\t")[3]) for el in stats])
+	return {'mapped':mapped_reads, 'notmapped':notmapped_reads, 'all':mapped_reads+notmapped_reads}

biox/map/Bowtie.py

+import biox
+from os.path import join as pjoin
+import os
+import sys
+import locale
+
+#locale.setlocale(locale.LC_ALL, '')
+
+class Bowtie():
+
+    """
+    Mapping of reads to a reference genome
+    """
+
+    def __init__(self):
+        self.bowtie_exec = pjoin(biox.bowtie_folder, "bowtie")
+        self.bowtie_build_exec = pjoin(biox.bowtie_folder, "bowtie-build")
+        self.samtools_exec = pjoin(biox.samtools_folder, "samtools")
+        self.trim3 = False
+        self.sam = True
+        self.processors = 2
+        self.un_enabled = False
+        self.max_enabled = False
+        self.mode_fasta = False
+        self.m = 1
+        self.u = None
+        self.l = None
+        self.X = 1000
+        self.quality = "phred64-quals"
+        self.strata = False
+        self.enable_n()
+        
+    def enable_n(self, n=2):
+        """
+        Enable mode n of bowtie mapper.
+        """
+        self.n = n
+        self.v = None
+        
+    def set_X(self, X):
+        self.X = X
+        
+    def set_l(self, l):
+        """
+        Set seed length (l) and enable n mode.
+        """
+        self.l = l
+        self.enable_n()
+        
+    def enable_strata(self, strata):
+        """
+        Enable strata.
+        """
+        self.strata = strata
+
+    def enable_u(self, u):
+        """
+        Enable mode u. Map all reads.
+        """
+        self.u = u
+    
+    def enable_v(self, v=2):
+        """
+        Enable mode v
+        """
+        self.v = v
+        self.n = None
+        
+    def set_m(self, m):
+        """
+        Set allowed number of multiple hits per read for the hits to be reported.
+        """
+        self.m = m
+        
+    def set_sam(self, sam):
+        """
+        Enable sam output.
+        """
+        self.sam = sam
+        
+    def set_processors(self, processors):
+        """
+        Set number of processors to use.
+        """
+        self.processors = processors
+        
+    def set_mode_fasta(self):
+        """
+        Enable fasta mode.
+        """
+        self.mode_fasta = True
+
+    def set_mode_fastq(self):
+        """
+        Enable fastq mode.
+        """
+        self.mode_fasta = False
+        self.enable_n()
+        
+    def enable_trim3(self, trim3_iter=None, trim3_step=None):
+        """
+        Enable iterative trimming of unmapped reads from 3' end.
+        
+        :param trim3_iter: number of iterations
+        :param trim3_step: step of trimming (in nucleotides)
+        """
+        self.trim3 = True
+        self.trim3_iter = trim3_iter if trim3_iter!=None else 5
+        self.trim3_step = trim3_step if trim3_step!=None else 2
+        
+    def disable_trim3(self):
+        """
+        Disable iterative trimming of unmapped reads from 3' end.
+        """
+        self.trim3 = False
+        
+    def enable_un(self, un):
+        self.un_enabled = un
+        
+    def enable_max(self, mx):
+        self.max_enabled = mx
+        
+    def set_quality(self, quality):
+        """
+        Set qualities of data from FASTQ. Possible options as in bowtie manual.
+        :param quality: "solexa-quals", "phred33-quals", "phred64-quals", "integer-quals"
+        """
+        self.quality = quality
+        
+    def read_statfile(self, file):
+        reads = mapped = 0
+        f = open(file, "rt")
+        r = f.readline()
+        while r:
+            print r
+            r = r.rstrip("\r").rstrip("\n")
+            if r.startswith("# reads processed: "):
+                reads = int(r.split("# reads processed: ")[1])
+            if r.startswith("# reads with at least one reported alignment: "):
+                mapped = int(r.split("# reads with at least one reported alignment: ")[1].split(" ")[0])
+            r = f.readline()
+        return (reads, mapped)
+        
+    def map(self, index, input, output, index_path=None, create_stats=True, bam=True, delete_temp=True, bam_include_unmapped = False, simulate = False, verbose=False, keep_unmapped = True):
+        """
+        Map reads to the reference.
+        
+        :param index: the name of the reference sequence index (dd/dp)
+        :param input: full path to FASTA or FASTQ file with reads
+        :param output: output folder
+        :param index_path: specify full path to genome index, instead of using the indexes in biox.config.bowtie_index_folder folder
+        :param verbose: Print each command that is executed (verbose mode)
+        :param simulate: Only simulate mappings and doesn't execute any commands (together with verbose, prints out all the mapping commands)
+        """
+        
+        executed_commands = []
+        output_log = open("%s.log" % output, "wt")
+        paired_input = False
+        
+        if type(input)==list:
+            if len(input)==2:
+                paired_input = True
+                input_decompressed = []
+                input_decompressed.append(biox.utils.decompress(input[0]))
+                input_decompressed.append(biox.utils.decompress(input[1]))
+            else:
+                input_decompressed = biox.utils.decompress(input[0])
+        else:
+            input_decompressed = biox.utils.decompress(input)
+       
+        sam_par = "--sam" if self.sam else ""
+        bam_unmapped_par = "-F 4" if bam_include_unmapped else ""
+        u_par = "-u %s" % self.u if self.u != None else ""
+        n_par = "-n %s" % self.n if self.n != None else ""
+        v_par = "-v %s" % self.v if self.v != None else ""
+        m_par = "-m %s" % self.m if self.m != None else ""
+        strata_par = "--strata --best" if self.strata != False else ""
+        fasta_par = "-f" if self.mode_fasta==True else ""
+        l_par = "-l %s" % self.l if self.l != None else ""
+        if index_path!=None: # specify direct path to bowtie index
+            index_par = index_path
+        else:
+            index_par = pjoin(biox.bowtie_index_folder, index)
+        sam_files = []
+        stat_files = []
+        un_files = []
+        bam_files = []
+        if self.trim3==False:
+            stats_par = "%s.stats" % (output) if create_stats else "/dev/null" 
+            if type(input_decompressed)==list:
+                if len(input_decompressed)==2:
+                    input_par = "-1 %s -2 %s -X %s" % (input_decompressed[0], input_decompressed[1], self.X)
+                else:
+                    input_par = input_decompressed[0]
+            else:
+                input_par = input_decompressed
+                if input_decompressed.endswith(".fasta"):
+                    self.set_mode_fasta()
+            output_par = "%s.sam" % output
+            un_par = "--un "+output+".unmapped" if self.un_enabled else ""
+            max_par = "--max "+output+".maxmulti" if self.max_enabled else ""
+            command = "{bowtie_exec} --{quality} {strata_par} {l_par} -a {u_par} {processors} {un_par} {max_par} {n_par} {v_par} {sam_par} {m_par} {index_par} {fasta_par} {input_par} 1>{output_par} 2>{output_stats}".format \
+            (bowtie_exec = self.bowtie_exec, fasta_par = fasta_par, index_par = index_par, input_par = input_par, output_par = output_par, \
+            sam_par = sam_par, n_par = n_par, v_par = v_par, output_stats = stats_par, m_par = m_par, \
+            un_par = un_par, max_par = max_par, processors = "-p %s" % self.processors, u_par = u_par, strata_par=strata_par, l_par = l_par, quality = self.quality)
+            output_log.write(command+"\n")
+            executed_commands.append(command)
+            if verbose:
+                print command
+            if not simulate:
+                str, err = biox.utils.cmd(command)
+            sam_files.append(output_par)
+            if self.un_enabled:
+                un_files.append(output+".unmapped")
+            if bam: # create bam file
+                bam_output = "%s.bam" % output
+                command = "{samtools_exec} view -F 4 {bam_unmapped_par} -bS {sam} > {bam}".format(samtools_exec = self.samtools_exec, sam = output_par, bam = bam_output, bam_unmapped_par = bam_unmapped_par)
+                if verbose:
+                    print command
+                if not simulate:
+                    str, err = biox.utils.cmd(command)
+                output_log.write(command+"\n")   
+                executed_commands.append(command)
+            if create_stats and not simulate:
+                stat_files.append(stats_par)
+                reads, mapped = self.read_statfile(stats_par)
+                print reads,mapped
+                stats_par_tab = "%s.stats.tab" % (output)
+                f = open(stats_par_tab, "wt")
+                f.write("trim_size\treads_processed\treads_mapped\treads_mapped_perc\n")
+                f.write("0\t%s\t%s\t%.2f%%\n" % (reads, mapped, float(mapped)*100/reads))
+                f.close()
+        else:
+            reads = {}
+            mapped = {}
+            for t in range(0, self.trim3_iter+1):
+                trim3 = t * self.trim3_step
+                stats_par = "%s.trim%s.stats" % (output, trim3) if create_stats else "/dev/null" 
+                
+                if t==0:
+                    if type(input_decompressed)==list:
+                        if len(input_decompressed)==2:
+                            input_par = "-1 %s -2 %s" % (input_decompressed[0], input_decompressed[1])
+                        elif len(input_decompressed)==1:
+                            input_par = input_decompressed[0]
+                    else:
+                        input_par = input_decompressed
+                else:
+                    if type(input_decompressed)==list:
+                        if len(input_decompressed)==2:
+                            input_par = "-1 %s_1 -2 %s_2" % (un_par, un_par)
+                    else:
+                        input_par = un_par
+                        
+                # use _ instead of . here because pair-end mapping naming of
+                # bowtie is not working correctly when using dot
+                # eg.: output.trim0.unmapped -> output.trim0_1.unmapped, output.trim0_2.unmapped
+                # instead of output.trim0.unmapped_1, output.trim0.unmapped_2
+                un_par = output+"_trim%s_unmapped" % trim3
+
+                fasta_par = "-f" if self.mode_fasta==True else ""
+                
+                max_par = "--max "+output+".trim%s.maxmulti" % trim3 if self.max_enabled else ""
+                trim3_par = "--trim3 %s" % trim3
+                output_par = output+".trim%s.sam" % trim3
+                command = "{bowtie_exec} --{quality} {strata_par} {l_par} -a {u_par} {processors} --un {un_par} {max_par} {trim3_par} {n_par} {v_par} {sam_par} {m_par} {index_par} {fasta_par} {input_par} 1>{output_par} 2>{output_stats}".format \
+                (bowtie_exec = self.bowtie_exec, fasta_par = fasta_par, index_par = index_par, input_par = input_par, output_par = output_par, \
+                sam_par = sam_par, n_par = n_par, v_par = v_par, output_stats = stats_par, m_par = m_par, trim3_par = trim3_par, \
+                un_par = un_par, max_par = max_par, processors = "-p %s" % self.processors, u_par = u_par, strata_par=strata_par, l_par = l_par, quality = self.quality)
+                if verbose:
+                    print command
+                if not simulate:
+                    str, err = biox.utils.cmd(command)
+                output_log.write(command+"\n")
+                executed_commands.append(command)
+                sam_files.append(output_par)
+                if type(input_decompressed)==list:
+                    if len(input_decompressed)==2:
+                        un_files.append("%s_1" % un_par)
+                        un_files.append("%s_2" % un_par)
+                    elif len(input_decompressed)==1:
+                        un_files.append(un_par)
+                else:
+                        un_files.append(un_par)
+                if bam:
+                    bam_file = output_par[:-3]+"bam"
+                    command = "{samtools_exec} view -F 4 {bam_unmapped_par} -bS {sam} > {bam}".format(samtools_exec = self.samtools_exec, sam = output_par, bam = bam_file, bam_unmapped_par = bam_unmapped_par)
+                    output_log.write(command+"\n")
+                    executed_commands.append(command)
+                    bam_files.append(output_par[:-3]+"bam")
+                    if verbose:
+                        print command                    
+                    if not simulate:
+                        str, err = biox.utils.cmd(command)
+                if create_stats and not simulate:
+                    stat_files.append(stats_par)
+                    reads[trim3], mapped[trim3] = self.read_statfile(stats_par)
+            # merge bam files
+            if bam:
+                bam_output = "%s.bam" % output
+                command = "{samtools_exec} merge -h {header_sam} -f {bam_output} {bam_files}".format(samtools_exec = self.samtools_exec, \
+                header_sam = output+".trim0.sam", bam_output = bam_output, bam_files = " ".join(bam_files))
+                if verbose:
+                    print command
+                if not simulate:
+                    str, err = biox.utils.cmd(command)
+                output_log.write(command+"\n") 
+                executed_commands.append(command)                
+            # create trim statistics
+            if create_stats and not simulate:
+                all_reads = 0
+                mapped_reads = 0
+                trims = reads.keys()
+                trims.sort()
+                stats_par_tab = "%s.stats.tab" % (output)
+                f = open(stats_par_tab, "wt")
+                f.write("trim3 size\treads processed\treads mapped\treads mapped (perc.)\n")
+                for trim in trims:
+                    perc = (float(mapped.get(trim, 0)) / max(float(reads.get(trim, 1)), 1) ) * 100
+                    f.write("%s\t%s\t%s\t%.2f%%\n" % (trim, locale.format("%d", reads.get(trim, 0), True), locale.format("%d", mapped.get(trim, 0), True), perc))
+                    all_reads = max(all_reads, float(reads.get(trim, 0)))
+                    mapped_reads += float(mapped.get(trim, 0))
+                f.write("all\t%s\t%s\t%.2f%%\n" % (locale.format("%d", all_reads, True), locale.format("%d", mapped_reads, True), (mapped_reads/max(all_reads, 1) * 100)))
+                f.close()
+                
+        if bam: # sort and index bam file
+            bam_sorted = bam_output[:-4]+"_sorted"
+            command = "{samtools_exec} sort {bam} {bam_sorted}".format(samtools_exec = self.samtools_exec, bam = bam_output, bam_sorted = bam_sorted)
+            if verbose:
+                print command
+            if not simulate:
+                str, err = biox.utils.cmd(command)
+            output_log.write(command+"\n")
+            executed_commands.append(command)
+            command = "{samtools_exec} index {bam_sorted}".format(samtools_exec = self.samtools_exec, bam_sorted = bam_sorted+".bam")
+            if verbose:
+                print command
+            if not simulate:
+                str, err = biox.utils.cmd(command)
+            output_log.write(command+"\n")
+            executed_commands.append(command)
+            if not simulate:
+                if verbose:
+                    print "mv %s %s\n" % (bam_sorted+".bam", output+".bam")
+                    print "mv %s %s\n" % (bam_sorted+".bam.bai", output+".bam.bai")
+                os.rename(bam_sorted+".bam", output+".bam")
+                os.rename(bam_sorted+".bam.bai", output+".bam.bai")
+                output_log.write("mv %s %s\n" % (bam_sorted+".bam", output+".bam"))
+                executed_commands.append("mv %s %s" % (bam_sorted+".bam", output+".bam"))
+                output_log.write("mv %s %s\n" % (bam_sorted+".bam.bai", output+".bam.bai"))
+                executed_commands.append("mv %s %s\n" % (bam_sorted+".bam.bai", output+".bam.bai"))
+                if verbose:
+                    print "mv %s %s\n" % (bam_sorted+".bam", output+".bam")
+                    print "mv %s %s\n" % (bam_sorted+".bam.bai", output+".bam.bai")
+
+        if delete_temp and not simulate:
+            for stat_file in stat_files:
+                if os.path.exists(stat_file):
+                    os.remove(stat_file)
+                    output_log.write("rm %s\n" % stat_file)
+                    executed_commands.append("rm %s" % stat_file)
+                    if verbose:
+                        print "rm %s" % stat_file
+        if delete_temp and bam and not simulate: # remove sam
+            for sam_file in sam_files:
+                if os.path.exists(sam_file):
+                    os.remove(sam_file)
+                    output_log.write("rm %s\n" % sam_file)
+                    executed_commands.append("rm %s" % sam_file)
+                    if verbose:
+                        print "rm %s" % sam_file
+            for bam_file in bam_files:
+                if os.path.exists(bam_file):
+                    os.remove(bam_file)
+                    output_log.write("rm %s\n" % bam_file)
+                    executed_commands.append("rm %s" % bam_file)
+                    if verbose:
+                        print "rm %s" % bam_file
+        if delete_temp and self.un_enabled in [False, None] and not simulate:
+            if keep_unmapped==False:
+                to_delete = un_files
+                to_keep = []
+            else:
+                if paired_input:
+                    to_delete = un_files[:-2]
+                    to_keep = un_files[-2:]
+                else:
+                    to_delete = un_files[:-1]
+                    to_keep = un_files[-1:]
+            for un_file in to_delete:
+                if os.path.exists(un_file):
+                    os.remove(un_file)
+                    output_log.write("rm %s\n" % un_file)
+                    executed_commands.append("rm %s" % un_file)
+                    if verbose:
+                        print "rm %s" % un_file
+            if len(to_keep)==1:
+                if os.path.exists(to_keep[0]):
+                    if verbose:
+                        print "mv %s %s" % (to_keep[0], "%s.unmapped" % output)
+                    os.rename(to_keep[0], "%s.unmapped" % output)
+                    output_log.write("mv %s %s\n" % (to_keep[0], "%s.unmapped" % output))
+                    executed_commands.append("mv %s %s" % (to_keep[0], "%s.unmapped" % output))
+                    if verbose:
+                        print "gzip %s" % ("%s.unmapped" % output)
+                    biox.utils.gzip("%s.unmapped" % output)
+                    output_log.write("gzip %s\n" % ("%s.unmapped" % output))
+                    executed_commands.append("gzip %s" % ("%s.unmapped" % output))
+            elif len(to_keep)==2:
+                if os.path.exists(to_keep[0]) and os.path.exists(to_keep[1]):
+                    os.rename(to_keep[0], "%s.1.unmapped" % output)
+                    output_log.write("mv %s %s\n" % (to_keep[0], "%s.1.unmapped" % output))
+                    executed_commands.append("mv %s %s" % (to_keep[0], "%s.1.unmapped" % output))
+                    os.rename(to_keep[1], "%s.2.unmapped" % output)
+                    output_log.write("mv %s %s\n" % (to_keep[1], "%s.2.unmapped" % output))
+                    executed_commands.append("mv %s %s" % (to_keep[1], "%s.2.unmapped" % output))
+                    biox.utils.gzip("%s.1.unmapped" % output)
+                    output_log.write("rm %s\n" % ("%s.1.unmapped" % output))
+                    executed_commands.append("rm %s" % ("%s.1.unmapped" % output))
+                    biox.utils.gzip("%s.2.unmapped" % output)
+                    output_log.write("rm %s\n" % ("%s.2.unmapped" % output))
+                    executed_commands.append("rm %s" % ("%s.2.unmapped" % output))
+                    if verbose:
+                        print "mv %s %s" % (to_keep[0], "%s.1.unmapped" % output)
+                        print "mv %s %s" % (to_keep[1], "%s.2.unmapped" % output)
+                        print "rm %s" % ("%s.1.unmapped" % output)
+                        print "rm %s" % ("%s.2.unmapped" % output)
+                 
+        # delete temp input files
+        if type(input_decompressed)==list:
+            for index, file in enumerate(input_decompressed):
+                if input[index]!=file:
+                    try:
+                        os.remove(file)
+                        output_log.write("rm %s\n" % file)
+                        executed_commands.append("rm %s" % file)
+                        if verbose:
+                            print "rm %s" % file
+                    except:
+                        pass
+        else:
+            if input_decompressed!=input:
+                try:
+                    os.remove(input_decompressed)
+                    output_log.write("rm %s\n" % input_decompressed)
+                    executed_commands.append("rm %s" % input_decompressed)
+                    if verbose:
+                        print "rm %s" % input_decompressed
+                except:
+                    pass
+                    
+        output_log.close()
+        return executed_commands
+        
+    def make_index(self, fasta, index_name):
+        output = pjoin(biox.bowtie_index_folder, index_name)
+        command = "{bowtie_build_exec} {fasta} {output}".format \
+        (bowtie_build_exec = self.bowtie_build_exec, fasta = fasta, output = output)
+        print command
+        std, err = biox.utils.cmd(command)
+        # also make color index
+        command = "{bowtie_build_exec} --color {fasta} {output}".format \
+        (bowtie_build_exec = self.bowtie_build_exec, fasta = fasta, output = output+"_color")
+        print command
+        std, err = biox.utils.cmd(command)

biox/map/Bowtie2.py

+import biox
+from os.path import join as pjoin
+import os
+import sys
+import locale
+
+#locale.setlocale(locale.LC_ALL, '')
+
+class Bowtie2():
+
+    """
+    Mapping of reads to a reference genome
+    """
+
+    def __init__(self):
+        self.bowtie_exec = pjoin(biox.bowtie2_folder, "bowtie2")
+        self.bowtie_build_exec = pjoin(biox.bowtie2_folder, "bowtie2-build")
+        self.samtools_exec = pjoin(biox.samtools_folder, "samtools")
+        self.trim3 = False
+        self.processors = 2
+        self.mode_fasta = False
+        self.un_enabled = False        
+        self.quality = "phred64-quals"
+        self.mode_par = "--very-fast-local"
+        self.u = None
+        self.X = 1000
+        
+    def enable_u(self, u):
+        """
+        Enable mode u. Map all reads.
+        """
+        self.u = u        
+        
+    def set_processors(self, processors):
+        """
+        Set number of processors to use.
+        """
+        self.processors = processors
+        
+    def set_X(self, X):
+        self.X = X
+        
+    def set_mode_fasta(self):
+        """
+        Enable fasta mode.
+        """
+        self.mode_fasta = True
+
+    def set_mode_fastq(self):
+        """
+        Enable fastq mode.
+        """
+        self.mode_fasta = False
+        
+    def set_mode(self, mode_par):
+        """
+        Set alignment mode
+        """
+        self.mode_par = mode_par
+        
+    def enable_un(self, un):
+        self.un_enabled = un
+        
+    def set_quality(self, quality):
+        """
+        Set qualities of data from FASTQ. Possible options as in bowtie manual.
+        :param quality: "solexa-quals", "phred33-quals", "phred64-quals", "integer-quals"
+        """
+        self.quality = quality
+        
+    def map(self, index, input, output, index_path=None, create_stats=True, bam=True, delete_temp=True, bam_include_unmapped = False, simulate = False, verbose=False, keep_unmapped = True):
+        """
+        Map reads to the reference.
+        
+        :param index: the name of the reference sequence index (dd/dp)
+        :param input: full path to FASTA or FASTQ file with reads
+        :param output: output folder
+        :param index_path: specify full path to genome index, instead of using the indexes in biox.config.bowtie_index_folder folder        :param verbose: Print each command that is executed (verbose mode)
+        :param simulate: Only simulate mappings and doesn't execute any commands (together with verbose, prints out all the mapping commands)
+        """
+        
+        u_par = "-u %s" % self.u if self.u != None else ""
+        
+        executed_commands = []
+        output_log = open("%s.log" % output, "wt")
+        paired_input = False
+        
+        # input decompressed is the same as input (bowtie2 reads .gz)
+        if type(input)==list:
+            if len(input)==2:
+                paired_input = True
+                input_decompressed = [input[0], input[1]]
+            else:
+                input_decompressed = input[0]
+        else:
+            input_decompressed = input
+       
+        bam_unmapped_par = "-F 4" if bam_include_unmapped else ""
+        fasta_par = "-f" if self.mode_fasta==True else ""
+        if index_path!=None: # specify direct path to bowtie index
+            index_par = index_path
+        else:
+            index_par = pjoin(biox.bowtie2_index_folder, index)
+        sam_files = []
+        stat_files = []
+        un_files = []
+        bam_files = []
+        stats_par = "%s.stats" % (output) if create_stats else "/dev/null" 
+        if type(input_decompressed)==list:
+            if len(input_decompressed)==2:
+                input_par = "-1 %s -2 %s -X %s" % (input_decompressed[0], input_decompressed[1], self.X)
+            else:
+                input_par = "-U %s" % input_decompressed[0]
+        else:
+            input_par = "-U %s" % input_decompressed
+            if input_decompressed.endswith(".fasta"):
+                self.set_mode_fasta()
+        output_par = "%s.sam" % output
+        un_par = "--un-gz "+output+".unmapped.gz" if self.un_enabled else ""
+        command = "{bowtie_exec} --{quality} {processors} {un_par} {u_par} {mode_par} {index_par} {fasta_par} {input_par} 1>{output_par} 2>{output_stats}".format \
+        (bowtie_exec = self.bowtie_exec, fasta_par = fasta_par, index_par = index_par, input_par = input_par, output_par = output_par, u_par = u_par, \
+        output_stats = stats_par, un_par = un_par, processors = "-p %s" % self.processors, quality = self.quality, mode_par = self.mode_par)
+        output_log.write(command+"\n")
+        executed_commands.append(command)
+        if verbose:
+            print command
+        if not simulate:
+            str, err = biox.utils.cmd(command)
+        sam_files.append(output_par)
+        if self.un_enabled:
+            un_files.append(output+".unmapped.gz")
+        if bam: # create bam file
+            bam_output = "%s.bam" % output
+            command = "{samtools_exec} view -F 4 {bam_unmapped_par} -bS {sam} > {bam}".format(samtools_exec = self.samtools_exec, sam = output_par, bam = bam_output, bam_unmapped_par = bam_unmapped_par)
+            if verbose:
+                print command
+            if not simulate:
+                str, err = biox.utils.cmd(command)
+            output_log.write(command+"\n")   
+            executed_commands.append(command)
+        if create_stats and not simulate:
+            print "stats"
+                
+        if bam: # sort and index bam file
+            bam_sorted = bam_output[:-4]+"_sorted"
+            command = "{samtools_exec} sort {bam} {bam_sorted}".format(samtools_exec = self.samtools_exec, bam = bam_output, bam_sorted = bam_sorted)
+            if verbose:
+                print command
+            if not simulate:
+                str, err = biox.utils.cmd(command)
+            output_log.write(command+"\n")
+            executed_commands.append(command)
+            command = "{samtools_exec} index {bam_sorted}".format(samtools_exec = self.samtools_exec, bam_sorted = bam_sorted+".bam")
+            if verbose:
+                print command
+            if not simulate:
+                str, err = biox.utils.cmd(command)
+            output_log.write(command+"\n")
+            executed_commands.append(command)
+            if not simulate:
+                if verbose:
+                    print "mv %s %s\n" % (bam_sorted+".bam", output+".bam")
+                    print "mv %s %s\n" % (bam_sorted+".bam.bai", output+".bam.bai")
+                os.rename(bam_sorted+".bam", output+".bam")
+                os.rename(bam_sorted+".bam.bai", output+".bam.bai")
+                output_log.write("mv %s %s\n" % (bam_sorted+".bam", output+".bam"))
+                executed_commands.append("mv %s %s" % (bam_sorted+".bam", output+".bam"))
+                output_log.write("mv %s %s\n" % (bam_sorted+".bam.bai", output+".bam.bai"))
+                executed_commands.append("mv %s %s\n" % (bam_sorted+".bam.bai", output+".bam.bai"))
+                if verbose:
+                    print "mv %s %s\n" % (bam_sorted+".bam", output+".bam")
+                    print "mv %s %s\n" % (bam_sorted+".bam.bai", output+".bam.bai")
+
+        # delete temp files
+        if bam and os.path.exists(output+".sam"):
+            os.remove(output+".sam")
+            
+        output_log.close()
+        return executed_commands
+        
+    def make_index(self, fasta, index_name):
+        output = pjoin(biox.bowtie_index_folder, index_name)
+        command = "{bowtie_build_exec} {fasta} {output}".format \
+        (bowtie_build_exec = self.bowtie_build_exec, fasta = fasta, output = output)
+        print command
+        std, err = biox.utils.cmd(command)
+        # also make color index
+        command = "{bowtie_build_exec} --color {fasta} {output}".format \
+        (bowtie_build_exec = self.bowtie_build_exec, fasta = fasta, output = output+"_color")
+        print command
+        std, err = biox.utils.cmd(command)

biox/map/__init__.py

+import biox
+
+# modules
+from Bowtie import *
+from Bowtie2 import *

biox/maths/__init__.py

+try:
+    import stats
+except:
+    pass
+    
+def chisquare(matrix):
+    sum_all = float(sum([sum(x) for x in matrix]))
+    def sum_col(i):
+        return sum(matrix[i])
+    def sum_row(i):
+        return sum([x[i] for x in matrix])
+    dim_x = len(matrix)
+    dim_y = len(matrix[0])
+
+    matrix_expected = []
+    for i in range(0, dim_x):
+        col = []
+        for j in range(0, dim_y):
+            element = sum_col(i)*sum_row(j)/sum_all
+            col.append(element)
+        matrix_expected.append(col)
+        
+    matrix_chi = []
+    for i in range(0, dim_x):
+        col = []
+        for j in range(0, dim_y):
+            element = matrix[i][j]-matrix_expected[i][j]
+            element *= element
+            divide_by = matrix_expected[i][j] if matrix_expected[i][j]!=0 else 1
+            element /= divide_by
+            col.append(element)
+        matrix_chi.append(col)
+    chi = sum([sum(x) for x in matrix_chi])
+    return chi, stats.chisqprob(chi, (dim_x-1)*(dim_y-1))

biox/maths/pstat.py

+# Copyright (c) 1999-2000 Gary Strangman; All Rights Reserved.
+#
+# This software is distributable under the terms of the GNU
+# General Public License (GPL) v2, the text of which can be found at
+# http://www.gnu.org/copyleft/gpl.html. Installing, importing or otherwise
+# using this module constitutes acceptance of the terms of this License.
+#
+# Disclaimer
+# 
+# This software is provided "as-is".  There are no expressed or implied
+# warranties of any kind, including, but not limited to, the warranties
+# of merchantability and fittness for a given application.  In no event
+# shall Gary Strangman be liable for any direct, indirect, incidental,
+# special, exemplary or consequential damages (including, but not limited
+# to, loss of use, data or profits, or business interruption) however
+# caused and on any theory of liability, whether in contract, strict
+# liability or tort (including negligence or otherwise) arising in any way
+# out of the use of this software, even if advised of the possibility of
+# such damage.
+#
+# Comments and/or additions are welcome (send e-mail to:
+# strang@nmr.mgh.harvard.edu).
+# 
+"""
+pstat.py module
+
+#################################################
+#######  Written by:  Gary Strangman  ###########
+#######  Last modified:  Jun 29, 2001 ###########
+#################################################
+
+This module provides some useful list and array manipulation routines
+modeled after those found in the |Stat package by Gary Perlman, plus a
+number of other useful list/file manipulation functions.  The list-based
+functions include:
+
+      abut (source,*args)
+      simpleabut (source, addon)
+      colex (listoflists,cnums)
+      collapse (listoflists,keepcols,collapsecols,fcn1=None,fcn2=None,cfcn=None)
+      dm (listoflists,criterion)
+      flat (l)
+      linexand (listoflists,columnlist,valuelist)
+      linexor (listoflists,columnlist,valuelist)
+      linedelimited (inlist,delimiter)
+      lineincols (inlist,colsize) 
+      lineincustcols (inlist,colsizes)
+      list2string (inlist)
+      makelol(inlist)
+      makestr(x)
+      printcc (lst,extra=2)
+      printincols (listoflists,colsize)
+      pl (listoflists)
+      printl(listoflists)
+      replace (lst,oldval,newval)
+      recode (inlist,listmap,cols='all')
+      remap (listoflists,criterion)
+      roundlist (inlist,num_digits_to_round_floats_to)
+      sortby(listoflists,sortcols)
+      unique (inlist)
+      duplicates(inlist)
+      writedelimited (listoflists, delimiter, file, writetype='w')
+
+Some of these functions have alternate versions which are defined only if
+Numeric (NumPy) can be imported.  These functions are generally named as
+above, with an 'a' prefix.
+
+      aabut (source, *args)
+      acolex (a,indices,axis=1)
+      acollapse (a,keepcols,collapsecols,sterr=0,ns=0)
+      adm (a,criterion)
+      alinexand (a,columnlist,valuelist)
+      alinexor (a,columnlist,valuelist)
+      areplace (a,oldval,newval)
+      arecode (a,listmap,col='all')
+      arowcompare (row1, row2)
+      arowsame (row1, row2)
+      asortrows(a,axis=0)
+      aunique(inarray)
+      aduplicates(inarray)
+
+Currently, the code is all but completely un-optimized.  In many cases, the
+array versions of functions amount simply to aliases to built-in array
+functions/methods.  Their inclusion here is for function name consistency.
+"""
+
+## CHANGE LOG:
+## ==========
+## 01-11-15 ... changed list2string() to accept a delimiter
+## 01-06-29 ... converted exec()'s to eval()'s to make compatible with Py2.1
+## 01-05-31 ... added duplicates() and aduplicates() functions
+## 00-12-28 ... license made GPL, docstring and import requirements
+## 99-11-01 ... changed version to 0.3
+## 99-08-30 ... removed get, getstrings, put, aget, aput (into io.py)
+## 03/27/99 ... added areplace function, made replace fcn recursive
+## 12/31/98 ... added writefc function for ouput to fixed column sizes
+## 12/07/98 ... fixed import problem (failed on collapse() fcn)
+##              added __version__ variable (now 0.2)
+## 12/05/98 ... updated doc-strings
+##              added features to collapse() function
+##              added flat() function for lists
+##              fixed a broken asortrows() 
+## 11/16/98 ... fixed minor bug in aput for 1D arrays
+##
+## 11/08/98 ... fixed aput to output large arrays correctly
+
+import stats  # required 3rd party module
+import string, copy
+from types import *
+
+__version__ = 0.4
+
+###===========================  LIST FUNCTIONS  ==========================
+###
+### Here are the list functions, DEFINED FOR ALL SYSTEMS.
+### Array functions (for NumPy-enabled computers) appear below.
+###
+
+def abut (source,*args):
+    """
+Like the |Stat abut command.  It concatenates two lists side-by-side
+and returns the result.  '2D' lists are also accomodated for either argument
+(source or addon).  CAUTION:  If one list is shorter, it will be repeated
+until it is as long as the longest list.  If this behavior is not desired,
+use pstat.simpleabut().
+
+Usage:   abut(source, args)   where args=any # of lists
+Returns: a list of lists as long as the LONGEST list past, source on the
+         'left', lists in <args> attached consecutively on the 'right'
+"""
+
+    if type(source) not in [ListType,TupleType]:
+        source = [source]
+    for addon in args:
+        if type(addon) not in [ListType,TupleType]:
+            addon = [addon]
+        if len(addon) < len(source):                # is source list longer?
+            if len(source) % len(addon) == 0:        # are they integer multiples?
+                repeats = len(source)/len(addon)    # repeat addon n times
+                origadd = copy.deepcopy(addon)
+                for i in range(repeats-1):
+                    addon = addon + origadd
+            else:
+                repeats = len(source)/len(addon)+1  # repeat addon x times,
+                origadd = copy.deepcopy(addon)      #    x is NOT an integer
+                for i in range(repeats-1):
+                    addon = addon + origadd
+                    addon = addon[0:len(source)]
+        elif len(source) < len(addon):                # is addon list longer?
+            if len(addon) % len(source) == 0:        # are they integer multiples?
+                repeats = len(addon)/len(source)    # repeat source n times
+                origsour = copy.deepcopy(source)
+                for i in range(repeats-1):
+                    source = source + origsour
+            else:
+                repeats = len(addon)/len(source)+1  # repeat source x times,
+                origsour = copy.deepcopy(source)    #   x is NOT an integer
+                for i in range(repeats-1):
+                    source = source + origsour
+                source = source[0:len(addon)]
+
+        source = simpleabut(source,addon)
+    return source
+
+
+def simpleabut (source, addon):
+    """
+Concatenates two lists as columns and returns the result.  '2D' lists
+are also accomodated for either argument (source or addon).  This DOES NOT
+repeat either list to make the 2 lists of equal length.  Beware of list pairs
+with different lengths ... the resulting list will be the length of the
+FIRST list passed.
+
+Usage:   simpleabut(source,addon)  where source, addon=list (or list-of-lists)
+Returns: a list of lists as long as source, with source on the 'left' and
+                 addon on the 'right'
+"""
+    if type(source) not in [ListType,TupleType]:
+        source = [source]
+    if type(addon) not in [ListType,TupleType]:
+        addon = [addon]
+    minlen = min(len(source),len(addon))
+    list = copy.deepcopy(source)                # start abut process
+    if type(source[0]) not in [ListType,TupleType]:
+        if type(addon[0]) not in [ListType,TupleType]:
+            for i in range(minlen):
+                list[i] = [source[i]] + [addon[i]]        # source/addon = column
+        else:
+            for i in range(minlen):
+                list[i] = [source[i]] + addon[i]        # addon=list-of-lists
+    else:
+        if type(addon[0]) not in [ListType,TupleType]:
+            for i in range(minlen):
+                list[i] = source[i] + [addon[i]]        # source=list-of-lists
+        else:
+            for i in range(minlen):
+                list[i] = source[i] + addon[i]        # source/addon = list-of-lists
+    source = list
+    return source
+
+
+def colex (listoflists,cnums):
+    """
+Extracts from listoflists the columns specified in the list 'cnums'
+(cnums can be an integer, a sequence of integers, or a string-expression that
+corresponds to a slice operation on the variable x ... e.g., 'x[3:]' will colex
+columns 3 onward from the listoflists).
+
+Usage:   colex (listoflists,cnums)
+Returns: a list-of-lists corresponding to the columns from listoflists
+         specified by cnums, in the order the column numbers appear in cnums
+"""
+    global index
+    column = 0
+    if type(cnums) in [ListType,TupleType]:   # if multiple columns to get
+        index = cnums[0]
+        column = map(lambda x: x[index], listoflists)
+        for col in cnums[1:]:
+            index = col
+            column = abut(column,map(lambda x: x[index], listoflists))
+    elif type(cnums) == StringType:              # if an 'x[3:]' type expr.
+        evalstring = 'map(lambda x: x'+cnums+', listoflists)'
+        column = eval(evalstring)
+    else:                                     # else it's just 1 col to get
+        index = cnums
+        column = map(lambda x: x[index], listoflists)
+    return column
+
+
+def collapse (listoflists,keepcols,collapsecols,fcn1=None,fcn2=None,cfcn=None):
+     """
+Averages data in collapsecol, keeping all unique items in keepcols
+(using unique, which keeps unique LISTS of column numbers), retaining the
+unique sets of values in keepcols, the mean for each.  Setting fcn1
+and/or fcn2 to point to a function rather than None (e.g., stats.sterr, len)
+will append those results (e.g., the sterr, N) after each calculated mean.
+cfcn is the collapse function to apply (defaults to mean, defined here in the
+pstat module to avoid circular imports with stats.py, but harmonicmean or
+others could be passed).
+
+Usage:    collapse (listoflists,keepcols,collapsecols,fcn1=None,fcn2=None,cfcn=None)
+Returns: a list of lists with all unique permutations of entries appearing in
+     columns ("conditions") specified by keepcols, abutted with the result of
+     cfcn (if cfcn=None, defaults to the mean) of each column specified by
+     collapsecols.
+"""
+     def collmean (inlist):
+         s = 0
+         for item in inlist:
+             s = s + item
+         return s/float(len(inlist))
+
+     if type(keepcols) not in [ListType,TupleType]:
+         keepcols = [keepcols]
+     if type(collapsecols) not in [ListType,TupleType]:
+         collapsecols = [collapsecols]
+     if cfcn == None:
+         cfcn = collmean
+     if keepcols == []:
+         means = [0]*len(collapsecols)
+         for i in range(len(collapsecols)):
+             avgcol = colex(listoflists,collapsecols[i])
+             means[i] = cfcn(avgcol)
+             if fcn1:
+                 try:
+                     test = fcn1(avgcol)
+                 except:
+                     test = 'N/A'
+                     means[i] = [means[i], test]
+             if fcn2:
+                 try:
+                     test = fcn2(avgcol)
+                 except:
+                     test = 'N/A'
+                 try:
+                     means[i] = means[i] + [len(avgcol)]
+                 except TypeError:
+                     means[i] = [means[i],len(avgcol)]
+         return means
+     else:
+         values = colex(listoflists,keepcols)
+         uniques = unique(values)
+         uniques.sort()
+         newlist = []
+         if type(keepcols) not in [ListType,TupleType]:  keepcols = [keepcols]
+         for item in uniques: