Commits

Anonymous committed 7a363ca

First work on dsptools

  • Participants
  • Parent commits d52392e

Comments (0)

Files changed (6)

+#!/usr/bin/env python -W ignore
+
+#
+# Copyright (c) 2010-2012, Davide Cittaro
+# All rights reserved.
+# 
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#     * Redistributions of source code must retain the above copyright
+#       notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+#     * Neither the name of the <organization> nor the
+#       names of its contributors may be used to endorse or promote products
+#       derived from this software without specific prior written permission.
+# 
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+
+import os
+import sys
+import argparse
+
+_version = '0.0.1'
+
+def main():
+
+  parser = argparse.ArgumentParser(prog="dsptools", description="A set of tools to operate on genomic signals", version=_version)
+  subparsers = parser.add_subparsers(dest="predicate")
+  
+  # pileup predicate will convert input into serialized arrays for any operation
+  pileup_parser = subparsers.add_parser("pileup", help="Serialize various data into data arrays")
+  pileup_parser.add_argument("-i", "--filein", dest="input_file",  help="Input file", required=True, default=None)
+  pileup_parser.add_argument("-o", "--fileout", dest="output_file",  help="Output file prefix", default="dspout")
+  pileup_parser.add_argument("-f", "--format", dest="file_format",  help="File format", choices=['bam', 'bigwig', 'bed', 'bar', 'wig'], default='bam')
+  pileup_parser.add_argument("-c", "--csize", dest="chrom_size",  help="File containing chromosome sizes", default=None)
+  pileup_parser.add_argument("-n", "--nodup", dest="nodup", action="store_true", help="Naive duplicate removal")
+  pileup_parser.add_argument("-q", "--quality", dest="qual_filter", type=int, help="Quality filter for bam or bed", default=0)
+
+  # dump predicate will convert serialized arrays into data format widely used in genomic research
+  dump_parser = subparsers.add_parser("dump", help="Convert serialized data into convenient file formats")
+  dump_parser.add_argument("-i", "--filein", dest="input_file",  help="Input file", required=True, default=None)
+  dump_parser.add_argument("-o", "--fileout", dest="output_file",  help="Output file prefix", default="dspout")
+  dump_parser.add_argument("-f", "--format", dest="file_format",  help="File format", choices=['bigwig', 'bedgraph'], default='bigwig')
+  dump_parser.add_argument("-w", "--winfun", dest="window_function", help="Window Function", choices=['mean', 'median', 'max'], default='mean')
+  dump_parser.add_argument("-s", "--step", dest="step_size", help="Step size", type=int, default=100)
+  
+  # fir parser will perform smooth using a FIR
+  firdenoise_parser = subparsers.add_parser("firdenoise", help="Denoise data using FIR smoothing")
+  firdenoise_parser.add_argument("-i", "--filein", dest="input_file",  help="Input file", required=True, default=None)
+  firdenoise_parser.add_argument("-o", "--fileout", dest="output_file",  help="Output file prefix", default="dspout")
+  firdenoise_parser.add_argument("-f", "--fir", dest="fir_type", help="Filter type", default="hanning", choices=["hanning", "hamming", "blackman", "bartlett", "flat", "flattop", "nuttall", "gauss"])
+  firdenoise_parser.add_argument("-s", "--size", dest="window_size", help="Filter window size", type=int, default=50000)
+  
+  
+  
+  # wavelet parser will perform wavelet denoising
+  wltdenoise = subparsers.add_parser("wltdenoise", help="Denoise data using wavelets (requires pywavelets)")
+  wltdenoise.add_argument("-i", "--filein", dest="input_file",  help="Input file", required=True, default=None)
+  wltdenoise.add_argument("-o", "--fileout", dest="output_file",  help="Output file prefix", default="dspout")
+  wltdenoise.add_argument("-w", "--wavelet", dest="wavelet", help="Wavelet to use", default="coif4")
+  wltdenoise.add_argument("-s", "--size", dest="window_size", help="Expected signal size", default=50000, type=int)
+  
+  
+  # expr parser will evaluate a mathematical operation
+  eval_parser = subparsers.add_parser("eval", help="Evaluates arithmetical expression on serialized data")
+  eval_parser.add_argument("-o", "--fileout", dest="output_file",  help="Output file prefix", default="dspout")
+
+
+  # threshold parser will calculate a threshold
+  threshold_parser = subparsers.add_parser("threshold", help="Apply a threshold to data")
+  threshold_parser.add_argument("-i", "--filein", dest="input_file",  help="Input file", required=True, default=None)
+  threshold_parser.add_argument("-o", "--fileout", dest="output_file",  help="Output file prefix", default="dspout")
+  threshold_parser.add_argument("-t", "--threshold", dest="threshold_type", help="Thresholding approach to use", choices=["otsu","dyn_otsu","float"], default="otsu")
+  threshold_parser.add_argument("-k", "--keep", dest="keep_values", help="Rule to filter values", choices=["ge", "g", "le", "l"], default="ge")
+  threshold_parser.add_argument("-f", "--fixedt", dest="fixed_t", help="Fixed float threshold", type=float, default=0.0)
+  
+  # peak parser will find peak boundaries
+  peaks_parser = subparsers.add_parser("peak", help="Find peaks in quantitative data")
+  peaks_parser.add_argument("-i", "--filein", dest="input_file",  help="Input file", required=True, default=None)
+  peaks_parser.add_argument("-o", "--fileout", dest="output_file",  help="Output file prefix", default="dspout")
+  peaks_parser.add_argument("-e", "--epsilon", dest="dbscan_eps", help="DBScan Epsilon vale (distance)", type=int, default=500)
+  peaks_parser.add_argument("-m", "--minpeaks", dest="dbscan_min", help="DBScan min. number of peaks", type=int, default=1)
+  peaks_parser.add_argument("-t", "--feature", dest="score_feature", help="Feature to score", choices=["height", "area", "width"], default="height")
+  peaks_parser.add_argument("-d", "--distribution", dest="score_dist", help="Score distribution model (leave blank for best fit)", choices=["norm", "lognorm", "powerlaw", "gamma"], default=None)
+  
+  args = parser.parse_args(sys.argv[1:])
+  
+  print args
+  
+  
+if __name__ == '__main__':
+    try:
+        main()
+    except KeyboardInterrupt:
+        sys.stderr.write("Exiting" + os.linesep)
+        sys.exit(1)

File lib/dsptools/IO/__init__.py

Empty file added.

File lib/dsptools/__init__.py

+__version__ = '0.1'

File lib/dsptools/__init__.pyc

Binary file added.

File lib/dsptools/fir.py

+#
+# Copyright (c) 2011, Davide Cittaro
+# All rights reserved.
+# 
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#     * Redistributions of source code must retain the above copyright
+#       notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+#     * Neither the name of the <organization> nor the
+#       names of its contributors may be used to endorse or promote products
+#       derived from this software without specific prior written permission.
+# 
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+import tables
+import numpy as np
+import scipy.signal as ssig
+
+# Functions
+
+def nextpow2(x):
+  return np.ceil(np.log2(np.abs(x)))
+
+
+def pad_zero(data, padded, pos = 1):
+  # ok, this is tough... we have to pad signall that is written on disk
+  # better way to do it? Mmm, I would say we write a temporary copy
+  
+  l_data = len(data)
+  l_padded = len(padded)
+  pad_array = np.zeros(l_padded / 2 - l_data / 2)
+  npad = len(pad_array)
+  if pos == 0:
+    padded[:2*npad] = np.concatenate([pad_array, pad_array])
+    padded[2 * npad:] = data
+  elif pos = 1:
+    padded[:npad] = pad_array
+    padded[npad:-npad] = data
+    padded[-npad:] = pad_array
+  elif pos = 2:
+    padded[:-2 * npad] = data
+    padded[-2 * npad:] = np.concatenate([pad_array, pad_array])
+
+# Classes
+
+
+class __fir__():
+  def __init__(self, fir_size):
+    self._FIR = np.array([])
+  def smooth(self, signal):
+    # pad signal
+    l_padded = len(signal) + len(self._FIR)
+
+    fname = 'changeme'
+    filters = tables.Filters(complevel=5, complib='blosc')
+
+    padded_signal = tables.openFile(fname, mode = 'w', filters = filters)
+    padded_signal.createCArray("/", "pad", tables.Float64Atom(), (l_padded,))
+    padded_signal = pad_zero(signal, padded_signal)
+    
+  
+
+
+class hanning(__fir__):
+  def __init__(self, fir_size):
+    self._FIR = ssig.hanning(fir_size)
+
+class hamming(__fir__):
+  def __init__(self, fir_size):
+    self._FIR = ssig.hamming(fir_size)
+
+class blackman(__fir__):
+  def __init__(self, fir_size):
+    self._FIR = ssig.blackman(fir_size)
+
+class bartlett(__fir__):
+  def __init__(self, fir_size):
+    self._FIR = ssig.bartlett(fir_size)
+
+class flattop(__fir__):
+  def __init__(self, fir_size):
+    self._FIR = ssig.flattop(fir_size)
+
+class nuttall(__fir__):
+  def __init__(self, fir_size):
+    self._FIR = ssig.nuttall(fir_size)
+    
+class morlet(__fir__):
+  def __init__(self, fir_size):
+    self._FIR = ssig.morlet(fir_size)
+
+class chebwin(__fir__):
+  def __init__(self, fir_size, dB):
+    self._FIR = ssig.chebwin(fir_size, dB)
+
+class flat(__fir__):
+  def __init__(self, fir_size):
+    self._FIR = np.ones(fir_size, 'd')
+
+class gaussian(__fir__):
+  def __init__(self, fir_size, std_dev):
+    self._FIR = np.ssig.gaussian(fir_size, std_dev)

File lib/dsptools/ptpileup.py

+#
+# Copyright (c) 2011, Davide Cittaro
+# All rights reserved.
+# 
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#     * Redistributions of source code must retain the above copyright
+#       notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+#     * Neither the name of the <organization> nor the
+#       names of its contributors may be used to endorse or promote products
+#       derived from this software without specific prior written permission.
+# 
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+import tables
+import pysam
+import bx.wiggle
+import bx.bbi.bigwig_file
+import subprocess
+import numpy as np
+import os
+import struct
+
+class PTPileup():
+  """ The container for genomic data. This is a pytables file with one
+  root group and as many nodes as chromosomes.
+  """
+  def __init__(self, fname = 'pileup.h5', mode = 'w'):
+    filters = None
+    if mode == 'w':
+      # If this is a new file, write it with blosc filters
+      filters = tables.Filters(complevel=5, complib='blosc')
+    self._h5file = tables.openFile(fname, mode = mode, filters = filters)
+  def _close(self):
+    self._h5file.close()
+  def _flush(self):
+    self._h5file.flush()
+  def hasChromosome(self, cname):
+    if cname in self._h5file.root._v_children.keys():
+      return True
+    return False
+  def addChromosome(self, cname, csize):
+    if not self.hasChromosome(cname):
+      self._h5file.createCArray("/", cname, tables.Float64Atom(), (csize,))
+  def chromosome(self, cname):
+    try:
+      return self._h5file.root._f_getChild(cname)
+    except tables.exceptions.NoSuchNodeError:
+      None
+  def sizeof(self, cname):
+    return self.chromosome(cname).__len__()
+  def sizeDict(self):
+    return dict([(x.name, x) for x in self._h5file.listNodes("/", classname="CArray")])
+  def asArray(self, cname, start = None, end = None):
+    return self.chromosome(cname)[start:end]
+  def importIntervalFile(self, fname, ds = 1.0, mq = 0, rd = False, fmt = "bam", csize = None):
+    """Import interval files into a pytable container.
+    fname: interval file name
+    ds: downsample ratio
+    mq: min. quality mapping (used for BAM filtering)
+    rd: remove duplicates
+    fmt: file format (bam or bed)
+    cisze: when bed file is specified, a chromosome table should be provided as dictionary
+      {'chrA_name':chrA_size,
+       'chrB_name': chrB_size, ...}
+    """
+    fname = os.path.expanduser(fname)
+    fmt = fmt.lower()
+    if fmt == "sam":
+      fmt = "bam"
+    if ds > 1.0 or ds < 0.0:
+      ds = 1.0
+    if mq < 0:
+      mq = 0
+  
+    if fmt == "bed":
+      if not csize:
+        raise IOError("chromosome sizes must be provided when %s format is specified" % fmt)
+      else:
+        giterator = open(fname)	# open file handler for later 
+        # read from csize
+        if type(csize) == str:
+          # if csize is a string, use it as filename
+          # otherwise should be a dictionary
+          csize = dict([x.strip().split() for x in open(os.expanduser(fname))])
+        if type(csize) != dict:
+          # if it is not a dictionary
+          raise TypeError("A chromosome table should be provided either as dictionary or file")
+        for (chrom, size) in csize.items():
+          self.addChromosome(chrom, int(size))
+    elif fmt == "bam":
+      gfh = pysam.Samfile(fname, "rb")	# we can use this handler right now
+      for sd in gfh.header["SQ"]:
+        self.addChromosome(sd["SN"], sd["LN"])
+      # also create the iterator
+      try:
+        giterator = gfh.fetch()
+      except ValueError:
+        # a ValueError is raised if index is not present
+        gfh.close()
+        pysam.index(fname)
+        gfh = pysam.Samfile(fname, "rb")
+        giterator = gfh.fetch()
+    else:
+      raise TypeError("Format %s not supported, yet" % fmt)
+    
+    pstart = -1
+    pend = -1
+    pchrom = ''
+    keepit = True
+    removed = 0
+    for n, x in enumerate(giterator):
+      if fmt == "bam":
+        if x.mapq < mq: continue
+        (chrom, start, end) = (gfh.getrname(x.rname), x.pos, x.pos + x.rlen)
+      elif fmt == "bed":
+        fields = x.strip().split()
+        (chrom, start, end) = (fields[0], int(fields[1]), int(fields[2]))
+      if rd and start == pstart and end == pend and chrom == pchrom:
+        keepit = False
+      if np.random.uniform() > ds:
+        keepit = False
+      if keepit:
+        pstart = start
+        pend = end
+        pchrom = chrom
+        self.chromosome(chrom)[start:end] += 1
+        if n % 131072 == 0:
+          self._flush()
+      else:
+        removed += 1
+      keepit = True  
+    gfh.close()
+  def importProfileFile(self, fname, fmt = 'wig', csize = None):
+    fmt = fmt.lower()
+    if fmt == "wig" or fmt == "bdg":
+      if not csize:
+        raise IOError("chromosome sizes must be provided when %s format is specified" % fmt)
+      else:
+        giterator = open(fname, mode = 'r')
+        # read from csize
+        if type(csize) == str:
+          # if csize is a string, use it as filename
+          # otherwise should be a dictionary
+          csize = dict([x.strip().split() for x in open(os.expanduser(fname))])
+        if type(csize) != dict:
+          # if it is not a dictionary
+          raise TypeError("A chromosome table should be provided either as dictionary or file")
+        for (chrom, size) in csize.items():
+          self.addChromosome(chrom, int(size))
+      for (chrom, position, value) in bx.wiggle.Reader(giterator):
+        self.chromosome(chrom)[position] = value
+        if position % 131072 == 0:
+          self._flush()
+    elif fmt == "bw" or fmt == "bigwig":
+      fh = open(os.path.expanduser(fname), "rb")
+      bwh = bx.bbi.bigwig_file.BigWigFile(fh)
+      for (chrom, size) in getChromosomeSizesFromBigWig(fname):
+        self.addChromosome(chrom, size)
+        self.chromosome(chrom) = bwh.get_as_array(chrom, 0, csize[chrom])
+      fh.close()
+    else:  
+      raise TypeError("Format %s not supported, yet" % fmt)
+
+
+def getChromosomeSizesFromBigWig(bwname):
+  csize = {}
+  fh = open(os.path.expanduser(bwname), "rb")
+  
+  # read magic number to guess endianness
+  magic = fh.read(4)
+  if magic == '&\xfc\x8f\x88':
+    endianness = '<'
+  elif magic == '\x88\x8f\xfc&':
+    endianness = '>'
+  else:
+    raise IOError("The file is not in bigwig format")
+  # read the header
+  (version, zoomLevels, chromosomeTreeOffset, 
+  fullDataOffset, fullIndexOffset, fieldCount, definedFieldCount, 
+  autoSqlOffset, totalSummaryOffset, uncompressBufSize, reserved) = struct.unpack(endianness + 'HHQQQHHQQIQ', fh.read(60))
+  
+  if version < 3:
+    raise IOError("Bigwig files version <3 are not supported")
+  
+  # go to the data
+  fh.seek(chromosomeTreeOffset)
+  
+  # read magic again
+  magic = fh.read(4)
+  if magic == '\x91\x8c\xcax':
+    endianness = '<'
+  elif magic == 'x\xca\x8c\x91':
+    endianness = '>'
+  else:
+    raise ValueError("Wrong magic for this bigwig data file")
+  
+  (blockSize, keySize, valSize, itemCount, reserved) = struct.unpack(endianness + 'IIIQQ', fh.read(28))
+  (isLeaf, reserved, count) = struct.unpack(endianness + 'BBH', fh.read(4))
+  if isLeaf:
+    for n in range(count):
+      (key, chromId, chromSize) = struct.unpack(endianness + str(keySize) + 'sII', fh.read(keySize + 2 * 4))
+      # we have chrom and size
+      csize[key.replace('\x00', '')] = chromSize
+    fh.close()  
+    return csize  
+#  (key, childOffset) = struct.unpack(endianness + str(keySize) + 'sQ', fh.read(keySize + 8))
+####
+  (dataCount) = struct.unpack("<I", fh.read(4))
+  (chromId, chromStart, chromEnd, itemStep, itemSpan, type, reserved, itemCount) = struct.unpack(endianness + 'IIIIIBBH', fh.read(24))
+