Source

dsptools / lib / dspomics / ptpileup.py

#
# Copyright (c) 2011, Davide Cittaro
# All rights reserved.
# 
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#     * Redistributions of source code must retain the above copyright
#       notice, this list of conditions and the following disclaimer.
#     * Redistributions in binary form must reproduce the above copyright
#       notice, this list of conditions and the following disclaimer in the
#       documentation and/or other materials provided with the distribution.
#     * Neither the name of the <organization> nor the
#       names of its contributors may be used to endorse or promote products
#       derived from this software without specific prior written permission.
# 
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#

import tables
import pysam
import bx.wiggle
import subprocess
import numpy as np
import os
import struct

class PTPileup():
  """ The container for genomic data. This is a pytables file with one
  root group and as many nodes as chromosomes.
  """
  def __init__(self, fname = 'pileup.h5', mode = 'w'):
    filters = None
    if mode == 'w':
      # If this is a new file, write it with blosc filters
      filters = tables.Filters(complevel=5, complib='blosc')
    self._h5file = tables.openFile(fname, mode = mode, filters = filters)
  def _close(self):
    self._h5file.close()
  def _flush(self):
    self._h5file.flush()
  def hasChromosome(self, cname):
    if cname in self._h5file.root._v_children.keys():
      return True
    return False
  def addChromosome(self, cname, csize):
    if not self.hasChromosome(cname):
      self._h5file.createCArray("/", cname, tables.Float64Atom(), (csize,))
  def chromosome(self, cname):
    try:
      return self._h5file.root._f_getChild(cname)
    except tables.exceptions.NoSuchNodeError:
      None
  def sizeof(self, cname):
    return self.chromosome(cname).__len__()
  def sizeDict(self):
    return dict([(x.name, x) for x in self._h5file.listNodes("/", classname="CArray")])
  def asArray(self, cname, start = None, end = None):
    return self.chromosome(cname)[start:end]
  def importIntervalFile(self, fname, ds = 1.0, mq = 0, rd = False, fmt = "bam", csize = None):
    """Import interval files into a pytable container.
    fname: interval file name
    ds: downsample ratio
    mq: min. quality mapping (used for BAM filtering)
    rd: remove duplicates
    fmt: file format (bam or bed)
    cisze: when bed file is specified, a chromosome table should be provided as dictionary
      {'chrA_name':chrA_size,
       'chrB_name': chrB_size, ...}
    """
    fname = os.path.expanduser(fname)
    fmt = fmt.lower()
    if fmt == "sam":
      fmt = "bam"
    if ds > 1.0 or ds < 0.0:
      ds = 1.0
    if mq < 0:
      mq = 0
  
    if fmt == "bed":
      if not csize:
        raise IOError("chromosome sizes must be provided when %s format is specified" % fmt)
      else:
        giterator = open(fname)	# open file handler for later 
        # read from csize
        if type(csize) == str:
          # if csize is a string, use it as filename
          # otherwise should be a dictionary
          csize = dict([x.strip().split() for x in open(os.expanduser(fname))])
        if type(csize) != dict:
          # if it is not a dictionary
          raise TypeError("A chromosome table should be provided either as dictionary or file")
        for (chrom, size) in csize.items():
          self.addChromosome(chrom, int(size))
    elif fmt == "bam":
      gfh = pysam.Samfile(fname, "rb")	# we can use this handler right now
      for sd in gfh.header["SQ"]:
        self.addChromosome(sd["SN"], sd["LN"])
      # also create the iterator
      try:
        giterator = gfh.fetch()
      except ValueError:
        # a ValueError is raised if index is not present
        gfh.close()
        pysam.index(fname)
        gfh = pysam.Samfile(fname, "rb")
        giterator = gfh.fetch()
    else:
      raise TypeError("Format %s not supported, yet" % fmt)
    
    pstart = -1
    pend = -1
    pchrom = ''
    keepit = True
    removed = 0
    for n, x in enumerate(giterator):
      if fmt == "bam":
        if x.mapq < mq: continue
        (chrom, start, end) = (gfh.getrname(x.rname), x.pos, x.pos + x.rlen)
      elif fmt == "bed":
        fields = x.strip().split()
        (chrom, start, end) = (fields[0], int(fields[1]), int(fields[2]))
      if rd and start == pstart and end == pend and chrom == pchrom:
        keepit = False
      if np.random.uniform() > ds:
        keepit = False
      if keepit:
        pstart = start
        pend = end
        pchrom = chrom
        self.chromosome(chrom)[start:end] += 1
        if n % 131072 == 0:
          self._flush()
      else:
        removed += 1
      keepit = True  
    gfh.close()
  def importProfileFile(self, fname, fmt = 'wig', csize = None):
    fmt = fmt.lower()
    if fmt == "bdg":
      # since we use the bx.wiggle module, we can treat bedgraph and wiggle
      # as they are the same format.
      fmt = "wig"
    elif fmt == "bigwig":
      fmt = "bw"
    if fmt == "wig":
      if not csize:
        raise IOError("chromosome sizes must be provided when %s format is specified" % fmt)
      else:
        giterator = open(fname, mode = 'r')
        # read from csize
        if type(csize) == str:
          # if csize is a string, use it as filename
          # otherwise should be a dictionary
          csize = dict([x.strip().split() for x in open(os.expanduser(fname))])
        if type(csize) != dict:
          # if it is not a dictionary
          raise TypeError("A chromosome table should be provided either as dictionary or file")
        for (chrom, size) in csize.items():
          self.addChromosome(chrom, int(size))
    elif fmt == "bw":
      for (chrom, size) in getChromosomeSizesFromBigWig(fname):
        self.addChromosome(chrom, size)
    
      giterator = subprocess.Popen(["bigWigToBedGraph", fname, "stdout"], shell = False, stdout = subprocess.PIPE).stdout 
    else:  
      raise TypeError("Format %s not supported, yet" % fmt)
    for (chrom, position, value) in bx.wiggle.Reader(giterator):
      self.chromosome(chrom)[position] = value
      if position % 131072 == 0:
        self._flush()


def getChromosomeSizesFromBigWig(bwname):
  csize = {}
  bw = open(os.path.expanduser(bwname), "rb")
  
  # read magic number to guess endianness
  magic = fh.read(4)
  if magic == '&\xfc\x8f\x88':
    endianness = '<'
  elif magic == '\x88\x8f\xfc&':
    endianness = '>'
  else:
    raise IOError("The file is not in bigwig format")
  # read the header
  (version, zoomLevels, chromosomeTreeOffset, 
  fullDataOffset, fullIndexOffset, fieldCount, definedFieldCount, 
  autoSqlOffset, totalSummaryOffset, uncompressBufSize, reserved) = struct.unpack(endianness + 'HHQQQHHQQIQ', fh.read(60))
  
  if version < 3:
    raise IOError("Bigwig files version <3 are not supported")
  
  # go to the data
  fh.seek(chromosomeTreeOffset)
  
  # read magic again
  magic = fh.read(4)
  if magic == '\x91\x8c\xcax':
    endianness = '<'
  elif magic == 'x\xca\x8c\x91':
    endianness = '>'
  else:
    raise ValueError("Wrong magic for this bigwig data file")
  
  (blockSize, keySize, valSize, itemCount, reserved) = struct.unpack(endianness + 'IIIQQ', fh.read(28))
  (isLeaf, reserved, count) = struct.unpack(endianness + 'BBH', fh.read(4))
  if isLeaf:
    for n in range(count):
      (key, chromId, chromSize) = struct.unpack(endianness + str(keySize) + 'sII', fh.read(keySize + 2 * 4))
      # we have chrom and size
      csize[key.replace('\x00', '')] = chromSize
    return csize  
#  (key, childOffset) = struct.unpack(endianness + str(keySize) + 'sQ', fh.read(keySize + 8))
####
  (dataCount) = struct.unpack("<I", fh.read(4))
  (chromId, chromStart, chromEnd, itemStep, itemSpan, type, reserved, itemCount) = struct.unpack(endianness + 'IIIIIBBH', fh.read(24))