Commits

Davide Cittaro  committed 1c7fcc2

new directory structure

  • Participants
  • Parent commits 3728a7f

Comments (0)

Files changed (3)

File examples/bam2h5.py

+import tables
+import pysam
+import sys
+import getopt
+import numpy as np
+import os
+import dspomics.ptpileup
+
+def errlog(*msgl):
+  msg = ' '.join([str(x) for x in msgl])
+  sys.stderr.write(msg + os.linesep)
+
+def main():
+  # some variables
+  samFiles = [None]
+  h5fname = "pileup.h5"
+  downsample = 1.0
+  mapQuality = 0
+  removeDuplicates = False
+
+  # get command line options
+  try:
+    opts, args = getopt.getopt(sys.argv[1:], "o:d:rq:")
+  except getopt.GetoptError, err:
+    errlog(err)
+    sys.exit(2)
+  
+  for o, a in opts:
+    if o == "-o":
+      h5fname = a
+    elif o == "-d":
+      try:
+        downsample = float(a)
+      except ValueError, err:
+        errlog(err)
+        sys.exit(2)
+      if downsample > 1 or downsample < 0:
+        downsample = 1.0
+    elif o == "-r":
+      removeDuplicates = True
+    elif o == "-q":
+      try:
+        mapQuality = int(a)
+      except ValueError, err:
+        errlog(err)
+        sys.exit(2)
+      if mapQuality < 0:
+        mapQuality = 0
+  samFiles = args
+  h5ob = dspomic.ptpileup.PTPileup(h5fname)
+  for fname in samFiles:
+    h5ob.importIntervalFile(fname, ds = downsample, mq = mapQuality, rd = removeDuplicates, fmt = "bam")
+  h5ob._close()
+  
+if __name__ == "__main__":
+  main()

File lib/dspomics/__init__.py

Empty file added.

File lib/dspomics/ptpileup.py

+import tables
+import pysam
+import bx.wiggle
+import subprocess
+import numpy as np
+import os
+import struct
+
+class PTPileup():
+  """ The container for genomic data. This is a pytables file with one
+  root group and as many nodes as chromosomes.
+  """
+  def __init__(self, fname = 'pileup.h5', mode = 'w'):
+    filters = None
+    if mode == 'w':
+      # If this is a new file, write it with blosc filters
+      filters = tables.Filters(complevel=5, complib='blosc')
+    self._h5file = tables.openFile(fname, mode = mode, filters = filters)
+  def _close(self):
+    self._h5file.close()
+  def _flush(self):
+    self._h5file.flush()
+  def hasChromosome(self, cname):
+    if cname in self._h5file.root._v_children.keys():
+      return True
+    return False
+  def addChromosome(self, cname, csize):
+    if not self.hasChromosome(cname):
+      self._h5file.createCArray("/", cname, tables.Float64Atom(), (csize,))
+  def chromosome(self, cname):
+    try:
+      return self._h5file.root._f_getChild(cname)
+    except tables.exceptions.NoSuchNodeError:
+      None
+  def sizeof(self, cname):
+    return self.chromosome(cname).__len__()
+  def sizeDict(self):
+    return dict([(x.name, x) for x in self._h5file.listNodes("/", classname="CArray")])
+  def asArray(self, cname, start = None, end = None):
+    return self.chromosome(cname)[start:end]
+  def importIntervalFile(self, fname, ds = 1.0, mq = 0, rd = False, fmt = "bam", csize = None):
+    """Import interval files into a pytable container.
+    fname: interval file name
+    ds: downsample ratio
+    mq: min. quality mapping (used for BAM filtering)
+    rd: remove duplicates
+    fmt: file format (bam or bed)
+    cisze: when bed file is specified, a chromosome table should be provided as dictionary
+      {'chrA_name':chrA_size,
+       'chrB_name': chrB_size, ...}
+    """
+    fname = os.path.expanduser(fname)
+    fmt = fmt.lower()
+    if fmt == "sam":
+      fmt = "bam"
+    if ds > 1.0 or ds < 0.0:
+      ds = 1.0
+    if mq < 0:
+      mq = 0
+  
+    if fmt == "bed":
+      if not csize:
+        raise IOError("chromosome sizes must be provided when %s format is specified" % fmt)
+      else:
+        giterator = open(fname)	# open file handler for later 
+        # read from csize
+        if type(csize) == str:
+          # if csize is a string, use it as filename
+          # otherwise should be a dictionary
+          csize = dict([x.strip().split() for x in open(os.expanduser(fname))])
+        if type(csize) != dict:
+          # if it is not a dictionary
+          raise TypeError("A chromosome table should be provided either as dictionary or file")
+        for (chrom, size) in csize.items():
+          self.addChromosome(chrom, int(size))
+    elif fmt == "bam":
+      gfh = pysam.Samfile(fname, "rb")	# we can use this handler right now
+      for sd in gfh.header["SQ"]:
+        self.addChromosome(sd["SN"], sd["LN"])
+      # also create the iterator
+      try:
+        giterator = gfh.fetch()
+      except ValueError:
+        # a ValueError is raised if index is not present
+        gfh.close()
+        pysam.index(fname)
+        gfh = pysam.Samfile(fname, "rb")
+        giterator = gfh.fetch()
+    else:
+      raise TypeError("Format %s not supported, yet" % fmt)
+    
+    pstart = -1
+    pend = -1
+    pchrom = ''
+    keepit = True
+    removed = 0
+    for n, x in enumerate(giterator):
+      if fmt == "bam":
+        if x.mapq < mq: continue
+        (chrom, start, end) = (gfh.getrname(x.rname), x.pos, x.pos + x.rlen)
+      elif fmt == "bed":
+        fields = x.strip().split()
+        (chrom, start, end) = (fields[0], int(fields[1]), int(fields[2]))
+      if rd and start == pstart and end == pend and chrom == pchrom:
+        keepit = False
+      if np.random.uniform() > ds:
+        keepit = False
+      if keepit:
+        pstart = start
+        pend = end
+        pchrom = chrom
+        self.chromosome(chrom)[start:end] += 1
+        if n % 131072 == 0:
+          self._flush()
+      else:
+        removed += 1
+      keepit = True  
+    gfh.close()
+  def importProfileFile(self, fname, fmt = 'wig', csize = None):
+    fmt = fmt.lower()
+    if fmt == "bdg":
+      # since we use the bx.wiggle module, we can treat bedgraph and wiggle
+      # as they are the same format.
+      fmt = "wig"
+    elif fmt == "bigwig":
+      fmt = "bw"
+    if fmt == "wig":
+      if not csize:
+        raise IOError("chromosome sizes must be provided when %s format is specified" % fmt)
+      else:
+        giterator = open(fname, mode = 'r')
+        # read from csize
+        if type(csize) == str:
+          # if csize is a string, use it as filename
+          # otherwise should be a dictionary
+          csize = dict([x.strip().split() for x in open(os.expanduser(fname))])
+        if type(csize) != dict:
+          # if it is not a dictionary
+          raise TypeError("A chromosome table should be provided either as dictionary or file")
+        for (chrom, size) in csize.items():
+          self.addChromosome(chrom, int(size))
+    elif fmt == "bw":
+      for (chrom, size) in getChromosomeFromBigWig(fname):
+        self.addChromosome(chrom, size)
+    
+      giterator = subprocess.Popen(["bigWigToBedGraph", fname, "stdout"], shell = False, stdout = subprocess.PIPE).stdout 
+    else:  
+      raise TypeError("Format %s not supported, yet" % fmt)
+    for (chrom, position, value) in bx.wiggle.Reader(giterator):
+      self.chromosome(chrom)[position] = value
+      if position % 131072 == 0:
+        self._flush()
+
+
+def getChromosomeFromBigWig(bwname):
+  csize = {}
+  bw = open(os.path.expanduser(bwname), "rb")
+  
+  # read magic number to guess endianness
+  magic = fh.read(4)
+  if magic == '&\xfc\x8f\x88':
+    endianness = '<'
+  elif magic == '\x88\x8f\xfc&':
+    endianness = '>'
+  else:
+    raise IOError("The file is not in bigwig format")
+  # read the header
+  (version, zoomLevels, chromosomeTreeOffset, 
+  fullDataOffset, fullIndexOffset, fieldCount, definedFieldCount, 
+  autoSqlOffset, totalSummaryOffset, uncompressBufSize, reserved) = struct.unpack(endianness + 'HHQQQHHQQIQ', fh.read(60))
+  
+  if version < 3:
+    raise IOError("Bigwig files version <3 are not supported")
+  
+  # go to the data
+  fh.seek(chromosomeTreeOffset)
+  
+  # read magic again
+  magic = fh.read(4)
+  if magic == '\x91\x8c\xcax':
+    endianness = '<'
+  elif magic == 'x\xca\x8c\x91':
+    endianness = '>'
+  else:
+    raise ValueError("Wrong magic for this bigwig data file")
+  
+  (blockSize, keySize, valSize, itemCount, reserved) = struct.unpack(endianness + 'IIIQQ', fh.read(28))
+  (isLeaf, reserved, count) = struct.unpack(endianness + 'BBH', fh.read(4))
+  for n in range(itemCount):
+    (key, chromId, chromSize) = struct.unpack(endianness + str(keySize) + 'sII', fh.read(keySize + 2 * 4))
+    # we have chrom and size
+    csize[key.replace('\x00', '')] = chromSize
+  return csize  
+#  (key, childOffset) = struct.unpack(endianness + str(keySize) + 'sQ', fh.read(keySize + 8))