Commits

Davide Cittaro committed 677d52f

new directory structure

  • Participants
  • Parent commits 1c7fcc2

Comments (0)

Files changed (3)

File bam2h5.py

-import tables
-import pysam
-import sys
-import getopt
-import numpy as np
-import os
-import dspomics.ptpileup
-
-def errlog(*msgl):
-  msg = ' '.join([str(x) for x in msgl])
-  sys.stderr.write(msg + os.linesep)
-
-def main():
-  # some variables
-  samFiles = [None]
-  h5fname = "pileup.h5"
-  downsample = 1.0
-  mapQuality = 0
-  removeDuplicates = False
-
-  # get command line options
-  try:
-    opts, args = getopt.getopt(sys.argv[1:], "o:d:rq:")
-  except getopt.GetoptError, err:
-    errlog(err)
-    sys.exit(2)
-  
-  for o, a in opts:
-    if o == "-o":
-      h5fname = a
-    elif o == "-d":
-      try:
-        downsample = float(a)
-      except ValueError, err:
-        errlog(err)
-        sys.exit(2)
-      if downsample > 1 or downsample < 0:
-        downsample = 1.0
-    elif o == "-r":
-      removeDuplicates = True
-    elif o == "-q":
-      try:
-        mapQuality = int(a)
-      except ValueError, err:
-        errlog(err)
-        sys.exit(2)
-      if mapQuality < 0:
-        mapQuality = 0
-  samFiles = args
-  h5ob = dspomic.ptpileup.PTPileup(h5fname)
-  for fname in samFiles:
-    h5ob.importIntervalFile(fname, ds = downsample, mq = mapQuality, rd = removeDuplicates, fmt = "bam")
-  h5ob._close()
-  
-if __name__ == "__main__":
-  main()

File dspomics/__init__.py

Empty file removed.

File dspomics/ptpileup.py

-import tables
-import pysam
-import bx.wiggle
-import subprocess
-import numpy as np
-import os
-import struct
-
-class PTPileup():
-  """ The container for genomic data. This is a pytables file with one
-  root group and as many nodes as chromosomes.
-  """
-  def __init__(self, fname = 'pileup.h5', mode = 'w'):
-    filters = None
-    if mode == 'w':
-      # If this is a new file, write it with blosc filters
-      filters = tables.Filters(complevel=5, complib='blosc')
-    self._h5file = tables.openFile(fname, mode = mode, filters = filters)
-  def _close(self):
-    self._h5file.close()
-  def _flush(self):
-    self._h5file.flush()
-  def hasChromosome(self, cname):
-    if cname in self._h5file.root._v_children.keys():
-      return True
-    return False
-  def addChromosome(self, cname, csize):
-    if not self.hasChromosome(cname):
-      self._h5file.createCArray("/", cname, tables.Float64Atom(), (csize,))
-  def chromosome(self, cname):
-    try:
-      return self._h5file.root._f_getChild(cname)
-    except tables.exceptions.NoSuchNodeError:
-      None
-  def sizeof(self, cname):
-    return self.chromosome(cname).__len__()
-  def sizeDict(self):
-    return dict([(x.name, x) for x in self._h5file.listNodes("/", classname="CArray")])
-  def asArray(self, cname, start = None, end = None):
-    return self.chromosome(cname)[start:end]
-  def importIntervalFile(self, fname, ds = 1.0, mq = 0, rd = False, fmt = "bam", csize = None):
-    """Import interval files into a pytable container.
-    fname: interval file name
-    ds: downsample ratio
-    mq: min. quality mapping (used for BAM filtering)
-    rd: remove duplicates
-    fmt: file format (bam or bed)
-    cisze: when bed file is specified, a chromosome table should be provided as dictionary
-      {'chrA_name':chrA_size,
-       'chrB_name': chrB_size, ...}
-    """
-    fname = os.path.expanduser(fname)
-    fmt = fmt.lower()
-    if fmt == "sam":
-      fmt = "bam"
-    if ds > 1.0 or ds < 0.0:
-      ds = 1.0
-    if mq < 0:
-      mq = 0
-  
-    if fmt == "bed":
-      if not csize:
-        raise IOError("chromosome sizes must be provided when %s format is specified" % fmt)
-      else:
-        giterator = open(fname)	# open file handler for later 
-        # read from csize
-        if type(csize) == str:
-          # if csize is a string, use it as filename
-          # otherwise should be a dictionary
-          csize = dict([x.strip().split() for x in open(os.expanduser(fname))])
-        if type(csize) != dict:
-          # if it is not a dictionary
-          raise TypeError("A chromosome table should be provided either as dictionary or file")
-        for (chrom, size) in csize.items():
-          self.addChromosome(chrom, int(size))
-    elif fmt == "bam":
-      gfh = pysam.Samfile(fname, "rb")	# we can use this handler right now
-      for sd in gfh.header["SQ"]:
-        self.addChromosome(sd["SN"], sd["LN"])
-      # also create the iterator
-      try:
-        giterator = gfh.fetch()
-      except ValueError:
-        # a ValueError is raised if index is not present
-        gfh.close()
-        pysam.index(fname)
-        gfh = pysam.Samfile(fname, "rb")
-        giterator = gfh.fetch()
-    else:
-      raise TypeError("Format %s not supported, yet" % fmt)
-    
-    pstart = -1
-    pend = -1
-    pchrom = ''
-    keepit = True
-    removed = 0
-    for n, x in enumerate(giterator):
-      if fmt == "bam":
-        if x.mapq < mq: continue
-        (chrom, start, end) = (gfh.getrname(x.rname), x.pos, x.pos + x.rlen)
-      elif fmt == "bed":
-        fields = x.strip().split()
-        (chrom, start, end) = (fields[0], int(fields[1]), int(fields[2]))
-      if rd and start == pstart and end == pend and chrom == pchrom:
-        keepit = False
-      if np.random.uniform() > ds:
-        keepit = False
-      if keepit:
-        pstart = start
-        pend = end
-        pchrom = chrom
-        self.chromosome(chrom)[start:end] += 1
-        if n % 131072 == 0:
-          self._flush()
-      else:
-        removed += 1
-      keepit = True  
-    gfh.close()
-  def importProfileFile(self, fname, fmt = 'wig', csize = None):
-    fmt = fmt.lower()
-    if fmt == "bdg":
-      # since we use the bx.wiggle module, we can treat bedgraph and wiggle
-      # as they are the same format.
-      fmt = "wig"
-    elif fmt == "bigwig":
-      fmt = "bw"
-    if fmt == "wig":
-      if not csize:
-        raise IOError("chromosome sizes must be provided when %s format is specified" % fmt)
-      else:
-        giterator = open(fname, mode = 'r')
-        # read from csize
-        if type(csize) == str:
-          # if csize is a string, use it as filename
-          # otherwise should be a dictionary
-          csize = dict([x.strip().split() for x in open(os.expanduser(fname))])
-        if type(csize) != dict:
-          # if it is not a dictionary
-          raise TypeError("A chromosome table should be provided either as dictionary or file")
-        for (chrom, size) in csize.items():
-          self.addChromosome(chrom, int(size))
-    elif fmt == "bw":
-      for (chrom, size) in getChromosomeFromBigWig(fname):
-        self.addChromosome(chrom, size)
-    
-      giterator = subprocess.Popen(["bigWigToBedGraph", fname, "stdout"], shell = False, stdout = subprocess.PIPE).stdout 
-    else:  
-      raise TypeError("Format %s not supported, yet" % fmt)
-    for (chrom, position, value) in bx.wiggle.Reader(giterator):
-      self.chromosome(chrom)[position] = value
-      if position % 131072 == 0:
-        self._flush()
-
-
-def getChromosomeFromBigWig(bwname):
-  csize = {}
-  bw = open(os.path.expanduser(bwname), "rb")
-  
-  # read magic number to guess endianness
-  magic = fh.read(4)
-  if magic == '&\xfc\x8f\x88':
-    endianness = '<'
-  elif magic == '\x88\x8f\xfc&':
-    endianness = '>'
-  else:
-    raise IOError("The file is not in bigwig format")
-  # read the header
-  (version, zoomLevels, chromosomeTreeOffset, 
-  fullDataOffset, fullIndexOffset, fieldCount, definedFieldCount, 
-  autoSqlOffset, totalSummaryOffset, uncompressBufSize, reserved) = struct.unpack(endianness + 'HHQQQHHQQIQ', fh.read(60))
-  
-  if version < 3:
-    raise IOError("Bigwig files version <3 are not supported")
-  
-  # go to the data
-  fh.seek(chromosomeTreeOffset)
-  
-  # read magic again
-  magic = fh.read(4)
-  if magic == '\x91\x8c\xcax':
-    endianness = '<'
-  elif magic == 'x\xca\x8c\x91':
-    endianness = '>'
-  else:
-    raise ValueError("Wrong magic for this bigwig data file")
-  
-  (blockSize, keySize, valSize, itemCount, reserved) = struct.unpack(endianness + 'IIIQQ', fh.read(28))
-  (isLeaf, reserved, count) = struct.unpack(endianness + 'BBH', fh.read(4))
-  for n in range(itemCount):
-    (key, chromId, chromSize) = struct.unpack(endianness + str(keySize) + 'sII', fh.read(keySize + 2 * 4))
-    # we have chrom and size
-    csize[key.replace('\x00', '')] = chromSize
-  return csize  
-#  (key, childOffset) = struct.unpack(endianness + str(keySize) + 'sQ', fh.read(keySize + 8))