Commits

Davide Cittaro committed 77ea528

removed gilbert

Comments (0)

Files changed (1)

gilbert.py

-#!/usr/bin/env python2.7
-
-import sys
-import bx.bbi.bigwig_file
-import numpy as np
-import matplotlib
-matplotlib.use("Agg")
-import matplotlib.pyplot as plt
-import os
-import struct
-import argparse
-
-
-def terminate(msg=None, exit_status=1):
-  sys.stderr.write("%s\n" % msg)
-  sys.exit(exit_status)
-  
-def getChromosomeSizesFromBigWig(bwname):
-  csize = {}
-  fh = open(os.path.expanduser(bwname), "rb")
-  magic = fh.read(4)
-  if magic == '&\xfc\x8f\x88':
-    endianness = '<'
-  elif magic == '\x88\x8f\xfc&':
-    endianness = '>'
-  else:
-    raise IOError("The file is not in bigwig format")
-  (version,zoomLevels,chromosomeTreeOffset,fullDataOffset,fullIndexOffset,fieldCount,definedFieldCount,autoSqlOffset,totalSummaryOffset,uncompressBufSize,reserved)=struct.unpack(endianness+'HHQQQHHQQIQ',fh.read(60))
-  if version < 3:
-    raise IOError("Bigwig files version <3 are not supported")
-  fh.seek(chromosomeTreeOffset)
-  magic = fh.read(4)
-  if magic == '\x91\x8c\xcax':
-    endianness = '<'
-  elif magic == 'x\xca\x8c\x91':
-    endianness = '>'
-  else:
-    raise ValueError("Wrong magic for this bigwig data file")
-  (blockSize, keySize, valSize, itemCount, reserved) = struct.unpack(endianness + 'IIIQQ', fh.read(28))
-  (isLeaf, reserved, count) = struct.unpack(endianness + 'BBH', fh.read(4))
-  for n in range(count):
-    (key, chromId, chromSize) = struct.unpack(endianness + str(keySize) + 'sII', fh.read(keySize + 2 * 4))
-    csize[key.replace('\x00', '')] = chromSize
-  return csize
-
-# These functions were copied from simpleHilbertCurve (https://github.com/dentearl/simpleHilbertCurve.git)
-
-def d2xy(n, d):
-    """
-    take a d value in [0, n**2 - 1] and map it to
-    an x, y value (e.g. c, r).
-    """
-    assert(d <= n**2 - 1)
-    t = d
-    x = y = 0
-    s = 1
-    while (s < n):
-        rx = 1 & (t / 2)
-        ry = 1 & (t ^ rx)
-        x, y = rot(s, x, y, rx, ry)
-        x += s * rx
-        y += s * ry
-        t /= 4
-        s *= 2
-    return x, y
-
-def rot(n, x, y, rx, ry):
-    """
-    rotate/flip a quadrant appropriately
-    """
-    if ry == 0:
-        if rx == 1:
-            x = n - 1 - x
-            y = n - 1 - y
-        return y, x
-    return x, y
-
-
-def dump_png(hc, chromosome, chromosome_size, options):
-  n = 1 << options.level
-  if options.background:
-    colormap = plt.cm.seismic
-  else:
-    colormap = plt.cm.Reds
-  plt.matshow(hc, origin=0, cmap = colormap)
-  plt.colorbar()
-  for i in range(0, chromosome_size, 4000000):
-    # add crosshatches for chrom position
-    (y, x) = d2xy(n, (n**2 - 1) * i / chromosome_size)
-    plt.plot(x, y, 'k+')
-    plt.text(x, y, "%dM" % (i / 1000000))
-  plt.xticks([],[])
-  plt.yticks([],[])
-  plt.xlim(0, n)
-  plt.ylim(0, n)
-  plt.title("Hilbert curve for %s" % chromosome)
-  plt.savefig("%s_%s.png" % (options.name, chromosome))
-  plt.close()
-  
-def dump_opng(hc, bg_hc, chromosome, chromosome_size, options):
-  n = 1 << options.level
-  plt.matshow(hc, origin=0, cmap = plt.cm.Reds, alpha=0.5)
-  plt.matshow(bc_hc, origin=0, cmap = plt.cm.Blues, alpha=0.5)
-  for i in range(0, chromosome_size, 4000000):
-    # add crosshatches for chrom position
-    (y, x) = d2xy(n, (n**2 - 1) * i / chromosome_size)
-    plt.plot(x, y, 'k+')
-    plt.text(x, y, "%dM" % (i / 1000000))
-  plt.xticks([],[])
-  plt.yticks([],[])
-  plt.xlim(0, n)
-  plt.ylim(0, n)
-  plt.title("Hilbert curve for %s" % chromosome)
-  plt.savefig("%s_%s.png" % (options.name, chromosome))
-  plt.close()  
-
-def create_map(data, chromosome_size, options):
-  n = 1 << options.level
-  hc_map = np.zeros((n, n))
-  counts = np.zeros((n, n))
-  for i in xrange(0, chromosome_size):
-    if i % 1000000 == 0:
-      sys.stderr.write('.')
-    # I need to parallelize this (or cython?)
-    (y, x) = d2xy(n, (n**2 - 1) * i / chromosome_size)
-    if options.summarize == 'sum' or options.summarize == 'mean':
-      hc_map[x][y] += data[i]
-    elif options.summarize == 'max':  
-      hc_map[x][y] = max(data[i], hc_map[x][y])
-    if options.summarize == 'mean':
-      counts[x][y] += 1  
-
-  if options.summarize == 'mean':
-    hc_map = 1. * hc_map / counts
-
-  return hc_map  
-    
-def bw2hc_main():
-  # parse options
-  option_parser = argparse.ArgumentParser(
-  description="Convert genomic data to Hilbert Curves",
-  prog="gilbert.py",
-  epilog="For any question, write to cittaro.davide@hsr.it")
-  option_parser.add_argument("--version", action="version", version="%(prog)s 0.1")
-  option_parser.add_argument("-f","--file", help="File to convert", action='store', type=argparse.FileType('rb'), default=sys.stdin)
-  option_parser.add_argument("-b","--background", help="Background to subtract", action="store",type=argparse.FileType('rb'))
-  option_parser.add_argument("-o","--overlay", help="Overlay two datasets, don't subtract", action="store_true", default=False)
-  option_parser.add_argument("-t","--type", help="File type", action='store', choices=['bigwig', 'bed', 'bam'], default='bigwig')
-  option_parser.add_argument("-n","--name", help="Map prefix", action='store', default="Gilbert")
-  option_parser.add_argument("-l","--level", help="Map level (<10)", type=int, default=8)
-  option_parser.add_argument("-x","--exclude", help="Exclude this chromosome from the analysis", action='append')
-  option_parser.add_argument("-i","--include", help="Limit the analysis to this chromosome", action='append')
-  option_parser.add_argument("-L","--linearscale", help="Plot map in linear scale", action="store_true", default=False)
-  option_parser.add_argument("-r","--resolution", help="Resolution of resulting map", type=int, default=600)
-  option_parser.add_argument("-s","--summarize", help="Function to summarize data", choices=['sum', 'mean', 'max'], default='sum')
-  options = option_parser.parse_args()
-  
-  # initialize functions
-  n = 1 << options.level
-  
-  
-  if options.type == 'bigwig':
-    data_handler = bx.bbi.bigwig_file.BigWigFile(options.file)
-    chromosome_sizes = getChromosomeSizesFromBigWig(options.file.name)
-    if options.background:
-      bg_data_handler = bx.bbi.bigwig_file.BigWigFile(options.background)
-  else:
-    terminate("%s support not yet implemented" % options.type, 1)
-  
-  for chromosome in chromosome_sizes.keys():
-    # cycle on actual data data
-    if options.exclude and chromosome in options.exclude: continue
-    if options.include > 0 and not chromosome in options.include: continue
-    # random chrom are ecluded by default
-    if '_' in chromosome: continue
-
-    file_prefix = "%s_%s" % (options.name, chromosome)
-
-    chromosome_size = chromosome_sizes[chromosome]
-    sys.stderr.write("Processing %s [%d bp] " % (chromosome, chromosome_size))
-    
-    sys.stderr.write("Reading data ")
-    hilbert_map = np.zeros((n, n))
-    if options.type == 'bigwig':
-      data = data_handler.get_as_array(chromosome, 0, chromosome_size)
-      data[np.isnan(data)] = 0
-    else:  
-      terminate("%s support not yet implemented" % options.type, 1)
-
-    sys.stderr.write("Creating map ")
-    hc = create_map(data, chromosome_size, options)
-
-    if not options.linearscale:
-      hc = np.log1p(hc)
-    sys.stderr.write("\n")
-    sys.stderr.write("Dumping Map into numpy array %s.npy \n" % file_prefix)
-    np.save(file_prefix, hc)
-    
-    if options.background:
-      sys.stderr.write("Processing Background")
-      bg_data = bg_data_handler.get_as_array(chromosome, 0, chromosome_size)
-      bg_data[np.isnan(data)] = 0
-      bg_hc = create_map(bg_data, chromosome_size, options)
-      if not options.linearscale:
-        bg_hc = np.log1p(bg_hc)
-      sys.stderr.write("\n")
-      sys.stderr.write("Dumping Map into numpy array bg_%s.npy \n" % file_prefix)
-      np.save("bg_%s" % file_prefix, bg_hc)
-      if not options.overlay:
-        # subtract background
-        hc = hc - bg_hc
-      
-    # create the map
-    if not options.overlay:
-      dump_png(hc, chromosome, chromosome_size, options)
-    else:
-      dump_opng(hc, bg_hc, chromosome, chromosome_size, options)  
-
-    
-if __name__ == "__main__": bw2hc_main()
-