Commits

Davide Cittaro  committed e0087d5 Merge

merge

  • Participants
  • Parent commits f9fac45, b23ddd2

Comments (0)

Files changed (2)

File bighilbert.py

 import matplotlib.pyplot as plt
 import os
 import struct
+import argparse
+
 
 def getChromosomeSizesFromBigWig(bwname):
   csize = {}
 n = 1 << 10
 #imsize = 640
 
+v_d2xy = np.vectorize(d2xy)
+
 for chrom in csize.keys():
   if '_' in chrom: continue
   sys.stderr.write("Processing chromosome %s\n" % chrom)
   
   chrom_length = csize[chrom]
-  sys.stderr.write("Processing Sample1\n")
+  sys.stderr.write("mapping positions... ")
+  positions = np.cumsum(np.ones(chrom_length, dtype =np.int)) - 1
+  c_a, r_a = v_d2xy(n, (n**2 - 1) * positions / chrom_length)
+
+  sys.stderr.write("Processing Sample1... ")
   hc = np.zeros((n, n))
   data = Sample1.get_as_array(chrom, 0, chrom_length)
   data[np.isnan(data)] = 0
   for i in xrange(0, chrom_length):
-    c, r = d2xy(n, int(round((n**2 - 1) * i / chrom_length)))
-    hc[r][c] += data[i]
+    hc[r_a[i]][c_a[i]] += data[i]
   # dump hilbert curve in image and array
   np.save("Sample1_%s" % chrom, hc)
-  plt.imshow(np.log1p(hc), origin=0, alpha=0.3, cmap=plt.cm.Blues)  
+  s1 = np.log1p(hc)
 
-  sys.stderr.write("Processing Sample2\n")
+  sys.stderr.write("Processing Sample2... ")
   hc = np.zeros((n, n))
   data = Sample2.get_as_array(chrom, 0, chrom_length)
   data[np.isnan(data)] = 0
   for i in xrange(0, chrom_length):
-    c, r = d2xy(n, int(round((n**2 - 1) * i / chrom_length)))
-    hc[r][c] += data[i]
+    hc[r_a[i]][c_a[i]] += data[i]
   # dump hilbert curve in image and array
   np.save("Sample2_%s" % chrom, hc)
-  plt.imshow(np.log1p(hc), origin=0, alpha=0.3, cmap=plt.cm.Reds)  
+  s2 = np.log1p(hc)
+  plt.matshow(a - b, origin=0,cmap = plt.cm.seismic)
+  plt.colorbar()
+  for i in range(0, chrom_length, 4000000):
+    (y, x) = d2xy(n, (n**2 - 1) * i / chrom_length)
+    plt.plot(x, y, 'k+')
+    plt.text(x, y, "%dM" % (i / 1000000))
+  plt.xticks([],[])
+  plt.yticks([],[])
+  plt.xlim(0, n)
+  plt.ylim(0, n)
   plt.title("Hilbert curve for %s" % chrom)
   plt.savefig("HC_%s.png" % chrom)
   plt.close()
 
+def bw2hc_main():
+
+  # parse options
+  option_parser = argparse.ArgumentParser(
+  description="Convert genomic data to Hilbert Curves",
+  prog="gilbert.py",
+  epilog="For any question, write to cittaro.davide@hsr.it")
+  option_parser.add_argument("--version", action="version", version="%(prog)s 0.1")
+  option_parser.add_argument("-f", "--file", help="File to convert", action='store', type=file, default=sys.stdin)
+  option_parser.add_argumnet("-t", "--type", help="File type", action='store', choice=['bigwig', 'bed', 'bam'], default='bigwig')
+  option_parser.add_argument("-n","--name", help="Map prefix", action='store', default="Gilbert")
+  option_parser.add_argument("-x","--exclude", help="Chromosome to exclude", action='store')
+  options = option_parser.parse_args()
+  
+  for line in options.file:
+    print line
+    
+if __name__ == "__main__": bw2hc_main()
   
+#!/usr/bin/env python2.7
+
+import sys
+import bx.bbi.bigwig_file
+import numpy as np
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import os
+import struct
+import argparse
+
+
+def terminate(msg=None, exit_status=1):
+  sys.stderr.write("%s\n" % msg)
+  sys.exit(exit_status)
+  
+def getChromosomeSizesFromBigWig(bwname):
+  csize = {}
+  fh = open(os.path.expanduser(bwname), "rb")
+  magic = fh.read(4)
+  if magic == '&\xfc\x8f\x88':
+    endianness = '<'
+  elif magic == '\x88\x8f\xfc&':
+    endianness = '>'
+  else:
+    raise IOError("The file is not in bigwig format")
+  (version,zoomLevels,chromosomeTreeOffset,fullDataOffset,fullIndexOffset,fieldCount,definedFieldCount,autoSqlOffset,totalSummaryOffset,uncompressBufSize,reserved)=struct.unpack(endianness+'HHQQQHHQQIQ',fh.read(60))
+  if version < 3:
+    raise IOError("Bigwig files version <3 are not supported")
+  fh.seek(chromosomeTreeOffset)
+  magic = fh.read(4)
+  if magic == '\x91\x8c\xcax':
+    endianness = '<'
+  elif magic == 'x\xca\x8c\x91':
+    endianness = '>'
+  else:
+    raise ValueError("Wrong magic for this bigwig data file")
+  (blockSize, keySize, valSize, itemCount, reserved) = struct.unpack(endianness + 'IIIQQ', fh.read(28))
+  (isLeaf, reserved, count) = struct.unpack(endianness + 'BBH', fh.read(4))
+  for n in range(count):
+    (key, chromId, chromSize) = struct.unpack(endianness + str(keySize) + 'sII', fh.read(keySize + 2 * 4))
+    csize[key.replace('\x00', '')] = chromSize
+  return csize
+
+# These functions were copied from simpleHilbertCurve (https://github.com/dentearl/simpleHilbertCurve.git)
+
+def d2xy(n, d):
+    """
+    take a d value in [0, n**2 - 1] and map it to
+    an x, y value (e.g. c, r).
+    """
+    assert(d <= n**2 - 1)
+    t = d
+    x = y = 0
+    s = 1
+    while (s < n):
+        rx = 1 & (t / 2)
+        ry = 1 & (t ^ rx)
+        x, y = rot(s, x, y, rx, ry)
+        x += s * rx
+        y += s * ry
+        t /= 4
+        s *= 2
+    return x, y
+
+def rot(n, x, y, rx, ry):
+    """
+    rotate/flip a quadrant appropriately
+    """
+    if ry == 0:
+        if rx == 1:
+            x = n - 1 - x
+            y = n - 1 - y
+        return y, x
+    return x, y
+
+
+def dump_png(hc, chromosome, chromosome_size, options):
+  n = 1 << options.level
+  if options.background:
+    colormap = plt.cm.seismic
+  else:
+    colormap = plt.cm.Reds
+  plt.matshow(hc, origin=0, cmap = colormap)
+  plt.colorbar()
+  for i in range(0, chromosome_size, 4000000):
+    # add crosshatches for chrom position
+    (y, x) = d2xy(n, (n**2 - 1) * i / chromosome_size)
+    plt.plot(x, y, 'k+')
+    plt.text(x, y, "%dM" % (i / 1000000))
+  plt.xticks([],[])
+  plt.yticks([],[])
+  plt.xlim(0, n)
+  plt.ylim(0, n)
+  plt.title("Hilbert curve for %s" % chromosome)
+  plt.savefig("%s_%s.png" % (options.name, chromosome))
+  plt.close()
+  
+def dump_opng(hc, bg_hc, chromosome, chromosome_size, options):
+  n = 1 << options.level
+  plt.matshow(hc, origin=0, cmap = plt.cm.Reds, alpha=0.5)
+  plt.matshow(bc_hc, origin=0, cmap = plt.cm.Blues, alpha=0.5)
+  for i in range(0, chromosome_size, 4000000):
+    # add crosshatches for chrom position
+    (y, x) = d2xy(n, (n**2 - 1) * i / chromosome_size)
+    plt.plot(x, y, 'k+')
+    plt.text(x, y, "%dM" % (i / 1000000))
+  plt.xticks([],[])
+  plt.yticks([],[])
+  plt.xlim(0, n)
+  plt.ylim(0, n)
+  plt.title("Hilbert curve for %s" % chromosome)
+  plt.savefig("%s_%s.png" % (options.name, chromosome))
+  plt.close()  
+
+def create_map(data, chromosome_size, options):
+  n = 1 << options.level
+  hc_map = np.zeros((n, n))
+  counts = np.zeros((n, n))
+  for i in xrange(0, chromosome_size):
+    if i % 1000000 == 0:
+      sys.stderr.write('.')
+    # I need to parallelize this (or cython?)
+    (y, x) = d2xy(n, (n**2 - 1) * i / chromosome_size)
+    if options.summarize == 'sum' or options.summarize == 'mean':
+      hc_map[x][y] += data[i]
+    elif options.summarize == 'max':  
+      hc_map[x][y] = max(data[i], hc_map[x][y])
+    if options.summarize == 'mean':
+      counts[x][y] += 1  
+
+  if options.summarize == 'mean':
+    hc_map = 1. * hc_map / counts
+
+  return hc_map  
+    
+def bw2hc_main():
+  # parse options
+  option_parser = argparse.ArgumentParser(
+  description="Convert genomic data to Hilbert Curves",
+  prog="gilbert.py",
+  epilog="For any question, write to cittaro.davide@hsr.it")
+  option_parser.add_argument("--version", action="version", version="%(prog)s 0.1")
+  option_parser.add_argument("-f","--file", help="File to convert", action='store', type=argparse.FileType('rb'), default=sys.stdin)
+  option_parser.add_argument("-b","--background", help="Background to subtract", action="store",type=argparse.FileType('rb'))
+  option_parser.add_argument("-o","--overlay", help="Overlay two datasets, don't subtract", action="store_true", default=False)
+  option_parser.add_argument("-t","--type", help="File type", action='store', choices=['bigwig', 'bed', 'bam'], default='bigwig')
+  option_parser.add_argument("-n","--name", help="Map prefix", action='store', default="Gilbert")
+  option_parser.add_argument("-l","--level", help="Map level (<10)", type=int, default=8)
+  option_parser.add_argument("-x","--exclude", help="Exclude this chromosome from the analysis", action='append')
+  option_parser.add_argument("-i","--include", help="Limit the analysis to this chromosome", action='append')
+  option_parser.add_argument("-L","--linearscale", help="Plot map in linear scale", action="store_true", default=False)
+  option_parser.add_argument("-r","--resolution", help="Resolution of resulting map", type=int, default=600)
+  option_parser.add_argument("-s","--summarize", help="Function to summarize data", choices=['sum', 'mean', 'max'], default='sum')
+  options = option_parser.parse_args()
+  
+  # initialize functions
+  n = 1 << options.level
+  
+  
+  if options.type == 'bigwig':
+    data_handler = bx.bbi.bigwig_file.BigWigFile(options.file)
+    chromosome_sizes = getChromosomeSizesFromBigWig(options.file.name)
+    if options.background:
+      bg_data_handler = bx.bbi.bigwig_file.BigWigFile(options.background)
+  else:
+    terminate("%s support not yet implemented" % options.type, 1)
+  
+  for chromosome in chromosome_sizes.keys():
+    # cycle on actual data data
+    if options.exclude and chromosome in options.exclude: continue
+    if options.include > 0 and not chromosome in options.include: continue
+    # random chrom are ecluded by default
+    if '_' in chromosome: continue
+
+    file_prefix = "%s_%s" % (options.name, chromosome)
+
+    chromosome_size = chromosome_sizes[chromosome]
+    sys.stderr.write("Processing %s [%d bp] " % (chromosome, chromosome_size))
+    
+    sys.stderr.write("Reading data ")
+    hilbert_map = np.zeros((n, n))
+    if options.type == 'bigwig':
+      data = data_handler.get_as_array(chromosome, 0, chromosome_size)
+      data[np.isnan(data)] = 0
+    else:  
+      terminate("%s support not yet implemented" % options.type, 1)
+
+    sys.stderr.write("Creating map ")
+    hc = create_map(data, chromosome_size, options)
+
+    if not options.linearscale:
+      hc = np.log1p(hc)
+    sys.stderr.write("\n")
+    sys.stderr.write("Dumping Map into numpy array %s.npy \n" % file_prefix)
+    np.save(file_prefix, hc)
+    
+    if options.background:
+      sys.stderr.write("Processing Background")
+      bg_data = bg_data_handler.get_as_array(chromosome, 0, chromosome_size)
+      bg_data[np.isnan(data)] = 0
+      bg_hc = create_map(bg_data, chromosome_size, options)
+      if not options.linearscale:
+        bg_hc = np.log1p(bg_hc)
+      sys.stderr.write("\n")
+      sys.stderr.write("Dumping Map into numpy array bg_%s.npy \n" % file_prefix)
+      np.save("bg_%s" % file_prefix, bg_hc)
+      if not options.overlay:
+        # subtract background
+        hc = hc - bg_hc
+      
+    # create the map
+    if not options.overlay:
+      dump_png(hc, chromosome, chromosome_size, options)
+    else:
+      dump_opng(hc, bg_hc, chromosome, chromosome_size, options)  
+
+    
+if __name__ == "__main__": bw2hc_main()
+