Source

utils / bighilbert.py

#!/usr/bin/env python2.7

import sys
import bx.bbi.bigwig_file
import numpy as np
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import os
import struct
import argparse


def getChromosomeSizesFromBigWig(bwname):
  csize = {}
  fh = open(os.path.expanduser(bwname), "rb")
  magic = fh.read(4)
  if magic == '&\xfc\x8f\x88':
    endianness = '<'
  elif magic == '\x88\x8f\xfc&':
    endianness = '>'
  else:
    raise IOError("The file is not in bigwig format")
  (version,zoomLevels,chromosomeTreeOffset,fullDataOffset,fullIndexOffset,fieldCount,definedFieldCount,autoSqlOffset,totalSummaryOffset,uncompressBufSize,reserved)=struct.unpack(endianness+'HHQQQHHQQIQ',fh.read(60))
  if version < 3:
    raise IOError("Bigwig files version <3 are not supported")
  fh.seek(chromosomeTreeOffset)
  magic = fh.read(4)
  if magic == '\x91\x8c\xcax':
    endianness = '<'
  elif magic == 'x\xca\x8c\x91':
    endianness = '>'
  else:
    raise ValueError("Wrong magic for this bigwig data file")
  (blockSize, keySize, valSize, itemCount, reserved) = struct.unpack(endianness + 'IIIQQ', fh.read(28))
  (isLeaf, reserved, count) = struct.unpack(endianness + 'BBH', fh.read(4))
  for n in range(count):
    (key, chromId, chromSize) = struct.unpack(endianness + str(keySize) + 'sII', fh.read(keySize + 2 * 4))
    csize[key.replace('\x00', '')] = chromSize
  return csize

# These functions were copied from simpleHilbertCurve (https://github.com/dentearl/simpleHilbertCurve.git)

def d2xy(n, d):
    """
    take a d value in [0, n**2 - 1] and map it to
    an x, y value (e.g. c, r).
    """
    assert(d <= n**2 - 1)
    t = d
    x = y = 0
    s = 1
    while (s < n):
        rx = 1 & (t / 2)
        ry = 1 & (t ^ rx)
        x, y = rot(s, x, y, rx, ry)
        x += s * rx
        y += s * ry
        t /= 4
        s *= 2
    return x, y

def rot(n, x, y, rx, ry):
    """
    rotate/flip a quadrant appropriately
    """
    if ry == 0:
        if rx == 1:
            x = n - 1 - x
            y = n - 1 - y
        return y, x
    return x, y

Sample1 = bx.bbi.bigwig_file.BigWigFile(open(sys.argv[1], 'rb'))
Sample2 = bx.bbi.bigwig_file.BigWigFile(open(sys.argv[2], 'rb'))
csize = getChromosomeSizesFromBigWig(sys.argv[1])
n = 1 << 10
#imsize = 640

v_d2xy = np.vectorize(d2xy)

for chrom in csize.keys():
  if '_' in chrom: continue
  sys.stderr.write("Processing chromosome %s\n" % chrom)
  
  chrom_length = csize[chrom]
  sys.stderr.write("mapping positions... ")
  positions = np.cumsum(np.ones(chrom_length, dtype =np.int)) - 1
  c_a, r_a = v_d2xy(n, (n**2 - 1) * positions / chrom_length)

  sys.stderr.write("Processing Sample1... ")
  hc = np.zeros((n, n))
  data = Sample1.get_as_array(chrom, 0, chrom_length)
  data[np.isnan(data)] = 0
  for i in xrange(0, chrom_length):
    hc[r_a[i]][c_a[i]] += data[i]
  # dump hilbert curve in image and array
  np.save("Sample1_%s" % chrom, hc)
  s1 = np.log1p(hc)

  sys.stderr.write("Processing Sample2... ")
  hc = np.zeros((n, n))
  data = Sample2.get_as_array(chrom, 0, chrom_length)
  data[np.isnan(data)] = 0
  for i in xrange(0, chrom_length):
    hc[r_a[i]][c_a[i]] += data[i]
  # dump hilbert curve in image and array
  np.save("Sample2_%s" % chrom, hc)
  s2 = np.log1p(hc)
  plt.matshow(a - b, origin=0,cmap = plt.cm.seismic)
  plt.colorbar()
  for i in range(0, chrom_length, 4000000):
    (y, x) = d2xy(n, (n**2 - 1) * i / chrom_length)
    plt.plot(x, y, 'k+')
    plt.text(x, y, "%dM" % (i / 1000000))
  plt.xticks([],[])
  plt.yticks([],[])
  plt.xlim(0, n)
  plt.ylim(0, n)
  plt.title("Hilbert curve for %s" % chrom)
  plt.savefig("HC_%s.png" % chrom)
  plt.close()

def bw2hc_main():

  # parse options
  option_parser = argparse.ArgumentParser(
  description="Convert genomic data to Hilbert Curves",
  prog="gilbert.py",
  epilog="For any question, write to cittaro.davide@hsr.it")
  option_parser.add_argument("--version", action="version", version="%(prog)s 0.1")
  option_parser.add_argument("-f", "--file", help="File to convert", action='store', type=file, default=sys.stdin)
  option_parser.add_argumnet("-t", "--type", help="File type", action='store', choice=['bigwig', 'bed', 'bam'], default='bigwig')
  option_parser.add_argument("-n","--name", help="Map prefix", action='store', default="Gilbert")
  option_parser.add_argument("-x","--exclude", help="Chromosome to exclude", action='store')
  options = option_parser.parse_args()
  
  for line in options.file:
    print line
    
if __name__ == "__main__": bw2hc_main()