Source

utils / gilbert.py

#!/usr/bin/env python2.7

import sys
import bx.bbi.bigwig_file
import numpy as np
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import os
import struct
import argparse


def terminate(msg=None, exit_status=1):
  sys.stderr.write("%s\n" % msg)
  sys.exit(exit_status)
  
def getChromosomeSizesFromBigWig(bwname):
  csize = {}
  fh = open(os.path.expanduser(bwname), "rb")
  magic = fh.read(4)
  if magic == '&\xfc\x8f\x88':
    endianness = '<'
  elif magic == '\x88\x8f\xfc&':
    endianness = '>'
  else:
    raise IOError("The file is not in bigwig format")
  (version,zoomLevels,chromosomeTreeOffset,fullDataOffset,fullIndexOffset,fieldCount,definedFieldCount,autoSqlOffset,totalSummaryOffset,uncompressBufSize,reserved)=struct.unpack(endianness+'HHQQQHHQQIQ',fh.read(60))
  if version < 3:
    raise IOError("Bigwig files version <3 are not supported")
  fh.seek(chromosomeTreeOffset)
  magic = fh.read(4)
  if magic == '\x91\x8c\xcax':
    endianness = '<'
  elif magic == 'x\xca\x8c\x91':
    endianness = '>'
  else:
    raise ValueError("Wrong magic for this bigwig data file")
  (blockSize, keySize, valSize, itemCount, reserved) = struct.unpack(endianness + 'IIIQQ', fh.read(28))
  (isLeaf, reserved, count) = struct.unpack(endianness + 'BBH', fh.read(4))
  for n in range(count):
    (key, chromId, chromSize) = struct.unpack(endianness + str(keySize) + 'sII', fh.read(keySize + 2 * 4))
    csize[key.replace('\x00', '')] = chromSize
  return csize

# These functions were copied from simpleHilbertCurve (https://github.com/dentearl/simpleHilbertCurve.git)

def d2xy(n, d):
    """
    take a d value in [0, n**2 - 1] and map it to
    an x, y value (e.g. c, r).
    """
    assert(d <= n**2 - 1)
    t = d
    x = y = 0
    s = 1
    while (s < n):
        rx = 1 & (t / 2)
        ry = 1 & (t ^ rx)
        x, y = rot(s, x, y, rx, ry)
        x += s * rx
        y += s * ry
        t /= 4
        s *= 2
    return x, y

def rot(n, x, y, rx, ry):
    """
    rotate/flip a quadrant appropriately
    """
    if ry == 0:
        if rx == 1:
            x = n - 1 - x
            y = n - 1 - y
        return y, x
    return x, y


def dump_png(hc, chromosome, chromosome_size, options):
  n = 1 << options.level
  if options.background:
    colormap = plt.cm.seismic
  else:
    colormap = plt.cm.Reds
  plt.matshow(hc, origin=0, cmap = colormap)
  plt.colorbar()
  for i in range(0, chromosome_size, 4000000):
    # add crosshatches for chrom position
    (y, x) = d2xy(n, (n**2 - 1) * i / chromosome_size)
    plt.plot(x, y, 'k+')
    plt.text(x, y, "%dM" % (i / 1000000))
  plt.xticks([],[])
  plt.yticks([],[])
  plt.xlim(0, n)
  plt.ylim(0, n)
  plt.title("Hilbert curve for %s" % chromosome)
  plt.savefig("%s_%s.png" % (options.name, chromosome))
  plt.close()
  
def dump_opng(hc, bg_hc, chromosome, chromosome_size, options):
  n = 1 << options.level
  plt.matshow(hc, origin=0, cmap = plt.cm.Reds, alpha=0.5)
  plt.matshow(bc_hc, origin=0, cmap = plt.cm.Blues, alpha=0.5)
  for i in range(0, chromosome_size, 4000000):
    # add crosshatches for chrom position
    (y, x) = d2xy(n, (n**2 - 1) * i / chromosome_size)
    plt.plot(x, y, 'k+')
    plt.text(x, y, "%dM" % (i / 1000000))
  plt.xticks([],[])
  plt.yticks([],[])
  plt.xlim(0, n)
  plt.ylim(0, n)
  plt.title("Hilbert curve for %s" % chromosome)
  plt.savefig("%s_%s.png" % (options.name, chromosome))
  plt.close()  

def create_map(data, chromosome_size, options):
  n = 1 << options.level
  hc_map = np.zeros((n, n))
  counts = np.zeros((n, n))
  for i in xrange(0, chromosome_size):
    if i % 1000000 == 0:
      sys.stderr.write('.')
    # I need to parallelize this (or cython?)
    (y, x) = d2xy(n, (n**2 - 1) * i / chromosome_size)
    if options.summarize == 'sum' or options.summarize == 'mean':
      hc_map[x][y] += data[i]
    elif options.summarize == 'max':  
      hc_map[x][y] = max(data[i], hc_map[x][y])
    if options.summarize == 'mean':
      counts[x][y] += 1  

  if options.summarize == 'mean':
    hc_map = 1. * hc_map / counts

  return hc_map  
    
def bw2hc_main():
  # parse options
  option_parser = argparse.ArgumentParser(
  description="Convert genomic data to Hilbert Curves",
  prog="gilbert.py",
  epilog="For any question, write to cittaro.davide@hsr.it")
  option_parser.add_argument("--version", action="version", version="%(prog)s 0.1")
  option_parser.add_argument("-f","--file", help="File to convert", action='store', type=argparse.FileType('rb'), default=sys.stdin)
  option_parser.add_argument("-b","--background", help="Background to subtract", action="store",type=argparse.FileType('rb'))
  option_parser.add_argument("-o","--overlay", help="Overlay two datasets, don't subtract", action="store_true", default=False)
  option_parser.add_argument("-t","--type", help="File type", action='store', choices=['bigwig', 'bed', 'bam'], default='bigwig')
  option_parser.add_argument("-n","--name", help="Map prefix", action='store', default="Gilbert")
  option_parser.add_argument("-l","--level", help="Map level (<10)", type=int, default=8)
  option_parser.add_argument("-x","--exclude", help="Exclude this chromosome from the analysis", action='append')
  option_parser.add_argument("-i","--include", help="Limit the analysis to this chromosome", action='append')
  option_parser.add_argument("-L","--linearscale", help="Plot map in linear scale", action="store_true", default=False)
  option_parser.add_argument("-r","--resolution", help="Resolution of resulting map", type=int, default=600)
  option_parser.add_argument("-s","--summarize", help="Function to summarize data", choices=['sum', 'mean', 'max'], default='sum')
  options = option_parser.parse_args()
  
  # initialize functions
  n = 1 << options.level
  
  
  if options.type == 'bigwig':
    data_handler = bx.bbi.bigwig_file.BigWigFile(options.file)
    chromosome_sizes = getChromosomeSizesFromBigWig(options.file.name)
    if options.background:
      bg_data_handler = bx.bbi.bigwig_file.BigWigFile(options.background)
  else:
    terminate("%s support not yet implemented" % options.type, 1)
  
  for chromosome in chromosome_sizes.keys():
    # cycle on actual data data
    if options.exclude and chromosome in options.exclude: continue
    if options.include > 0 and not chromosome in options.include: continue
    # random chrom are ecluded by default
    if '_' in chromosome: continue

    file_prefix = "%s_%s" % (options.name, chromosome)

    chromosome_size = chromosome_sizes[chromosome]
    sys.stderr.write("Processing %s [%d bp] " % (chromosome, chromosome_size))
    
    sys.stderr.write("Reading data ")
    hilbert_map = np.zeros((n, n))
    if options.type == 'bigwig':
      data = data_handler.get_as_array(chromosome, 0, chromosome_size)
      data[np.isnan(data)] = 0
    else:  
      terminate("%s support not yet implemented" % options.type, 1)

    sys.stderr.write("Creating map ")
    hc = create_map(data, chromosome_size, options)

    if not options.linearscale:
      hc = np.log1p(hc)
    sys.stderr.write("\n")
    sys.stderr.write("Dumping Map into numpy array %s.npy \n" % file_prefix)
    np.save(file_prefix, hc)
    
    if options.background:
      sys.stderr.write("Processing Background")
      bg_data = bg_data_handler.get_as_array(chromosome, 0, chromosome_size)
      bg_data[np.isnan(data)] = 0
      bg_hc = create_map(bg_data, chromosome_size, options)
      if not options.linearscale:
        bg_hc = np.log1p(bg_hc)
      sys.stderr.write("\n")
      sys.stderr.write("Dumping Map into numpy array bg_%s.npy \n" % file_prefix)
      np.save("bg_%s" % file_prefix, bg_hc)
      if not options.overlay:
        # subtract background
        hc = hc - bg_hc
      
    # create the map
    if not options.overlay:
      dump_png(hc, chromosome, chromosome_size, options)
    else:
      dump_opng(hc, bg_hc, chromosome, chromosome_size, options)  

    
if __name__ == "__main__": bw2hc_main()
  
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.