Commits

Davide Cittaro committed 702a23a

new tool for hilbert curve comparison

  • Participants
  • Parent commits 18efeb1

Comments (0)

Files changed (2)

File bighilbert.py

+#!/usr/bin/env python2.7
+
+import sys
+import bx.bbi.bigwig_file
+import numpy as np
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+
+
+def getChromosomeSizesFromBigWig(bwname):
+  csize = {}
+  fh = open(os.path.expanduser(bwname), "rb")
+  magic = fh.read(4)
+  if magic == '&\xfc\x8f\x88':
+    endianness = '<'
+  elif magic == '\x88\x8f\xfc&':
+    endianness = '>'
+  else:
+    raise IOError("The file is not in bigwig format")
+  (version,zoomLevels,chromosomeTreeOffset,fullDataOffset,fullIndexOffset,fieldCount,definedFieldCount,autoSqlOffset,totalSummaryOffset,uncompressBufSize,reserved)=struct.unpack(endianness+'HHQQQHHQQIQ',fh.read(60))
+  if version < 3:
+    raise IOError("Bigwig files version <3 are not supported")
+  fh.seek(chromosomeTreeOffset)
+  magic = fh.read(4)
+  if magic == '\x91\x8c\xcax':
+    endianness = '<'
+  elif magic == 'x\xca\x8c\x91':
+    endianness = '>'
+  else:
+    raise ValueError("Wrong magic for this bigwig data file")
+  (blockSize, keySize, valSize, itemCount, reserved) = struct.unpack(endianness + 'IIIQQ', fh.read(28))
+  (isLeaf, reserved, count) = struct.unpack(endianness + 'BBH', fh.read(4))
+  for n in range(count):
+    (key, chromId, chromSize) = struct.unpack(endianness + str(keySize) + 'sII', fh.read(keySize + 2 * 4))
+    csize[key.replace('\x00', '')] = chromSize
+  return csize
+
+# These functions were copied from simpleHilbertCurve (https://github.com/dentearl/simpleHilbertCurve.git)
+
+def d2xy(n, d):
+    """
+    take a d value in [0, n**2 - 1] and map it to
+    an x, y value (e.g. c, r).
+    """
+    assert(d <= n**2 - 1)
+    t = d
+    x = y = 0
+    s = 1
+    while (s < n):
+        rx = 1 & (t / 2)
+        ry = 1 & (t ^ rx)
+        x, y = rot(s, x, y, rx, ry)
+        x += s * rx
+        y += s * ry
+        t /= 4
+        s *= 2
+    return x, y
+
+def rot(n, x, y, rx, ry):
+    """
+    rotate/flip a quadrant appropriately
+    """
+    if ry == 0:
+        if rx == 1:
+            x = n - 1 - x
+            y = n - 1 - y
+        return y, x
+    return x, y
+
+Sample1 = bx.bbi.bigwig_file.BigWigFile(open(sys.argv[1], 'rb'))
+Sample2 = bx.bbi.bigwig_file.BigWigFile(open(sys.argv[2], 'rb'))
+csize = getChromosomeSizesFromBigWig(sys.argv[1])
+imsize = 640
+n = imsize - 1
+
+for chrom in csize.keys():
+  sys.stderr.write("Processing chromosome %s\n" % chrom)
+  
+  chrom_length = csize[chrom]
+  sys.stderr.write("Processing Sample1\n")
+  hc = np.zeros((imsize, imsize))
+  data = Sample1.get_as_array(chrom, 0, chrom_length)
+  data[np.isnan(data)] = 0
+  for i in xrange(0, len(chrom_length)):
+    c, r = d2xy(n, int(round((n**2 - 1) * i / chrom_length)))
+    hc[r][c] += data[i]
+  # dump hilbert curve in image and array
+  np.save("Sample1_%s" % chrom, hc)
+  plt.imshow(hc, origin=0, alpha=0.5, cmap=plt.cm.Blues)  
+
+  sys.stderr.write("Processing Sample2\n")
+  hc = np.zeros((imsize, imsize))
+  data = Sample2.get_as_array(chrom, 0, chrom_length)
+  data[np.isnan(data)] = 0
+  for i in xrange(0, len(chrom_length)):
+    c, r = d2xy(n, int(round((n**2 - 1) * i / chrom_length)))
+    hc[r][c] += data[i]
+  # dump hilbert curve in image and array
+  np.save("Sample2_%s" % chrom, hc)
+  plt.imshow(hc, origin=0, alpha=0.5, cmap=plt.cm.Reds)  
+  plt.title("Hilbert curve for %s" % chrom)
+  plt.savefig("HC_%s.png" % chrom)
+  plt.close()
+
+  
 import argparse
 import sys
 import vcf
+import gzip
 
 def terminate(msg=None, exit_status=1):
   sys.stderr.write("%s\n" % msg)
   else:
     return s  
 
+def read_pos_file(fh):
+  
+  terminate("Not yet implemented!")
+  # if only 3 columns are available, annotate positions, otherwise add all the annotations available
+  
+  tree_dict = {}
+  
+  header = fh.readline()
+  mark_names = header.lstrip('#').strip().split()[3:]
+  nf = len(mark_names)
+  if nf > 1 and not header.startswith('#'):
+    terminate("Multiple columns in position list, but no header provided!")
+  elif nf == 1:
+    1
+
 def read_gene_list(fh):
   
   # Read gene list, which is a tab separated file with gene in the first column.
   mark_names = []
   gene_list = {}
   
+  # check if this is gziped
+  if fh.name.endswith(".gz"):
+    n = fh.fname
+    fh.close()
+    fh = gzip.open(n)
+  
   # read first line
   header = fh.readline()
   mark_names = header.lstrip('#').strip().split()
   
   for line in fh:
     fields = line.strip().split()
+    if len(fields) == 0: continue
+    if line.startswith('#'): continue
     if nf == 1:
       gene_list[fields[0]] = None
     else:
    
   # bin	name	chrom	strand	txStart	txEnd	cdsStart	cdsEnd	exonCount	exonStarts	exonEnds	score	name2	cdsStartStat	cdsEndStat	exonFrames
 
+  # check if this is gziped
+  if fh.name.endswith(".gz"):
+    n = fh.fname
+    fh.close()
+    fh = gzip.open(n)
+
   tree_dict = {}
   for line in (fh):
     if line.startswith('#'): continue
 def annotate_vcf(fh, gene_props, prefix):
   
   vcf_file = vcf.Reader(fh)
-  
   properties = [prefix + x for x in gene_props['__properties__']]
   
   if len(properties) == 0:
   option_parser.add_argument("-g","--genelist", help="File containing gene list", action='store', type=file, required=True)
   option_parser.add_argument("-r","--refgene", help="RefGene SQL table from UCSC", action='store', type=file, required=True)
   option_parser.add_argument("-n","--name", help="Annotation name prefix", action='store', default="MARK")
+  option_parser.add_argument("-p","--positions", help="Annotation is given with position list (chr, start, end)", action='store_true', default=False)
   options = option_parser.parse_args()
   
   # read the genelist and retur
-  gene_list = read_gene_list(options.genelist)
+  if options.positions:
+    1
+  else:
+    gene_list = read_gene_list(options.genelist)
   
-  # read sql file and create the interval tree with gene properties
-  gene_properties = read_refgene(options.refgene, gene_list)
+    # read sql file and create the interval tree with gene properties
+    gene_properties = read_refgene(options.refgene, gene_list)
   
   # read the vcf and, at the same time, output annotated entries
   # I really would like to use PyVCF interface but at time of writing it had a bug that prevents