Source

bx-python / lib / bx / intervals / operations / find_clusters.py

Full commit
"""
Find clusters of intervals within a set of intervals.  A cluster is a
group (of size minregions) of intervals within a specific distance (of
mincols) of each other.

Returns Cluster objects, which have a chrom, start, end, and lines (a
list of linenumbers from the original file).  The original can then be
ran through with the linenumbers to extract clustered regions without
disturbing original order, or the clusters may themselves be written
as intervals.
"""

import random
import math

import traceback
import fileinput
from warnings import warn

from bx.intervals.cluster import *
from bx.intervals.io import *
from bx.intervals.operations import *


def find_clusters(reader, mincols=1, minregions=2):
    extra = dict()
    chroms = dict()
    linenum = -1
    for interval in reader:
        linenum += 1
        if not isinstance(interval, GenomicInterval):
            extra[linenum] = interval
        else:
            if interval.chrom not in chroms:
                chroms[interval.chrom] = ClusterTree( mincols, minregions )
            try:
                chroms[interval.chrom].insert( interval.start, interval.end, linenum )
            except OverflowError, e:
                try:
                    # This will work only if reader is a NiceReaderWrapper
                    reader.skipped += 1
                    if reader.skipped < 10:
                        reader.skipped_lines.append( ( reader.linenum, reader.current_line, str( e ) ) )
                except:
                    pass
                continue
    return chroms, extra


### DEPRECATED: Use the ClusterTree in bx.intervals.cluster for this.
### It does the same thing, but is a C implementation.
class ClusterNode( object ):
    def __init__( self, start, end, linenum, mincols, minregions ):
        # Python lacks the binomial distribution, so we convert a
        # uniform into a binomial because it naturally scales with
        # tree size.  Also, python's uniform is perfect since the
        # upper limit is not inclusive, which gives us undefined here.
        self.priority = math.ceil( (-1.0 / math.log(.5)) * math.log( -1.0 / (random.uniform(0,1) - 1)))
        self.start = start
        self.end = end
        self.left = None
        self.right = None
        self.lines = [linenum]
        self.mincols = mincols
        self.minregions = minregions
        
    def insert( self, start, end, linenum ):
        if start - self.mincols > self.end:
            # insert to right tree
            if self.right:
                self.right = self.right.insert( start, end, linenum )
            else:
                self.right = ClusterNode(start, end, linenum, self.mincols, self.minregions)
            # rebalance tree
            if self.priority < self.right.priority:
                return self.rotateleft()
        elif end + self.mincols < self.start:
            # insert to left tree
            if self.left:
                self.left = self.left.insert( start, end, linenum )
            else:
                self.left = ClusterNode(start, end, linenum, self.mincols, self.minregions)
            # rebalance tree
            if self.priority < self.left.priority:
                return self.rotateright()
        else:
            # insert here
            self.start = min(self.start, start)
            self.end = max(self.end, end)
            self.lines.append(linenum)
            # recursive call to push nodes up
            if self.left:
                self.left = self.left.push_up(self)
            if self.right:
                self.right = self.right.push_up(self)
        return self

    def rotateright( self ):
        root = self.left
        self.left = self.left.right
        root.right = self
        return root
        
    def rotateleft( self ):
        root = self.right
        self.right = self.right.left
        root.left = self
        return root
        
    def push_up( self, topnode ):
        # Note: this function does not affect heap property
        # Distance method removed for inline, faster?
        distance = max(self.start, topnode.start) - min(self.end, topnode.end)
        if distance <= self.mincols:
            topnode.start = min(self.start, topnode.start)
            topnode.end = max(self.end, topnode.end)
            for linenum in self.lines:
                topnode.lines.append(linenum)
            if self.right:
                return self.right.push_up( topnode )
            if self.left:
                return self.left.push_up( topnode )
            return None
        if self.end < topnode.start and self.right:
            self.right = self.right.push_up( topnode )
        if self.start > topnode.end and self.left:
            self.left = self.left.push_up( topnode )
        return self

    def getintervals( self ):
        if self.left:
            for start, end in self.left.getintervals(minregions):
                yield start, end
        if len(self.lines) >= minregions:
            yield self.start, self.end
        if self.right:
            for start, end in self.right.getintervals(minregions):
                yield start, end

    def getlines( self ):
        if self.left:
            for line in self.left.getlines():
                yield line
        if len(self.lines) >= minregions:
            for line in self.lines:
                yield line
        if self.right:
            for line in self.right.getlines():
                yield line
                
## def main():
##     f1 = fileinput.FileInput("big.bed")
##     g1 = GenomicIntervalReader(f1)
##     returntree, extra = find_clusters(g1, mincols=50)
##     print "All found"
##     for chrom, value in returntree.items():
##         for start, end in value.getregions():
##             print chrom+"\t"+str(start)+"\t"+str(end)
##         for line in value.getlines():
##             print "Line:\t"+str(line)

## main()