# HG changeset patch # User markotoplak # Date 1381831391 -7200 # Node ID 2ff9da5aed5303cc3541eba935a6c2b9633b427d # Parent cad8926447866dbd38af6b136fc67315adb644aa Attributes m and n in Hypergeometric test can now be swapped and results do not change. The implementation is more resistant to rounding errors. diff --git a/orangecontrib/bio/obiGO.py b/orangecontrib/bio/obiGO.py --- a/orangecontrib/bio/obiGO.py +++ b/orangecontrib/bio/obiGO.py @@ -920,6 +920,7 @@ ## print >> sys.stderr, term, sorted(genes) ## print >> sys.stderr, sorted(allAnnotatedGenes) ## return + if len(reference) > len(allAnnotatedGenes): mappedReferenceGenes = reference.intersection(allAnnotatedGenes) else: diff --git a/orangecontrib/bio/obiProb.py b/orangecontrib/bio/obiProb.py --- a/orangecontrib/bio/obiProb.py +++ b/orangecontrib/bio/obiProb.py @@ -86,12 +86,23 @@ raise def p_value(self, k, N, m, n): - subtract = n - k + 1 > k -## result = sum([math.exp(self._logbin(m, i) + self._logbin(N - m, n - i)) for i in (range(k) if subtract else range(k, n+1))]) - result = sum([self.__call__(i, N, m, n) for i in (range(k) if subtract else range(k, n+1))]) -## result /= math.exp(self._logbin(N, n)) - return max(1.0 - result if subtract else result, 0.0) + #From wolfram alpha: + #1-CDF[HypergeometricDistribution[80, 50, 2000], 20-1] = 1.75695e-16 + #1-CDF[HypergeometricDistribution[80, 50, 2000], 40-1] = 9.92008e-52 + if min(n,m) - k + 1 <= k: + #starting from k gives the shorter list of values + return sum(self.__call__(i, N, m, n) for i in range(k, min(n,m)+1)) + else: + value = 1.0 - sum(self.__call__(i, N, m, n) for i in (range(k))) + #if the value is small it is probably inexact due to the limited + #precision of floats, as for example (1-(1-10e-20)) -> 0 + #if so, compute the result without substraction + if value < 1e-3: #arbitary threshold + #print "INEXACT", value, sum(self.__call__(i, N, m, n) for i in range(k, min(n,m)+1)) + return sum(self.__call__(i, N, m, n) for i in range(k, min(n,m)+1)) + else: + return value ## to speed-up FDR, calculate ahead sum([1/i for i in range(1, m+1)]), for m in [1,100000]. For higher values of m use an approximation, with error less or equal to 4.99999157277e-006. (sum([1/i for i in range(1, m+1)]) ~ log(m) + 0.5772..., 0.5572 is an Euler-Mascheroni constant) c = [1.0]