Commits

Marko Toplak  committed 2ff9da5

Attributes m and n in Hypergeometric test can now be swapped and results do not change. The implementation is more resistant to rounding errors.

  • Participants
  • Parent commits cad8926

Comments (0)

Files changed (2)

File orangecontrib/bio/obiGO.py

 ##                print >> sys.stderr, term, sorted(genes)
 ##                print >> sys.stderr, sorted(allAnnotatedGenes)
 ##                return
+            
             if len(reference) > len(allAnnotatedGenes):
                 mappedReferenceGenes = reference.intersection(allAnnotatedGenes)
             else:

File orangecontrib/bio/obiProb.py

             raise
 
     def p_value(self, k, N, m, n):
-        subtract = n - k + 1 > k
-##        result = sum([math.exp(self._logbin(m, i) + self._logbin(N - m, n - i)) for i in (range(k) if subtract else range(k, n+1))])
-        result = sum([self.__call__(i, N, m, n) for i in (range(k) if subtract else range(k, n+1))])
-##        result /= math.exp(self._logbin(N, n))
-        return max(1.0 - result if subtract else result, 0.0)
+        #From wolfram alpha:
+        #1-CDF[HypergeometricDistribution[80, 50, 2000], 20-1] = 1.75695e-16
+        #1-CDF[HypergeometricDistribution[80, 50, 2000], 40-1] = 9.92008e-52
 
+        if min(n,m) - k + 1 <= k:
+            #starting from k gives the shorter list of values
+            return sum(self.__call__(i, N, m, n) for i in range(k, min(n,m)+1))
+        else:
+            value = 1.0 - sum(self.__call__(i, N, m, n) for i in (range(k)))
+            #if the value is small it is probably inexact due to the limited
+            #precision of floats, as for example  (1-(1-10e-20)) -> 0
+            #if so, compute the result without substraction
+            if value < 1e-3: #arbitary threshold
+                #print "INEXACT", value, sum(self.__call__(i, N, m, n) for i in range(k, min(n,m)+1))
+                return sum(self.__call__(i, N, m, n) for i in range(k, min(n,m)+1))
+            else:
+                return value
 
 ## to speed-up FDR, calculate ahead sum([1/i for i in range(1, m+1)]), for m in [1,100000]. For higher values of m use an approximation, with error less or equal to 4.99999157277e-006. (sum([1/i for i in range(1, m+1)])  ~ log(m) + 0.5772..., 0.5572 is an Euler-Mascheroni constant) 
 c = [1.0]