Commits

Chris Park  committed fe98a86

merge Hypergeometric clarifications

  • Participants
  • Parent commits 0fc635c

Comments (0)

Files changed (3)

File src/measure.cpp

 
 double CMeasureHypergeometric::Measure( const float* adX, size_t iM, const float* adY,
 	size_t iN, EMap eMap, const float* adWX, const float* adWY ) const {
-	size_t	i, iOne, iTwo, iBoth;
+	size_t	i, iOne, iTwo, iBoth, iTotalPresent;
 
 	if( iM != iN )
 		return CMeta::GetNaN( );
 
-	iOne = iTwo = iBoth = 0;
+	iOne = iTwo = iTotalPresent = iBoth = 0;
 	for( i = 0; i < iN; ++i ) {
 		if( CMeta::IsNaN( adX[ i ] ) || CMeta::IsNaN( adY[ i ] ) )
 			continue;
+		iTotalPresent ++;
 		if( adX[ i ] )
 			iOne++;
 		if( adY[ i ] ) {
 			if( adX[ i ] )
 				iBoth++; } }
 
-	return ( 1 - CStatistics::HypergeometricCDF( iBoth, iOne, iTwo, iN ) ); }
+	return ( 1 - CStatistics::HypergeometricCDF( iBoth, iOne, iTwo, iTotalPresent ) ); }
 
 double CMeasureInnerProduct::Measure( const float* adX, size_t iM, const float* adY,
 	size_t iN, EMap eMap, const float* adWX, const float* adWY ) const {

File src/statistics.cpp

  * \brief
  * Calculates the hypergeometric p-value given the sizes and overlap of two sets.
  * 
- * \param iHitsOne
- * Number of hits in the first (query) set.
- * 
- * \param iSizeOne
- * Size of the first (query) set.
- * 
- * \param iHitsTwo
- * Number of hits in the second (background) set.
- * 
- * \param iSizeTwo
- * Size of the second (background) set.
- * 
+ * \param iBoth
+ * Number of non-zero in both query sets together.
+ *
+ * \param iNonZeroInOne
+ * Number of non-zero in query one
+ *
+ * \param iTwo
+ * Number of non-zero in query two
+ *
+ * \param iN
+ * Total number of sites in the query.
+ *
  * \returns
- * sum(CStatistics::HypergeometricPDF (i, iSizeOne, iHitsTwo, iSizeTwo), i=iHitsOne..min(iHitsTwo, iSizeOne))
+ * sum(CStatistics::HypergeometricPDF (iBoth, iNonZeroInOne, iTwo, iN), i=iBoth..min(iNonZeroInTwo, iNonZeroInOne))
  */
-double CStatistics::HypergeometricCDF( size_t iHitsOne, size_t iSizeOne, size_t iHitsTwo,
-	size_t iSizeTwo ) {
+double CStatistics::HypergeometricCDF( size_t iBoth, size_t iNonZeroInOne, size_t iNonZeroInTwo,
+	size_t iN ) {
 	size_t	i, iHits;
 	double	dRet;
 
 	dRet = 0;
-	iHits = ( iHitsTwo < iSizeOne ) ? iHitsTwo : iSizeOne;
-	for( i = iHitsOne; i <= iHits; ++i )
-		dRet += HypergeometricPDF( i, iSizeOne, iHitsTwo, iSizeTwo );
+	iHits = ( iNonZeroInTwo < iNonZeroInOne ) ? iNonZeroInTwo : iNonZeroInOne;
+	for( i = iBoth; i <= iHits; ++i )
+		dRet += HypergeometricPDF( i, iNonZeroInOne, iNonZeroInTwo, iN );
 
 	return ( ( dRet > 1 ) ? 1 : dRet ); }
 
 /*!
  * \brief
+ * Calculates the two sided hypergeometric p-value given the sizes and overlap of two sets.
+ *
+ * \param iNonZeroInCommon
+ * Number of non-zero in both query sets together.
+ * 
+ * \param iNonZeroInOne
+ * Number of non-zero in query one
+ * 
+ * \param iNonZeroInTwo
+ * Number of non-zero in query two
+ * 
+ * \param iTotalNumValues
+ * Total number of sites in the query.
+ * 
+ * \returns
+ * sum(CStatistics::HypergeometricPDF (iBoth, iNonZeroInOne, iNonZeroInTwo, iN), i=iBoth..min(iNonZeroInTwo, iNonZeroInOne))
+ */
+double CStatistics::TwoSidedHypergeometricCDF( size_t iNonZeroInCommon, size_t iNonZeroInOne, size_t iNonZeroInTwo,
+	size_t iTotalNumValues ) {
+	size_t	i, iHits;
+	double	dRet;
+
+	int a = iTotalNumValues - iNonZeroInTwo + iNonZeroInCommon - iNonZeroInOne ;
+	int b = iNonZeroInOne - iNonZeroInCommon;
+	int c = iNonZeroInTwo - iNonZeroInCommon;
+	int d = iNonZeroInCommon;
+
+	dRet = 0;
+
+	if (a*d - b*c < 0) {
+		while (a >= 0 && d >= 0) {
+			dRet += HypergeometricPDF( d, b+d, c+d, a+b+c+d );
+			a--;d--;c++;b++;
+			cout << dRet << endl;
+		}
+	}
+	else {
+		while (c >= 0 && b >= 0) {
+			dRet += HypergeometricPDF( d, b+d, c+d, a+b+c+d );
+			a++;d++;c--;b--;
+		}
+	}
+	dRet *= 2;
+	return ( ( dRet > 1 ) ? 1 : dRet ); }
+
+
+/*!
+ * \brief
  * Calculate a precision/recall f-score.
  * 
  * \param iTruePositives

File src/statistics.h

 		const std::vector<bool>& vecfGenesOfInterest, bool fInvert = false );
 
 	// Probability distributions
-	static double HypergeometricCDF( size_t iHitsOne, size_t iSizeOne, size_t iHitsTwo, size_t iSizeTwo );
+	static double HypergeometricCDF( size_t iBoth, size_t iNonZeroInOne, size_t iNonZeroInTwo, size_t iN );
+	static double TwoSidedHypergeometricCDF( size_t iHitsOne, size_t iSizeOne, size_t iHitsTwo, size_t iSizeTwo );
 	static double SampleGammaStandard( double dShape );
 	static double SampleGammaLogStandard( double dXX );
 	static double SampleNormalStandard( );
 		d = ( iObservations - ( iSample * dProbability ) ) / sqrt( iSample * dProbability *
 			( 1 - dProbability ) );
 		return ( 1 - NormalCDF( d, 0, 1 ) ); }
-
+	
 	/*!
 	 * \brief
 	 * Calculate the hypergeometric probability distribution given the sizes and overlap of two sets.
 	 * 
-	 * \param iHitsOne
-	 * Number of hits in the first (query) set.
+	 * \param iNonZeroInCommon
+	 * Number of non zero values that both share.
 	 * 
-	 * \param iSizeOne
+	 * \param iNonZeroInOne
 	 * Size of the first (query) set.
 	 * 
-	 * \param iHitsTwo
+	 * \param iNonZeroInTwo
 	 * Number of hits in the second (background) set.
 	 * 
-	 * \param iSizeTwo
-	 * Size of the second (background) set.
+	 * \param iTotalNumValues
+	 * Total number of values that were compared.
 	 * 
 	 * \returns
-	 * choose(iHitsTwo, iHitsOne) * choose(iSizeTwo - iHitsTwo, iSizeOne - iHitsOne) /
-	 * choose(iSizeTwo, iSizeOne)
+	 * choose(iNonZeroInTwo, iNonZeroInCommon) * choose(iTotalNumValues - iNonZeroInTwo, iNonZeroInOne - iNonZeroInCommon) /
+	 * choose(iTotalNumValues, iNonZeroInOne)
 	 * 
 	 * \remarks
 	 * Calculated using the exponential of CStatistics::LogFact results for increased speed and precision.
 	 */
-	static double HypergeometricPDF( size_t iHitsOne, size_t iSizeOne, size_t iHitsTwo,
-		size_t iSizeTwo ) {
+	static double HypergeometricPDF( size_t iNonZeroInCommon, size_t iNonZeroInOne,
+			size_t iNonZeroInTwo, size_t iTotalNumValues ) {
 
-		return exp( LogFact( iSizeTwo - iHitsTwo ) + LogFact( iHitsTwo ) + LogFact( iSizeOne ) +
-			LogFact( iSizeTwo - iSizeOne ) - LogFact( iHitsOne ) -
-			LogFact( iHitsTwo - iHitsOne ) - LogFact( iSizeTwo - iHitsTwo + iHitsOne - iSizeOne ) -
-			LogFact( iSizeOne - iHitsOne ) - LogFact( iSizeTwo ) ); }
-
+		return exp( LogFact( iTotalNumValues - iNonZeroInTwo ) + LogFact( iNonZeroInTwo ) //right margin
+				+ LogFact( iNonZeroInOne ) + LogFact( iTotalNumValues - iNonZeroInOne ) // bottom margin
+				- LogFact( iTotalNumValues ) // total
+				- LogFact( iNonZeroInCommon ) //1,1
+				- LogFact( iNonZeroInTwo - iNonZeroInCommon ) //1,0
+				- LogFact( iTotalNumValues - iNonZeroInTwo + iNonZeroInCommon - iNonZeroInOne ) //0, 0
+				- LogFact( iNonZeroInOne - iNonZeroInCommon ) //0,1
+		);
+	}
+	
 	/*!
 	 * \brief
 	 * Calculate a p-value for the given T and degrees of freedom.