Commits

Jian Zhou  committed 2a24905

Weighted context: Added AUC calculation using weighted context for DChecker; Adjustments for reducing rounding error;

  • Participants
  • Parent commits a210239

Comments (0)

Files changed (5)

File src/statistics.cpp

 #include "statistics.h"
 #include "measure.h"
 #include "dat.h"
-
+#define WT_MULTIPLIER 50
 /*
  * Implementations thanks to:
  * Numerical Recipes in C
 
 	return ( dSum / iPos / iNeg ); }
 
+double CStatistics::WilcoxonRankSum( const CDat& DatData, const CDat& DatAnswers,
+	const vector<float>& vecGeneWeights, bool flipneg) {
+	size_t				i, j, k,m, iOne, iTwo, iNeg;
+	uint64_t			iPos;
+	float				d, dAnswer,w;
+	double				dSum;
+	std::vector<size_t>		veciGenes;
+	std::vector<float>		vecdValues, vecdRanks;
+	bool				fAnswer;
+
+	veciGenes.resize( DatAnswers.GetGenes( ) );
+	for( i = 0; i < veciGenes.size( ); ++i )
+		veciGenes[ i ] = DatData.GetGene( DatAnswers.GetGene( i ) );
+
+	for( i = 0; i < DatAnswers.GetGenes( ); ++i ) {
+		if( ( iOne = veciGenes[ i ] ) == -1 )
+			continue;
+		for( j = ( i + 1 ); j < DatAnswers.GetGenes( ); ++j ) {
+			if( ( ( iTwo = veciGenes[ j ] ) == -1 ) ||
+				CMeta::IsNaN( dAnswer = DatAnswers.Get( i, j ) ) ||
+				CMeta::IsNaN( d = DatData.Get( iOne, iTwo ) )||
+				CMeta::IsNaN( w = vecGeneWeights[i]*vecGeneWeights[j] ))
+				continue;
+			fAnswer = dAnswer > 0;
+//write sth here
+			if(fAnswer || !flipneg)
+				for( m = 0; m <(vecGeneWeights[i]*vecGeneWeights[j]*WT_MULTIPLIER-0.5); m++){
+					vecdValues.push_back( d );}  
+			else
+				for( m = 0; m <((1-vecGeneWeights[i]*vecGeneWeights[j])*WT_MULTIPLIER-0.5); m++){
+					vecdValues.push_back( d );} } }
+	{
+		std::vector<size_t>	veciIndices;
+		size_t				iIndex, iCount;
+
+		veciIndices.resize( vecdValues.size( ) );
+		for( i = 0; i < vecdValues.size( ); ++i )
+			veciIndices[ i ] = i;
+		std::sort( veciIndices.begin( ), veciIndices.end( ), SCompareRank<float>( vecdValues ) );
+		vecdRanks.resize( veciIndices.size( ) );
+		for( i = 0; i < vecdRanks.size( ); ++i ) {
+			iIndex = veciIndices[ i ];
+			if( !i || ( vecdValues[ iIndex ] != vecdValues[ veciIndices[ i - 1 ] ] ) ) {
+				for( iCount = 0,j = i; j < veciIndices.size( ); ++j ) {
+					if( vecdValues[ veciIndices[ j ] ] != vecdValues[ iIndex ] )
+						break;
+					iCount++; }
+				d = i + ( iCount - 1 ) / 2.0f; }
+			vecdRanks[ iIndex ] = d; }
+	}
+
+	for( dSum = 0,iPos = iNeg = i = k = 0; i < DatAnswers.GetGenes( ); ++i ) {
+		if( ( iOne = veciGenes[ i ] ) == -1 )
+			continue;
+		for( j = ( i + 1 ); j < DatAnswers.GetGenes( ); ++j ) {
+			if( ( ( iTwo = veciGenes[ j ] ) == -1 ) ||
+				CMeta::IsNaN( DatData.Get( iOne, iTwo ) ) ||
+				CMeta::IsNaN( dAnswer = DatAnswers.Get( i, j ) )||
+				CMeta::IsNaN( w = vecGeneWeights[i]*vecGeneWeights[j] ) )
+			    continue;
+			fAnswer = dAnswer > 0;
+
+				if( dAnswer > 0 ) {
+					for( m = 0; m <(vecGeneWeights[i]*vecGeneWeights[j]*WT_MULTIPLIER-0.5); m++){
+					iPos++;
+					dSum += vecdRanks[ k ];
+					k++;}}
+				else{
+					if(flipneg)
+						for( m = 0; m <((1-vecGeneWeights[i]*vecGeneWeights[j])*WT_MULTIPLIER-0.5); m++){
+							iNeg++;
+						k++;}
+					else
+						for( m = 0; m <(vecGeneWeights[i]*vecGeneWeights[j]*WT_MULTIPLIER-0.5); m++){
+							iNeg++;
+						k++;}
+				}
+		
+		}}
+	dSum -= ( iPos * ( iPos - 1 ) ) / 2;
+
+	return ( dSum / iPos / iNeg ); }
+
+double CStatistics::WilcoxonRankSum( const CDat& DatData, const CDat& DatAnswers,
+	const CDat& wDat, bool flipneg) {
+	size_t				i, j, k,m, iOne, iTwo, iNeg;
+	uint64_t			iPos;
+	float				d, dAnswer,w;
+	double				dSum;
+	std::vector<size_t>		veciGenes,vecfiGenes;
+	std::vector<float>		vecdValues, vecdRanks, vecdWeights;
+	bool				fAnswer;
+
+
+	veciGenes.resize( DatAnswers.GetGenes( ) );
+	for( i = 0; i < veciGenes.size( ); ++i )
+		veciGenes[ i ] = DatData.GetGene( DatAnswers.GetGene( i ) );
+
+	vecfiGenes.resize( DatAnswers.GetGenes( ) );
+	for( i = 0; i < vecfiGenes.size( ); ++i ){
+		vecfiGenes[ i ] = wDat.GetGene( DatAnswers.GetGene( i ));}
+
+	for( i = 0; i < DatAnswers.GetGenes( ); ++i ) {
+		if( ( iOne = veciGenes[ i ] ) == -1 || vecfiGenes[ i ] == -1 )
+			continue;
+		for( j = ( i + 1 ); j < DatAnswers.GetGenes( ); ++j ) {
+			if( ( ( iTwo = veciGenes[ j ] ) == -1 ||vecfiGenes[ j ] == -1 ) ||
+				CMeta::IsNaN( dAnswer = DatAnswers.Get( i, j ) ) ||
+				CMeta::IsNaN( d = DatData.Get( iOne, iTwo ) )||
+				CMeta::IsNaN( w = wDat.Get(vecfiGenes[i],vecfiGenes[j]) ))
+				continue;
+			fAnswer = dAnswer > 0;
+			
+			if(fAnswer || !flipneg)
+				for( m = 0; m <(w*WT_MULTIPLIER-0.5); m++){
+					vecdValues.push_back(d);}
+			else
+				for( m = 0; m <((1-w)*WT_MULTIPLIER-0.5); m++){
+					vecdValues.push_back(d);}
+
+			} }
+
+	{
+		std::vector<size_t>	veciIndices;
+		size_t				iIndex, iCount;
+
+		veciIndices.resize( vecdValues.size( ) );
+		for( i = 0; i < vecdValues.size( ); ++i )
+			veciIndices[ i ] = i;
+		std::sort( veciIndices.begin( ), veciIndices.end( ), SCompareRank<float>( vecdValues ) );
+		vecdRanks.resize( veciIndices.size( ) );
+		for( i = 0; i < vecdRanks.size( ); ++i ) {
+			iIndex = veciIndices[ i ];
+			if( !i || ( vecdValues[ iIndex ] != vecdValues[ veciIndices[ i - 1 ] ] ) ) {
+				for( iCount = 0,j = i; j < veciIndices.size( ); ++j ) {
+					if( vecdValues[ veciIndices[ j ] ] != vecdValues[ iIndex ] )
+						break;
+					iCount++; }
+				d = i + ( iCount - 1 ) / 2.0f; }
+			vecdRanks[ iIndex ] = d; }
+	}
+
+	for( dSum = 0,iPos = iNeg = i = k = 0; i < DatAnswers.GetGenes( ); ++i ) {
+		if( ( iOne = veciGenes[ i ] ) == -1 || vecfiGenes[ i ] == -1)
+			continue;
+		for( j = ( i + 1 ); j < DatAnswers.GetGenes( ); ++j ) {
+			if( ( ( iTwo = veciGenes[ j ] ) == -1 || vecfiGenes[ j ] == -1 ) ||
+				CMeta::IsNaN( DatData.Get( iOne, iTwo ) ) ||
+				CMeta::IsNaN( dAnswer = DatAnswers.Get( i, j ) )||
+				CMeta::IsNaN( w = wDat.Get(vecfiGenes[i],vecfiGenes[j]) ) )
+			    continue;
+			fAnswer = dAnswer > 0;
+				if( dAnswer > 0 ) {
+					for( m = 0; m <(wDat.Get(vecfiGenes[i],vecfiGenes[j])*WT_MULTIPLIER-0.5); m++){
+						iPos++;
+						dSum += vecdRanks[ k ]; 
+						k++;}}
+				else{
+					if(flipneg)
+						for( m = 0; m <((1-wDat.Get(vecfiGenes[i],vecfiGenes[j]))*WT_MULTIPLIER-0.5); m++){
+							iNeg++;
+							k++;}
+					else
+						for( m = 0; m <(wDat.Get(vecfiGenes[i],vecfiGenes[j])*WT_MULTIPLIER-0.5); m++){
+							iNeg++;
+							k++;}
+					 }
+		}}
+
+	dSum -= ( iPos * ( iPos - 1 ) ) / 2;
+
+	return ( dSum / iPos / iNeg ); }
+
 double bessi0( double dX ) {
 	double	dAbs, dRet, dY;
 

File src/statistics.h

 	
 	static double WilcoxonRankSum(const CDat& DatData, const CDat& DatAnswers, const std::vector<bool>& vecfGenesOfInterest, const std::vector<bool>& vecfUbik, bool fPosIn, bool fNegIn, bool fPosBridge, bool fNegBridge, bool fPosOut, bool fNegOut, bool fInvert = false);
 	static double WilcoxonRankSum(const CPCL& DatData, const CPCL& DatAnswers, const std::vector<bool>& vecfGenesOfInterest, const std::vector<bool>& vecfUbik, bool fPosIn, bool fNegIn, bool fPosBridge, bool fNegBridge, bool fPosOut, bool fNegOut, bool fInvert = false);
-	
+	static double WilcoxonRankSum( const CDat& DatData, const CDat& DatAnswers, const vector<float>& vecGeneWeights, bool flipneg);
+	static double WilcoxonRankSum( const CDat& DatData, const CDat& DatAnswers,  const CDat& wDat, bool flipneg);
 	// Probability distributions
 	static double HypergeometricCDF(size_t iBoth, size_t iNonZeroInOne,
 			size_t iNonZeroInTwo, size_t iN);

File tools/Counter/Counter.cpp

 
 
 #define WT_MULTIPLIER 50
+#define WT_MULTIPLIERf 50.0
 
 class CRegularize;
 
     vector<size_t>	veciGenes, vecfiGenes;
 	vector<float>	vecGeneWeights;
     psData = (SLearn*)pData;
+	float			w;
 
     if (psData->m_pUbikGenes->GetGenes( )) {
 		vecfUbik.resize( psData->m_pAnswers->GetGenes( ) );
 					//When contexts are weighted, add counts = WT_MULTIPLIER * weight1 * weight 2 
 					if(psData->m_pGenes->IsWeighted()){
 						if(iAnswer==1 || !psData->m_bFlipNeg)
-							for( k = 0; k <(vecGeneWeights[i]*vecGeneWeights[j]*WT_MULTIPLIER); k++){
+							for( k = 0; k <(vecGeneWeights[i]*vecGeneWeights[j]*WT_MULTIPLIER-0.5); k++){
 								psData->m_pMatCounts->Get( iVal, iAnswer )++;
 								}
 						else
-							for( k = 0; k <((1-vecGeneWeights[i]*vecGeneWeights[j])*WT_MULTIPLIER); k++){
+							for( k = 0; k <((1-vecGeneWeights[i]*vecGeneWeights[j])*WT_MULTIPLIER-0.5); k++){
 								psData->m_pMatCounts->Get( iVal, iAnswer )++;
 								}
 					}	
 					else if(psData->m_isDatWeighted){
+						if(CMeta::IsNaN(w = psData->m_pwDat->Get( vecfiGenes[i],vecfiGenes[j] )) || vecfiGenes[i] == -1 ||
+							vecfiGenes[j] == -1)
+							continue;
 						if(iAnswer==1 || !psData->m_bFlipNeg)
-							for( k = 0; k <(psData->m_pwDat->Get( vecfiGenes[i],vecfiGenes[j] ) *WT_MULTIPLIER); k++){
+							for( k = 0; k <(w *WT_MULTIPLIER-0.5); k++){
 								psData->m_pMatCounts->Get( iVal, iAnswer )++;
 								}
 						else
-							for( k = 0; k <((1-psData->m_pwDat->Get( vecfiGenes[i],vecfiGenes[j] )) *WT_MULTIPLIER); k++){
+							for( k = 0; k <((1-w) *WT_MULTIPLIER-0.5); k++){
 								psData->m_pMatCounts->Get( iVal, iAnswer )++;
 								}
 					}
 					else{
 					psData->m_pMatCounts->Get( iVal, iAnswer )++;
-					//FIXME: Regularization has not supportted weighted context
+					//FIXME: Regularization has not been supported for weighted context
 					psData->m_pRegularize->Add( psData->m_iDat, *psData->m_pDat, i, j, iVal );
 					}
 			}
 	if(psData->m_pGenes->IsWeighted()||psData->m_isDatWeighted){
 		for (i=0; i< psData->m_pMatCounts->GetRows();i++)
 			for(j=0; j<psData->m_pMatCounts->GetColumns();j++)
-				psData->m_pMatCounts->Get( i,j ) /= WT_MULTIPLIER;
+				psData->m_pMatCounts->Get( i,j ) = int(psData->m_pMatCounts->Get( i,j )/ WT_MULTIPLIERf + 0.5);
 	}
 
     return NULL;

File tools/DChecker/DChecker.cpp

 double AUCMod( const CDat&, const CDat&, const vector<bool>&, const vector<bool>&, const gengetopt_args_info&, bool, float );
 
 int main( int iArgs, char** aszArgs ) {
-    CDat		    Answers, Data;
+    CDat		    Answers, Data, wDat;
     gengetopt_args_info	    sArgs;
     size_t	    	    i, j, k, m, iOne, iTwo, iGenes, iPositives, iNegatives, iBins, iRand;
     vector<size_t>	    veciGenes, veciRec, veciRecTerm;
     int			    iMax;
     float		    dAnswer, dValue;
     vector<bool>    	    vecfHere, vecfUbik;
-    vector<float>	    vecdScores, vecdSSE, vecdBinValue;
+    vector<float>	    vecdScores, vecdSSE, vecdBinValue,vecGeneWeights;
     vector<size_t>	    veciPositives, veciNegatives, veciGenesTerm;
+	CGenome				Genome;
+	CGenes				wGenes(Genome);
     ofstream		    ofsm;
     ostream*		    postm;
     map<float,size_t>	    mapValues;
-    bool		    fMapAnswers;
-    CGenome		    Genome;
+    bool		    fMapAnswers,isDatWeighted=false;
     CGenes		    GenesTm( Genome ), GenesUbik( Genome );
 
     if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) {
     }
     if( sArgs.normalize_flag )
         Data.Normalize( CDat::ENormalizeMinMax );
+	
+	
+	if(sArgs.weights_arg){
+		ifstream	ifsm;
+		ifsm.open( sArgs.weights_arg );
+			if( !wGenes.OpenWeighted( ifsm ) ) {
+				if(!wDat.Open(sArgs.weights_arg, !!sArgs.memmap_flag )){
+					cerr << "Couldn't open: " << sArgs.inputs[ i ] << endl;
+					return 1;	}
+				else{
+					isDatWeighted = true;
+				}
+			}else{
+				vecGeneWeights.resize(Answers.GetGenes( ));
+				for( i = 0; i < vecGeneWeights.size( ); ++i ){
+					vecGeneWeights[ i ] = wGenes.GetGeneWeight(wGenes.GetGene( Answers.GetGene( i ) ));}
+			}
+		}	
 
     veciGenes.resize( Answers.GetGenes( ) );
     for( i = 0; i < Answers.GetGenes( ); ++i )
             else
                 postm = &cout;
 
-            if( !sArgs.sse_flag ) {
+            if( !sArgs.sse_flag  ) { //Weighted context is currently only supported for AUC calculation
                 *postm << "#	P	" << iPositives << endl;
                 *postm << "#	N	" << iNegatives << endl;
             }
+
+			if(!sArgs.weights_arg){
             *postm << "Cut	Genes	" << ( sArgs.sse_flag ? "Pairs	SSE" : "TP	FP	TN	FN	RC	PR	VALUE" ) << endl;
             for( i = 0; i < MatResults.GetRows( ); ++i ) {
                 *postm << ( iBins ? i : ( sArgs.min_arg + ( i * sArgs.delta_arg ) ) ) << '\t' <<
                 *postm << '\t' << vecdBinValue[ i ];
 
                 *postm << endl;
-            }
+			}}
+			else{
+				*postm << "AUC is calculated using weighted context."<<endl;
+				if(sArgs.flipneg_flag)
+					*postm << "Flipneg ON"<<endl;
+				else
+					*postm << "Flipneg OFF"<<endl;
+			}
+			
             if( !sArgs.sse_flag )
-                *postm << "#	AUC	" << ( sArgs.auc_arg ?
-                                           AUCMod( Data, Answers, vecfHere, vecfUbik, sArgs, !!sArgs.invert_flag, sArgs.auc_arg ) :
-                                           CStatistics::WilcoxonRankSum( Data, Answers, vecfHere, vecfUbik, !!sArgs.ctxtpos_flag, !!sArgs.ctxtneg_flag, !!sArgs.bridgepos_flag, !!sArgs.bridgeneg_flag, !!sArgs.outpos_flag, !!sArgs.outneg_flag, !!sArgs.invert_flag ) ) << endl;
+			{
+				if(sArgs.weights_arg){
+					if(isDatWeighted)
+					 *postm << "#	AUC	" <<  CStatistics::WilcoxonRankSum( Data, Answers, wDat,sArgs.flipneg_flag ) << endl;
+					else
+						*postm << "#	AUC	" <<  CStatistics::WilcoxonRankSum( Data, Answers, vecGeneWeights,sArgs.flipneg_flag ) << endl;
+				}
+				else{
+					if( sArgs.auc_arg)
+						  *postm << "#	AUC	" <<  AUCMod( Data, Answers, vecfHere, vecfUbik, sArgs, !!sArgs.invert_flag, sArgs.auc_arg ) << endl;
+					else{
+						*postm << "#	AUC	" <<  CStatistics::WilcoxonRankSum( Data, Answers, vecfHere, vecfUbik, !!sArgs.ctxtpos_flag, !!sArgs.ctxtneg_flag, !!sArgs.bridgepos_flag, !!sArgs.bridgeneg_flag, !!sArgs.outpos_flag, !!sArgs.outneg_flag, !!sArgs.invert_flag )  << endl;
 
+					}
+				}
+			}
+               
             if( sArgs.inputs_num )
                 ofsm.close( );
             else

File tools/DChecker/DChecker.ggo

                                                         flag    off
 option  "outneg"                U       "Use negative edges outside the context"
                                                         flag    off
+option	"weights"			W	"Weight file"
+							string	typestr="filename"
+option	"flipneg"           F       "Flip weights(one minus original) for negative standards"
+                                                        flag    on
 
 section "Preprocessing"
 option	"normalize"		n	"Normalize scores before processing"