Anonymous avatar Anonymous committed 1f6a970

Add covariate mutual information approximation to Counter
Approximates joint P(X1, X2) as sum(M, prod(X, P(X|M))P(M)) over covariate
In our case, covariate is gene membership in GO/KEGG/etc. terms
Works great! Regularizes Bayes nets at least as well as full MI
Fix symmetric CDat normalization to use normal CDF

Comments (0)

Files changed (7)

 			if( !CMeta::IsNaN( d = Get( i, j ) ) )
 				Set( i, j, 1.0f / ( 1 + exp( -d ) ) ); }
 
-void CDatImpl::NormalizeSigmoidSymmetric( ) {
-	size_t	i, j, iN, iNUp, iNDown;
-	float	d, dAveUp, dAveDown, dStdUp, dStdDown;
+void CDatImpl::NormalizeNormCDF( ) {
+	size_t	i, j, iN;
+	float	d;
 	double	dAve, dStd;
 
 	AveStd( dAve, dStd, iN );
-	dAveUp = dAveDown = dStdUp = dStdDown = 0;
-	for( iNUp = iNDown = i = 0; i < GetGenes( ); ++i )
+	for( i = 0; i < GetGenes( ); ++i )
 		for( j = ( i + 1 ); j < GetGenes( ); ++j )
-			if( !CMeta::IsNaN( d = Get( i, j ) ) ) {
-				if( d >= dAve ) {
-					iNUp++;
-					dAveUp += d;
-					dStdUp += d * d; }
-				else {
-					iNDown++;
-					dAveDown += d;
-					dStdDown += d * d; } }
-	if( iNUp ) {
-		dAveUp /= iNUp;
-		dStdUp = sqrt( ( dStdUp / ( max( 2, iNUp ) - 1 ) ) - ( dAveUp * dAveUp ) ); }
-	if( iNDown ) {
-		dAveDown /= iNDown;
-		dStdDown = sqrt( ( dStdDown / ( max( 2, iNDown ) - 1 ) ) - ( dAveDown * dAveDown ) ); }
-	for( iNUp = iNDown = i = 0; i < GetGenes( ); ++i )
-		for( j = ( i + 1 ); j < GetGenes( ); ++j )
-			if( !CMeta::IsNaN( d = Get( i, j ) ) ) {
-				d = (float)( ( d - dAve ) / ( ( d >= dAve ) ? dStdUp : dStdDown ) );
-				Set( i, j, 1.0f / ( 1 + exp( -d ) ) ); } }
+			if( !CMeta::IsNaN( d = Get( i, j ) ) )
+				Set( i, j, (float)CStatistics::NormalCDF( d, dAve, dStd ) ); }
 
 void CDatImpl::NormalizeMinmax( ) {
 	float	d, dMin, dMax;
 		 * Sigmoid transform scores to the range [0, 1].
 		 */
 		ENormalizeSigmoid	= ENormalizeZScore + 1,
-		ENormalizeSigmSymm	= ENormalizeSigmoid + 1
+		ENormalizeNormCDF	= ENormalizeSigmoid + 1
 	};
 
 	bool Open( const char* szFile, bool fMemmap = false, size_t iSkip = 2, bool fZScore = false,
 				NormalizeStdev( );
 				break;
 
-			case ENormalizeSigmSymm:
-				NormalizeSigmoidSymmetric( );
+			case ENormalizeNormCDF:
+				NormalizeNormCDF( );
 				break;
 
 			default:
 	void NormalizeMinmax( );
 	void NormalizeStdev( );
 	void NormalizeSigmoid( );
-	void NormalizeSigmoidSymmetric( );
+	void NormalizeNormCDF( );
 	void OpenHelper( const CGenes*, float );
 	void OpenHelper( const CGenes*, const CGenes*, float );
 	bool OpenHelper( );

tools/Counter/Counter.cpp

 		size_t						iDatasetOne, iDatasetTwo, iValueOne, iValueTwo, iGroup, iCountOne, iCountTwo;
 		const CFullMatrix<size_t>*	pMatOne;
 		const CFullMatrix<size_t>*	pMatTwo;
-		vector<float>				vecdOne, vecdTwo;
-		CFullMatrix<float>			MatJoint;
-		float						dCountOne, dCountTwo, dCountJoint, dOne, dTwo, dJoint, dMI;
+		vector<double>				vecdOne, vecdTwo, vecdGroups;
+		vector<size_t>				veciCounts;
+		CFullMatrix<double>			MatJoint;
+		double						dOne, dTwo, dJoint, dMI;
+
+		veciCounts.resize( m_vecsetiGenes.size( ) );
+		fill( veciCounts.begin( ), veciCounts.end( ), 0 );
+		for( iCountOne = iDatasetOne = 0; iDatasetOne < m_vecpMatCounts.size( ); ++iDatasetOne ) {
+			if( !( pMatOne = m_vecpMatCounts[iDatasetOne] ) )
+				continue;
+			for( iValueOne = 0; iValueOne < pMatOne->GetRows( ); ++iValueOne )
+				for( iGroup = 0; iGroup < pMatOne->GetColumns( ); ++iGroup ) {
+					iCountTwo = pMatOne->Get( iValueOne, iGroup );
+					veciCounts[iGroup] += iCountTwo;
+					iCountOne += iCountTwo; } }
+		vecdGroups.resize( veciCounts.size( ) );
+		for( iGroup = 0; iGroup < vecdGroups.size( ); ++iGroup )
+			vecdGroups[iGroup] = (double)veciCounts[iGroup] / iCountOne;
 
 		for( iDatasetOne = 0; iDatasetOne < m_vecpMatCounts.size( ); ++iDatasetOne ) {
 			if( !( pMatOne = m_vecpMatCounts[iDatasetOne] ) )
 				continue;
+/*
+for( iGroup = 0; iGroup < m_vecsetiGenes.size( ); ++iGroup ) {
+for( iCountOne = iValueOne = 0; iValueOne < pMatOne->GetRows( ); ++iValueOne )
+iCountOne += pMatOne->Get( iValueOne, iGroup );
+for( iValueOne = 0; iValueOne < pMatOne->GetRows( ); ++iValueOne )
+cerr << ( iValueOne ? "\t" : "" ) << ( (double)pMatOne->Get( iValueOne, iGroup ) / iCountOne );
+cerr << endl; }
+//*/
 			vecdOne.resize( pMatOne->GetRows( ) );
 			for( iDatasetTwo = iDatasetOne; iDatasetTwo < m_vecpMatCounts.size( ); ++iDatasetTwo ) {
 				if( !( pMatTwo = m_vecpMatCounts[iDatasetTwo] ) )
 				vecdTwo.resize( pMatTwo->GetRows( ) );
 				MatJoint.Initialize( vecdOne.size( ), vecdTwo.size( ) );
 
-				dCountOne = dCountTwo = dCountJoint = 0;
 				fill( vecdOne.begin( ), vecdOne.end( ), 0.0f );
 				fill( vecdTwo.begin( ), vecdTwo.end( ), 0.0f );
 				MatJoint.Clear( );
 					for( iCountTwo = iValueTwo = 0; iValueTwo < vecdTwo.size( ); ++iValueTwo )
 						iCountTwo += pMatTwo->Get( iValueTwo, iGroup );
 					for( iValueOne = 0; iValueOne < vecdOne.size( ); ++iValueOne ) {
-						dOne = (float)pMatOne->Get( iValueOne, iGroup ) / ( iCountOne ? iCountOne : 1 );
-						dCountOne += dOne;
-						vecdOne[iValueOne] += dOne;
+						dOne = (double)pMatOne->Get( iValueOne, iGroup ) / ( iCountOne ? iCountOne : 1 );
+						vecdOne[iValueOne] += dOne * vecdGroups[iGroup];
 						for( iValueTwo = 0; iValueTwo < vecdTwo.size( ); ++iValueTwo ) {
-							dTwo = (float)pMatTwo->Get( iValueTwo, iGroup ) / ( iCountTwo ? iCountTwo : 1 );
-							if( !iValueOne ) {
-								dCountTwo += dTwo;
-								vecdTwo[iValueTwo] += dTwo; }
-							dTwo *= dOne;
-							dCountJoint += dTwo;
-							MatJoint.Get( iValueOne, iValueTwo ) += dTwo; } } }
+							dTwo = (double)pMatTwo->Get( iValueTwo, iGroup ) / ( iCountTwo ? iCountTwo : 1 );
+							if( !iValueOne )
+								vecdTwo[iValueTwo] += dTwo * vecdGroups[iGroup];
+							MatJoint.Get( iValueOne, iValueTwo ) += dOne * dTwo * vecdGroups[iGroup]; } } }
 /*
+				if( iDatasetOne == iDatasetTwo ) {
+					MatJoint.Clear( );
+					for( iValueOne = 0; iValueOne < vecdOne.size( ); ++iValueOne )
+						MatJoint.Set( iValueOne, iValueOne, vecdOne[iValueOne] ); }
+//*/
+/*
+cerr << "One: " << vecstrNames[iDatasetOne] << endl;
 for( iValueOne = 0; iValueOne < vecdOne.size( ); ++iValueOne )
 cerr << ( iValueOne ? "\t" : "" ) << vecdOne[iValueOne];
 cerr << endl;
+cerr << "Two: " << vecstrNames[iDatasetTwo] << endl;
 for( iValueTwo = 0; iValueTwo < vecdTwo.size( ); ++iValueTwo )
 cerr << ( iValueTwo ? "\t" : "" ) << vecdTwo[iValueTwo];
 cerr << endl;
+cerr << "Joint:" << endl;
 for( iValueOne = 0; iValueOne < vecdOne.size( ); ++iValueOne ) {
 for( iValueTwo = 0; iValueTwo < vecdTwo.size( ); ++iValueTwo )
 cerr << ( iValueTwo ? "\t" : "" ) << MatJoint.Get( iValueOne, iValueTwo );
 //*/
 
 				for( dMI = 0,iValueOne = 0; iValueOne < vecdOne.size( ); ++iValueOne ) {
-					dOne = vecdOne[iValueOne] / ( dCountOne ? dCountOne : 1 );
+					dOne = vecdOne[iValueOne];
 					for( iValueTwo = 0; iValueTwo < vecdTwo.size( ); ++iValueTwo )
-						if( dJoint = MatJoint.Get( iValueOne, iValueTwo ) ) {
-							dJoint /= ( dCountJoint ? dCountJoint : 1 );
-							dMI += dJoint * ( dJoint ? log( dJoint * dCountTwo / dOne / vecdTwo[iValueTwo] ) : 0 ); } }
-				if( dCountOne || dCountTwo )
-					dMI -= ( vecdOne.size( ) - 1 ) * ( vecdTwo.size( ) - 1 ) / ( 2 * ( dCountOne + dCountTwo ) );
+						if( dJoint = MatJoint.Get( iValueOne, iValueTwo ) )
+							dMI += dJoint * ( dJoint ? log( dJoint / dOne / vecdTwo[iValueTwo] ) : 0 ); }
+//cerr << "MI: " << dMI << endl;
 				dMI = ( dMI < 0 ) ? 0 : ( dMI / log( 2.0f ) );
 
 				ostm << vecstrNames[iDatasetOne] << '\t' << vecstrNames[iDatasetTwo] << '\t' << dMI << endl; } } }

tools/Normalizer/Normalizer.cpp

 	{"globalz",	CDat::ENormalizeZScore},
 	{"0to1",	CDat::ENormalizeMinMax},
 	{"sigmoid",	CDat::ENormalizeSigmoid},
-	{"sigsym",	CDat::ENormalizeSigmSymm},
+	{"normcdf",	CDat::ENormalizeNormCDF},
 	{NULL,		CDat::ENormalizeNone}
 };
 

tools/Normalizer/Normalizer.ggo

 option	"itype"		t	"Data file type"
 						values="pcl","dat"	default="dat"
 option	"otype"		T	"Normalization type"
-						values="columnz","rowz","globalz","column0","0to1","colcenter","medmult","colfrac","sigmoid","sigsym"	default="globalz"
+						values="columnz","rowz","globalz","column0","0to1","colcenter","medmult","colfrac","sigmoid","normcdf"	default="globalz"
 
 section "Optional"
 option	"flip"		f	"Flip high/low scores"

tools/Normalizer/cmdline.c

   "  -i, --input=filename   Input/output PCL/DAT/DAB file",
   "  -o, --output=filename  Output PCL/DAB file",
   "  -t, --itype=STRING     Data file type  (possible values=\"pcl\", \"dat\" \n                           default=`dat')",
-  "  -T, --otype=STRING     Normalization type  (possible values=\"columnz\", \n                           \"rowz\", \"globalz\", \"column0\", \"0to1\", \n                           \"colcenter\", \"medmult\", \"colfrac\", \n                           \"sigmoid\", \"sigsym\" default=`globalz')",
+  "  -T, --otype=STRING     Normalization type  (possible values=\"columnz\", \n                           \"rowz\", \"globalz\", \"column0\", \"0to1\", \n                           \"colcenter\", \"medmult\", \"colfrac\", \n                           \"sigmoid\", \"normcdf\" default=`globalz')",
   "\nOptional:",
   "  -f, --flip             Flip high/low scores  (default=off)",
   "  -s, --skip=INT         Columns to skip in input PCL  (default=`2')",
 
 
 char *cmdline_parser_itype_values[] = {"pcl", "dat", 0} ;	/* Possible values for itype.  */
-char *cmdline_parser_otype_values[] = {"columnz", "rowz", "globalz", "column0", "0to1", "colcenter", "medmult", "colfrac", "sigmoid", "sigsym", 0} ;	/* Possible values for otype.  */
+char *cmdline_parser_otype_values[] = {"columnz", "rowz", "globalz", "column0", "0to1", "colcenter", "medmult", "colfrac", "sigmoid", "normcdf", 0} ;	/* Possible values for otype.  */
 
 static char *
 gengetopt_strdup (const char *s);
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.