Commits

chuttenh  committed d5698ad

[svn r403] Fix CClustKMeans to work with missing precomputed similarity values
Fix CPCL column normalization for zero stdev conditions
Add negative-only listing to Explainer

  • Participants
  • Parent commits d59e9f5

Comments (0)

Files changed (5)

File src/clustkmeans.cpp

 bool CClustKMeans::Cluster( const CDistanceMatrix& MatSimilarities, size_t iK,
 	vector<uint16_t>& vecsClusters ) {
 	size_t			i, j, iOne, iIteration, iChanged, iState;
-	float			d, dMax;
+	float			d, dMax, dMin;
 	CDataMatrix		MatPrev, MatNext;
 	vector<size_t>	veciPrev, veciNext;
 	uint16_t		sMax;
 	if( MatSimilarities.GetSize( ) < iK )
 		return false;
 
-	dMax = -FLT_MAX;
+	dMax = -( dMin = FLT_MAX );
 	for( i = 0; i < MatSimilarities.GetSize( ); ++i )
 		for( j = ( i + 1 ); j < MatSimilarities.GetSize( ); ++j )
-			if( !CMeta::IsNaN( d = MatSimilarities.Get( i, j ) ) && ( d > dMax ) )
-				dMax = d;
-	if( dMax == -FLT_MAX )
+			if( !CMeta::IsNaN( d = MatSimilarities.Get( i, j ) ) ) {
+				if( d > dMax )
+					dMax = d;
+				if( d < dMin )
+					dMin = d; }
+	if( dMin == FLT_MAX )
 		return false;
 	dMax++;
+	dMin--;
 	MatPrev.Initialize( MatSimilarities.GetSize( ), iK );
 	for( i = 0; i < MatPrev.GetColumns( ); ++i ) {
 		iOne = rand( ) % MatSimilarities.GetSize( );
 		for( j = 0; j < MatPrev.GetRows( ); ++j )
-			MatPrev.Set( j, i, ( j == iOne ) ? dMax : MatSimilarities.Get( iOne, j ) ); }
+			MatPrev.Set( j, i, GetClean( iOne, j, dMin, dMax, MatSimilarities ) ); }
 	MatNext.Initialize( MatPrev.GetRows( ), MatPrev.GetColumns( ) );
 	MatNext.Clear( );
 
 				if( vecsClusters[ i ] != iK )
 					veciNext[ vecsClusters[ i ] ]--;
 				veciNext[ sMax ]++;
-				for( j = 0; j < MatSimilarities.GetSize( ); ++j )
-					if( !CMeta::IsNaN( d = ( ( i == j ) ? dMax : MatSimilarities.Get( i, j ) ) ) ) {
-						if( vecsClusters[ i ] != iK )
-							MatNext.Get( j, vecsClusters[ i ] ) -= d;
-						MatNext.Get( j, sMax ) += d; }
+				for( j = 0; j < MatSimilarities.GetSize( ); ++j ) {
+					d = GetClean( i, j, dMin, dMax, MatSimilarities );
+					if( vecsClusters[ i ] != iK )
+						MatNext.Get( j, vecsClusters[ i ] ) -= d;
+					MatNext.Get( j, sMax ) += d; }
 				vecsClusters[ i ] = sMax; } }
 
 		for( i = 0; i < veciNext.size( ); ++i )
 				g_CatSleipnir.info( "CClustKMeans::Cluster( %d ) moving gene %d into empty cluster %d", iK,
 					iOne, i );
 				veciNext[ vecsClusters[ iOne ] ]--;
-				for( j = 0; j < MatNext.GetRows( ); ++j )
-					if( !CMeta::IsNaN( d = ( ( j == iOne ) ? dMax : MatSimilarities.Get( iOne, j ) ) ) ) {
-						MatNext.Get( j, vecsClusters[ iOne ] ) -= d;
-						MatNext.Set( j, i, d ); }
+				for( j = 0; j < MatNext.GetRows( ); ++j ) {
+					d = GetClean( iOne, j, dMin, dMax, MatSimilarities );
+					MatNext.Get( j, vecsClusters[ iOne ] ) -= d;
+					MatNext.Set( j, i, d ); }
 				veciNext[ i ]++; }
 
 // This calculates a simple hash for the current cluster assignments

File src/clustkmeansi.h

 #include <vector>
 
 #include "fullmatrix.h"
-#include "typesi.h"
+#include "halfmatrix.h"
+#include "meta.h"
 
 namespace Sleipnir {
 
 class CClustKMeansImpl {
 protected:
 	static void Randomize( CDataMatrix&, size_t, const CDataMatrix& );
+
+	static float GetClean( size_t iOne, size_t iTwo, float dMin, float dMax, const CDistanceMatrix& Mat ) {
+		float	dRet;
+
+		return ( ( iOne == iTwo ) ? dMax : ( CMeta::IsNaN( dRet = Mat.Get( iOne, iTwo ) ) ? dMin : dRet ) ); }
 };
 
 }
 						dStd += d * d; }
 				if( iCount ) {
 					dAve /= iCount;
-					dStd = sqrt( ( dStd / iCount ) - ( dAve * dAve ) );
-					if( dStd )
-						for( j = 0; j < GetGenes( ); ++j )
-							if( !CMeta::IsNaN( d = Get( j, i ) ) )
-								Set( j, i, (float)( ( d - dAve ) / dStd ) ); } }
+					if( ( dStd = sqrt( ( dStd / iCount ) - ( dAve * dAve ) ) ) <= 0 )
+						dStd = 1;
+					for( j = 0; j < GetGenes( ); ++j )
+						if( !CMeta::IsNaN( d = Get( j, i ) ) )
+							Set( j, i, (float)( ( d - dAve ) / dStd ) ); } }
 			break;
 
 		case ENormalizeMinMax:

File tools/Explainer/Explainer.cpp

 				continue;
 			if( !sArgs.everything_flag && ( ( ( iTwo = veciGenes[ j ] ) == -1 ) ||
 				CMeta::IsNaN( dAnswer = Answers.Get( iOne, iTwo ) ) ||
-				( sArgs.positives_flag && ( dAnswer <= 0 ) ) ) )
+				( sArgs.positives_flag && ( dAnswer <= 0 ) ) ||
+				( sArgs.negatives_flag && ( dAnswer > 0 ) ) ) )
 				continue;
 			if( ( (float)rand( ) / RAND_MAX ) > sArgs.fraction_arg )
 				continue;

File tools/Explainer/Explainer.ggo

 							int	default="-1"
 option	"positives"		p	"Include only positive pairs"
 							flag	off
+option	"negatives"		P	"Include only negative pairs"
+							flag	off
 option	"everything"	e	"Include pairs without answers"
 							flag	off
 option	"unknowns"		u	"Treatment of unknown genes"