1. libsleipnir
  2. sleipnir

Commits

chuttenh  committed 8511b95

[svn r420] Add minimum effect size for COALESCE conditions and motifs
This results in both different behavior and reorganized code
Improve (maybe) COALESCE handling of all missing values
Conditions with absolutely no data are ignored
Improve COALESCE default command line parameters
Only clusters that are complete subsets will merge
Genes will be kept in they're in >=1/2 of the merged clusters
Improve COALESCE wiggle track input (arbitrarily many unflagged on CLI)
Improve COALESCE behavior for unrecognized genes during postprocessing
Improve log4cpp stub
Fix Answerer negatives in main documentation example (thanks Jim Costello!)
Fix CStatistics::ZScore calculation bug for identical means
Add CStatistics::RootMeanSquareError
Add error logging for CClustKMeans with missing similarity measures

  • Participants
  • Parent commits c7c7d37
  • Branches sleipnir

Comments (0)

Files changed (14)

File proj/vs2008/COALESCE/COALESCE.vcproj

View file
  • Ignore whitespace
 		<File
 			RelativePath="..\..\..\tools\COALESCE\COALESCE.ggo"
 			>
+			<FileConfiguration
+				Name="Debug|Win32"
+				>
+				<Tool
+					Name="GenGetOpts"
+					UnnamedInputs="true"
+				/>
+			</FileConfiguration>
+			<FileConfiguration
+				Name="Release|Win32"
+				>
+				<Tool
+					Name="GenGetOpts"
+					UnnamedInputs="true"
+				/>
+			</FileConfiguration>
 		</File>
 	</Files>
 	<Globals>

File src/clustkmeans.cpp

View file
  • Ignore whitespace
 			for( sMax = j = 0; j < MatMeans.GetRows( ); ++j ) {
 				d = (float)pMeasure->Measure( MatData.Get( i ), MatData.GetColumns( ), MatMeans.Get( j ),
 					MatMeans.GetColumns( ), IMeasure::EMapNone, pMatWeights ? pMatWeights->Get( i ) : NULL );
-				if( CMeta::IsNaN( d ) )
-					return false;
+				if( CMeta::IsNaN( d ) ) {
+					g_CatSleipnir.error( "CClustKMeans::Cluster( %d ) got invalid measure for genes: %d, %d",
+						iK, i, j );
+					return false; }
 				if( d > dMax ) {
 					dMax = d;
 					sMax = j; } }

File src/coalesce.cpp

View file
  • Ignore whitespace
 			PCL.GetGenes( ), PCL.GetExperiments( ), iSequences );
 		for( i = 0; i < PCL.GetExperiments( ); ++i )
 			ossm << ( i ? "\t" : "" ) << PCL.GetExperiment( i );
-		g_CatSleipnir.notice( ossm.str( ) );
-		g_CatSleipnir.notice( "k %d, P gene %g, p condition %g, p motif %g, p correlation %g", GetK( ),
-			GetProbabilityGene( ), GetPValueCondition( ), GetPValueMotif( ), GetPValueCorrelation( ) );
+		g_CatSleipnir.notice( ossm.str( ).c_str( ) );
+		g_CatSleipnir.notice( "k %d, P gene %g, p condition %g, z condition %g, p motif %g, z motif %g, p correlation %g", GetK( ),
+			GetProbabilityGene( ), GetPValueCondition( ), GetZScoreCondition( ), GetPValueMotif( ),
+			GetZScoreMotif( ), GetPValueCorrelation( ) );
 		g_CatSleipnir.notice( "p merge %g, cutoff merge %g, penalty gap %g, penalty mismatch %g",
 			GetPValueMerge( ), GetCutoffMerge( ), GetMotifs( ) ? GetMotifs( )->GetPenaltyGap( ) : 0,
 			GetMotifs( ) ? GetMotifs( )->GetPenaltyMismatch( ) : 0 );
 			ossm << "CCoalesce::Cluster( ) initialized:";
 			for( iterGene = Cluster.GetGenes( ).begin( ); iterGene != Cluster.GetGenes( ).end( ); ++iterGene )
 				ossm << ' ' << PCL.GetGene( *iterGene );
-			g_CatSleipnir.debug( ossm.str( ) ); }
+			g_CatSleipnir.debug( ossm.str( ).c_str( ) ); }
 		if( Cluster.GetGenes( ).size( ) < GetSizeMinimum( ) )
 			continue;
 		while( !( Cluster.IsConverged( ) || Cluster.IsEmpty( ) ) ) {
 			Cluster.CalculateHistograms( GeneScores, HistsCluster, &HistsPot );
 			Cluster.Snapshot( GeneScores, HistsCluster );
 			Pot.Snapshot( GeneScores, HistsPot );
-			if( !Cluster.SelectConditions( PCLCopy, Pot, GetThreads( ), GetPValueCondition( ) ) )
+			if( !Cluster.SelectConditions( PCLCopy, Pot, GetThreads( ), GetPValueCondition( ),
+				GetZScoreCondition( ) ) )
 				return false;
 			if( Cluster.IsEmpty( ) ) {
 				g_CatSleipnir.notice( "CCoalesce::Cluster( ) selected no conditions" );
 			if( ( Cluster.GetGenes( ).size( ) >= GetSizeMinimum( ) ) &&
 				!( CombineMotifs( FASTA, veciPCL2FASTA, sModifiers, Cluster, GetThreads( ), GeneScores,
 				HistsCluster, HistsPot ) && Cluster.SelectMotifs( HistsCluster, HistsPot, GetPValueMotif( ),
-				GetSizeMaximum( ), GetThreads( ), GetMotifs( ) ) ) )
+				GetZScoreMotif( ),GetSizeMaximum( ), GetThreads( ), GetMotifs( ) ) ) )
 				return false;
 			if( !Cluster.SelectGenes( PCLCopy, GeneScores, HistsCluster, HistsPot, GetSizeMinimum( ),
 				GetThreads( ), Pot, GetProbabilityGene( ), GetMotifs( ) ) )
 			g_CatSleipnir.notice( "CCoalesce::Cluster( ) finalizing cluster" );
 			setpriiSeeds.clear( );
 			if( Cluster.GetMotifs( ).size( ) >= GetSizeMaximum( ) ) {
-				if( !Cluster.SelectMotifs( HistsCluster, HistsPot, GetPValueMotif( ), -1, GetThreads( ),
-					GetMotifs( ) ) )
+				if( !Cluster.SelectMotifs( HistsCluster, HistsPot, GetPValueMotif( ), GetZScoreMotif( ), -1,
+					GetThreads( ), GetMotifs( ) ) )
 					return false;
 				g_CatSleipnir.notice( "CCoalesce::Cluster( ) finalized %d motifs",
 					Cluster.GetMotifs( ).size( ) ); }

File src/coalesce.h

View file
  • Ignore whitespace
 
 		return m_iBins; }
 
+	float GetZScoreCondition( ) const {
+
+		return m_dZScoreCondition; }
+
+	void SetZScoreCondition( float dZScore ) {
+
+		m_dZScoreCondition = dZScore; }
+
 	float GetPValueCondition( ) const {
 
 		return m_dPValueCondition; }
 
 		m_dPValueCondition = dPValue; }
 
+	float GetZScoreMotif( ) const {
+
+		return m_dZScoreMotif; }
+
+	void SetZScoreMotif( float dZScore ) {
+
+		m_dZScoreMotif = dZScore; }
+
 	float GetPValueMotif( ) const {
 
 		return m_dPValueMotif; }

File src/coalescecluster.cpp

View file
  • Ignore whitespace
 			for( iGene = 0; iGene < veciPot.size( ); ++iGene )
 				if( !CMeta::IsNaN( d = PCL.Get( veciPot[ iGene ], iCondition ) ) )
 					adPot[ iPot++ ] = d;
-			if( !( iCluster || iPot ) )
+			if( !( iCluster && iPot ) )
 				continue;
-			if( !( iCluster && iPot ) ) {
-				dZ = FLT_MAX;
-				dP = 0; }
-			else {
-				dZ = CStatistics::ZScore( adCluster, adCluster + iCluster, adPot, adPot + iPot );
-				dP = CStatistics::ZTest( dZ, min( iCluster, iPot ) ) * psData->m_pvecsDatasets->size( ); }
-			if( dP < psData->m_dPValue ) {
-				g_CatSleipnir.debug( "CCoalesceClusterImpl::ThreadSelectCondition( %g ) selected condition %d at %g, z=%g",
-					psData->m_dPValue, iCondition, dP, dZ );
-				(*psData->m_pvecfSignificant)[ iDataset ] = true;
-				sDataset.m_dZ = (float)min( dZ, (double)FLT_MAX ); }
-			else
-				g_CatSleipnir.debug( "CCoalesceClusterImpl::ThreadSelectCondition( %g ) rejected condition %d at %g, z=%g",
-					psData->m_dPValue, iCondition, dP, dZ ); }
+			dZ = CStatistics::ZScore( adCluster, adCluster + iCluster, adPot, adPot + iPot );
+			sDataset.m_dP = CStatistics::ZTest( dZ, min( iCluster, iPot ) );
+			sDataset.m_dZ = (float)min( dZ, (double)FLT_MAX ); }
 		else {
 			vecdDatasetCluster.resize( sDataset.GetConditions( ) );
 			fill( vecdDatasetCluster.begin( ), vecdDatasetCluster.end( ), 0.0f );
 				sDataset.m_psDataset->m_MatSigmaChol, min( iCluster, iPot ) );
 			if( dP > 0.5 )
 				dP = 1 - dP;
-			dP *= 2 * psData->m_pvecsDatasets->size( );
-			if( dP < psData->m_dPValue ) {
-				dZ = 0;
-				for( iCondition = 0; iCondition < sDataset.GetConditions( ); ++iCondition )
-					if( sDataset.m_psDataset->m_vecdStdevs[ iCondition ] ) {
-						d = ( vecdDatasetCluster[ iCondition ] - vecdDatasetPot[ iCondition ] ) /
-							sDataset.m_psDataset->m_vecdStdevs[ iCondition ];
-						dZ += d * d; }
-				dZ = sqrt( dZ );
-				g_CatSleipnir.debug( "CCoalesceClusterImpl::ThreadSelectCondition( %g ) selected dataset %d at %g, z=%g",
-					psData->m_dPValue, iDataset, dP, dZ );
-				(*psData->m_pvecfSignificant)[ iDataset ] = true;
-				sDataset.m_dZ = (float)min( dZ, (double)FLT_MAX ); } } }
+
+			sDataset.m_dP = 2 * dP;
+			dZ = 0;
+			for( iCondition = 0; iCondition < sDataset.GetConditions( ); ++iCondition )
+				if( sDataset.m_psDataset->m_vecdStdevs[ iCondition ] ) {
+					d = ( vecdDatasetCluster[ iCondition ] - vecdDatasetPot[ iCondition ] ) /
+						sDataset.m_psDataset->m_vecdStdevs[ iCondition ];
+					dZ += d * d; }
+			dZ = sqrt( dZ );
+			sDataset.m_dZ = (float)min( dZ, (double)FLT_MAX ); } }
 	delete[] adCluster;
 	delete[] adPot;
 
 	return NULL; }
 
 bool CCoalesceCluster::SelectConditions( const CPCL& PCL, const CCoalesceCluster& Pot, size_t iThreads,
-	float dPValue ) {
-	vector<bool>					vecfSignificant;
+	float dPValue, float dZScore ) {
 	vector<pthread_t>				vecpthdThreads;
 	vector<SThreadSelectCondition>	vecsThreads;
 	size_t							i;
 	copy( GetGenes( ).begin( ), GetGenes( ).end( ), veciCluster.begin( ) );
 	veciPot.resize( Pot.GetGenes( ).size( ) );
 	copy( Pot.GetGenes( ).begin( ), Pot.GetGenes( ).end( ), veciPot.begin( ) );
-	vecfSignificant.resize( m_vecsDatasets.size( ) );
 	vecpthdThreads.resize( iThreads );
 	vecsThreads.resize( vecpthdThreads.size( ) );
 	for( i = 0; i < vecsThreads.size( ); ++i ) {
 		vecsThreads[ i ].m_pveciPot = &veciPot;
 		vecsThreads[ i ].m_pvecsDatasets = &m_vecsDatasets;
 		vecsThreads[ i ].m_pPCL = &PCL;
-		vecsThreads[ i ].m_dPValue = dPValue;
-		vecsThreads[ i ].m_pvecfSignificant = &vecfSignificant;
 		if( pthread_create( &vecpthdThreads[ i ], NULL, ThreadSelectCondition, &vecsThreads[ i ] ) ) {
 			g_CatSleipnir.error( "CCoalesceCluster::SelectConditions( %d, %g ) could not select conditions",
-				iThreads, dPValue );
+				iThreads, dZScore );
 			return false; } }
 	for( i = 0; i < vecpthdThreads.size( ); ++i )
 		pthread_join( vecpthdThreads[ i ], NULL );
-	for( i = 0; i < vecfSignificant.size( ); ++i )
-		if( vecfSignificant[ i ] )
-			m_setiDatasets.insert( i );
+
+	dPValue /= m_vecsDatasets.size( );
+	for( i = 0; i < m_vecsDatasets.size( ); ++i ) {
+		const SDataset&	sDataset	= m_vecsDatasets[ i ];
+
+		if( ( sDataset.m_dP < dPValue ) && ( fabs( sDataset.m_dZ ) > dZScore ) ) {
+			g_CatSleipnir.debug( "CCoalesceCluster::SelectConditions( %d, %g ) selected dataset %d at %g, z=%g",
+				iThreads, dZScore, i, sDataset.m_dP, sDataset.m_dZ );
+			m_setiDatasets.insert( i ); }
+		else
+			g_CatSleipnir.debug( "CCoalesceCluster::SelectConditions( %d, %g ) rejected dataset %d at %g, z=%g",
+				iThreads, dZScore, i, sDataset.m_dP, sDataset.m_dZ ); }
 
 	return true; }
 
 	for( iMotif = psData->m_iOffset; iMotif < ( psData->m_pveciMotifs ? psData->m_pveciMotifs->size( ) :
 		psData->m_pMotifs->GetMotifs( ) ); iMotif += psData->m_iStep )
 		if( !AddSignificant( *psData->m_pMotifs,  psData->m_pveciMotifs ? (*psData->m_pveciMotifs)[ iMotif ] :
-			iMotif, *psData->m_pHistsCluster, *psData->m_pHistsPot, psData->m_dPValue,
+			iMotif, *psData->m_pHistsCluster, *psData->m_pHistsPot, psData->m_dPValue, psData->m_dZScore,
 			psData->m_vecsMotifs ) )
 			break;
 
 	return NULL; }
 
 bool CCoalesceCluster::SelectMotifs( const CCoalesceGroupHistograms& HistsCluster,
-	const CCoalesceGroupHistograms& HistsPot, float dPValue, size_t iMaxMotifs, size_t iThreads,
-	const CCoalesceMotifLibrary* pMotifs ) {
+	const CCoalesceGroupHistograms& HistsPot, float dPValue, float dZScore, size_t iMaxMotifs,
+	size_t iThreads, const CCoalesceMotifLibrary* pMotifs ) {
 	uint32_t					i, j;
 	vector<pthread_t>			vecpthdThreads;
 	vector<SThreadSelectMotif>	vecsThreads;
 		vecsThreads[ i ].m_pHistsCluster = &HistsCluster;
 		vecsThreads[ i ].m_pHistsPot = &HistsPot;
 		vecsThreads[ i ].m_dPValue = dPValue;
+		vecsThreads[ i ].m_dZScore = dZScore;
 		vecsThreads[ i ].m_pveciMotifs = veciMotifs.empty( ) ? NULL : &veciMotifs;
 		if( pthread_create( &vecpthdThreads[ i ], NULL, ThreadSelectMotif, &vecsThreads[ i ] ) ) {
 			g_CatSleipnir.error( "CCoalesceCluster::SelectMotifs( %g, %d ) could not select motifs",
-				dPValue, iThreads );
+				dZScore, iThreads );
 			return false; } }
 	for( i = 0; i < vecpthdThreads.size( ); ++i ) {
 		pthread_join( vecpthdThreads[ i ], NULL );
 
 bool CCoalesceClusterImpl::AddSignificant( const CCoalesceMotifLibrary& Motifs, uint32_t iMotif,
 	const CCoalesceGroupHistograms& HistsCluster, const CCoalesceGroupHistograms& HistsPot, float dPValue,
-	vector<SMotifMatch>& vecsMotifs ) {
+	float dZScore, vector<SMotifMatch>& vecsMotifs ) {
 	size_t									iTypeCluster, iTypePot;
 	double									dP, dAveOne, dAverage, dZ;
 	CCoalesceSequencerBase::ESubsequence	eSubsequence;
 				!( HistSetCluster.GetTotal( ) && HistSetPot.GetTotal( ) ) )
 				continue;
 			dP = HistSetCluster.CohensD( iMotif, HistSetPot, dAveOne, dAverage, dZ );
-			if( dP < dPValue ) {
+			if( ( dP < dPValue ) && ( fabs( dZ ) > dZScore ) ) {
 				SMotifMatch	sMotif( iMotif, strTypeCluster, eSubsequence, (float)dZ, (float)( dAveOne -
 					dAverage ) );
 
 				if( g_CatSleipnir.isInfoEnabled( ) ) {
 					ostringstream	ossm;
 
-					ossm << "CCoalesceClusterImpl::AddSignificant( " << iMotif << ", " << dPValue <<
+					ossm << "CCoalesceClusterImpl::AddSignificant( " << iMotif << ", " << dZScore <<
 						" ) adding at " << dP << ":" << endl << sMotif.Save( &Motifs ) << endl <<
 						"Cluster	" << HistSetCluster.Save( iMotif ) << endl <<
 						"Pot	" << HistSetPot.Save( iMotif );
-					g_CatSleipnir.info( ossm.str( ) ); }
+					g_CatSleipnir.info( ossm.str( ).c_str( ) ); }
 				vecsMotifs.push_back( sMotif ); }
 			else if( g_CatSleipnir.isDebugEnabled( ) ) {
 				ostringstream	ossm;
 
-				ossm << "CCoalesceClusterImpl::AddSignificant( " << iMotif << ", " << dPValue <<
+				ossm << "CCoalesceClusterImpl::AddSignificant( " << iMotif << ", " << dZScore <<
 					" ) failed at " << dP << ":" << endl << SMotifMatch( iMotif, strTypeCluster, eSubsequence,
 					(float)dZ, (float)( dAveOne - dAverage ) ).Save( &Motifs ) << endl <<
 					"Cluster	" << HistSetCluster.Save( iMotif ) << endl <<
 					"Pot	" << HistSetPot.Save( iMotif );
-				g_CatSleipnir.debug( ossm.str( ) ); } } }
+				g_CatSleipnir.debug( ossm.str( ).c_str( ) ); } } }
 
 	return true; }
 
 		return -1; }
 	for( i = 1; i < vecstrLine.size( ); ++i ) {
 		if( ( j = PCL.GetGene( vecstrLine[ i ] ) ) == -1 ) {
-			g_CatSleipnir.error( "CCoalesceCluster::Open( ) unrecognized gene: %s",
+			g_CatSleipnir.warn( "CCoalesceCluster::Open( ) unrecognized gene: %s",
 				vecstrLine[ i ].c_str( ) );
-			return -1; }
+			continue; }
 		m_setiGenes.insert( j ); }
 
 	istm.getline( &strBuffer[ 0 ], strBuffer.size( ) );
 	CCoalesceClusterImpl::Snapshot( m_setsMotifs, m_vecsPrevMotifs );
 	CCoalesceClusterImpl::Snapshot( GetGenes( ), m_veciPrevGenes ); }
 
-size_t CCoalesceCluster::RemoveMotifs( const CCoalesceMotifLibrary& Motifs, float dZScore ) {
-	set<SMotifMatch>::const_iterator	iterMotif;
-	vector<const SMotifMatch*>			vecpsMotifs;
-	size_t								i;
-
-	for( iterMotif = m_setsMotifs.begin( ); iterMotif != m_setsMotifs.end( ); ++iterMotif )
-		if( fabs( iterMotif->m_dZ ) < dZScore ) {
-			g_CatSleipnir.info( "CCoalesceCluster::RemoveMotifs( %g ) removing: %s", dZScore,
-				iterMotif->Save( &Motifs ).c_str( ) );
-			vecpsMotifs.push_back( &*iterMotif ); }
-	for( i = 0; i < vecpsMotifs.size( ); ++i )
-		m_setsMotifs.erase( *vecpsMotifs[ i ] );
-
-	return vecpsMotifs.size( ); }
-
 bool CCoalesceCluster::LabelMotifs( const CCoalesceMotifLibrary& Motifs, float dPenaltyGap,
 	float dPenaltyMismatch, float dPValue ) {
 	set<SMotifMatch>::iterator	iterMotif;

File src/coalescecluster.h

View file
  • Ignore whitespace
 		std::set<std::pair<size_t, size_t> >& setpriiSeeds, size_t iPairs, float dPValue, size_t iThreads );
 	void Subtract( CPCL& PCL, const CCoalesceCluster& Pot ) const;
 	void Subtract( CCoalesceGeneScores& GeneScores ) const;
-	bool SelectConditions( const CPCL& PCL, const CCoalesceCluster& Pot, size_t iThreads, float dPValue );
+	bool SelectConditions( const CPCL& PCL, const CCoalesceCluster& Pot, size_t iThreads, float dPValue,
+		float dZScore );
 	bool SelectMotifs( const CCoalesceGroupHistograms& HistsCluster, const CCoalesceGroupHistograms& HistsPot,
-		float dPValue, size_t iMaxMotifs, size_t iThreads, const CCoalesceMotifLibrary* pMotifs = NULL );
+		float dPValue, float dZScore, size_t iMaxMotifs, size_t iThreads,
+		const CCoalesceMotifLibrary* pMotifs = NULL );
 	bool SelectGenes( const CPCL& PCL, const CCoalesceGeneScores& GeneScores,
 		const CCoalesceGroupHistograms& HistsCluster, const CCoalesceGroupHistograms& HistsPot,
 		size_t iMinimum, size_t iThreads, CCoalesceCluster& Pot, float dPValue,
 		float dCutoffPWMs = 0, float dPenaltyGap = 0, float dPenaltyMismatch = 0, bool fNoRCs = false ) const;
 	float GetSimilarity( const CCoalesceCluster& Cluster, size_t iGenes, size_t iDatasets ) const;
 	void Snapshot( const CCoalesceGeneScores& GeneScores, CCoalesceGroupHistograms& Histograms );
-	size_t RemoveMotifs( const CCoalesceMotifLibrary&, float dZScore );
 	bool LabelMotifs( const CCoalesceMotifLibrary&, float dPenaltyGap, float dPenaltyMismatch,
 		float dPValue );
 

File src/coalesceclusteri.h

View file
  • Ignore whitespace
 
 	struct SDataset {
 		const SCoalesceDataset*	m_psDataset;
+		double					m_dP;
 		float					m_dZ;
 		std::vector<float>		m_vecdCentroid;
 
 		const CCoalesceGroupHistograms*	m_pHistsCluster;
 		const CCoalesceGroupHistograms*	m_pHistsPot;
 		float							m_dPValue;
+		float							m_dZScore;
 		const std::vector<uint32_t>*	m_pveciMotifs;
 		std::vector<SMotifMatch>		m_vecsMotifs;
 	};
 		const std::vector<size_t>*	m_pveciPot;
 		std::vector<SDataset>*		m_pvecsDatasets;
 		const CPCL*					m_pPCL;
-		float						m_dPValue;
-		std::vector<bool>*			m_pvecfSignificant;
 	};
 
 	static void* ThreadCentroid( void* );
 	static void* ThreadSeedPair( void* );
 	static void* ThreadSelectCondition( void* );
 	static bool AddSignificant( const CCoalesceMotifLibrary&, uint32_t, const CCoalesceGroupHistograms&,
-		const CCoalesceGroupHistograms&, float, std::vector<SMotifMatch>& );
+		const CCoalesceGroupHistograms&, float, float, std::vector<SMotifMatch>& );
 	static size_t Open( const CHierarchy&, const std::vector<CCoalesceCluster>&,
 		const std::vector<std::string>&, std::map<size_t, size_t>&, std::map<size_t, size_t>&,
 		TVecMapStrSetSMotifs& );

File src/coalescei.h

View file
  • Ignore whitespace
 
 	static void* ThreadCombineMotif( void* );
 
-	CCoalesceImpl( ) : m_iK(7), m_dPValueCorrelation(0.05f), m_iBins(12), m_dPValueCondition(0.05f),
-		m_dProbabilityGene(0.95f), m_dPValueMotif(0.05f), m_pMotifs(NULL), m_fMotifs(false),
+	CCoalesceImpl( ) : m_iK(7), m_dPValueCorrelation(0.05f), m_iBins(12), m_dZScoreCondition(0.5f),
+		m_dProbabilityGene(0.95f), m_dZScoreMotif(0.5f), m_pMotifs(NULL), m_fMotifs(false),
 		m_iBasesPerMatch(5000), m_dPValueMerge(0.05f), m_dCutoffMerge(2.5f), m_iSizeMinimum(5),
-		m_iThreads(1), m_iSizeMerge(100), m_iSizeMaximum(1000) { }
+		m_iThreads(1), m_iSizeMerge(100), m_iSizeMaximum(1000), m_dPValueCondition(0.05f),
+		m_dPValueMotif(0.05f) { }
 	virtual ~CCoalesceImpl( );
 
 	void Clear( );
 	float							m_dPValueMerge;
 	float							m_dProbabilityGene;
 	float							m_dPValueCondition;
+	float							m_dZScoreCondition;
 	float							m_dPValueCorrelation;
 	float							m_dPValueMotif;
+	float							m_dZScoreMotif;
 	float							m_dCutoffMerge;
 	size_t							m_iNumberCorrelation;
 	size_t							m_iBins;

File src/coalescemotifs.cpp

View file
  • Ignore whitespace
-/*****************************************************************************
-* This file is provided under the Creative Commons Attribution 3.0 license.
-*
-* You are free to share, copy, distribute, transmit, or adapt this work
-* PROVIDED THAT you attribute the work to the authors listed below.
-* For more information, please see the following web page:
-* http://creativecommons.org/licenses/by/3.0/
-*
-* This file is a component of the Sleipnir library for functional genomics,
-* authored by:
-* Curtis Huttenhower (chuttenh@princeton.edu)
-* Mark Schroeder
-* Maria D. Chikina
-* Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
-*
-* If you use this library, the included executable tools, or any related
-* code in your work, please cite the following publication:
-* Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
-* Olga G. Troyanskaya.
-* "The Sleipnir library for computational functional genomics"
-*****************************************************************************/
-#include "stdafx.h"
-#include "coalescemotifs.h"
-#include "coalescestructsi.h"
-#include "pst.h"
-#include "measure.h"
-#include "statistics.h"
-
-namespace Sleipnir {
-
-const char	CCoalesceMotifLibraryImpl::c_szBases[]			= "ACGT";
-const char	CCoalesceMotifLibraryImpl::c_szComplements[]	= "TGCA";
-
-void CCoalesceMotifLibraryImpl::SKnowns::Match( const vector<float>& vecdMotif,
-	map<string, float>& mapstrdKnown ) const {
-	static const size_t					c_iOverlap	= 5;
-	map<string, TVecPr>::const_iterator	iterKnown;
-	size_t								i, j, k, iMin, iMax, iTests;
-	float								dP, dR, dMin;
-	const float*						adMotif;
-	const float*						adPWM;
-	const float*						adRC;
-
-	for( iterKnown = m_mapstrvecKnown.begin( ); iterKnown != m_mapstrvecKnown.end( ); ++iterKnown ) {
-		dMin = FLT_MAX;
-		for( iTests = i = 0; i < iterKnown->second.size( ); ++i ) {
-			const vector<float>&	vecdPWM	= iterKnown->second[ i ].first;
-			const vector<float>&	vecdRC	= iterKnown->second[ i ].second;
-
-			iMin = strlen( c_szBases ) * c_iOverlap;
-			iMax = vecdMotif.size( ) + vecdPWM.size( ) - iMin;
-			if( iMax < iMin )
-				continue;
-			for( j = iMin; j <= iMax; j += strlen( c_szBases ) ) {
-				iTests += 2;
-				if( j < vecdMotif.size( ) ) {
-					adMotif = &vecdMotif[ vecdMotif.size( ) - j - 1 ];
-					adPWM = &vecdPWM.front( );
-					adRC = &vecdRC.front( );
-					k = min( vecdPWM.size( ), j ); }
-				else {
-					adMotif = &vecdMotif.front( );
-					k = j - vecdMotif.size( );
-					adPWM = &vecdPWM[ k ];
-					adRC = &vecdRC[ k ];
-					k = min( vecdMotif.size( ), vecdPWM.size( ) - k ); }
-				if( ( dR = (float)max( CMeasurePearson::Pearson( adMotif, k, adPWM, k,
-					IMeasure::EMapNone ), CMeasurePearson::Pearson( adMotif, k, adRC, k,
-					IMeasure::EMapNone ) ) ) < 0 )
-					continue;
-				if( ( dP = (float)CStatistics::PValuePearson( dR, k ) ) < dMin )
-					dMin = dP; } }
-		if( dMin != FLT_MAX )
-			mapstrdKnown[ iterKnown->first ] = dMin * iTests; } }
-
-bool CCoalesceMotifLibrary::Open( istream& istm, vector<SMotifMatch>& vecsMotifs,
-	CCoalesceMotifLibrary* pMotifs ) {
-	string	strBuffer;
-	size_t	i;
-
-	strBuffer.resize( CFile::GetBufferSize( ) );
-	while( CFile::IsNewline( istm.peek( ) ) )
-		istm.getline( &strBuffer[ 0 ], strBuffer.size( ) );
-	while( !istm.eof( ) && ( istm.peek( ) != c_cCluster ) ) {
-		SMotifMatch	sMotif;
-
-		if( pMotifs ) {
-			if( !sMotif.Open( istm, *pMotifs ) )
-				break;
-			vecsMotifs.push_back( sMotif );
-			if( isdigit( istm.peek( ) ) )
-				for( i = 0; i < 4; ++i )
-					istm.getline( &strBuffer[ 0 ], strBuffer.size( ) ); }
-		else
-			istm.getline( &strBuffer[ 0 ], strBuffer.size( ) );
-		while( CFile::IsNewline( istm.peek( ) ) )
-			istm.getline( &strBuffer[ 0 ], strBuffer.size( ) ); }
-
-	return true; }
-
-CCoalesceMotifLibraryImpl::~CCoalesceMotifLibraryImpl( ) {
-	size_t	i;
-
-	for( i = 0; i < m_vecpPSTs.size( ); ++i )
-		delete m_vecpPSTs[ i ]; }
-
-/*!
- * \brief
- * Calculates the length-normalized match strength of the given motif against the appropriate number of
- * characters in the input sequence at the requested offset.
- * 
- * \param strSequence
- * Sequence against which motif is matched.
- * 
- * \param iMotif
- * ID of motif to be matched.
- * 
- * \param iOffset
- * Zero-based offset within strSequence at which the match is performed.
- * 
- * \param sModifiers
- * A modifier cache containing any prior weights to be incorporated into the match.
- * 
- * \returns
- * Length-normalized strength of motif match against the given sequence and offset.
- * 
- * \remarks
- * iMotif must represent a valid motif for the current library, and iOffset must fall within strSequence,
- * although motifs extending from a valid iOffset past the end of the sequence will be handled appropriately.
- * Only PST motifs are currently supported, as there should never be any need to match non-PST motifs at
- * runtime, but support for kmers and RCs could be added in a straightforward manner.
- * 
- * \see
- * GetMatches
- */
-float CCoalesceMotifLibrary::GetMatch( const std::string& strSequence, uint32_t iMotif, size_t iOffset,
-	SCoalesceModifierCache& sModifiers ) const {
-	size_t		i, iDepth;
-	float		d, dRet;
-	const CPST*	pPST;
-	string		strMotif, strRC;
-	EType		eType;
-
-	switch( eType = GetType( iMotif ) ) {
-		case ETypePST:
-			if( !( pPST = GetPST( iMotif ) ) ) {
-				g_CatSleipnir.error( "CCoalesceMotifLibrary::GetMatch( %s, %d, %d ) could not find PST",
-					strSequence.c_str( ), iMotif, iOffset );
-				return CMeta::GetNaN( ); }
-			break;
-
-		case ETypeKMer:
-			strMotif = GetMotif( iMotif );
-			break;
-
-		case ETypeRC:
-			strMotif = GetRCOne( iMotif );
-			strRC = GetReverseComplement( strMotif );
-			break; }
-
-	iDepth = ( eType == ETypePST ) ? pPST->GetDepth( ) : GetK( );
-	sModifiers.InitializeWeight( 0, 0 );
-	dRet = 0;
-	for( i = 1; i < iDepth; ++i ) {
-		sModifiers.AddWeight( 0, iOffset, i - 1 );
-		if( eType == ETypePST )
-			dRet += pPST->GetMatch( strSequence, iDepth - i ) * sModifiers.GetWeight( i ) / i; }
-	sModifiers.AddWeight( 0, iOffset, i - 1 );
-	for( i = 0; i < strSequence.length( ); ++i ) {
-		switch( eType ) {
-			case ETypePST:
-				d = pPST->GetMatch( strSequence.substr( i ) ) * sModifiers.GetWeight( iDepth ) / iDepth;
-				break;
-
-			case ETypeKMer:
-			case ETypeRC:
-				d = ( strSequence.compare( i, iDepth, strMotif ) &&
-					( strRC.empty( ) || strSequence.compare( i, iDepth, strRC ) ) ) ? 0 :
-					sModifiers.GetWeight( iDepth );
-				break; }
-		dRet += d;
-		sModifiers.AddWeight( iDepth, iOffset, i ); }
-	if( CMeta::IsNaN( dRet ) || ( dRet < 0 ) ) {
-		g_CatSleipnir.error( "CCoalesceMotifLibrary::GetMatch( %s, %d, %d ) found negative score: %g",
-			strSequence.c_str( ), iMotif, iOffset, dRet );
-		return CMeta::GetNaN( ); }
-
-	return dRet; }
-
-std::string CCoalesceMotifLibraryImpl::GetMotif( uint32_t iMotif ) const {
-	std::string	strKMer;
-
-// kmer
-	if( iMotif < GetKMers( ) )
-		return ID2KMer( iMotif, m_iK );
-// reverse complement
-	if( iMotif < GetBasePSTs( ) ) {
-		strKMer = GetRCOne( iMotif );
-		return ( strKMer + c_cSeparator + GetReverseComplement( strKMer ) ); }
-// pst
-	return GetPST( iMotif )->GetMotif( ); }
-
-CPST* CCoalesceMotifLibraryImpl::CreatePST( uint32_t& iMotif ) {
-	CPST*	pRet;
-
-	iMotif = GetBasePSTs( ) + GetPSTs( );
-	m_vecpPSTs.push_back( pRet = new CPST( strlen( c_szBases ) ) );
-	return pRet; }
-
-uint32_t CCoalesceMotifLibrary::Open( const std::string& strMotif ) {
-	uint32_t	iMotif;
-	CPST*		pPST;
-
-	if( strMotif.length( ) == GetK( ) && !IsIgnorableKMer( strMotif ) )
-		return KMer2ID( strMotif );
-	if( ( strMotif.length( ) == ( ( 2 * GetK( ) ) + 1 ) ) &&
-		!IsIgnorableKMer( strMotif.substr( 0, GetK( ) ) ) &&
-		( strMotif.substr( 0, GetK( ) ) == GetReverseComplement( strMotif.substr( GetK( ) + 1 ) ) ) &&
-		( ( iMotif = KMer2ID( strMotif.substr( 0, GetK( ) ) ) ) != -1 ) )
-		return ( GetBaseRCs( ) + m_veciKMer2RC[ iMotif ] );
-
-	pPST = CreatePST( iMotif );
-	if( !pPST->Open( strMotif ) ) {
-		delete pPST;
-		m_vecpPSTs.pop_back( );
-		return -1; }
-
-	return iMotif; }
-
-float CCoalesceMotifLibraryImpl::AlignKMers( const std::string& strOne, const std::string& strTwo,
-	float dCutoff ) const {
-	int	iOffset;
-
-	return Align( strOne, strTwo, dCutoff, iOffset ); }
-
-uint32_t CCoalesceMotifLibraryImpl::MergeKMers( const std::string& strOne, const std::string& strTwo,
-	float dCutoff, bool fAllowDuplicates ) {
-	int			iOffset;
-	float		dScore;
-	uint32_t	iRet;
-	CPST*		pPST;
-
-	if( ( ( dScore = Align( strOne, strTwo, dCutoff, iOffset ) ) > dCutoff ) ||
-		!( fAllowDuplicates || dScore ) )
-		return -1;
-
-	pPST = CreatePST( iRet );
-	pPST->Add( strOne, strTwo, iOffset );
-	if( g_CatSleipnir.isInfoEnabled( ) )
-		g_CatSleipnir.info( "CCoalesceMotifLibraryImpl::MergeKMers( %s, %s, %g ) merged at %g to %s",
-			strOne.c_str( ), strTwo.c_str( ), dCutoff, dScore, pPST->GetMotif( ).c_str( ) );
-	return iRet; }
-
-float CCoalesceMotifLibraryImpl::AlignKMerRC( const std::string& strKMer, uint32_t iRC, float dCutoff ) const {
-	string	strOne, strTwo;
-	float	dOne, dTwo;
-	int		iOne, iTwo;
-
-	strOne = GetRCOne( iRC );
-	strTwo = GetReverseComplement( strOne );
-	dOne = Align( strKMer, strOne, dCutoff, iOne );
-	dTwo = Align( strKMer, strTwo, dCutoff, iTwo );
-
-	return min( dOne, dTwo ); }
-
-uint32_t CCoalesceMotifLibraryImpl::MergeKMerRC( uint32_t iKMer, uint32_t iRC, float dCutoff,
-	bool fAllowDuplicates ) {
-	string		strKMer, strOne, strTwo;
-	float		dOne, dTwo, dMin;
-	int			iOne, iTwo;
-	uint32_t	iRet;
-	CPST*		pPST;
-
-	if( m_veciKMer2RC[ iKMer ] == iRC )
-		return -1;
-	strKMer = GetMotif( iKMer );
-	strOne = GetRCOne( iRC );
-	strTwo = GetReverseComplement( strOne );
-	dOne = Align( strKMer, strOne, dCutoff, iOne );
-	dTwo = Align( strKMer, strTwo, dCutoff, iTwo );
-	dMin = min( dOne, dTwo );
-	if( ( dMin > dCutoff ) || !( fAllowDuplicates || dMin ) )
-		return -1;
-
-	pPST = CreatePST( iRet );
-	if( dOne < dTwo ) {
-		pPST->Add( strKMer, strOne, iOne );
-		pPST->Add( strTwo ); }
-	else {
-		pPST->Add( strKMer, strTwo, iTwo );
-		pPST->Add( strOne ); }
-	if( g_CatSleipnir.isInfoEnabled( ) )
-		g_CatSleipnir.info( "CCoalesceMotifLibraryImpl::MergeKMerRC( %s, %s, %g ) merged at %g to %s",
-			strKMer.c_str( ), GetMotif( iRC ).c_str( ), dCutoff, min( dOne, dTwo ),
-			pPST->GetMotif( ).c_str( ) );
-	return iRet; }
-
-struct SCrossRCs {
-	string	m_strOne;
-	string	m_strTwo;
-	float	m_dScore;
-	int		m_iOffset;
-};
-
-float CCoalesceMotifLibraryImpl::AlignRCs( uint32_t iOne, uint32_t iTwo, float dCutoff ) const {
-	SCrossRCs	asCrosses[ 4 ];
-	size_t		i;
-	float		dMin;
-
-	asCrosses[ 0 ].m_strOne = asCrosses[ 1 ].m_strOne = GetRCOne( iOne );
-	asCrosses[ 0 ].m_strTwo = asCrosses[ 2 ].m_strTwo = GetRCOne( iTwo );
-	asCrosses[ 1 ].m_strTwo = asCrosses[ 3 ].m_strTwo = GetReverseComplement( asCrosses[ 0 ].m_strTwo );
-	asCrosses[ 2 ].m_strOne = asCrosses[ 3 ].m_strOne = GetReverseComplement( asCrosses[ 0 ].m_strOne );
-	dMin = FLT_MAX;
-	for( i = 0; i < ARRAYSIZE(asCrosses); ++i ) {
-		asCrosses[ i ].m_dScore = Align( asCrosses[ i ].m_strOne, asCrosses[ i ].m_strTwo, dCutoff,
-			asCrosses[ i ].m_iOffset );
-		if( asCrosses[ i ].m_dScore < dMin )
-			dMin = asCrosses[ i ].m_dScore; }
-
-	return dMin; }
-
-uint32_t CCoalesceMotifLibraryImpl::MergeRCs( uint32_t iOne, uint32_t iTwo, float dCutoff,
-	bool fAllowDuplicates ) {
-	SCrossRCs	asCrosses[ 4 ];
-	uint32_t	iRet;
-	CPST*		pPST;
-	size_t		i, iMin;
-	float		dMin;
-
-	asCrosses[ 0 ].m_strOne = asCrosses[ 1 ].m_strOne = GetRCOne( iOne );
-	asCrosses[ 0 ].m_strTwo = asCrosses[ 2 ].m_strTwo = GetRCOne( iTwo );
-	asCrosses[ 1 ].m_strTwo = asCrosses[ 3 ].m_strTwo = GetReverseComplement( asCrosses[ 0 ].m_strTwo );
-	asCrosses[ 2 ].m_strOne = asCrosses[ 3 ].m_strOne = GetReverseComplement( asCrosses[ 0 ].m_strOne );
-	dMin = FLT_MAX;
-	for( iMin = i = 0; i < ARRAYSIZE(asCrosses); ++i ) {
-		asCrosses[ i ].m_dScore = Align( asCrosses[ i ].m_strOne, asCrosses[ i ].m_strTwo, dCutoff,
-			asCrosses[ i ].m_iOffset );
-		if( asCrosses[ i ].m_dScore < dMin ) {
-			dMin = asCrosses[ i ].m_dScore;
-			iMin = i; } }
-	if( ( dMin > dCutoff ) || !( fAllowDuplicates || dMin ) )
-		return -1;
-
-	pPST = CreatePST( iRet );
-	pPST->Add( asCrosses[ iMin ].m_strOne, asCrosses[ iMin ].m_strTwo, asCrosses[ iMin ].m_iOffset );
-	{
-		CPST	PST( strlen( c_szBases ) );
-
-		PST.Add( asCrosses[ ( iMin + 1 ) % ARRAYSIZE(asCrosses) ].m_strTwo,
-			asCrosses[ ( iMin + 2 ) % ARRAYSIZE(asCrosses) ].m_strOne, asCrosses[ iMin ].m_iOffset );
-		pPST->Add( PST );
-	}
-	if( g_CatSleipnir.isInfoEnabled( ) )
-		g_CatSleipnir.info( "CCoalesceMotifLibraryImpl::MergeRCs( %s, %s, %g ) merged at %g to %s",
-			GetMotif( iOne ).c_str( ), GetMotif( iTwo ).c_str( ), dCutoff, dMin, pPST->GetMotif( ).c_str( ) );
-	return iRet; }
-
-float CCoalesceMotifLibraryImpl::AlignKMerPST( const std::string& strKMer, const CPST& PSTIn,
-	float dCutoff ) const {
-	int	iOffset;
-
-	return PSTIn.Align( strKMer, m_dPenaltyGap, m_dPenaltyMismatch, dCutoff, iOffset ); }
-
-uint32_t CCoalesceMotifLibraryImpl::MergeKMerPST( const std::string& strKMer, const CPST& PSTIn,
-	float dCutoff, bool fAllowDuplicates ) {
-	int			iOffset;
-	float		dScore;
-	uint32_t	iRet;
-	CPST*		pPSTOut;
-
-	if( ( ( dScore = PSTIn.Align( strKMer, m_dPenaltyGap, m_dPenaltyMismatch, dCutoff, iOffset ) ) >
-		dCutoff ) || !( fAllowDuplicates || dScore ) )
-		return -1;
-
-	pPSTOut = CreatePST( iRet );
-	pPSTOut->Add( strKMer, PSTIn, iOffset );
-	if( g_CatSleipnir.isInfoEnabled( ) ) {
-		ostringstream	ossm;
-
-		ossm << "CCoalesceMotifLibraryImpl::MergeKMerPST( " << strKMer << ", " << PSTIn.GetMotif( ) <<
-			", " << dCutoff << " ) merged at " << dScore << " to " << pPSTOut->GetMotif( );
-		g_CatSleipnir.info( ossm.str( ) ); }
-	return iRet; }
-
-float CCoalesceMotifLibraryImpl::AlignRCPST( uint32_t iRC, const CPST& PSTIn, float dCutoff ) const {
-	int		iOne, iTwo;
-	string	strOne, strTwo;
-	float	dOne, dTwo;
-
-	strOne = GetRCOne( iRC );
-	strTwo = GetReverseComplement( strOne );
-	dOne = PSTIn.Align( strOne, m_dPenaltyGap, m_dPenaltyMismatch, dCutoff, iOne );
-	dTwo = PSTIn.Align( strTwo, m_dPenaltyGap, m_dPenaltyMismatch, dCutoff, iTwo );
-
-	return min( dOne, dTwo ); }
-
-uint32_t CCoalesceMotifLibraryImpl::MergeRCPST( uint32_t iRC, const CPST& PSTIn, float dCutoff,
-	bool fAllowDuplicates ) {
-	int			iOne, iTwo;
-	uint32_t	iRet;
-	CPST*		pPSTOut;
-	string		strOne, strTwo;
-	float		dOne, dTwo, dMin;
-
-	strOne = GetRCOne( iRC );
-	strTwo = GetReverseComplement( strOne );
-	dOne = PSTIn.Align( strOne, m_dPenaltyGap, m_dPenaltyMismatch, dCutoff, iOne );
-	dTwo = PSTIn.Align( strTwo, m_dPenaltyGap, m_dPenaltyMismatch, dCutoff, iTwo );
-	dMin = min( dOne, dTwo );
-	if( ( dMin > dCutoff ) || !( fAllowDuplicates || dMin ) )
-		return -1;
-
-	pPSTOut = CreatePST( iRet );
-	if( dOne < dTwo ) {
-		pPSTOut->Add( strOne, PSTIn, iOne );
-		pPSTOut->Add( strTwo ); }
-	else {
-		pPSTOut->Add( strTwo, PSTIn, iTwo );
-		pPSTOut->Add( strOne ); }
-	if( g_CatSleipnir.isInfoEnabled( ) ) {
-		ostringstream	ossm;
-
-		ossm << "CCoalesceMotifLibraryImpl::MergeRCPST( " << GetMotif( iRC ) << ", " << PSTIn.GetMotif( ) <<
-			", " << dCutoff << " ) merged at " << min( dOne, dTwo ) << " to " << pPSTOut->GetMotif( );
-		g_CatSleipnir.info( ossm.str( ) ); }
-	return iRet; }
-
-float CCoalesceMotifLibraryImpl::AlignPSTs( const CPST& PSTOne, const CPST& PSTTwo, float dCutoff ) const {
-	int	iOffset;
-
-	return PSTOne.Align( PSTTwo, m_dPenaltyGap, m_dPenaltyMismatch, dCutoff, iOffset ); }
-
-uint32_t CCoalesceMotifLibraryImpl::MergePSTs( const CPST& PSTOne, const CPST& PSTTwo, float dCutoff,
-	bool fAllowDuplicates ) {
-	int			iOffset;
-	uint32_t	iRet;
-	CPST*		pPSTOut;
-	float		dScore;
-
-	if( ( ( dScore = PSTOne.Align( PSTTwo, m_dPenaltyGap, m_dPenaltyMismatch, dCutoff, iOffset ) ) >
-		dCutoff ) || !( fAllowDuplicates || dScore ) )
-		return -1;
-
-	pPSTOut = CreatePST( iRet );
-	if( iOffset < 0 ) {
-		pPSTOut->Add( PSTOne );
-		pPSTOut->Add( PSTTwo, -iOffset ); }
-	else {
-		pPSTOut->Add( PSTTwo );
-		pPSTOut->Add( PSTOne, iOffset ); }
-	if( g_CatSleipnir.isInfoEnabled( ) ) {
-		ostringstream	ossm;
-
-		ossm << "CCoalesceMotifLibraryImpl::MergePSTs( " << PSTOne.GetMotif( ) << ", " <<
-			PSTTwo.GetMotif( ) << ", " << dCutoff << " ) merged at " << dScore << " to " <<
-			pPSTOut->GetMotif( );
-		g_CatSleipnir.info( ossm.str( ) ); }
-
-	return iRet; }
-
-uint32_t CCoalesceMotifLibraryImpl::RemoveRCs( const CPST& PST, float dPenaltyGap, float dPenaltyMismatch ) {
-	CPST*		pPST;
-	uint32_t	iRet;
-
-	if( !( pPST = CreatePST( iRet ) ) )
-		return -1;
-	PST.RemoveRCs( dPenaltyGap, dPenaltyMismatch, *pPST );
-	return iRet; }
-
-string CCoalesceMotifLibrary::GetPWM( uint32_t iMotif, float dCutoffPWMs, float dPenaltyGap,
-	float dPenaltyMismatch, bool fNoRCs ) const {
-	CFullMatrix<uint16_t>	MatPWM;
-	size_t					i, j;
-	ostringstream			ossm;
-
-	if( !CCoalesceMotifLibraryImpl::GetPWM( iMotif, dCutoffPWMs, dPenaltyGap, dPenaltyMismatch, fNoRCs,
-		MatPWM ) )
-		return "";
-	for( i = 0; i < MatPWM.GetRows( ); ++i ) {
-		for( j = 0; j < MatPWM.GetColumns( ); ++j )
-			ossm << ( j ? "\t" : "" ) << MatPWM.Get( i, j );
-		ossm << endl; }
-
-	return ossm.str( ); }
-
-bool CCoalesceMotifLibraryImpl::GetPWM( uint32_t iMotif, float dCutoffPWMs, float dPenaltyGap,
-	float dPenaltyMismatch, bool fNoRCs, CFullMatrix<uint16_t>& MatPWM ) const {
-	string	strMotif;
-	float	d;
-
-	if( fNoRCs )
-		iMotif = ((CCoalesceMotifLibrary*)this)->RemoveRCs( iMotif, dPenaltyGap, dPenaltyMismatch );
-	switch( GetType( iMotif ) ) {
-		case ETypeKMer:
-			if( !CCoalesceMotifLibraryImpl::GetPWM( GetMotif( iMotif ), MatPWM ) )
-				return false;
-			break;
-
-		case ETypeRC:
-			if( !( CCoalesceMotifLibraryImpl::GetPWM( strMotif = GetRCOne( iMotif ), MatPWM ) &&
-				CCoalesceMotifLibraryImpl::GetPWM( GetReverseComplement( strMotif ), MatPWM ) ) )
-				return false;
-			break;
-
-		case ETypePST:
-			GetPST( iMotif )->GetPWM( MatPWM, c_szBases );
-			break; }
-
-	if( dCutoffPWMs ) {
-		d = GetInformation( MatPWM );
-		if( d < dCutoffPWMs ) {
-			if( g_CatSleipnir.isInfoEnabled( ) ) {
-				ostringstream	ossm;
-
-				ossm << "CCoalesceMotifLibraryImpl::GetPWM( " << iMotif << ", " << dCutoffPWMs << ", " <<
-					fNoRCs << " ) rejected (" << d << "):" << endl;
-				MatPWM.Save( ossm, false );
-				g_CatSleipnir.info( ossm.str( ) ); }
-			return false; }
-		if( g_CatSleipnir.isDebugEnabled( ) ) {
-			ostringstream	ossm;
-
-			ossm << "CCoalesceMotifLibraryImpl::GetPWM( " << iMotif << ", " << dCutoffPWMs << ", " <<
-				fNoRCs << " ) got information (" << d << "):" << endl;
-			MatPWM.Save( ossm, false );
-			g_CatSleipnir.debug( ossm.str( ) ); } }
-
-	return true; }
-
-bool CCoalesceMotifLibrary::Simplify( uint32_t iMotif ) const {
-
-	return ( ( GetType( iMotif ) == ETypePST ) ? CCoalesceMotifLibraryImpl::GetPST( iMotif )->Simplify( ) :
-		false ); }
-
-bool CCoalesceMotifLibrary::OpenKnown( istream& istm ) {
-	char*			szBuffer;
-	vector<string>	vecstrLine;
-
-	szBuffer = new char[ CFile::GetBufferSize( ) ];
-	while( !istm.eof( ) ) {
-		istm.getline( szBuffer, CFile::GetBufferSize( ) );
-		if( !szBuffer[ 0 ] )
-			continue;
-		vecstrLine.clear( );
-		CMeta::Tokenize( szBuffer, vecstrLine );
-		if( vecstrLine.empty( ) )
-			continue;
-		if( ( vecstrLine.size( ) - 1 ) % 4 ) {
-			g_CatSleipnir.warn( "CCoalesceMotifLibrary::OpenKnown( ) invalid line: %s" );
-			continue; }
-
-		m_sKnowns.Add( vecstrLine[ 0 ], vecstrLine ); }
-	delete[] szBuffer;
-
-	return true; }
-
-bool CCoalesceMotifLibrary::GetKnown( uint32_t iMotif, float dPenaltyGap, float dPenaltyMismatch,
-	vector<pair<string, float> >& vecprstrdKnown, float dPValue ) const {
-	size_t							i, j, k, iSum;
-	vector<float>					vecdMotif;
-	CFullMatrix<uint16_t>			MatPWM;
-	map<string, float>				mapstrdKnown;
-	map<string, float>::iterator	iterKnown;
-
-	if( !m_sKnowns.GetSize( ) )
-		return true;
-	dPValue /= m_sKnowns.GetSize( );
-	if( !CCoalesceMotifLibraryImpl::GetPWM( iMotif, 0, dPenaltyGap, dPenaltyMismatch, true, MatPWM ) )
-		return false;
-	vecdMotif.resize( MatPWM.GetRows( ) * MatPWM.GetColumns( ) );
-	for( k = i = 0; i < MatPWM.GetColumns( ); ++i ) {
-		for( iSum = j = 0; j < MatPWM.GetRows( ); ++j )
-			iSum += MatPWM.Get( j, i );
-		if( !iSum )
-			iSum = 1;
-		for( j = 0; j < MatPWM.GetRows( ); ++j )
-			vecdMotif[ k++ ] = (float)MatPWM.Get( j, i ) / iSum; }
-
-	m_sKnowns.Match( vecdMotif, mapstrdKnown );
-	for( iterKnown = mapstrdKnown.begin( ); iterKnown != mapstrdKnown.end( ); ++iterKnown )
-		if( iterKnown->second < dPValue )
-			vecprstrdKnown.push_back( *iterKnown );
-
-	return true; }
-}
+/*****************************************************************************
+* This file is provided under the Creative Commons Attribution 3.0 license.
+*
+* You are free to share, copy, distribute, transmit, or adapt this work
+* PROVIDED THAT you attribute the work to the authors listed below.
+* For more information, please see the following web page:
+* http://creativecommons.org/licenses/by/3.0/
+*
+* This file is a component of the Sleipnir library for functional genomics,
+* authored by:
+* Curtis Huttenhower (chuttenh@princeton.edu)
+* Mark Schroeder
+* Maria D. Chikina
+* Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
+*
+* If you use this library, the included executable tools, or any related
+* code in your work, please cite the following publication:
+* Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
+* Olga G. Troyanskaya.
+* "The Sleipnir library for computational functional genomics"
+*****************************************************************************/
+#include "stdafx.h"
+#include "coalescemotifs.h"
+#include "coalescestructsi.h"
+#include "pst.h"
+#include "measure.h"
+#include "statistics.h"
+
+namespace Sleipnir {
+
+const char	CCoalesceMotifLibraryImpl::c_szBases[]			= "ACGT";
+const char	CCoalesceMotifLibraryImpl::c_szComplements[]	= "TGCA";
+
+void CCoalesceMotifLibraryImpl::SKnowns::Match( const vector<float>& vecdMotif,
+	map<string, float>& mapstrdKnown ) const {
+	static const size_t					c_iOverlap	= 5;
+	map<string, TVecPr>::const_iterator	iterKnown;
+	size_t								i, j, k, iMin, iMax, iTests;
+	float								dP, dR, dMin;
+	const float*						adMotif;
+	const float*						adPWM;
+	const float*						adRC;
+
+	for( iterKnown = m_mapstrvecKnown.begin( ); iterKnown != m_mapstrvecKnown.end( ); ++iterKnown ) {
+		dMin = FLT_MAX;
+		for( iTests = i = 0; i < iterKnown->second.size( ); ++i ) {
+			const vector<float>&	vecdPWM	= iterKnown->second[ i ].first;
+			const vector<float>&	vecdRC	= iterKnown->second[ i ].second;
+
+			iMin = strlen( c_szBases ) * c_iOverlap;
+			iMax = vecdMotif.size( ) + vecdPWM.size( ) - iMin;
+			if( iMax < iMin )
+				continue;
+			for( j = iMin; j <= iMax; j += strlen( c_szBases ) ) {
+				iTests += 2;
+				if( j < vecdMotif.size( ) ) {
+					adMotif = &vecdMotif[ vecdMotif.size( ) - j - 1 ];
+					adPWM = &vecdPWM.front( );
+					adRC = &vecdRC.front( );
+					k = min( vecdPWM.size( ), j ); }
+				else {
+					adMotif = &vecdMotif.front( );
+					k = j - vecdMotif.size( );
+					adPWM = &vecdPWM[ k ];
+					adRC = &vecdRC[ k ];
+					k = min( vecdMotif.size( ), vecdPWM.size( ) - k ); }
+				if( ( dR = (float)max( CMeasurePearson::Pearson( adMotif, k, adPWM, k,
+					IMeasure::EMapNone ), CMeasurePearson::Pearson( adMotif, k, adRC, k,
+					IMeasure::EMapNone ) ) ) < 0 )
+					continue;
+				if( ( dP = (float)CStatistics::PValuePearson( dR, k ) ) < dMin )
+					dMin = dP; } }
+		if( dMin != FLT_MAX )
+			mapstrdKnown[ iterKnown->first ] = dMin * iTests; } }
+
+bool CCoalesceMotifLibrary::Open( istream& istm, vector<SMotifMatch>& vecsMotifs,
+	CCoalesceMotifLibrary* pMotifs ) {
+	string	strBuffer;
+	size_t	i;
+
+	strBuffer.resize( CFile::GetBufferSize( ) );
+	while( CFile::IsNewline( istm.peek( ) ) )
+		istm.getline( &strBuffer[ 0 ], strBuffer.size( ) );
+	while( !istm.eof( ) && ( istm.peek( ) != c_cCluster ) ) {
+		SMotifMatch	sMotif;
+
+		if( pMotifs ) {
+			if( !sMotif.Open( istm, *pMotifs ) )
+				break;
+			vecsMotifs.push_back( sMotif );
+			if( isdigit( istm.peek( ) ) )
+				for( i = 0; i < 4; ++i )
+					istm.getline( &strBuffer[ 0 ], strBuffer.size( ) ); }
+		else
+			istm.getline( &strBuffer[ 0 ], strBuffer.size( ) );
+		while( CFile::IsNewline( istm.peek( ) ) )
+			istm.getline( &strBuffer[ 0 ], strBuffer.size( ) ); }
+
+	return true; }
+
+CCoalesceMotifLibraryImpl::~CCoalesceMotifLibraryImpl( ) {
+	size_t	i;
+
+	for( i = 0; i < m_vecpPSTs.size( ); ++i )
+		delete m_vecpPSTs[ i ]; }
+
+/*!
+ * \brief
+ * Calculates the length-normalized match strength of the given motif against the appropriate number of
+ * characters in the input sequence at the requested offset.
+ * 
+ * \param strSequence
+ * Sequence against which motif is matched.
+ * 
+ * \param iMotif
+ * ID of motif to be matched.
+ * 
+ * \param iOffset
+ * Zero-based offset within strSequence at which the match is performed.
+ * 
+ * \param sModifiers
+ * A modifier cache containing any prior weights to be incorporated into the match.
+ * 
+ * \returns
+ * Length-normalized strength of motif match against the given sequence and offset.
+ * 
+ * \remarks
+ * iMotif must represent a valid motif for the current library, and iOffset must fall within strSequence,
+ * although motifs extending from a valid iOffset past the end of the sequence will be handled appropriately.
+ * Only PST motifs are currently supported, as there should never be any need to match non-PST motifs at
+ * runtime, but support for kmers and RCs could be added in a straightforward manner.
+ * 
+ * \see
+ * GetMatches
+ */
+float CCoalesceMotifLibrary::GetMatch( const std::string& strSequence, uint32_t iMotif, size_t iOffset,
+	SCoalesceModifierCache& sModifiers ) const {
+	size_t		i, iDepth;
+	float		d, dRet;
+	const CPST*	pPST;
+	string		strMotif, strRC;
+	EType		eType;
+
+	switch( eType = GetType( iMotif ) ) {
+		case ETypePST:
+			if( !( pPST = GetPST( iMotif ) ) ) {
+				g_CatSleipnir.error( "CCoalesceMotifLibrary::GetMatch( %s, %d, %d ) could not find PST",
+					strSequence.c_str( ), iMotif, iOffset );
+				return CMeta::GetNaN( ); }
+			break;
+
+		case ETypeKMer:
+			strMotif = GetMotif( iMotif );
+			break;
+
+		case ETypeRC:
+			strMotif = GetRCOne( iMotif );
+			strRC = GetReverseComplement( strMotif );
+			break; }
+
+	iDepth = ( eType == ETypePST ) ? pPST->GetDepth( ) : GetK( );
+	sModifiers.InitializeWeight( 0, 0 );
+	dRet = 0;
+	for( i = 1; i < iDepth; ++i ) {
+		sModifiers.AddWeight( 0, iOffset, i - 1 );
+		if( eType == ETypePST )
+			dRet += pPST->GetMatch( strSequence, iDepth - i ) * sModifiers.GetWeight( i ) / i; }
+	sModifiers.AddWeight( 0, iOffset, i - 1 );
+	for( i = 0; i < strSequence.length( ); ++i ) {
+		switch( eType ) {
+			case ETypePST:
+				d = pPST->GetMatch( strSequence.substr( i ) ) * sModifiers.GetWeight( iDepth ) / iDepth;
+				break;
+
+			case ETypeKMer:
+			case ETypeRC:
+				d = ( strSequence.compare( i, iDepth, strMotif ) &&
+					( strRC.empty( ) || strSequence.compare( i, iDepth, strRC ) ) ) ? 0 :
+					sModifiers.GetWeight( iDepth );
+				break; }
+		dRet += d;
+		sModifiers.AddWeight( iDepth, iOffset, i ); }
+	if( CMeta::IsNaN( dRet ) || ( dRet < 0 ) ) {
+		g_CatSleipnir.error( "CCoalesceMotifLibrary::GetMatch( %s, %d, %d ) found negative score: %g",
+			strSequence.c_str( ), iMotif, iOffset, dRet );
+		return CMeta::GetNaN( ); }
+
+	return dRet; }
+
+std::string CCoalesceMotifLibraryImpl::GetMotif( uint32_t iMotif ) const {
+	std::string	strKMer;
+
+// kmer
+	if( iMotif < GetKMers( ) )
+		return ID2KMer( iMotif, m_iK );
+// reverse complement
+	if( iMotif < GetBasePSTs( ) ) {
+		strKMer = GetRCOne( iMotif );
+		return ( strKMer + c_cSeparator + GetReverseComplement( strKMer ) ); }
+// pst
+	return GetPST( iMotif )->GetMotif( ); }
+
+CPST* CCoalesceMotifLibraryImpl::CreatePST( uint32_t& iMotif ) {
+	CPST*	pRet;
+
+	iMotif = GetBasePSTs( ) + GetPSTs( );
+	m_vecpPSTs.push_back( pRet = new CPST( strlen( c_szBases ) ) );
+	return pRet; }
+
+uint32_t CCoalesceMotifLibrary::Open( const std::string& strMotif ) {
+	uint32_t	iMotif;
+	CPST*		pPST;
+
+	if( strMotif.length( ) == GetK( ) && !IsIgnorableKMer( strMotif ) )
+		return KMer2ID( strMotif );
+	if( ( strMotif.length( ) == ( ( 2 * GetK( ) ) + 1 ) ) &&
+		!IsIgnorableKMer( strMotif.substr( 0, GetK( ) ) ) &&
+		( strMotif.substr( 0, GetK( ) ) == GetReverseComplement( strMotif.substr( GetK( ) + 1 ) ) ) &&
+		( ( iMotif = KMer2ID( strMotif.substr( 0, GetK( ) ) ) ) != -1 ) )
+		return ( GetBaseRCs( ) + m_veciKMer2RC[ iMotif ] );
+
+	pPST = CreatePST( iMotif );
+	if( !pPST->Open( strMotif ) ) {
+		delete pPST;
+		m_vecpPSTs.pop_back( );
+		return -1; }
+
+	return iMotif; }
+
+float CCoalesceMotifLibraryImpl::AlignKMers( const std::string& strOne, const std::string& strTwo,
+	float dCutoff ) const {
+	int	iOffset;
+
+	return Align( strOne, strTwo, dCutoff, iOffset ); }
+
+uint32_t CCoalesceMotifLibraryImpl::MergeKMers( const std::string& strOne, const std::string& strTwo,
+	float dCutoff, bool fAllowDuplicates ) {
+	int			iOffset;
+	float		dScore;
+	uint32_t	iRet;
+	CPST*		pPST;
+
+	if( ( ( dScore = Align( strOne, strTwo, dCutoff, iOffset ) ) > dCutoff ) ||
+		!( fAllowDuplicates || dScore ) )
+		return -1;
+
+	pPST = CreatePST( iRet );
+	pPST->Add( strOne, strTwo, iOffset );
+	if( g_CatSleipnir.isInfoEnabled( ) )
+		g_CatSleipnir.info( "CCoalesceMotifLibraryImpl::MergeKMers( %s, %s, %g ) merged at %g to %s",
+			strOne.c_str( ), strTwo.c_str( ), dCutoff, dScore, pPST->GetMotif( ).c_str( ) );
+	return iRet; }
+
+float CCoalesceMotifLibraryImpl::AlignKMerRC( const std::string& strKMer, uint32_t iRC, float dCutoff ) const {
+	string	strOne, strTwo;
+	float	dOne, dTwo;
+	int		iOne, iTwo;
+
+	strOne = GetRCOne( iRC );
+	strTwo = GetReverseComplement( strOne );
+	dOne = Align( strKMer, strOne, dCutoff, iOne );
+	dTwo = Align( strKMer, strTwo, dCutoff, iTwo );
+
+	return min( dOne, dTwo ); }
+
+uint32_t CCoalesceMotifLibraryImpl::MergeKMerRC( uint32_t iKMer, uint32_t iRC, float dCutoff,
+	bool fAllowDuplicates ) {
+	string		strKMer, strOne, strTwo;
+	float		dOne, dTwo, dMin;
+	int			iOne, iTwo;
+	uint32_t	iRet;
+	CPST*		pPST;
+
+	if( m_veciKMer2RC[ iKMer ] == iRC )
+		return -1;
+	strKMer = GetMotif( iKMer );
+	strOne = GetRCOne( iRC );
+	strTwo = GetReverseComplement( strOne );
+	dOne = Align( strKMer, strOne, dCutoff, iOne );
+	dTwo = Align( strKMer, strTwo, dCutoff, iTwo );
+	dMin = min( dOne, dTwo );
+	if( ( dMin > dCutoff ) || !( fAllowDuplicates || dMin ) )
+		return -1;
+
+	pPST = CreatePST( iRet );
+	if( dOne < dTwo ) {
+		pPST->Add( strKMer, strOne, iOne );
+		pPST->Add( strTwo ); }
+	else {
+		pPST->Add( strKMer, strTwo, iTwo );
+		pPST->Add( strOne ); }
+	if( g_CatSleipnir.isInfoEnabled( ) )
+		g_CatSleipnir.info( "CCoalesceMotifLibraryImpl::MergeKMerRC( %s, %s, %g ) merged at %g to %s",
+			strKMer.c_str( ), GetMotif( iRC ).c_str( ), dCutoff, min( dOne, dTwo ),
+			pPST->GetMotif( ).c_str( ) );
+	return iRet; }
+
+struct SCrossRCs {
+	string	m_strOne;
+	string	m_strTwo;
+	float	m_dScore;
+	int		m_iOffset;
+};
+
+float CCoalesceMotifLibraryImpl::AlignRCs( uint32_t iOne, uint32_t iTwo, float dCutoff ) const {
+	SCrossRCs	asCrosses[ 4 ];
+	size_t		i;
+	float		dMin;
+
+	asCrosses[ 0 ].m_strOne = asCrosses[ 1 ].m_strOne = GetRCOne( iOne );
+	asCrosses[ 0 ].m_strTwo = asCrosses[ 2 ].m_strTwo = GetRCOne( iTwo );
+	asCrosses[ 1 ].m_strTwo = asCrosses[ 3 ].m_strTwo = GetReverseComplement( asCrosses[ 0 ].m_strTwo );
+	asCrosses[ 2 ].m_strOne = asCrosses[ 3 ].m_strOne = GetReverseComplement( asCrosses[ 0 ].m_strOne );
+	dMin = FLT_MAX;
+	for( i = 0; i < ARRAYSIZE(asCrosses); ++i ) {
+		asCrosses[ i ].m_dScore = Align( asCrosses[ i ].m_strOne, asCrosses[ i ].m_strTwo, dCutoff,
+			asCrosses[ i ].m_iOffset );
+		if( asCrosses[ i ].m_dScore < dMin )
+			dMin = asCrosses[ i ].m_dScore; }
+
+	return dMin; }
+
+uint32_t CCoalesceMotifLibraryImpl::MergeRCs( uint32_t iOne, uint32_t iTwo, float dCutoff,
+	bool fAllowDuplicates ) {
+	SCrossRCs	asCrosses[ 4 ];
+	uint32_t	iRet;
+	CPST*		pPST;
+	size_t		i, iMin;
+	float		dMin;
+
+	asCrosses[ 0 ].m_strOne = asCrosses[ 1 ].m_strOne = GetRCOne( iOne );
+	asCrosses[ 0 ].m_strTwo = asCrosses[ 2 ].m_strTwo = GetRCOne( iTwo );
+	asCrosses[ 1 ].m_strTwo = asCrosses[ 3 ].m_strTwo = GetReverseComplement( asCrosses[ 0 ].m_strTwo );
+	asCrosses[ 2 ].m_strOne = asCrosses[ 3 ].m_strOne = GetReverseComplement( asCrosses[ 0 ].m_strOne );
+	dMin = FLT_MAX;
+	for( iMin = i = 0; i < ARRAYSIZE(asCrosses); ++i ) {
+		asCrosses[ i ].m_dScore = Align( asCrosses[ i ].m_strOne, asCrosses[ i ].m_strTwo, dCutoff,
+			asCrosses[ i ].m_iOffset );
+		if( asCrosses[ i ].m_dScore < dMin ) {
+			dMin = asCrosses[ i ].m_dScore;
+			iMin = i; } }
+	if( ( dMin > dCutoff ) || !( fAllowDuplicates || dMin ) )
+		return -1;
+
+	pPST = CreatePST( iRet );
+	pPST->Add( asCrosses[ iMin ].m_strOne, asCrosses[ iMin ].m_strTwo, asCrosses[ iMin ].m_iOffset );
+	{
+		CPST	PST( strlen( c_szBases ) );
+
+		PST.Add( asCrosses[ ( iMin + 1 ) % ARRAYSIZE(asCrosses) ].m_strTwo,
+			asCrosses[ ( iMin + 2 ) % ARRAYSIZE(asCrosses) ].m_strOne, asCrosses[ iMin ].m_iOffset );
+		pPST->Add( PST );
+	}
+	if( g_CatSleipnir.isInfoEnabled( ) )
+		g_CatSleipnir.info( "CCoalesceMotifLibraryImpl::MergeRCs( %s, %s, %g ) merged at %g to %s",
+			GetMotif( iOne ).c_str( ), GetMotif( iTwo ).c_str( ), dCutoff, dMin, pPST->GetMotif( ).c_str( ) );
+	return iRet; }
+
+float CCoalesceMotifLibraryImpl::AlignKMerPST( const std::string& strKMer, const CPST& PSTIn,
+	float dCutoff ) const {
+	int	iOffset;
+
+	return PSTIn.Align( strKMer, m_dPenaltyGap, m_dPenaltyMismatch, dCutoff, iOffset ); }
+
+uint32_t CCoalesceMotifLibraryImpl::MergeKMerPST( const std::string& strKMer, const CPST& PSTIn,
+	float dCutoff, bool fAllowDuplicates ) {
+	int			iOffset;
+	float		dScore;
+	uint32_t	iRet;
+	CPST*		pPSTOut;
+
+	if( ( ( dScore = PSTIn.Align( strKMer, m_dPenaltyGap, m_dPenaltyMismatch, dCutoff, iOffset ) ) >
+		dCutoff ) || !( fAllowDuplicates || dScore ) )
+		return -1;
+
+	pPSTOut = CreatePST( iRet );
+	pPSTOut->Add( strKMer, PSTIn, iOffset );
+	if( g_CatSleipnir.isInfoEnabled( ) ) {
+		ostringstream	ossm;
+
+		ossm << "CCoalesceMotifLibraryImpl::MergeKMerPST( " << strKMer << ", " << PSTIn.GetMotif( ) <<
+			", " << dCutoff << " ) merged at " << dScore << " to " << pPSTOut->GetMotif( );
+		g_CatSleipnir.info( ossm.str( ).c_str( ) ); }
+	return iRet; }
+
+float CCoalesceMotifLibraryImpl::AlignRCPST( uint32_t iRC, const CPST& PSTIn, float dCutoff ) const {
+	int		iOne, iTwo;
+	string	strOne, strTwo;
+	float	dOne, dTwo;
+
+	strOne = GetRCOne( iRC );
+	strTwo = GetReverseComplement( strOne );
+	dOne = PSTIn.Align( strOne, m_dPenaltyGap, m_dPenaltyMismatch, dCutoff, iOne );
+	dTwo = PSTIn.Align( strTwo, m_dPenaltyGap, m_dPenaltyMismatch, dCutoff, iTwo );
+
+	return min( dOne, dTwo ); }
+
+uint32_t CCoalesceMotifLibraryImpl::MergeRCPST( uint32_t iRC, const CPST& PSTIn, float dCutoff,
+	bool fAllowDuplicates ) {
+	int			iOne, iTwo;
+	uint32_t	iRet;
+	CPST*		pPSTOut;
+	string		strOne, strTwo;
+	float		dOne, dTwo, dMin;
+
+	strOne = GetRCOne( iRC );
+	strTwo = GetReverseComplement( strOne );
+	dOne = PSTIn.Align( strOne, m_dPenaltyGap, m_dPenaltyMismatch, dCutoff, iOne );
+	dTwo = PSTIn.Align( strTwo, m_dPenaltyGap, m_dPenaltyMismatch, dCutoff, iTwo );
+	dMin = min( dOne, dTwo );
+	if( ( dMin > dCutoff ) || !( fAllowDuplicates || dMin ) )
+		return -1;
+
+	pPSTOut = CreatePST( iRet );
+	if( dOne < dTwo ) {
+		pPSTOut->Add( strOne, PSTIn, iOne );
+		pPSTOut->Add( strTwo ); }
+	else {
+		pPSTOut->Add( strTwo, PSTIn, iTwo );
+		pPSTOut->Add( strOne ); }
+	if( g_CatSleipnir.isInfoEnabled( ) ) {
+		ostringstream	ossm;
+
+		ossm << "CCoalesceMotifLibraryImpl::MergeRCPST( " << GetMotif( iRC ) << ", " << PSTIn.GetMotif( ) <<
+			", " << dCutoff << " ) merged at " << min( dOne, dTwo ) << " to " << pPSTOut->GetMotif( );
+		g_CatSleipnir.info( ossm.str( ).c_str( ) ); }
+	return iRet; }
+
+float CCoalesceMotifLibraryImpl::AlignPSTs( const CPST& PSTOne, const CPST& PSTTwo, float dCutoff ) const {
+	int	iOffset;
+
+	return PSTOne.Align( PSTTwo, m_dPenaltyGap, m_dPenaltyMismatch, dCutoff, iOffset ); }
+
+uint32_t CCoalesceMotifLibraryImpl::MergePSTs( const CPST& PSTOne, const CPST& PSTTwo, float dCutoff,
+	bool fAllowDuplicates ) {
+	int			iOffset;
+	uint32_t	iRet;
+	CPST*		pPSTOut;
+	float		dScore;
+
+	if( ( ( dScore = PSTOne.Align( PSTTwo, m_dPenaltyGap, m_dPenaltyMismatch, dCutoff, iOffset ) ) >
+		dCutoff ) || !( fAllowDuplicates || dScore ) )
+		return -1;
+
+	pPSTOut = CreatePST( iRet );
+	if( iOffset < 0 ) {
+		pPSTOut->Add( PSTOne );
+		pPSTOut->Add( PSTTwo, -iOffset ); }
+	else {
+		pPSTOut->Add( PSTTwo );
+		pPSTOut->Add( PSTOne, iOffset ); }
+	if( g_CatSleipnir.isInfoEnabled( ) ) {
+		ostringstream	ossm;
+
+		ossm << "CCoalesceMotifLibraryImpl::MergePSTs( " << PSTOne.GetMotif( ) << ", " <<
+			PSTTwo.GetMotif( ) << ", " << dCutoff << " ) merged at " << dScore << " to " <<
+			pPSTOut->GetMotif( );
+		g_CatSleipnir.info( ossm.str( ).c_str( ) ); }
+
+	return iRet; }
+
+uint32_t CCoalesceMotifLibraryImpl::RemoveRCs( const CPST& PST, float dPenaltyGap, float dPenaltyMismatch ) {
+	CPST*		pPST;
+	uint32_t	iRet;
+
+	if( !( pPST = CreatePST( iRet ) ) )
+		return -1;
+	PST.RemoveRCs( dPenaltyGap, dPenaltyMismatch, *pPST );
+	return iRet; }
+
+string CCoalesceMotifLibrary::GetPWM( uint32_t iMotif, float dCutoffPWMs, float dPenaltyGap,
+	float dPenaltyMismatch, bool fNoRCs ) const {
+	CFullMatrix<uint16_t>	MatPWM;
+	size_t					i, j;
+	ostringstream			ossm;
+
+	if( !CCoalesceMotifLibraryImpl::GetPWM( iMotif, dCutoffPWMs, dPenaltyGap, dPenaltyMismatch, fNoRCs,
+		MatPWM ) )
+		return "";
+	for( i = 0; i < MatPWM.GetRows( ); ++i ) {
+		for( j = 0; j < MatPWM.GetColumns( ); ++j )
+			ossm << ( j ? "\t" : "" ) << MatPWM.Get( i, j );
+		ossm << endl; }
+
+	return ossm.str( ); }
+
+bool CCoalesceMotifLibraryImpl::GetPWM( uint32_t iMotif, float dCutoffPWMs, float dPenaltyGap,
+	float dPenaltyMismatch, bool fNoRCs, CFullMatrix<uint16_t>& MatPWM ) const {
+	string	strMotif;
+	float	d;
+
+	if( fNoRCs )
+		iMotif = ((CCoalesceMotifLibrary*)this)->RemoveRCs( iMotif, dPenaltyGap, dPenaltyMismatch );
+	switch( GetType( iMotif ) ) {
+		case ETypeKMer:
+			if( !CCoalesceMotifLibraryImpl::GetPWM( GetMotif( iMotif ), MatPWM ) )
+				return false;
+			break;
+
+		case ETypeRC:
+			if( !( CCoalesceMotifLibraryImpl::GetPWM( strMotif = GetRCOne( iMotif ), MatPWM ) &&
+				CCoalesceMotifLibraryImpl::GetPWM( GetReverseComplement( strMotif ), MatPWM ) ) )
+				return false;
+			break;
+
+		case ETypePST:
+			GetPST( iMotif )->GetPWM( MatPWM, c_szBases );
+			break; }
+
+	if( dCutoffPWMs ) {
+		d = GetInformation( MatPWM );
+		if( d < dCutoffPWMs ) {
+			if( g_CatSleipnir.isInfoEnabled( ) ) {
+				ostringstream	ossm;
+
+				ossm << "CCoalesceMotifLibraryImpl::GetPWM( " << iMotif << ", " << dCutoffPWMs << ", " <<
+					fNoRCs << " ) rejected (" << d << "):" << endl;
+				MatPWM.Save( ossm, false );
+				g_CatSleipnir.info( ossm.str( ).c_str( ) ); }
+			return false; }
+		if( g_CatSleipnir.isDebugEnabled( ) ) {
+			ostringstream	ossm;
+
+			ossm << "CCoalesceMotifLibraryImpl::GetPWM( " << iMotif << ", " << dCutoffPWMs << ", " <<
+				fNoRCs << " ) got information (" << d << "):" << endl;
+			MatPWM.Save( ossm, false );
+			g_CatSleipnir.debug( ossm.str( ).c_str( ) ); } }
+
+	return true; }
+
+bool CCoalesceMotifLibrary::Simplify( uint32_t iMotif ) const {
+
+	return ( ( GetType( iMotif ) == ETypePST ) ? CCoalesceMotifLibraryImpl::GetPST( iMotif )->Simplify( ) :
+		false ); }
+
+bool CCoalesceMotifLibrary::OpenKnown( istream& istm ) {
+	char*			szBuffer;
+	vector<string>	vecstrLine;
+
+	szBuffer = new char[ CFile::GetBufferSize( ) ];
+	while( !istm.eof( ) ) {
+		istm.getline( szBuffer, CFile::GetBufferSize( ) );
+		if( !szBuffer[ 0 ] )
+			continue;
+		vecstrLine.clear( );
+		CMeta::Tokenize( szBuffer, vecstrLine );
+		if( vecstrLine.empty( ) )
+			continue;
+		if( ( vecstrLine.size( ) - 1 ) % 4 ) {
+			g_CatSleipnir.warn( "CCoalesceMotifLibrary::OpenKnown( ) invalid line: %s" );
+			continue; }
+
+		m_sKnowns.Add( vecstrLine[ 0 ], vecstrLine ); }
+	delete[] szBuffer;
+
+	return true; }
+
+bool CCoalesceMotifLibrary::GetKnown( uint32_t iMotif, float dPenaltyGap, float dPenaltyMismatch,
+	vector<pair<string, float> >& vecprstrdKnown, float dPValue ) const {
+	size_t							i, j, k, iSum;
+	vector<float>					vecdMotif;
+	CFullMatrix<uint16_t>			MatPWM;
+	map<string, float>				mapstrdKnown;
+	map<string, float>::iterator	iterKnown;
+
+	if( !m_sKnowns.GetSize( ) )
+		return true;
+	dPValue /= m_sKnowns.GetSize( );
+	if( !CCoalesceMotifLibraryImpl::GetPWM( iMotif, 0, dPenaltyGap, dPenaltyMismatch, true, MatPWM ) )
+		return false;
+	vecdMotif.resize( MatPWM.GetRows( ) * MatPWM.GetColumns( ) );
+	for( k = i = 0; i < MatPWM.GetColumns( ); ++i ) {
+		for( iSum = j = 0; j < MatPWM.GetRows( ); ++j )
+			iSum += MatPWM.Get( j, i );
+		if( !iSum )
+			iSum = 1;
+		for( j = 0; j < MatPWM.GetRows( ); ++j )
+			vecdMotif[ k++ ] = (float)MatPWM.Get( j, i ) / iSum; }
+
+	m_sKnowns.Match( vecdMotif, mapstrdKnown );
+	for( iterKnown = mapstrdKnown.begin( ); iterKnown != mapstrdKnown.end( ); ++iterKnown )
+		if( iterKnown->second < dPValue )
+			vecprstrdKnown.push_back( *iterKnown );
+
+	return true; }
+}

File src/statistics.h

View file
  • Ignore whitespace
 		dStd = ( iOne || iTwo ) ? sqrt( ( ( dVarOne + dVarTwo ) / ( iOne + iTwo ) ) - ( dAve * dAve ) ) : 0;
 
 		return ( dStd ? ( ( dAveOne - dAve ) / dStd ) :
-			( ( dAveOne == dAveTwo ) ? 0 : DBL_MAX ) ); }
+			( ( dAveOne == dAve ) ? 0 : DBL_MAX ) ); }
 
 	static double ZTest( double dZScore, size_t iN ) {
 
 		return ( 1 - Normal01CDF( fabs( dZScore ) * sqrt( (double)iN ) ) ); }
 
+	template<class tType>
+	static double RootMeanSquareError( tType BeginOne, tType EndOne, tType BeginTwo, tType EndTwo ) {
+		tType	CurOne, CurTwo;
+		double	d, dRet;
+		size_t	iN;
+
+		for( dRet = 0,CurOne = BeginOne,CurTwo = BeginTwo; ( CurOne < EndOne ) && ( CurTwo < EndTwo );
+			++CurOne,++CurTwo ) {
+			d = *CurOne - *CurTwo;
+			dRet += d * d; }
+		iN = min( EndOne - BeginOne, EndTwo - BeginTwo );
+
+		return ( iN ? sqrt( dRet / iN ) : 0 ); }
+
 	// P-value tests
 	static double LjungBox( const float* adX, size_t iN, size_t iH );
 

File src/stdafx.cpp

View file
  • Ignore whitespace
  * \code
  * BNFunc -o gene_ontology.obo -a gene_assocation.sgd -i GO_functional_slim.txt -d positives
  * \endcode
- * - Using \ref Answerer, we can turn these gene sets into a gold standard DAB file:
+ * - Using \ref Answerer, we can turn these gene sets into a gold standard DAB file.  Note that we're using
+ *	genes \e coannotated to a GO term as positives and \e not coannotated to any GO term as negatives.  See
+ *	the \ref Answerer page for more information on the \c -l flag as well:
  * \code
- * Answerer -p positives -o answers.dab
+ * Answerer -p positives -n positives -l 0.05 -o answers.dab
  * \endcode
  * - You can automatically create and learn a Bayesian network that will integrate your four datasets in a
  *	context-specific manner.  Assuming you've generated the four files \c pza1_imputed.dab and so forth in the
  * 
  * \section sec_history Version History
  * 
+ * - <a href="sleipnir-2.0.tar.gz">2.0</a>, ***-***-09 <br>
+ * Fixed \ref Answerer usage in the documentation example - thanks to Jim Costello! <br>
+ * 
  * - <a href="sleipnir-1.1.tar.gz">1.1</a>, 06-02-08 <br>
  * Added \ref BNs2Txt and \ref Mat2Txt tools for dumping binary Bayesian classifiers and matrices. <br>
  * Improved Gene Ontology parser for compatibility with latest versions. <br>
 
 #ifdef USE_LOG4CPP_STUB
 Category	g_CatSleipnir;
+
+const char* Priority::c_aszPriorityLevels[]	= {"FATAL", "ALERT", "CRITICAL", "ERROR", "WARN",
+												"NOTICE", "INFO", "DEBUG", "NOTSET"};
 #else // USE_LOG4CPP_STUB
 const char	c_szSleipnir[]	= "Sleipnir";
 Category&	g_CatSleipnir	= Category::getInstance( c_szSleipnir );

File src/stdafx.h

View file
  • Ignore whitespace
 #else // _MSC_VER
 #include <dirent.h>
 #include <errno.h>
+#include <fcntl.h>
 #define __STDC_LIMIT_MACROS
 #include <netinet/in.h>
 #include <signal.h>
 
 #ifdef USE_LOG4CPP_STUB
 
+struct Priority {
+	enum PriorityLevel {
+		EMERG	= 0, 
+		FATAL	= 0,
+		ALERT	= 1,
+		CRIT	= 2,
+		ERROR	= 3, 
+		WARN	= 4,
+		NOTICE	= 5,
+		INFO	= 6,
+		DEBUG	= 7,
+		NOTSET	= 8
+	};
+
+	const static char* c_aszPriorityLevels[];
+};
+
 struct Category {
 
 	static void shutdown( ) { }
 
 		va_start( valArgs, szFormat );
 		log4cpp( "DEBUG", szFormat, valArgs ); }
+
+	void log( Priority::PriorityLevel ePriority, const char* szFormat, ... ) const {
+		va_list	valArgs;
+
+		va_start( valArgs, szFormat );
+		log4cpp( Priority::c_aszPriorityLevels[ ePriority ], szFormat, valArgs ); }
+
+	bool isDebugEnabled( ) const {
+
+		return true; }
+
+	bool isInfoEnabled( ) const {
+
+		return true; }
+
+	bool isNoticeEnabled( ) const {
+
+		return true; }
 };
 
 #endif // USE_LOG4CPP_STUB

File tools/COALESCE/COALESCE.cpp

View file
  • Ignore whitespace
 
 int main( int iArgs, char** aszArgs ) {
 	gengetopt_args_info			sArgs;
-	CFASTA						FASTA, FASTANucleosomes, FASTAConservation;
+	CFASTA						FASTA;
 	CPCL						PCL;
 	CCoalesce					Coalesce;
 	vector<CCoalesceCluster>	vecClusters;
 	size_t						i, j;
 	set<string>					setstrTypes;
+	vector<CFASTA*>				vecpFASTAs;
 
 #ifdef WIN32
 	pthread_win32_process_attach_np( );
 	Coalesce.SetMotifs( Motifs );
 	Coalesce.SetProbabilityGene( (float)sArgs.prob_gene_arg );
 	Coalesce.SetPValueCondition( (float)sArgs.pvalue_cond_arg );
+	Coalesce.SetZScoreCondition( (float)sArgs.zscore_cond_arg );
 	Coalesce.SetPValueMotif( (float)sArgs.pvalue_motif_arg );
+	Coalesce.SetZScoreMotif( (float)sArgs.zscore_motif_arg );
 	Coalesce.SetPValueCorrelation( (float)sArgs.pvalue_correl_arg );
 	Coalesce.SetNumberCorrelation( sArgs.number_correl_arg );
 	Coalesce.SetPValueMerge( (float)sArgs.pvalue_merge_arg );
 		Coalesce.SetDirectoryIntermediate( sArgs.output_arg );
 	if( sArgs.cache_arg )
 		Coalesce.SetSequenceCache( sArgs.cache_arg );
-	if( sArgs.nucleosomes_arg ) {
-		if( !FASTANucleosomes.Open( sArgs.nucleosomes_arg ) ) {
-			cerr << "Could not open: " << sArgs.nucleosomes_arg << endl;
+
+	vecpFASTAs.resize( sArgs.inputs_num );
+	for( i = 0; i < vecpFASTAs.size( ); ++i ) {
+		vecpFASTAs[ i ] = new CFASTA( );
+		if( !vecpFASTAs[ i ]->Open( sArgs.inputs[ i ] ) ) {
+			cerr << "Could not open: " << sArgs.inputs[ i ] << endl;
 			return 1; }
-		Coalesce.AddWiggle( FASTANucleosomes ); }
-	if( sArgs.conservation_arg ) {
-		if( !FASTAConservation.Open( sArgs.conservation_arg ) ) {
-			cerr << "Could not open: " << sArgs.conservation_arg << endl;
-			return 1; }
-		Coalesce.AddWiggle( FASTAConservation ); }
+		Coalesce.AddWiggle( *vecpFASTAs[ i ] ); }
+
 	if( sArgs.progressive_flag )
 		Coalesce.AddOutputIntermediate( cout );
 	if( !Coalesce.Cluster( PCL, FASTA, vecClusters ) ) {
 		cerr << "Clustering failed" << endl;
 		return 1; }
 
+	for( i = 0; i < vecpFASTAs.size( ); ++i )
+		delete vecpFASTAs[ i ];
 	if( !sArgs.progressive_flag )
 		for( i = 0; i < vecClusters.size( ); ++i )
 			vecClusters[ i ].Save( cout, i, PCL, &Motifs );
 			cerr << "Cluster too small: " << Cluster.GetGenes( ).size( ) << endl;
 			return true; }
 
-		Cluster.RemoveMotifs( Motifs, (float)sArgs.min_zscore_arg );
 		if( !Cluster.LabelMotifs( Motifs, (float)sArgs.penalty_gap_arg, (float)sArgs.penalty_mismatch_arg,
 			(float)sArgs.known_cutoff_arg ) )
 			return false;

File tools/COALESCE/COALESCE.ggo

View file
  • Ignore whitespace
 section "Algorithm Parameters"
 option	"prob_gene"		p	"Probability threshhold for gene inclusion"
 							double	default="0.95"
-option	"pvalue_cond"	P	"P-value threshhold for condition inclusion"
+option	"pvalue_cond"	c	"P-value threshhold for condition inclusion"
 							double	default="0.05"
 option	"pvalue_motif"	m	"P-value threshhold for motif inclusion"
 							double	default="0.05"
+option	"zscore_cond"	C	"Z-score threshhold for condition inclusion"
+							double	default="0.5"
+option	"zscore_motif"	M	"Z-score threshhold for motif inclusion"
+							double	default="0.5"
 
 section "Sequence Parameters"
 option	"k"				k	"Sequence kmer length"
 							double	default="2.1"
 
 section "Performance Parameters"
-option	"pvalue_correl"	c	"P-value threshhold for significant correlation"
+option	"pvalue_correl"	n	"P-value threshhold for significant correlation"
 							double	default="0.05"
-option	"number_correl"	C	"Maximum number of pairs to sample for significant correlation"
+option	"number_correl"	N	"Maximum number of pairs to sample for significant correlation"
 							int	default="100000"
 option	"sequences"		q	"Sequence types to use (comma separated)"
 							string
 option	"size_maximum"	Z	"Maximum motif count to consider a cluster saturated"
 							int	default="1000"
 
-section "Additional Data"
-option	"nucleosomes"	n	"Nucleosome position file (WIG)"
-							string	typestr="filename"
-option	"conservation"	a	"Evolutionary conservation file (WIG)"
-							string	typestr="filename"
-
 section "Postprocessing Parameters"
 option	"postprocess"	j	"Input directory of clusters to postprocess"
 							string	typestr="directory"
 option	"known_cutoff"	F	"P-value cutoff for known motif labeling"
 							double	default="0.05"
 option	"cutoff_postprocess"	J	"Similarity cutoff for cluster merging"
-							double	default="0.75"
+							double	default="1"
 option	"fraction_postprocess"	L	"Overlap fraction for postprocessing gene/condition inclusion"
 							double	default="0.5"
 option	"cutoff_trim"	T	"Cocluster stdev cutoff for cluster trimming"
 							flag	on
 option	"min_info"		u	"Uninformative motif threshhold (bits)"
 							double	default="0.3"
-option	"min_zscore"	U	"Minimum motif z-score magnitude"
-							double	default="0.2"
 option	"max_motifs"	x	"Maximum motifs to merge exactly"
 							int	default="2500"