Commits

Anonymous committed 3411794

[svn r415] Fix deadly COALESCE bug in CCoalesceGeneScores::Grow
Previous m_iMotifs _not_ necessarily equal to previous m_iCapacity
For example, in cases of skipped/unused motif IDs
This led to complete screwage (OBOs) of motif counts during Grow
Fix deadly COALESCE bug in CCoalesceGeneScores::Add
Genes with sequences length <k were detected as having no scores
This made them sequenceless and behave incorrectly for later PST matching
Add COALESCE test code for archival purposes (to be removed imminently)
Add known motif parsing/tagging to COALESCE
Open in standard format through COALESCE/CCoalesceMotifLibrary
Detect using significant PWM correlation in CCoalesceCluster::LabelMotifs
Store in ID/score pairs in SMotifMatch
Add input/output in Open/Save calls
Fix incorrect error detection in CCoalesceClusterImpl::OpenMotifs
Caused premature parse failure during postprocessing with motifs
Improve member counting in CCoalesceGroupHistogramSet
Member count is stored as length of m_vecTotal
All members included, even those with only zero scores
But unknown member queries behave correctly wrt totals/zeros

Add rudimentary known motif scanning/counting to PCLPlotter
Add byte-accurate message logging to BNServer
Add kmer/rc matching to CCoalesceMotifLibrary::GetMatch
Not needed during normal usage, useful for testing

Add cluster postprocessing from .out file(s) to COALESCE
Largely untested!
Update CPCL::OpenExperiments to allow zero experiments
Keep an eye on this for interactions with DAB misopenings
Improve GetPWM organization in CCoalesceMotifLibrary
Improve (remove) motif counting in CCoalesceGroupHistograms
Improve thread usage for centroids in CCoalesceCluster::SelectGenes
Fix empty merge idiocy in SMotifMatch::Open (during postprocessing)
Remove some unnecessary STL namespaceage from .cpp files
Improve header usage in Sleipnir proper
Add CPCL::GetExperiment( const string& ) (unoptimized)

  • Participants
  • Parent commits 4553ed7

Comments (0)

Files changed (23)

File src/coalesce.cpp

 	for( iSubsequence = ESubsequenceBegin; iSubsequence < vecvecdCounts.size( ); ++iSubsequence ) {
 		const vector<float>&	vecdCounts	= vecvecdCounts[ iSubsequence ];
 
-		if( iLength = veciLengths[ iSubsequence ] )
+		if( iLength = veciLengths[ iSubsequence ] ) {
+			Set( iType, (ESubsequence)iSubsequence, iGene, 0, 0, Motifs.GetMotifs( ) );
 			for( iMotif = 0; iMotif < vecdCounts.size( ); ++iMotif )
 				if( d = vecdCounts[ iMotif ] )
 					Set( iType, (ESubsequence)iSubsequence, iGene, iMotif, d / iLength,
-						vecdCounts.size( ) ); }
+						Motifs.GetMotifs( ) ); } }
 
 	return true; }
 
 				vecdEdges.resize( m_iBins );
 				for( int i = 0; (size_t)i < vecdEdges.size( ); ++i )
 					vecdEdges[ i ] = ( i * m_dStep ) - ( m_dStep / 2 );
-				Histograms.Initialize( GetMotifs( ), vecdEdges ); }
+				Histograms.Initialize( GeneScores.GetMotifs( ), vecdEdges ); }
 // unlock
 			if( iMotifThem == -1 ) {
 				for( iMotifUs = 0; iMotifUs < GeneScores.GetMotifs( ); ++iMotifUs )
 			if( !Histograms.GetTotal( ) )
 				continue;
 			ostm << GetType( iType ) << '\t' << GetSubsequence( (ESubsequence)iSubsequence ) << endl;
-			for( iMotif = 0; iMotif < ( pMotifs ? pMotifs->GetMotifs( ) : GetMotifs( ) ); ++iMotif ) {
+			for( iMotif = 0; iMotif < ( pMotifs ? pMotifs->GetMotifs( ) : Histograms.GetMembers( ) ); ++iMotif ) {
 				if( Histograms.Get( iMotif, 0.0f ) == Histograms.GetTotal( ) )
 					continue;
 				if( pMotifs )
 		const CCoalesceHistogramSet<>&	HistOne	= Get( iTypeOne, sOne.m_eSubsequence );
 		const CCoalesceHistogramSet<>&	HistTwo	= Get( iTypeTwo, sTwo.m_eSubsequence );
 
-		dP = 1 - HistOne.ZScore( sOne.m_iMotif, HistTwo, sTwo.m_iMotif, false, dAveOne, dAverage, dZ );
+		dP = 1 - HistOne.CohensD( sOne.m_iMotif, HistTwo, sTwo.m_iMotif, false, dAveOne, dAverage, dZ );
 		if( g_CatSleipnir.isDebugEnabled( ) )
 			g_CatSleipnir.debug( "CCoalesceGroupHistograms::IsSimilar( %s, %s, %g ) got p-value %g",
 				sOne.Save( pMotifs ).c_str( ), sTwo.Save( pMotifs ).c_str( ), dPValue, dP );
 			GetSizeMaximum( ) ); }
 	for( dFailure = 1; dFailure > c_dEpsilon; dFailure *= GetPValueCorrelation( ) ) {
 		CCoalesceCluster			Cluster, Pot;
-		CCoalesceGroupHistograms	HistsCluster( GetMotifCount( ), GetBins( ), 1.0f / GetBasesPerMatch( ) );
-		CCoalesceGroupHistograms	HistsPot( GetMotifCount( ), GetBins( ), 1.0f / GetBasesPerMatch( ) );
+		CCoalesceGroupHistograms	HistsCluster( GetBins( ), 1.0f / GetBasesPerMatch( ) );
+		CCoalesceGroupHistograms	HistsPot( GetBins( ), 1.0f / GetBasesPerMatch( ) );
 
 		Pot.SetGenes( PCLCopy.GetGenes( ) );
 		Pot.CalculateHistograms( GeneScores, HistsPot, NULL );
 
 	return true; }
 
+bool CCoalesce::Cluster2( const CPCL& PCL, const CFASTA& FASTA, vector<CCoalesceCluster>& vecClusters ) {
+	static const float			c_dEpsilon	= 1e-10f;
+	size_t						i;
+	vector<size_t>				veciPCL2FASTA;
+	CCoalesceGeneScores			GeneScores;
+	set<pair<size_t, size_t> >	setpriiSeeds;
+	SCoalesceModifiers			sModifiers;
+	float						dFailure;
+
+	if( !( InitializeDatasets( PCL ) && InitializeGeneScores( PCL, FASTA, veciPCL2FASTA, sModifiers,
+		GeneScores ) ) )
+		return false;
+	if( g_CatSleipnir.isNoticeEnabled( ) ) {
+		size_t			iSequences;
+		ostringstream	ossm;
+
+		for( iSequences = i = 0; i < veciPCL2FASTA.size( ); ++i )
+			if( veciPCL2FASTA[ i ] != -1 )
+				iSequences++;
+		g_CatSleipnir.notice( "CCoalesce::Cluster( ) running with %d genes, %d conditions, and %d sequences",
+			PCL.GetGenes( ), PCL.GetExperiments( ), iSequences );
+		for( i = 0; i < PCL.GetExperiments( ); ++i )
+			ossm << ( i ? "\t" : "" ) << PCL.GetExperiment( i );
+		g_CatSleipnir.notice( ossm.str( ) );
+		g_CatSleipnir.notice( "k %d, P gene %g, p condition %g, p motif %g, p correlation %g", GetK( ),
+			GetProbabilityGene( ), GetPValueCondition( ), GetPValueMotif( ), GetPValueCorrelation( ) );
+		g_CatSleipnir.notice( "p merge %g, cutoff merge %g, penalty gap %g, penalty mismatch %g",
+			GetPValueMerge( ), GetCutoffMerge( ), GetMotifs( ) ? GetMotifs( )->GetPenaltyGap( ) : 0,
+			GetMotifs( ) ? GetMotifs( )->GetPenaltyMismatch( ) : 0 );
+		g_CatSleipnir.notice( "correlation pairs %d, bases %d, min size %d, merge size %d, max size %d",
+			GetNumberCorrelation( ), GetBasesPerMatch( ), GetSizeMinimum( ), GetSizeMerge( ),
+			GetSizeMaximum( ) ); }
+	for( dFailure = 1; dFailure > c_dEpsilon; dFailure *= GetPValueCorrelation( ) ) {
+		CCoalesceCluster			Cluster, Pot;
+		CCoalesceGroupHistograms	HistsCluster( GetBins( ), 1.0f / GetBasesPerMatch( ) );
+		CCoalesceGroupHistograms	HistsPot( GetBins( ), 1.0f / GetBasesPerMatch( ) );
+
+		Pot.SetGenes( PCL.GetGenes( ) );
+		Pot.CalculateHistograms( GeneScores, HistsPot, NULL );
+		if( !Cluster.Initialize( PCL, Pot, m_vecsDatasets, setpriiSeeds, GetNumberCorrelation( ),
+			GetPValueCorrelation( ), GetThreads( ) ) )
+			continue;
+		g_CatSleipnir.notice( "CCoalesce::Cluster( ) initialized %d genes", Cluster.GetGenes( ).size( ) );
+		if( g_CatSleipnir.isDebugEnabled( ) ) {
+			ostringstream				ossm;
+			set<size_t>::const_iterator	iterGene;
+
+			ossm << "CCoalesce::Cluster( ) initialized:";
+			for( iterGene = Cluster.GetGenes( ).begin( ); iterGene != Cluster.GetGenes( ).end( ); ++iterGene )
+				ossm << ' ' << PCL.GetGene( *iterGene );
+			g_CatSleipnir.debug( ossm.str( ) ); }
+		if( Cluster.GetGenes( ).size( ) < GetSizeMinimum( ) )
+			continue;
+		while( !( Cluster.IsConverged( ) || Cluster.IsEmpty( ) ) ) {
+			Cluster.CalculateHistograms( GeneScores, HistsCluster, &HistsPot );
+			Cluster.Snapshot( GeneScores, HistsCluster );
+			Pot.Snapshot( GeneScores, HistsPot );
+			if( ( Cluster.GetGenes( ).size( ) >= GetSizeMinimum( ) ) &&
+				!( CombineMotifs( FASTA, veciPCL2FASTA, sModifiers, Cluster, GetThreads( ), GeneScores,
+				HistsCluster, HistsPot ) && Cluster.SelectMotifs( HistsCluster, HistsPot, GetPValueMotif( ),
+				GetSizeMaximum( ), GetThreads( ), GetMotifs( ) ) ) )
+				return false;
+			if( !Cluster.SelectGenes( PCL, GeneScores, HistsCluster, HistsPot, GetSizeMinimum( ),
+				GetThreads( ), Pot, GetProbabilityGene( ), GetMotifs( ) ) )
+				return false;
+			g_CatSleipnir.notice( "CCoalesce::Cluster( ) processed %d genes, %d datasets, %d motifs",
+				Cluster.GetGenes( ).size( ), Cluster.GetDatasets( ).size( ), Cluster.GetMotifs( ).size( ) ); }
+		if( Cluster.IsConverged( ) && ( Cluster.GetGenes( ).size( ) >= GetSizeMinimum( ) ) ) {
+			g_CatSleipnir.notice( "CCoalesce::Cluster( ) finalizing cluster" );
+			setpriiSeeds.clear( );
+			vecClusters.push_back( Cluster );
+			Cluster.Subtract( GeneScores );
+			dFailure = 1; }
+		else
+			g_CatSleipnir.notice( "CCoalesce::Cluster( ) discarding cluster (failure %g)", dFailure ); }
+
+	return true; }
+
 }

File src/coalesce.h

 class CCoalesce : CCoalesceImpl {
 public:
 	bool Cluster( const CPCL& PCL, const CFASTA& FASTA, std::vector<CCoalesceCluster>& vecClusters );
+	bool Cluster2( const CPCL& PCL, const CFASTA& FASTA, std::vector<CCoalesceCluster>& vecClusters );
 
 	void SetPValueCorrelation( float dPValue ) {
 

File src/coalescecluster.cpp

 
 namespace Sleipnir {
 
-const char	CCoalesceClusterImpl::c_szMotifs[]			= "_motifs.txt";
+const char	CCoalesceClusterImpl::c_szMotifs[]		= "_motifs.txt";
+const char	CCoalesceClusterImpl::c_szGenes[]		= "Genes";
+const char	CCoalesceClusterImpl::c_szConditions[]	= "Conditions";
 
 bool CCoalesceCluster::Initialize( const CPCL& PCL, CCoalesceCluster& Pot,
 	const std::vector<SCoalesceDataset>& vecsDatasets, std::set<std::pair<size_t, size_t> >& setpriiSeeds,
 	set<SMotifMatch>::const_iterator	iterMotif;
 	vector<float>						vecdStdevs;
 
-	vecpthdThreads.resize( max( 2, iThreads ) );
+	vecpthdThreads.resize( iThreads );
 	{
 		SThreadCentroid	sUs( this, PCL ), sThem( &Pot, PCL );
 
-		if( pthread_create( &vecpthdThreads[ 0 ], NULL, ThreadCentroid, &sUs ) ||
-			pthread_create( &vecpthdThreads[ 1 ], NULL, ThreadCentroid, &sThem ) ) {
+		if( iThreads > 1 ) {
+			if( pthread_create( &vecpthdThreads[ 0 ], NULL, ThreadCentroid, &sUs ) ||
+				pthread_create( &vecpthdThreads[ 1 ], NULL, ThreadCentroid, &sThem ) ) {
+				g_CatSleipnir.error( "CCoalesceCluster::SelectGenes( %d, %g ) could not calculate centroids",
+					iThreads, dProbability );
+				return false; }
+			for( i = 0; i < 2; ++i )
+				pthread_join( vecpthdThreads[ i ], NULL ); }
+		else if( ThreadCentroid( &sUs ) || ThreadCentroid( &sThem ) ) {
 			g_CatSleipnir.error( "CCoalesceCluster::SelectGenes( %d, %g ) could not calculate centroids",
 				iThreads, dProbability );
 			return false; }
-		for( i = 0; i < 2; ++i )
-			pthread_join( vecpthdThreads[ i ], NULL );
 	}
 	iCluster = GetGenes( ).size( );
 	iPot = Pot.GetGenes( ).size( );
 				CMeta::IsNaN( dCluster = m_vecdCentroid[ iCondition ] ) ||
 				CMeta::IsNaN( dPot = Pot.m_vecdCentroid[ iCondition ] ) )
 				continue;
-			
+
 			if( fClustered )
 				dCluster = ( ( dCluster * GetGenes( ).size( ) ) - dGene ) /
 					( GetGenes( ).size( ) - 1 );
 					sCluster--;
 					iCluster--; }
 				else
-					g_CatSleipnir.warn( "CCoalesceClusterImpl::CalculateProbabilityMotifs( %d, %d, %g, %g ) no motifs of %d in cluster: %g, %s",
+					g_CatSleipnir.warn( "CCoalesceClusterImpl::CalculateProbabilityMotifs( %d, %d, %g, %g ) no motifs of %d in cluster: %g, %s\n%s",
 						iGene, fClustered, dLogPIn, dLogPOut, iCluster, dGene,
-						iterMotif->Save( NULL ).c_str( ) ); }
+						iterMotif->Save( NULL ).c_str( ), HistSetCluster.Save( iterMotif->m_iMotif ).c_str( ) ); }
 			else if( sPot ) {
 				sPot--;
 				iPot--; }
 			else
-				g_CatSleipnir.warn( "CCoalesceClusterImpl::CalculateProbabilityMotifs( %d, %d, %g, %g ) no motifs of %d in pot: %g, %s",
-					iGene, fClustered, dLogPIn, dLogPOut, iPot, dGene, iterMotif->Save( NULL ).c_str( ) );
+				g_CatSleipnir.warn( "CCoalesceClusterImpl::CalculateProbabilityMotifs( %d, %d, %g, %g ) no motifs of %d in pot: %g, %s\n%s",
+					iGene, fClustered, dLogPIn, dLogPOut, iPot, dGene, iterMotif->Save( NULL ).c_str( ),
+					HistSetPot.Save( iterMotif->m_iMotif ).c_str( ) );
 			dPCluster = (double)( sCluster + 1 ) / ( iCluster + HistSetCluster.GetEdges( ) );
 			dPPot = (double)( sPot + 1 ) / ( iPot + HistSetPot.GetEdges( ) );
 		}
 	string								strMotif;
 
 	ostm << "Cluster\t" << iID << endl;
-	ostm << "Genes";
+	ostm << c_szGenes;
 	for( iterID = GetGenes( ).begin( ); iterID != GetGenes( ).end( ); ++iterID )
 		ostm << '\t' << PCL.GetGene( *iterID );
-	ostm << endl << "Conditions";
-	for( iterID = m_setiDatasets.begin( ); iterID != m_setiDatasets.end( ); ++iterID ) 
+	ostm << endl << c_szConditions;
+	for( iterID = m_setiDatasets.begin( ); iterID != m_setiDatasets.end( ); ++iterID )
 		for( i = 0; i < GetConditions( *iterID ); ++i )
 			ostm << '\t' << PCL.GetExperiment( GetCondition( *iterID, i ) );
 	ostm << endl << "Motifs" << endl;
 
 	return PCLCluster.GetExperiments( ); }
 
+size_t CCoalesceCluster::Open( istream& istm, const CPCL& PCL, CCoalesceMotifLibrary* pMotifs ) {
+	string				strBuffer;
+	vector<string>		vecstrLine;
+	size_t				i, j;
+	vector<SMotifMatch>	vecsMotifs;
+
+	Clear( );
+	strBuffer.resize( CFile::GetBufferSize( ) );
+	istm.getline( &strBuffer[ 0 ], strBuffer.size( ) );
+
+	istm.getline( &strBuffer[ 0 ], strBuffer.size( ) );
+	CMeta::Tokenize( strBuffer.c_str( ), vecstrLine );
+	if( vecstrLine.empty( ) || ( vecstrLine[ 0 ] != c_szGenes ) ) {
+		g_CatSleipnir.error( "CCoalesceCluster::Open( ) invalid line: %s", strBuffer.c_str( ) );
+		return -1; }
+	for( i = 1; i < vecstrLine.size( ); ++i ) {
+		if( ( j = PCL.GetGene( vecstrLine[ i ] ) ) == -1 ) {
+			g_CatSleipnir.error( "CCoalesceCluster::Open( ) unrecognized gene: %s",
+				vecstrLine[ i ].c_str( ) );
+			return -1; }
+		m_setiGenes.insert( j ); }
+
+	istm.getline( &strBuffer[ 0 ], strBuffer.size( ) );
+	vecstrLine.clear( );
+	CMeta::Tokenize( strBuffer.c_str( ), vecstrLine );
+	if( vecstrLine.empty( ) || ( vecstrLine[ 0 ] != c_szConditions ) ) {
+		g_CatSleipnir.error( "CCoalesceCluster::Open( ) invalid line: %s", strBuffer.c_str( ) );
+		return -1; }
+	for( i = 1; i < vecstrLine.size( ); ++i ) {
+		if( ( j = PCL.GetExperiment( vecstrLine[ i ] ) ) == -1 ) {
+			g_CatSleipnir.error( "CCoalesceCluster::Open( ) unrecognized condition: %s",
+				vecstrLine[ i ].c_str( ) );
+			return -1; }
+		m_setiDatasets.insert( j ); }
+
+	istm.getline( &strBuffer[ 0 ], CFile::GetBufferSize( ) );
+	if( !CCoalesceMotifLibrary::Open( istm, vecsMotifs, pMotifs ) )
+		return -1;
+	for( i = 0; i < vecsMotifs.size( ); ++i )
+		m_setsMotifs.insert( vecsMotifs[ i ] );
+
+	return m_setiDatasets.size( ); }
+
 bool CCoalesceCluster::Open( const CHierarchy& Hierarchy, const vector<CCoalesceCluster>& vecClusters,
 	const vector<string>& vecstrClusters, float dFraction, float dCutoff, size_t iCutoff,
 	CCoalesceMotifLibrary* pMotifs ) {
 		size_t		iCount;
 
 		sMotif.m_dZ = 0;
-		if( !sMotif.Open( Hier, vecsMotifs, Motifs, iCount = 0 ) )
+		if( sMotif.Open( Hier, vecsMotifs, Motifs, iCount = 0 ) == -1 )
 			return false;
 		sMotif.m_dZ /= iCount;
 		setsMotifs.insert( sMotif );
 	CCoalesceClusterImpl::Snapshot( GetGenes( ), m_veciPrevGenes ); }
 
 size_t CCoalesceCluster::RemoveMotifs( const CCoalesceMotifLibrary& Motifs, float dZScore ) {
-	std::set<SMotifMatch>::const_iterator	iterMotif;
-	std::vector<const SMotifMatch*>			vecpsMotifs;
-	size_t									i;
+	set<SMotifMatch>::const_iterator	iterMotif;
+	vector<const SMotifMatch*>			vecpsMotifs;
+	size_t								i;
 
 	for( iterMotif = m_setsMotifs.begin( ); iterMotif != m_setsMotifs.end( ); ++iterMotif )
 		if( fabs( iterMotif->m_dZ ) < dZScore ) {
 
 	return vecpsMotifs.size( ); }
 
+bool CCoalesceCluster::LabelMotifs( const CCoalesceMotifLibrary& Motifs, float dPenaltyGap,
+	float dPenaltyMismatch, float dPValue ) {
+	set<SMotifMatch>::iterator	iterMotif;
+
+	if( !Motifs.GetKnowns( ) )
+		return true;
+	for( iterMotif = m_setsMotifs.begin( ); iterMotif != m_setsMotifs.end( ); ++iterMotif ) {
+		SMotifMatch&	sMotif	= (SMotifMatch&)*iterMotif;
+// For some obscure reason, Linux won't let me use this as a non-const iterator...
+		if( !sMotif.Label( Motifs, dPenaltyGap, dPenaltyMismatch, dPValue ) )
+			return false; }
+
+	return true; }
+
 }

File src/coalescecluster.h

 		CCoalesceGroupHistograms& HistogramsCluster, CCoalesceGroupHistograms* pHistogramsPot ) const;
 	size_t Open( const std::string& strPCL, size_t iSkip, const CPCL& PCL,
 		CCoalesceMotifLibrary* pMotifs = NULL );
+	size_t Open( std::istream& istm, const CPCL& PCL, CCoalesceMotifLibrary* pMotifs = NULL );
 	bool Open( const CHierarchy& Hierarchy, const std::vector<CCoalesceCluster>& vecClusters,
 		const std::vector<std::string>& vecstrClusters, float dFraction, float dCutoff, size_t iCutoff,
 		CCoalesceMotifLibrary* pMotifs = NULL );
 	float GetSimilarity( const CCoalesceCluster& Cluster, size_t iGenes, size_t iDatasets ) const;
 	void Snapshot( const CCoalesceGeneScores& GeneScores, CCoalesceGroupHistograms& Histograms );
 	size_t RemoveMotifs( const CCoalesceMotifLibrary&, float dZScore );
+	bool LabelMotifs( const CCoalesceMotifLibrary&, float dPenaltyGap, float dPenaltyMismatch,
+		float dPValue );
 
 	bool IsConverged( ) {
 

File src/coalesceclusteri.h

 protected:
 	typedef std::vector<std::map<std::string, std::set<SMotifMatch> > >	TVecMapStrSetSMotifs;
 
-	static const char	c_cStar	= '*';
+	static const char	c_cStar		= '*';
 	static const char	c_szMotifs[];
+	static const char	c_szConditions[];
+	static const char	c_szGenes[];
 
 	struct SDataset {
 		const SCoalesceDataset*	m_psDataset;
 
 		return sMotif.GetHash( ); }
 
+public:
 	void Add( size_t, CCoalesceCluster& );
+protected:
 	bool AddCorrelatedGenes( const CPCL&, CCoalesceCluster&, float );
 	bool AddSeedPair( const CPCL&, CCoalesceCluster&, std::set<std::pair<size_t, size_t> >&, float, float,
 		size_t );

File src/coalescei.h

 template<class tValue = float, class tCount = unsigned short>
 class CCoalesceHistogramSet {
 public:
-	CCoalesceHistogramSet( ) : m_iMembers(0) { }
-
 	double ZScore( size_t iMember, const CCoalesceHistogramSet& HistSet, double& dAveOne, double& dAverage,
 		double& dZ, bool fCount = true ) const {
 
 
 	void Initialize( size_t iMembers, const std::vector<tValue>& vecEdges ) {
 
-		m_iMembers = iMembers;
+		SetMembers( iMembers );
 		m_vecEdges.resize( vecEdges.size( ) );
 		std::copy( vecEdges.begin( ), vecEdges.end( ), m_vecEdges.begin( ) );
 		m_iZero = GetBin( 0 ); }
 		Initialize( iMembers, vecEdges ); }
 
 	bool Add( size_t iMember, tValue Value, tCount Count ) {
-		size_t	i;
+		size_t	iBin;
 
 // lock
-		if( m_vecEdges.empty( ) || ( ( i = GetBin( Value ) ) == m_iZero ) )
+		if( m_vecEdges.empty( ) )
 // unlock
 			return false;
 
-		m_iMembers = max( GetMembers( ), iMember + 1 );
-		m_vecCounts.resize( GetOffset( GetMembers( ) ) );
-		m_vecCounts[ GetOffset( iMember ) + i ] += Count;
-		m_vecTotal.resize( GetMembers( ) );
-		m_vecTotal[ iMember ] += Count;
+		SetMembers( max( GetMembers( ), iMember + 1 ) );
+		if( ( iBin = GetBin( Value ) ) != m_iZero ) {
+			m_vecCounts[ GetOffset( iMember ) + iBin ] += Count;
+			m_vecTotal[ iMember ] += Count; }
 // unlock
 
 		return true; }
 
 	tCount Get( size_t iMember, size_t iBin ) const {
 
-		if( ( iMember >= GetMembers( ) ) || ( iBin >= GetEdges( ) ) )
+		if( iBin >= GetEdges( ) )
 			return 0;
+		if( iBin == m_iZero )
+			return ( GetTotal( ) - ( ( iMember < GetMembers( ) ) ? m_vecTotal[ iMember ] : 0 ) );
 
-		return ( ( iBin == m_iZero ) ? ( GetTotal( ) - m_vecTotal[ iMember ] ) :
-			m_vecCounts[ GetOffset( iMember ) + iBin ] ); }
+		return ( ( iMember < GetMembers( ) ) ? m_vecCounts[ GetOffset( iMember ) + iBin ] : 0 ); }
 
 	tCount Get( size_t iMember, tValue Value ) const {
 
 
 		return min( i, GetEdges( ) - 1 ); }
 
-/*
-	const tCount* Get( size_t iMember ) const {
-		const tCount*	pRet;
-
-		if( !GetEdges( ) || ( iMember >= GetMembers( ) ) )
-			return NULL;
-
-		pRet = &m_vecCounts[ GetOffset( iMember ) ];
-		*(tCount*)pRet = Get( iMember, (size_t)0 );
-		return pRet; }
-*/
-
 	size_t GetMembers( ) const {
 
-		return m_iMembers; }
+		return m_vecTotal.size( ); }
 
 	size_t GetEdges( ) const {
 
 
 		return m_vecEdges[ iBin ]; }
 
-/*
-	bool Add( const CCoalesceHistogramSet& Histograms ) {
-		size_t	i, j;
-
-		if( ( Histograms.GetMembers( ) != GetMembers( ) ) ||
-			( Histograms.GetEdges( ).size( ) != GetEdges( ).size( ) ) )
-			return false;
-
-		m_Total += Histograms.GetTotal( );
-		for( i = 0; i < GetMembers( ); ++i )
-			for( j = 1; j < GetEdges( ).size( ); ++j )
-				if( !Add( i, GetEdges( )[ j ], Histograms.Get( i, j ) ) )
-					return false;
-
-		return true; }
-*/
-
 	std::string Save( size_t iMember ) const {
 		std::ostringstream	ossm;
 		size_t				i;
 		return m_vecEdges; }
 
 protected:
+	void SetMembers( size_t iMembers ) {
+
+		m_vecTotal.resize( iMembers );
+		m_vecCounts.resize( GetOffset( GetMembers( ) ) ); }
+
 	size_t GetOffset( size_t iMember ) const {
 
 		return ( iMember * GetEdges( ) ); }
 		dAve = (double)Ave / GetTotal( );
 		dVar = ( (double)Var / GetTotal( ) ) - ( dAve * dAve ); }
 
-	size_t				m_iMembers;
 	size_t				m_iZero;
 	tCount				m_Total;
 	std::vector<tCount>	m_vecTotal;
 		pthread_mutex_unlock( &m_mutx ); }
 
 	void Grow( uint32_t iMotif, uint32_t iMotifs ) {
-		size_t	iType, iSubtype, iGene, iTarget;
+		size_t	iType, iSubtype, iGene, iTarget, iCapacity;
 		float*	adTotal;
 		float*	ad;
 		float*	adFrom;
 			if( iTarget > m_iMotifs )
 				m_iMotifs = iTarget;
 			return; }
+		iCapacity = m_iCapacity;
 		m_iCapacity = iTarget + c_iLookahead;
 		for( iType = 0; iType < GetTypes( ); ++iType )
 			for( iSubtype = 0; iSubtype < GetSubsequences( iType ); ++iSubtype ) {
 					continue;
 				ad = new float[ m_iGenes * m_iCapacity ];
 				for( iGene = 0,adFrom = adTotal,adTo = ad; iGene < m_iGenes;
-					++iGene,adFrom += m_iMotifs,adTo += m_iCapacity ) {
+					++iGene,adFrom += iCapacity,adTo += m_iCapacity ) {
 					memcpy( adTo, adFrom, m_iMotifs * sizeof(*adTo) );
 					memset( adTo + m_iMotifs, 0, ( m_iCapacity - m_iMotifs ) * sizeof(*adTo) ); }
 				delete[] adTotal;
 // One histogram per motif atom
 class CCoalesceGroupHistograms : public CCoalesceSequencer<CCoalesceHistogramSet<> > {
 public:
-	CCoalesceGroupHistograms( uint32_t iMotifs, size_t iBins, float dStep ) : m_iMotifs(iMotifs),
-		m_iBins(iBins), m_dStep(dStep) { }
+	CCoalesceGroupHistograms( size_t iBins, float dStep ) : m_iBins(iBins), m_dStep(dStep) { }
 
 	bool Add( const CCoalesceGeneScores&, size_t, bool, uint32_t = -1 );
 	void Save( std::ostream&, const CCoalesceMotifLibrary* ) const;
 	bool IsSimilar( const CCoalesceMotifLibrary*, const SMotifMatch&, const SMotifMatch&, float ) const;
 
-	uint32_t GetMotifs( ) const {
-
-		return m_iMotifs; }
-
 	void SetTotal( const CCoalesceGeneScores& GeneScores, const std::set<size_t>& setiGenes ) {
 		std::set<size_t>::const_iterator	iterGene;
 		size_t								i, iTypeUs, iTypeThem, iSubsequence;
 
 				if( !HistsCur.GetTotal( ) )
 					continue;
-				for( iMotif = 0; iMotif < GetMotifs( ); ++iMotif )
+				for( iMotif = 0; iMotif < HistsCur.GetMembers( ); ++iMotif )
 					for( iEdge = 0; iEdge < HistsCur.GetEdges( ); ++iEdge )
 						if( HistsCur.Get( iMotif, iEdge ) != HistsTotal.Get( iMotif, iEdge ) ) {
 							std::cerr << "INVALID" << '\t' << GetType( iType ) << '\t' << iSubsequence <<

File src/coalescemotifs.cpp

 #include "coalescemotifs.h"
 #include "coalescestructsi.h"
 #include "pst.h"
+#include "measure.h"
+#include "statistics.h"
 
 namespace Sleipnir {
 
 const char	CCoalesceMotifLibraryImpl::c_szBases[]			= "ACGT";
 const char	CCoalesceMotifLibraryImpl::c_szComplements[]	= "TGCA";
 
+void CCoalesceMotifLibraryImpl::SKnowns::Match( const vector<float>& vecdMotif,
+	map<string, float>& mapstrdKnown ) const {
+	static const size_t					c_iOverlap	= 5;
+	map<string, TVecPr>::const_iterator	iterKnown;
+	size_t								i, j, k, iMin, iMax, iTests;
+	float								dP, dR, dMin;
+	const float*						adMotif;
+	const float*						adPWM;
+	const float*						adRC;
+
+	for( iterKnown = m_mapstrvecKnown.begin( ); iterKnown != m_mapstrvecKnown.end( ); ++iterKnown ) {
+		dMin = FLT_MAX;
+		for( iTests = i = 0; i < iterKnown->second.size( ); ++i ) {
+			const vector<float>&	vecdPWM	= iterKnown->second[ i ].first;
+			const vector<float>&	vecdRC	= iterKnown->second[ i ].second;
+
+			iMin = strlen( c_szBases ) * c_iOverlap;
+			iMax = vecdMotif.size( ) + vecdPWM.size( ) - iMin;
+			if( iMax < iMin )
+				continue;
+			for( j = iMin; j <= iMax; j += strlen( c_szBases ) ) {
+				iTests += 2;
+				if( j < vecdMotif.size( ) ) {
+					adMotif = &vecdMotif[ vecdMotif.size( ) - j - 1 ];
+					adPWM = &vecdPWM.front( );
+					adRC = &vecdRC.front( );
+					k = min( vecdPWM.size( ), j ); }
+				else {
+					adMotif = &vecdMotif.front( );
+					k = j - vecdMotif.size( );
+					adPWM = &vecdPWM[ k ];
+					adRC = &vecdRC[ k ];
+					k = min( vecdMotif.size( ), vecdPWM.size( ) - k ); }
+				if( ( dR = (float)max( CMeasurePearson::Pearson( adMotif, k, adPWM, k,
+					IMeasure::EMapNone ), CMeasurePearson::Pearson( adMotif, k, adRC, k,
+					IMeasure::EMapNone ) ) ) < 0 )
+					continue;
+				if( ( dP = (float)CStatistics::PValuePearson( dR, k ) ) < dMin )
+					dMin = dP; } }
+		if( dMin != FLT_MAX )
+			mapstrdKnown[ iterKnown->first ] = dMin * iTests; } }
+
+bool CCoalesceMotifLibrary::Open( istream& istm, vector<SMotifMatch>& vecsMotifs,
+	CCoalesceMotifLibrary* pMotifs ) {
+	string	strBuffer;
+	size_t	i;
+
+	strBuffer.resize( CFile::GetBufferSize( ) );
+	while( CFile::IsNewline( istm.peek( ) ) )
+		istm.getline( &strBuffer[ 0 ], strBuffer.size( ) );
+	while( !istm.eof( ) && ( istm.peek( ) != c_cCluster ) ) {
+		SMotifMatch	sMotif;
+
+		if( pMotifs ) {
+			if( !sMotif.Open( istm, *pMotifs ) )
+				break;
+			vecsMotifs.push_back( sMotif );
+			if( isdigit( istm.peek( ) ) )
+				for( i = 0; i < 4; ++i )
+					istm.getline( &strBuffer[ 0 ], strBuffer.size( ) ); }
+		else
+			istm.getline( &strBuffer[ 0 ], strBuffer.size( ) );
+		while( CFile::IsNewline( istm.peek( ) ) )
+			istm.getline( &strBuffer[ 0 ], strBuffer.size( ) ); }
+
+	return true; }
+
 CCoalesceMotifLibraryImpl::~CCoalesceMotifLibraryImpl( ) {
 	size_t	i;
 
  * \returns
  * Length-normalized strength of motif match against the given sequence and offset.
  * 
- * Write detailed description for GetMatch here.
- * 
  * \remarks
  * iMotif must represent a valid motif for the current library, and iOffset must fall within strSequence,
  * although motifs extending from a valid iOffset past the end of the sequence will be handled appropriately.
  */
 float CCoalesceMotifLibrary::GetMatch( const std::string& strSequence, uint32_t iMotif, size_t iOffset,
 	SCoalesceModifierCache& sModifiers ) const {
-	size_t		i;
-	float		dRet;
+	size_t		i, iDepth;
+	float		d, dRet;
 	const CPST*	pPST;
+	string		strMotif, strRC;
+	EType		eType;
 
-// TODO: this could in theory be implemented
-	if( ( GetType( iMotif ) != ETypePST ) || !( pPST = GetPST( iMotif ) ) ) {
-		g_CatSleipnir.error( "CCoalesceMotifLibrary::GetMatch( %s, %d, %d ) attempted to match a non-PST motif",
-			strSequence.c_str( ), iMotif, iOffset );
-		return CMeta::GetNaN( ); }
+	switch( eType = GetType( iMotif ) ) {
+		case ETypePST:
+			if( !( pPST = GetPST( iMotif ) ) ) {
+				g_CatSleipnir.error( "CCoalesceMotifLibrary::GetMatch( %s, %d, %d ) could not find PST",
+					strSequence.c_str( ), iMotif, iOffset );
+				return CMeta::GetNaN( ); }
+			break;
 
+		case ETypeKMer:
+			strMotif = GetMotif( iMotif );
+			break;
+
+		case ETypeRC:
+			strMotif = GetRCOne( iMotif );
+			strRC = GetReverseComplement( strMotif );
+			break; }
+
+	iDepth = ( eType == ETypePST ) ? pPST->GetDepth( ) : GetK( );
 	sModifiers.InitializeWeight( 0, 0 );
 	dRet = 0;
-	for( i = 1; i < pPST->GetDepth( ); ++i ) {
+	for( i = 1; i < iDepth; ++i ) {
 		sModifiers.AddWeight( 0, iOffset, i - 1 );
-		dRet += pPST->GetMatch( strSequence, pPST->GetDepth( ) - i ) * sModifiers.GetWeight( i ) / i; }
+		if( eType == ETypePST )
+			dRet += pPST->GetMatch( strSequence, iDepth - i ) * sModifiers.GetWeight( i ) / i; }
 	sModifiers.AddWeight( 0, iOffset, i - 1 );
 	for( i = 0; i < strSequence.length( ); ++i ) {
-		dRet += pPST->GetMatch( strSequence.substr( i ) ) * sModifiers.GetWeight( pPST->GetDepth( ) ) /
-			pPST->GetDepth( );
-		sModifiers.AddWeight( pPST->GetDepth( ), iOffset, i ); }
-	if( dRet < 0 ) {
+		switch( eType ) {
+			case ETypePST:
+				d = pPST->GetMatch( strSequence.substr( i ) ) * sModifiers.GetWeight( iDepth ) / iDepth;
+				break;
+
+			case ETypeKMer:
+			case ETypeRC:
+				d = ( strSequence.compare( i, iDepth, strMotif ) &&
+					( strRC.empty( ) || strSequence.compare( i, iDepth, strRC ) ) ) ? 0 :
+					sModifiers.GetWeight( iDepth );
+				break; }
+		dRet += d;
+		sModifiers.AddWeight( iDepth, iOffset, i ); }
+	if( CMeta::IsNaN( dRet ) || ( dRet < 0 ) ) {
 		g_CatSleipnir.error( "CCoalesceMotifLibrary::GetMatch( %s, %d, %d ) found negative score: %g",
 			strSequence.c_str( ), iMotif, iOffset, dRet );
 		return CMeta::GetNaN( ); }
 
 	pPST = CreatePST( iRet );
 	pPST->Add( asCrosses[ iMin ].m_strOne, asCrosses[ iMin ].m_strTwo, asCrosses[ iMin ].m_iOffset );
-	pPST->Add( asCrosses[ ( iMin + 2 ) % ARRAYSIZE(asCrosses) ].m_strOne );
-	pPST->Add( asCrosses[ ( iMin + 1 ) % ARRAYSIZE(asCrosses) ].m_strTwo );
+	{
+		CPST	PST( strlen( c_szBases ) );
+
+		PST.Add( asCrosses[ ( iMin + 1 ) % ARRAYSIZE(asCrosses) ].m_strTwo,
+			asCrosses[ ( iMin + 2 ) % ARRAYSIZE(asCrosses) ].m_strOne, asCrosses[ iMin ].m_iOffset );
+		pPST->Add( PST );
+	}
 	if( g_CatSleipnir.isInfoEnabled( ) )
 		g_CatSleipnir.info( "CCoalesceMotifLibraryImpl::MergeRCs( %s, %s, %g ) merged at %g to %s",
 			GetMotif( iOne ).c_str( ), GetMotif( iTwo ).c_str( ), dCutoff, dMin, pPST->GetMotif( ).c_str( ) );
 string CCoalesceMotifLibrary::GetPWM( uint32_t iMotif, float dCutoffPWMs, float dPenaltyGap,
 	float dPenaltyMismatch, bool fNoRCs ) const {
 	CFullMatrix<uint16_t>	MatPWM;
-	string					strMotif;
 	size_t					i, j;
 	ostringstream			ossm;
-	float					d;
+
+	if( !CCoalesceMotifLibraryImpl::GetPWM( iMotif, dCutoffPWMs, dPenaltyGap, dPenaltyMismatch, fNoRCs,
+		MatPWM ) )
+		return "";
+	for( i = 0; i < MatPWM.GetRows( ); ++i ) {
+		for( j = 0; j < MatPWM.GetColumns( ); ++j )
+			ossm << ( j ? "\t" : "" ) << MatPWM.Get( i, j );
+		ossm << endl; }
+
+	return ossm.str( ); }
+
+bool CCoalesceMotifLibraryImpl::GetPWM( uint32_t iMotif, float dCutoffPWMs, float dPenaltyGap,
+	float dPenaltyMismatch, bool fNoRCs, CFullMatrix<uint16_t>& MatPWM ) const {
+	string	strMotif;
+	float	d;
 
 	if( fNoRCs )
 		iMotif = ((CCoalesceMotifLibrary*)this)->RemoveRCs( iMotif, dPenaltyGap, dPenaltyMismatch );
 	switch( GetType( iMotif ) ) {
 		case ETypeKMer:
 			if( !CCoalesceMotifLibraryImpl::GetPWM( GetMotif( iMotif ), MatPWM ) )
-				return "";
+				return false;
 			break;
 
 		case ETypeRC:
 			if( !( CCoalesceMotifLibraryImpl::GetPWM( strMotif = GetRCOne( iMotif ), MatPWM ) &&
 				CCoalesceMotifLibraryImpl::GetPWM( GetReverseComplement( strMotif ), MatPWM ) ) )
-				return "";
+				return false;
 			break;
 
 		case ETypePST:
 			if( g_CatSleipnir.isInfoEnabled( ) ) {
 				ostringstream	ossm;
 
-				ossm << "CCoalesceMotifLibrary::GetPWM( " << iMotif << ", " << dCutoffPWMs << ", " <<
+				ossm << "CCoalesceMotifLibraryImpl::GetPWM( " << iMotif << ", " << dCutoffPWMs << ", " <<
 					fNoRCs << " ) rejected (" << d << "):" << endl;
 				MatPWM.Save( ossm, false );
 				g_CatSleipnir.info( ossm.str( ) ); }
-			return ""; }
+			return false; }
 		if( g_CatSleipnir.isDebugEnabled( ) ) {
 			ostringstream	ossm;
 
-			ossm << "CCoalesceMotifLibrary::GetPWM( " << iMotif << ", " << dCutoffPWMs << ", " <<
+			ossm << "CCoalesceMotifLibraryImpl::GetPWM( " << iMotif << ", " << dCutoffPWMs << ", " <<
 				fNoRCs << " ) got information (" << d << "):" << endl;
 			MatPWM.Save( ossm, false );
 			g_CatSleipnir.debug( ossm.str( ) ); } }
-	for( i = 0; i < MatPWM.GetRows( ); ++i ) {
-		for( j = 0; j < MatPWM.GetColumns( ); ++j )
-			ossm << ( j ? "\t" : "" ) << MatPWM.Get( i, j );
-		ossm << endl; }
 
-	return ossm.str( ); }
+	return true; }
 
 bool CCoalesceMotifLibrary::Simplify( uint32_t iMotif ) const {
 
 	return ( ( GetType( iMotif ) == ETypePST ) ? CCoalesceMotifLibraryImpl::GetPST( iMotif )->Simplify( ) :
 		false ); }
 
+bool CCoalesceMotifLibrary::OpenKnown( istream& istm ) {
+	char*			szBuffer;
+	vector<string>	vecstrLine;
+
+	szBuffer = new char[ CFile::GetBufferSize( ) ];
+	while( !istm.eof( ) ) {
+		istm.getline( szBuffer, CFile::GetBufferSize( ) );
+		if( !szBuffer[ 0 ] )
+			continue;
+		vecstrLine.clear( );
+		CMeta::Tokenize( szBuffer, vecstrLine );
+		if( vecstrLine.empty( ) )
+			continue;
+		if( ( vecstrLine.size( ) - 1 ) % 4 ) {
+			g_CatSleipnir.warn( "CCoalesceMotifLibrary::OpenKnown( ) invalid line: %s" );
+			continue; }
+
+		m_sKnowns.Add( vecstrLine[ 0 ], vecstrLine ); }
+	delete[] szBuffer;
+
+	return true; }
+
+bool CCoalesceMotifLibrary::GetKnown( uint32_t iMotif, float dPenaltyGap, float dPenaltyMismatch,
+	vector<pair<string, float> >& vecprstrdKnown, float dPValue ) const {
+	size_t							i, j, k, iSum;
+	vector<float>					vecdMotif;
+	CFullMatrix<uint16_t>			MatPWM;
+	map<string, float>				mapstrdKnown;
+	map<string, float>::iterator	iterKnown;
+
+	if( !m_sKnowns.GetSize( ) )
+		return true;
+	dPValue /= m_sKnowns.GetSize( );
+	if( !CCoalesceMotifLibraryImpl::GetPWM( iMotif, 0, dPenaltyGap, dPenaltyMismatch, true, MatPWM ) )
+		return false;
+	vecdMotif.resize( MatPWM.GetRows( ) * MatPWM.GetColumns( ) );
+	for( k = i = 0; i < MatPWM.GetColumns( ); ++i ) {
+		for( iSum = j = 0; j < MatPWM.GetRows( ); ++j )
+			iSum += MatPWM.Get( j, i );
+		if( !iSum )
+			iSum = 1;
+		for( j = 0; j < MatPWM.GetRows( ); ++j )
+			vecdMotif[ k++ ] = (float)MatPWM.Get( j, i ) / iSum; }
+
+	m_sKnowns.Match( vecdMotif, mapstrdKnown );
+	for( iterKnown = mapstrdKnown.begin( ); iterKnown != mapstrdKnown.end( ); ++iterKnown )
+		if( iterKnown->second < dPValue )
+			vecprstrdKnown.push_back( *iterKnown );
+
+	return true; }
 }

File src/coalescemotifs.h

 namespace Sleipnir {
 
 struct SCoalesceModifierCache;
+struct SMotifMatch;
 
 /*!
  * \brief
  */
 class CCoalesceMotifLibrary : CCoalesceMotifLibraryImpl {
 public:
+	static bool Open( std::istream& istm, std::vector<SMotifMatch>& vecsMotifs,
+		CCoalesceMotifLibrary* pMotifs = NULL );
+
 	static std::string GetReverseComplement( const std::string& strKMer ) {
 
 		return CCoalesceMotifLibraryImpl::GetReverseComplement( strKMer ); }
 	float GetMatch( const std::string& strSequence, uint32_t iMotif, size_t iOffset,
 		SCoalesceModifierCache& sModifiers ) const;
 	uint32_t Open( const std::string& strMotif );
+	bool OpenKnown( std::istream& istm );
 	std::string GetPWM( uint32_t iMotif, float dCutoffPWMs, float dPenaltyGap, float dPenaltyMismatch,
 		bool fNoRCs ) const;
 	bool Simplify( uint32_t iMotif ) const;
+	bool GetKnown( uint32_t iMotif, float dPenaltyGap, float dPenaltyMismatch,
+		std::vector<std::pair<std::string, float> >& vecprstrdKnown, float dPValue = 1 ) const;
+
+	size_t GetKnowns( ) const {
+
+		return m_sKnowns.GetSize( ); }
 
 	/*!
 	 * \brief
 
 		return iMotif; }
 
-	/*!
-	 * \brief
-	 * Returns a motif ID representing the "merger" of the input motif (which can be of any type) with an
-	 * empty PST.
-	 * 
-	 * \param iMotif
-	 * Motif to be "merged" into a PST.
-	 * 
-	 * \returns
-	 * -1 if the motif cannot be merged; the ID of the new PST otherwise.
-	 */
-	uint32_t Merge( uint32_t iMotif ) {
-		uint32_t	iRet;
-		CPST*		pPST;
-
-		if( !( pPST = CreatePST( iRet ) ) )
-			return -1;
-		switch( GetType( iMotif ) ) {
-			case ETypeRC:
-				return MergeRCPST( iMotif, *pPST, FLT_MAX, true );
-
-			case ETypePST:
-				return MergePSTs( *GetPST( iMotif ), *pPST, FLT_MAX, true ); }
-
-		return MergeKMerPST( GetMotif( iMotif ), *pPST, FLT_MAX, true ); }
-
 	float Align( uint32_t iOne, uint32_t iTwo, float dCutoff ) {
 
 		switch( GetType( iOne ) ) {

File src/coalescemotifsi.h

 #ifndef COALESCEMOTIFSI_H
 #define COALESCEMOTIFSI_H
 
+#include <string.h>
+#include <map>
+
 #include "fullmatrix.h"
 
 namespace Sleipnir {
 	static const char	c_szComplements[];
 	static const size_t	c_iShift		= 2; // ceil( log2( strlen( c_szBases ) ) )
 	static const char	c_cSeparator	= '|';
+	static const char	c_cCluster		= 'C';
 
 	typedef std::map<std::pair<uint32_t, uint32_t>, uint32_t>	TMapPrIII;
 
 		ETypePST
 	};
 
+	struct SKnowns {
+		typedef std::vector<float>			TVecD;
+		typedef std::pair<TVecD, TVecD>		TPrVecDVecD;
+		typedef std::vector<TPrVecDVecD>	TVecPr;
+
+		void Match( const std::vector<float>&, std::map<std::string, float>& ) const;
+
+		size_t GetSize( ) const {
+
+			return m_mapstrvecKnown.size( ); }
+
+		void Add( const std::string& strID, const std::vector<std::string>& vecstrLine ) {
+			size_t	i, iFromPos, iFromBase, iToPos, iToBase;
+			TVecPr&	vecprMotif	= m_mapstrvecKnown[ strID ];
+
+			vecprMotif.push_back( TPrVecDVecD( ) );
+			{
+				TPrVecDVecD&	prvecdvecdMotif	= vecprMotif.back( );
+
+				prvecdvecdMotif.first.resize( vecstrLine.size( ) - 1 );
+				for( i = 1; i < vecstrLine.size( ); ++i )
+					prvecdvecdMotif.first[ i - 1 ] = (float)atof( vecstrLine[ i ].c_str( ) );
+
+				prvecdvecdMotif.second.resize( prvecdvecdMotif.first.size( ) );
+				i = prvecdvecdMotif.second.size( ) / 4;
+				for( iFromPos = 0; iFromPos < i; ++iFromPos ) {
+					iToPos = i - iFromPos - 1;
+					for( iFromBase = 0; iFromBase < 4; ++iFromBase ) {
+						iToBase = 3 - iFromBase;
+						prvecdvecdMotif.second[ ( 4 * iToPos ) + iToBase ] =
+							prvecdvecdMotif.first[ ( 4 * iFromPos ) + iFromBase ]; } }
+			} }
+
+		std::map<std::string, TVecPr>	m_mapstrvecKnown;
+	};
+
 	static size_t CountKMers( size_t iK ) {
 
 		return ( 1 << ( iK << 1 ) ); }
 	float AlignRCPST( uint32_t, const CPST&, float ) const;
 	float AlignPSTs( const CPST&, const CPST&, float ) const;
 	uint32_t RemoveRCs( const CPST&, float, float );
+	bool GetPWM( uint32_t, float, float, float, bool, CFullMatrix<uint16_t>& ) const;
 
 	EType GetType( uint32_t iMotif ) const {
 
 	std::vector<uint32_t>	m_veciRC2KMer;
 	std::vector<CPST*>		m_vecpPSTs;
 	TMapPrIII				m_mappriiiMerged;
+	SKnowns					m_sKnowns;
 };
 
 }

File src/coalescestructs.cpp

 bool SMotifMatch::Open( istream& istm, CCoalesceMotifLibrary& Motifs ) {
 	string			strLine;
 	vector<string>	vecstrLine;
+	size_t			i;
 
+	m_vecprstrdKnown.clear( );
 	strLine.resize( CFile::GetBufferSize( ) );
 	istm.getline( &strLine[ 0 ], strLine.size( ) - 1 );
 	CMeta::Tokenize( strLine.c_str( ), vecstrLine );
-	if( vecstrLine.size( ) != 3 ) {
+	if( vecstrLine.size( ) < 3 ) {
 		g_CatSleipnir.error( "SMotifMatch::Open( ) invalid line: %s", strLine.c_str( ) );
 		return false; }
 	m_strType = vecstrLine[ 0 ];
 		g_CatSleipnir.error( "SMotifMatch::Open( ) invalid subsequence: %s", vecstrLine[ 1 ].c_str( ) );
 		return false; }
 	m_dZ = (float)atof( vecstrLine[ 2 ].c_str( ) );
+
+	if( vecstrLine.size( ) > 3 ) {
+		vector<string>	vecstrKnowns;
+
+		CMeta::Tokenize( vecstrLine[ 3 ].c_str( ), vecstrKnowns, "|" );
+		for( i = 0; i < vecstrKnowns.size( ); ++i ) {
+			vector<string>	vecstrKnown;
+
+			CMeta::Tokenize( vecstrKnowns[ i ].c_str( ), vecstrKnown, ":" );
+			if( vecstrKnown.size( ) != 2 ) {
+				g_CatSleipnir.error( "SMotifMatch::Open( ) invalid known: %s", vecstrKnowns[ i ].c_str( ) );
+				return false; }
+			m_vecprstrdKnown.push_back( pair<string, float>( vecstrKnown[ 0 ],
+				(float)atof( vecstrKnown[ 1 ].c_str( ) ) ) ); } }
+
 	istm.getline( &strLine[ 0 ], strLine.size( ) - 1 );
 	if( ( m_iMotif = Motifs.Open( strLine.c_str( ) ) ) == -1 ) {
 		g_CatSleipnir.error( "SMotifMatch::Open( ) invalid motif: %s", strLine.c_str( ) );
 		m_strType = sMotif.m_strType;
 		m_dZ += sMotif.m_dZ;
 		iCount++;
-		return ( m_iMotif = Motifs.Merge( sMotif.m_iMotif ) ); }
+		return ( m_iMotif = sMotif.m_iMotif ); }
 
 	return ( ( ( ( iLeft = Open( Hier.Get( false ), vecsMotifs, Motifs, iCount ) ) == -1 ) ||
 		( ( iRight = Open( Hier.Get( true ), vecsMotifs, Motifs, iCount ) ) == -1 ) ) ? -1 :
 	float dPenaltyGap, float dPenaltyMismatch, bool fNoRCs ) const {
 	ostringstream	ossm;
 	string			strPWM;
+	size_t			i;
 
-	ossm << m_strType << '\t' << CCoalesceSequencerBase::GetSubsequence( m_eSubsequence ) << '\t' << m_dZ <<
-		endl;
+	ossm << m_strType << '\t' << CCoalesceSequencerBase::GetSubsequence( m_eSubsequence ) << '\t' << m_dZ;
+	for( i = 0; i < m_vecprstrdKnown.size( ); ++i ) {
+		const pair<string, float>&	prstrdKnown	= m_vecprstrdKnown[ i ];
+
+		ossm << ( i ? "|" : "\t" ) << prstrdKnown.first << ':' << prstrdKnown.second; }
+	ossm << endl;
 	if( pMotifs ) {
 		ossm << pMotifs->GetMotif( m_iMotif );
 		if( fPWM ) {
 
 	return ossm.str( ); }
 
+bool SMotifMatch::Label( const CCoalesceMotifLibrary& Motifs, float dPenaltyGap, float dPenaltyMismatch,
+	float dPValue ) {
+
+	m_vecprstrdKnown.clear( );
+	return Motifs.GetKnown( m_iMotif, dPenaltyGap, dPenaltyMismatch, m_vecprstrdKnown, dPValue ); }
+
 }

File src/coalescestructsi.h

 	uint32_t Open( const SMotifMatch&, const SMotifMatch&, CCoalesceMotifLibrary& );
 	std::string Save( const CCoalesceMotifLibrary*, bool = false, float = 0, float = 0, float = 0,
 		bool = false ) const;
+	bool Label( const CCoalesceMotifLibrary&, float, float, float );
 
 	bool operator==( const SMotifMatch& sMotif ) const {
 
 
 		return ( iMotif ^ iType ^ iSubsequence ); }
 
-	uint32_t								m_iMotif;
-	std::string								m_strType;
-	CCoalesceSequencerBase::ESubsequence	m_eSubsequence;
-	float									m_dZ;
-	float									m_dAverage;
+	uint32_t									m_iMotif;
+	std::string									m_strType;
+	CCoalesceSequencerBase::ESubsequence		m_eSubsequence;
+	float										m_dZ;
+	float										m_dAverage;
+	std::vector<std::pair<std::string, float> >	m_vecprstrdKnown;
 };
 
 struct SCoalesceDataset {

File src/fasta.cpp

 
 	return true; }
 
-bool CFASTAImpl::Get( size_t iGene, std::vector<SFASTASequence>& vecsSequences, size_t iOffset,
-	const std::string& strSequence, SFASTASequence& sSequence ) const {
+bool CFASTAImpl::Get( size_t iGene, vector<SFASTASequence>& vecsSequences, size_t iOffset,
+	const string& strSequence, SFASTASequence& sSequence ) const {
 	size_t	iBegin, iEnd;
 
 	if( strSequence.empty( ) ) {
 
 		return c_iBufferSize; }
 
+	static bool IsNewline( char c ) {
+
+		return ( ( c == '\n' ) || ( c == '\r' ) ); }
+
 	/*!
 	 * \brief
 	 * Return the next tab-delimited token from the given string.
 		istm.read( (char*)&iLength, sizeof(iLength) );
 		str.resize( iLength );
 		istm.read( &str[ 0 ], iLength ); }
-
-	static bool IsNewline( char c ) {
-
-		return ( ( c == '\n' ) || ( c == '\r' ) ); }
 };
 
 }

File src/halfmatrix.h

 #ifndef HALFMATRIX_H
 #define HALFMATRIX_H
 
+#include <string.h>
+
 #include "halfmatrixi.h"
 
 namespace Sleipnir {
 			m_vecstrFeatures.push_back( strToken );
 		else
 			m_vecstrExperiments.push_back( strToken );
-	if( m_vecstrExperiments.empty( ) )
-		g_CatSleipnir.error( "CPCLImpl::OpenExperiments( %d ) found no experiments", iFeatures );
 
-	return !m_vecstrExperiments.empty( ); }
+	return true; }
 
 bool CPCLImpl::OpenGene( std::istream& istmInput, std::vector<float>& vecdData, char* acLine, size_t iLine ) {
 	const char*	pc;
 
 		return m_vecstrExperiments[ iExperiment ]; }
 
+	size_t GetExperiment( const std::string& strExperiment ) const {
+		size_t	i;
+
+		for( i = 0; i < m_vecstrExperiments.size( ); ++i )
+			if( m_vecstrExperiments[ i ] == strExperiment )
+				return i;
+
+		return -1; }
+
 	/*!
 	 * \brief
 	 * Set the experiment label at the given PCL column.

File src/stdafx.h

 
 #include <assert.h>
 #include <float.h>
+#include <limits.h>
 #include <math.h>
 #include <pthread.h>
-//#include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 #ifdef _MSC_VER
 #include <fcntl.h>
 #include <io.h>

File tools/BNServer/BNServer.cpp

 	delete this; }
 
 bool CBNServer::ProcessMessage( const vector<unsigned char>& vecbMessage ) {
-	size_t	iProcessed, iOffset;
+	size_t	i, iProcessed, iOffset;
 
 	for( iOffset = 0; iOffset < vecbMessage.size( ); iOffset += ( iProcessed + 1 ) ) {
+		cerr << "LOG	" << time( NULL ) << '\t' << m_strConnection << '\t' << hex;
+		for( i = 0; i < vecbMessage.size( ); ++i )
+			cerr << setfill( '0' ) << setw( 2 ) << (unsigned int)vecbMessage[ i ];
+		cerr << dec << endl;
 		if( vecbMessage[ iOffset ] >= ARRAYSIZE(c_apfnProcessors) ) {
 			cerr << m_strConnection << " unknown opcode: " << (int)vecbMessage[ iOffset ] << endl;
 			return false; }
 
 		for( j = 0; j < Genes.GetGenes( ); ++j )
 			if( pOnto->IsAnnotated( vecsTerms[ i ].m_iID, Genes.GetGene( j ) ) && ( veciMapping[ j ] != -1 ) )
-				veciGenes.push_back( veciMapping[ j ] );
+				veciGenes.push_back( (uint32_t)veciMapping[ j ] );
 		iSize = (uint32_t)veciGenes.size( );
 		send( m_iSocket, (const char*)&iSize, sizeof(iSize), 0 );
 		for( j = 0; j < veciGenes.size( ); ++j ) {

File tools/BNServer/stdafx.h

 #define __STDC_LIMIT_MACROS
 
 #include <fstream>
+#include <iomanip>
 #include <queue>
 #include <sstream>
 using namespace std;

File tools/COALESCE/COALESCE.cpp

 };
 
 int main_postprocess( const gengetopt_args_info&, CCoalesceMotifLibrary& );
+int main_test( const gengetopt_args_info&, const CPCL&, const CFASTA&, CCoalesceMotifLibrary& );
+int main_test2( const gengetopt_args_info&, const CPCL&, const CFASTA&, CCoalesceMotifLibrary& );
 bool recluster( const gengetopt_args_info&, size_t, CCoalesceMotifLibrary&, const CHierarchy&,
 	const vector<CCoalesceCluster>&, const vector<string>&, const CPCL&, size_t& );
 bool trim( const gengetopt_args_info&, const CPCL&, vector<CCoalesceCluster>& );
 		cerr << "Could not open: " << sArgs.fasta_arg << endl;
 		return 1; }
 
+/*
+size_t					iGene	= 4;
+uint32_t				iOne;
+float					d;
+vector<SFASTASequence>	vecsSequences;
+SCoalesceModifiers		sMods;
+SCoalesceModifierCache	sCache( sMods );
+vector<float>			vecdScores;
+vector<size_t>			veciLengths;
+CCoalesceGeneScores		GeneScores;
+float*					ad;
+
+GeneScores.SetGenes( PCL.GetGenes( ) );
+iOne = Motifs.Open( "C:1 G:3 A:3 T:3 G:3 A:3 G:3 (A:1)|(C:1)" );
+FASTA.Get( FASTA.GetGene( PCL.GetGene( iGene ) ), vecsSequences );
+for( i = 0; i < vecsSequences.size( ); ++i )
+	if( vecsSequences[ i ].m_strType == "5" ) {
+		d = Motifs.GetMatch( vecsSequences[ i ].m_vecstrSequences[ 0 ], iOne, 0, sCache );
+		cerr << ( d / vecsSequences[ i ].m_vecstrSequences[ 0 ].length( ) ) << endl;
+
+		GeneScores.Add( iGene, Motifs, vecsSequences[ i ], sCache, iOne, vecdScores, veciLengths );
+		ad = GeneScores.Get( GeneScores.GetType( vecsSequences[ i ].m_strType ),
+			CCoalesceSequencerBase::ESubsequenceTotal, iGene );
+		cerr << ad[ iOne ] << endl;
+		return 0; }
+//*/
+//return main_test2( sArgs, PCL, FASTA, Motifs );
+
 	if( sArgs.datasets_arg ) {
 		static const size_t	c_iBuffer	= 131072;
 		ifstream			ifsm;
 	CHierarchy*					pHier;
 	vector<string>				vecstrClusters;
 	CPCL						PCL;
+	ifstream					ifsm;
 
 	if( sArgs.input_arg ) {
 		if( !PCL.Open( sArgs.input_arg, sArgs.skip_arg ) ) {
 	else if( !PCL.Open( cin, sArgs.skip_arg ) ) {
 		cerr << "Could not open: stdin" << endl;
 		return 1; }
+	if( sArgs.known_motifs_arg ) {
+		ifsm.open( sArgs.known_motifs_arg );
+		if( !Motifs.OpenKnown( ifsm ) ) {
+			cerr << "Could not open: " << sArgs.known_motifs_arg << endl;
+			return 1; }
+		ifsm.close( ); }
 	fFailed = false;
 	strDir = sArgs.postprocess_arg;
 	FOR_EACH_DIRECTORY_FILE(strDir, strFile)
 		vecstrClusters.push_back( strBase ); }
 	if( fFailed )
 		vecClustersFrom.pop_back( );
+	if( vecClustersFrom.empty( ) ) {
+		char	szBuffer[ 16 ];
+
+		fFailed = false;
+		ifsm.clear( );
+		ifsm.open( sArgs.postprocess_arg );
+		while( !ifsm.eof( ) ) {
+			if( !fFailed )
+				vecClustersFrom.push_back( CCoalesceCluster( ) );
+			if( fFailed = ( ( iDatasets = vecClustersFrom.back( ).Open( ifsm, PCL, &Motifs ) ) == -1 ) )
+				continue;
+			if( !( vecstrClusters.size( ) % 50 ) )
+				cerr << "Opened cluster " << vecstrClusters.size( ) << "..." << endl;
+			sprintf_s( szBuffer, "c%06d", vecstrClusters.size( ) );
+			vecstrClusters.push_back( szBuffer ); }
+		if( fFailed )
+			vecClustersFrom.pop_back( ); }
 
 	cerr << "Trimming clusters..." << endl;
 	if( !trim( sArgs, PCL, vecClustersFrom ) )
 			return true; }
 
 		Cluster.RemoveMotifs( Motifs, (float)sArgs.min_zscore_arg );
-/*
-		if( sArgs.known_motifs_arg ) {
-			if( !Motifs.OpenKnown( sArgs.known_motifs_arg ) ) {
-				cerr << "Could not open: " << sArgs.known_motifs_arg << endl;
-				return 1; }
-			Cluster.LabelMotifs( Motifs, sArgs.known_cutoff_arg ); }
-*/
+		if( !Cluster.LabelMotifs( Motifs, (float)sArgs.penalty_gap_arg, (float)sArgs.penalty_mismatch_arg,
+			(float)sArgs.known_cutoff_arg ) )
+			return false;
 		if( sArgs.output_arg )
 			Cluster.Save( sArgs.output_arg, iID, PCL, &Motifs );
 		Cluster.Save( cout, iID, PCL, &Motifs, (float)sArgs.min_info_arg, (float)sArgs.penalty_gap_arg,
 		Cluster.RemoveGenes( veciRemove ); }
 
 	return true; }
+
+struct STestFASTA {
+	string			m_strSequence;
+	const CFASTA*	m_pFASTA;
+	size_t			m_iGene;
+};
+
+void* ThreadTestFASTA( void* pData ) {
+	STestFASTA*				psData;
+	vector<SFASTASequence>	vecsSequences;
+
+	psData = (STestFASTA*)pData;
+	psData->m_pFASTA->Get( psData->m_iGene, vecsSequences );
+	if( !( vecsSequences.empty( ) || vecsSequences[ 0 ].m_vecstrSequences.empty( ) ) )
+		psData->m_strSequence = vecsSequences[ 0 ].m_vecstrSequences[ 0 ];
+
+	return NULL; }
+
+struct SThreadCombineMotif {
+	size_t							m_iOffset;
+	size_t							m_iStep;
+	const std::vector<size_t>*		m_pveciPCL2FASTA;
+	CCoalesceGeneScores*			m_pGeneScores;
+	const CCoalesceMotifLibrary*	m_pMotifs;
+	uint32_t						m_iMotif;
+	const CFASTA*					m_pFASTA;
+	const SCoalesceModifiers*		m_psModifiers;
+};
+
+void* ThreadCombineMotif( void* pData ) {
+	SThreadCombineMotif*	psData;
+	size_t					i, j;
+	vector<float>			vecdScores;
+	vector<size_t>			veciLengths;
+
+	psData = (SThreadCombineMotif*)pData;
+	SCoalesceModifierCache	sModifiers( *psData->m_psModifiers );
+
+	for( i = psData->m_iOffset; i < psData->m_pveciPCL2FASTA->size( ); i += psData->m_iStep )
+		if( (*psData->m_pveciPCL2FASTA)[ i ] != -1 ) {
+			vector<SFASTASequence>	vecsSequences;
+
+			if( psData->m_pFASTA->Get( (*psData->m_pveciPCL2FASTA)[ i ], vecsSequences ) ) {
+				sModifiers.Get( i );
+				for( j = 0; j < vecsSequences.size( ); ++j )
+					if( !psData->m_pGeneScores->Add( i, *psData->m_pMotifs, vecsSequences[ j ],
+						sModifiers, psData->m_iMotif, vecdScores, veciLengths ) )
+						break; } }
+
+	return NULL; }
+
+int main_test( const gengetopt_args_info& sArgs, const CPCL& PCL, const CFASTA& FASTA, CCoalesceMotifLibrary& Motifs ) {
+	CCoalesceGeneScores			GeneScores;
+	vector<size_t>				veciPCL2FASTA;
+	SCoalesceModifiers			sMods;
+	vector<vector<float> >		vecvecdCounts;
+	vector<size_t>				veciLengths;
+	SCoalesceModifierCache		sModifiers( sMods );
+	size_t						i, j, k;
+	uint32_t					iMotifOne, iMotifTwo;
+	vector<uint32_t>			veciMotifs;
+	vector<float>				vecdScores;
+	float						d;
+	CCoalesceCluster			Cluster, Pot;
+	CCoalesceGroupHistograms	HistsCluster( 12, 1.0f / sArgs.bases_arg );
+	CCoalesceGroupHistograms	HistsPot( 12, 1.0f / sArgs.bases_arg );
+	CCoalesceSequencerBase::ESubsequence	eSubsequence;
+	unsigned short				s;
+	vector<pthread_t>			vecpthdThreads;
+	vector<SThreadCombineMotif>	vecsThreads;
+
+/*
+	vector<STestFASTA>			vecsThreads;
+
+	vecpthdThreads.resize( sArgs.threads_arg );
+	vecsThreads.resize( vecpthdThreads.size( ) );
+	for( i = 0; i < 100; ++i ) {
+		iFASTA = rand( ) % FASTA.GetGenes( );
+		cerr << i << '\t' << FASTA.GetGene( iFASTA ) << endl;
+		for( j = 0; j < vecpthdThreads.size( ); ++j ) {
+			vecsThreads[ j ].m_iGene = iFASTA;
+			vecsThreads[ j ].m_pFASTA = &FASTA;
+			vecsThreads[ j ].m_strSequence = "";
+			pthread_create( &vecpthdThreads[ j ], NULL, ThreadTestFASTA, &vecsThreads[ j ] ); }
+		for( j = 0; j < vecpthdThreads.size( ); ++j ) {
+			pthread_join( vecpthdThreads[ j ], NULL );
+			if( j && ( vecsThreads[ j ].m_strSequence != vecsThreads[ 0 ].m_strSequence ) )
+				cerr << "Thread " << j << " failed:" << endl << vecsThreads[ 0 ].m_strSequence << endl <<
+					vecsThreads[ j ].m_strSequence << endl; }
+		cerr << vecsThreads[ 0 ].m_strSequence << endl; }
+	return 0;
+//*/
+
+	veciPCL2FASTA.resize( PCL.GetGenes( ) );
+	for( i = 0; i < veciPCL2FASTA.size( ); ++i )
+		veciPCL2FASTA[ i ] = FASTA.GetGene( PCL.GetGene( i ) );
+	sMods.Initialize( PCL );
+
+	GeneScores.SetGenes( PCL.GetGenes( ) );
+	for( i = 0; i < veciPCL2FASTA.size( ); ++i )
+		if( veciPCL2FASTA[ i ] != -1 ) {
+			vector<SFASTASequence>	vecsSequences;
+
+			if( FASTA.Get( veciPCL2FASTA[ i ], vecsSequences ) ) {
+				sModifiers.Get( i );
+				for( j = 0; j < vecsSequences.size( ); ++j )
+					if( !GeneScores.Add( i, Motifs, vecsSequences[ j ], sModifiers, vecvecdCounts,
+						veciLengths ) )
+						return 1; } }
+
+	Pot.SetGenes( PCL.GetGenes( ) );
+	Pot.CalculateHistograms( GeneScores, HistsPot, NULL );
+	for( i = 0; i < PCL.GetGenes( ); ++i )
+		if( ( (float)rand( ) / RAND_MAX ) < ( 1.0 / 100 ) )
+			Cluster.Add( i, Pot );
+
+	veciMotifs.resize( 20 );
+	vecpthdThreads.resize( sArgs.threads_arg );
+	vecsThreads.resize( vecpthdThreads.size( ) );
+	for( i = 0; i < 100; ++i ) {
+		Cluster.CalculateHistograms( GeneScores, HistsCluster, &HistsPot );
+		Cluster.Snapshot( GeneScores, HistsCluster );
+		Pot.Snapshot( GeneScores, HistsPot );
+		cerr << "Cluster	" << Cluster.GetGenes( ).size( ) << endl << "Pot	" <<
+			Pot.GetGenes( ).size( ) << endl;
+
+		for( j = 0; j < veciMotifs.size( ); ++j ) {
+			iMotifOne = rand( ) % Motifs.GetMotifs( );
+			iMotifTwo = rand( ) % Motifs.GetMotifs( );
+			veciMotifs[ j ] = Motifs.Merge( iMotifOne, iMotifTwo, FLT_MAX, false );
+			cerr << "Merged:" << endl << Motifs.GetMotif( iMotifOne ) << endl <<
+				Motifs.GetMotif( iMotifTwo ) << endl << Motifs.GetMotif( veciMotifs[ j ] ) << endl;
+
+			for( k = 0; k < vecsThreads.size( ); ++k ) {
+				vecsThreads[ k ].m_iOffset = k;
+				vecsThreads[ k ].m_iStep = vecsThreads.size( );
+				vecsThreads[ k ].m_pveciPCL2FASTA = &veciPCL2FASTA;
+				vecsThreads[ k ].m_pGeneScores = &GeneScores;
+				vecsThreads[ k ].m_pMotifs = &Motifs;
+				vecsThreads[ k ].m_iMotif = veciMotifs[ j ];
+				vecsThreads[ k ].m_pFASTA = &FASTA;
+				vecsThreads[ k ].m_psModifiers = &sMods;
+				if( pthread_create( &vecpthdThreads[ k ], NULL, ThreadCombineMotif, &vecsThreads[ k ] ) ) {
+					cerr << "CCoalesceImpl::CombineMotifs( " << sArgs.threads_arg <<
+						" ) could not combine motif: " << Motifs.GetMotif( veciMotifs[ j ] ).c_str( ) << endl;
+					return 1; } }
+			for( k = 0; k < vecpthdThreads.size( ); ++k )
+				pthread_join( vecpthdThreads[ k ], NULL );
+			for( k = 0; k < PCL.GetGenes( ); ++k )
+				if( veciPCL2FASTA[ k ] != -1 ) {
+					if( Cluster.IsGene( k ) )
+						HistsCluster.Add( GeneScores, k, false, veciMotifs[ j ] );
+					else
+						HistsPot.Add( GeneScores, k, false, veciMotifs[ j ] ); } }
+/*
+		for( j = 0; j < veciPCL2FASTA.size( ); ++j )
+			if( ( iFASTA = veciPCL2FASTA[ j ] ) != -1 ) {
+				vector<SFASTASequence>	vecsSequences;
+				float					dFive;
+
+				if( FASTA.Get( iFASTA, vecsSequences ) ) {
+					sModifiers.Get( j );
+					dFive = 0;
+					for( k = 0; k < vecsSequences.size( ); ++k ) {
+						if( !GeneScores.Add( j, Motifs, vecsSequences[ k ],
+							sModifiers, iMotifThree, vecdScores, veciLengths ) ) {
+							cerr << "FAILURE!" << endl;
+							return 1; }
+						if( vecsSequences[ k ].m_strType == "5" )
+							dFive = Motifs.GetMatch( vecsSequences[ k ].m_vecstrSequences[ 0 ],
+								iMotifThree, 0, sModifiers ) / vecsSequences[ k ].m_vecstrSequences[ 0 ].length( ); }
+
+					cerr << "Gene	" << j << '\t' << PCL.GetGene( j ) << endl;
+					if( dFive != ( d = GeneScores.Get( GeneScores.GetType( "5" ), CCoalesceSequencerBase::ESubsequenceTotal, j, iMotifThree ) ) ) {
+						cerr << "C	" << dFive << '\t' << d << endl;
+						return 1; }
+					for( k = 0; k < GeneScores.GetTypes( ); ++k )
+						for( eSubsequence = CCoalesceSequencerBase::ESubsequenceBegin;
+							(size_t)eSubsequence < GeneScores.GetSubsequences( k );
+							eSubsequence = (CCoalesceSequencerBase::ESubsequence)( (size_t)eSubsequence + 1 ) ) {
+							const float*	ad	= GeneScores.Get( k, eSubsequence, j );
+
+							d = GeneScores.Get( k, eSubsequence, j, iMotifThree );
+							if( ad ) {
+								if( ad[ iMotifThree ] != d ) {
+									cerr << "A	" << GeneScores.GetType( k ) << '\t' << eSubsequence << '\t' << ad[ iMotifThree ] <<
+										'\t' << GeneScores.Get( k, eSubsequence, j, iMotifThree ) << endl;
+									return 1; } }
+							else if( d != 0 ) {
+								cerr << "B	" << GeneScores.GetType( k ) << '\t' << eSubsequence << '\t' <<
+									'\t' << GeneScores.Get( k, eSubsequence, j, iMotifThree ) << endl;
+								return 1; } }
+					if( Cluster.IsGene( j ) )
+						HistsCluster.Add( GeneScores, j, false, iMotifThree );
+					else
+						HistsPot.Add( GeneScores, j, false, iMotifThree ); } }
+//*/
+
+/*
+		for( j = 0; j < PCL.GetGenes( ); ++j )
+			if( veciPCL2FASTA[ j ] != -1 ) {
+				vector<SFASTASequence>	vecsSequences;
+
+				FASTA.Get( veciPCL2FASTA[ j ], vecsSequences );
+				for( k = 0; k < vecsSequences.size( ); ++k ) {
+					const SFASTASequence&	sSequence	= vecsSequences[ k ];
+					float*					ad;
+
+					if( sSequence.m_vecstrSequences.size( ) != 1 )
+						continue;
+					d = Motifs.GetMatch( sSequence.m_vecstrSequences[ 0 ], iMotifThree, 0,
+						sModifiers ) / sSequence.m_vecstrSequences[ 0 ].length( );
+					if( !( ad = GeneScores.Get( GeneScores.GetType( sSequence.m_strType ),
+						CCoalesceSequencerBase::ESubsequenceTotal, j ) ) || ( d != ad[ iMotifThree ] ) ) {
+						cerr << j << '\t' << PCL.GetGene( j ) << '\t' << sSequence.m_strType << '\t' <<
+							( ad ? ad[ iMotifThree ] : -1 ) << '\t' << d << endl;
+						return 1; } } }
+//*/
+
+		for( j = 0; j < PCL.GetGenes( ); ++j )
+			for( k = 0; k < GeneScores.GetTypes( ); ++k )
+				for( eSubsequence = CCoalesceSequencerBase::ESubsequenceBegin;
+					(size_t)eSubsequence < GeneScores.GetSubsequences( k );
+					eSubsequence = (CCoalesceSequencerBase::ESubsequence)( (size_t)eSubsequence + 1 ) ) {
+					if( !GeneScores.Get( k, eSubsequence, j ) )
+						continue;
+					for( size_t m = 0; m < veciMotifs.size( ); ++m ) {
+						d = GeneScores.Get( k, eSubsequence, j, veciMotifs[ m ] );
+						s = Cluster.IsGene( j ) ?
+							HistsCluster.Get( k, eSubsequence ).Get( veciMotifs[ m ], d ) :
+							HistsPot.Get( k, eSubsequence ).Get( veciMotifs[ m ], d );
+						if( !s ) {
+							cerr << "D	" << j << '\t' << PCL.GetGene( j ) << '\t' <<
+								GeneScores.GetType( k ) << '\t' << eSubsequence << '\t' << d << '\t' <<
+								Cluster.IsGene( j ) << endl;
+							cerr << Motifs.GetMotif( veciMotifs[ m ] ) << endl;
+							cerr << HistsCluster.Get( k, eSubsequence ).Save( veciMotifs[ m ] ) << endl;
+							cerr << HistsPot.Get( k, eSubsequence ).Save( veciMotifs[ m ] ) << endl;
+							return 1; } } }
+
+		for( j = 0; j < PCL.GetGenes( ); ++j ) {
+			if( ( (float)rand( ) / RAND_MAX ) > ( 1.0 / 100 ) )
+				continue;
+			if( Cluster.IsGene( j ) )
+				Pot.Add( j, Cluster );
+			else
+				Cluster.Add( j, Pot ); } }
+
+	return 0; }
+
+int main_test2( const gengetopt_args_info& sArgs, const CPCL& PCL, const CFASTA& FASTA, CCoalesceMotifLibrary& Motifs ) {
+	CCoalesceGeneScores			GeneScores;
+	size_t						i, j, iIter, iOrig;
+	uint32_t					iOne, iTwo, iThree;
+	SCoalesceModifiers			sMods;
+	vector<vector<float> >		vecvecdCounts, vecvecdOrig;
+	vector<size_t>				veciLengths;
+	vector<float>				vecdScores;
+	SCoalesceModifierCache		sModifiers( sMods );
+	vector<size_t>				veciPCL2FASTA;
+	float						d;
+
+	veciPCL2FASTA.resize( PCL.GetGenes( ) );
+	for( i = 0; i < veciPCL2FASTA.size( ); ++i )
+		veciPCL2FASTA[ i ] = FASTA.GetGene( PCL.GetGene( i ) );
+	sMods.Initialize( PCL );
+
+	GeneScores.SetGenes( PCL.GetGenes( ) );
+	for( i = 0; i < veciPCL2FASTA.size( ); ++i )
+		if( veciPCL2FASTA[ i ] != -1 ) {
+			vector<SFASTASequence>	vecsSequences;
+
+			if( FASTA.Get( veciPCL2FASTA[ i ], vecsSequences ) ) {
+				sModifiers.Get( i );
+				for( j = 0; j < vecsSequences.size( ); ++j )
+					if( !GeneScores.Add( i, Motifs, vecsSequences[ j ], sModifiers, vecvecdCounts,
+						veciLengths ) )
+						return 1; } }
+
+	vecvecdOrig.resize( PCL.GetGenes( ) );
+	for( i = 0; i < vecvecdOrig.size( ); ++i ) {
+		vecvecdOrig[ i ].resize( iOrig = Motifs.GetMotifs( ) );
+		for( j = 0; j < vecvecdOrig[ i ].size( ); ++j )
+			vecvecdOrig[ i ][ j ] = GeneScores.Get( 0, CCoalesceSequencerBase::ESubsequenceTotal, i,
+				(uint32_t)j ); }
+
+	for( iIter = 0; iIter < 1000; ++iIter ) {
+		iOne = rand( ) % Motifs.GetMotifs( );
+		iTwo = rand( ) % Motifs.GetMotifs( );
+		iThree = Motifs.Merge( iOne, iTwo, FLT_MAX, false );
+
+		for( i = 0; i < veciPCL2FASTA.size( ); ++i )
+			if( veciPCL2FASTA[ i ] != -1 ) {
+				vector<SFASTASequence>	vecsSequences;
+
+				if( FASTA.Get( veciPCL2FASTA[ i ], vecsSequences ) ) {
+					sModifiers.Get( i );
+					for( j = 0; j < vecsSequences.size( ); ++j )
+						if( !GeneScores.Add( i, Motifs, vecsSequences[ j ], sModifiers, iThree, vecdScores,
+							veciLengths ) ) {
+							cerr << "FAILURE" << endl;
+							return 1; } } }
+
+		for( i = 0; i < PCL.GetGenes( ); ++i )
+			for( j = 0; j < vecvecdOrig[ i ].size( ); ++j )
+				if( vecvecdOrig[ i ][ j ] != ( d = GeneScores.Get( 0,
+					CCoalesceSequencerBase::ESubsequenceTotal, i, (uint32_t)j ) ) ) {
+					cerr << i << '\t' << PCL.GetGene( i ) << '\t' << j << '\t' << vecvecdOrig[ i ][ j ] <<
+						'\t' << d << endl;
+					cerr << Motifs.GetMotif( (uint32_t)j ) << endl;
+					return 1; } }
+
+	return 0; }

File tools/PCLPlotter/PCLPlotter.cpp

 		if( !FASTA.Open( sArgs.fasta_arg ) ) {
 			cerr << "Could not open: " << sArgs.fasta_arg << endl;
 			return 1; }
+
+		if( sArgs.motifs_arg ) {
+			ifstream				ifsm;
+			vector<SMotifMatch>		vecsMotifs;
+			CCoalesceMotifLibrary	Motifs( sArgs.k_arg );
+			SCoalesceModifiers		sModifiers;
+			SCoalesceModifierCache	sCache( sModifiers );
+			CPCL					PCLScores;
+			vector<string>			vecstrMotifs;
+
+			ifsm.open( sArgs.motifs_arg );
+			if( !CCoalesceMotifLibrary::Open( ifsm, vecsMotifs, &Motifs ) ) {
+				cerr << "Could not open: " << sArgs.motifs_arg << endl;
+				return 1; }
+			ifsm.close( );
+
+			vecstrMotifs.resize( vecsMotifs.size( ) );
+			for( i = 0; i < vecstrMotifs.size( ); ++i )
+				vecstrMotifs[ i ] = Motifs.GetMotif( vecsMotifs[ i ].m_iMotif );
+			PCLScores.Open( PCL.GetGeneNames( ), vecstrMotifs, vector<string>( ) );
+			PCLScores.Clear( );
+
+			for( iRow = 0; iRow < PCL.GetGenes( ); ++iRow ) {
+				vector<SFASTASequence>	vecsSequences;
+
+				if( !( iRow % 100 ) )
+					cerr << "Processing " << iRow << '/' << PCL.GetGenes( ) << endl;
+				if( ( i = FASTA.GetGene( PCL.GetGene( iRow ) ) ) == -1 ) {
+					for( i = 0; i < PCLScores.GetExperiments( ); ++i )
+						PCLScores.Set( iRow, i, CMeta::GetNaN( ) );
+					continue; }
+				if( !FASTA.Get( i, vecsSequences ) )
+					return 1;
+				for( iColumn = 0; iColumn < vecsMotifs.size( ); ++iColumn ) {
+					const SMotifMatch&	sMotif	= vecsMotifs[ iColumn ];
+
+					for( iLength = i = 0; i < vecsSequences.size( ); ++i ) {
+						const SFASTASequence&	sSequence	= vecsSequences[ i ];
+
+						if( sMotif.m_strType != sSequence.m_strType )
+							continue;
+						for( j = 0; j < sSequence.m_vecstrSequences.size( ); ++j ) {
+							const string&	strSequence	= sSequence.m_vecstrSequences[ j ];
+
+							if( ( sMotif.m_eSubsequence != CCoalesceSequencerBase::ESubsequenceTotal ) &&
+								( sMotif.m_eSubsequence != ( ( sSequence.m_fIntronFirst == !( j % 2 ) ) ?
+								CCoalesceSequencerBase::ESubsequenceIntrons :
+								CCoalesceSequencerBase::ESubsequenceExons ) ) )
+								continue;
+							iLength += strSequence.size( );
+							PCLScores.Get( iRow, iColumn ) += Motifs.GetMatch( strSequence,
+								sMotif.m_iMotif, 0, sCache ); } }
+					if( iLength )
+						PCLScores.Get( iRow, iColumn ) /= iLength; } }
+
+			PCLScores.Save( cout );
+			return 0; }
+
 		for( i = 0; i < FASTA.GetGenes( ); ++i )
 			if( PCL.GetGene( FASTA.GetGene( i ) ) != -1 ) {
 				setiGenes.insert( i );
 	for( i = 0; i < PCL.GetExperiments( ); ++i )
 		cout << ( i ? "\t" : "" ) << vecdSumSqsOut[ i ];
 	cout << endl;
-	
+
 
 	return 0; }

File tools/PCLPlotter/stdafx.h

 #include <fstream>
 using namespace std;
 
+#include "coalescemotifs.h"
+#include "coalescestructsi.h"
 #include "dat.h"
 #include "fasta.h"
 #include "genome.h"