1. libsleipnir
  2. sleipnir

Commits

chuttenh  committed 445e553

Add gene and term filtering to Combiner
Add manual cluster seeding to COALESCE
Fix typo in CPCL MedianMultiples

  • Participants
  • Parent commits 14e69a2
  • Branches sleipnir

Comments (0)

Files changed (11)

File src/coalesce.cpp

View file
  • Ignore whitespace
 
 // CCoalesce
 
+void CCoalesce::SetSeed( const CPCL& PCL ) {
+	size_t	i;
+
+	m_vecdSeed.resize( PCL.GetExperiments( ) );
+	for( i = 0; i < m_vecdSeed.size( ); ++i )
+		m_vecdSeed[i] = PCL.Get( 0, i ); }
+
 void CCoalesceImpl::Normalize( CPCL& PCL ) {
 	static const double	c_dLog2		= log( 2.0 );
 	vector<float>		vecdMedian;
 
 		Pot.SetGenes( PCLCopy.GetGenes( ) );
 		Pot.CalculateHistograms( GeneScores, HistsPot, NULL );
-		if( !Cluster.Initialize( PCLCopy, Pot, m_vecsDatasets, setpriiSeeds, GetNumberCorrelation( ),
-			GetPValueCorrelation( ), GetProbabilityGene( ), GetThreads( ) ) )
+		if( !Cluster.Initialize( PCLCopy, Pot, m_vecsDatasets, setpriiSeeds, m_vecdSeed,
+			GetNumberCorrelation( ), GetPValueCorrelation( ), GetProbabilityGene( ), GetThreads( ) ) )
 			continue;
+		m_vecdSeed.clear( );
 		g_CatSleipnir( ).notice( "CCoalesce::Cluster( ) initialized %d genes", Cluster.GetGenes( ).size( ) );
 		if( g_CatSleipnir( ).isDebugEnabled( ) ) {
 			ostringstream				ossm;

File src/coalesce.h

View file
  • Ignore whitespace
 public:
 	bool Cluster( const CPCL& PCL, const CFASTA& FASTA, std::vector<CCoalesceCluster>& vecClusters );
 
+	void SetSeed( const CPCL& PCL );
+
 	/*!
 	 * \brief
 	 * Sets the correlation p-value threshhold for genes to be included in a cluster during initialization.
 	bool GetNormalize( ) const {
 
 		return m_fNormalize; }
+
+	void ClearSeed( ) {
+
+		m_vecdSeed.clear( ); }
 };
 
 }

File src/coalescecluster.cpp

View file
  • Ignore whitespace
  */
 bool CCoalesceCluster::Initialize( const CPCL& PCL, CCoalesceCluster& Pot,
 	const std::vector<SCoalesceDataset>& vecsDatasets, std::set<std::pair<size_t, size_t> >& setpriiSeeds,
-	size_t iPairs, float dPValue, float dProbability, size_t iThreads ) {
+	const vector<float>& vecdSeed, size_t iPairs, float dPValue, float dProbability, size_t iThreads ) {
 	size_t	i;
 	float	dFraction;
 	CPCL	PCLCopy;
 
 	PCLCopy.Open( PCL );
 	PCLCopy.Normalize( CPCL::ENormalizeColumn );
-	return ( AddSeedPair( PCLCopy, Pot, setpriiSeeds, dFraction, dPValue, iThreads ) &&
-		AddCorrelatedGenes( PCLCopy, Pot, dPValue ) ); }
+	return ( ( vecdSeed.size( ) || AddSeedPair( PCLCopy, Pot, setpriiSeeds, dFraction, dPValue, iThreads ) ) &&
+		AddCorrelatedGenes( PCLCopy, Pot, vecdSeed, dPValue ) ); }
 
 void CCoalesceClusterImpl::Add( size_t iGene, CCoalesceCluster& Pot ) {
 
 		dFraction, dPValue, PCL.GetGene( iOne ).c_str( ), PCL.GetGene( iTwo ).c_str( ), dMaxCorr, dMinP );
 	return false; }
 
-bool CCoalesceClusterImpl::AddCorrelatedGenes( const CPCL& PCL, CCoalesceCluster& Pot, float dPValue ) {
+bool CCoalesceClusterImpl::AddCorrelatedGenes( const CPCL& PCL, CCoalesceCluster& Pot, const vector<float>& vecdSeed, float dPValue ) {
 	size_t	iGene, iN;
 	double	dR;
 
 	CalculateCentroid( PCL );
+	if( vecdSeed.size( ) ) {
+		if( vecdSeed.size( ) == m_vecdCentroid.size( ) )
+			copy( vecdSeed.begin( ), vecdSeed.end( ), m_vecdCentroid.begin( ) );
+		else
+			g_CatSleipnir( ).error( "CCoalesceClusterImpl::AddCorrelatedGenes( %g ) invalid seed provided, ignoring", dPValue ); }
 	for( iGene = 0; iGene < PCL.GetGenes( ); ++iGene )
 		if( !IsGene( iGene ) &&
 			( ( dR = CMeasurePearson::Pearson( &m_vecdCentroid.front( ), PCL.GetExperiments( ),
 		vecsThreads[ i ].m_pPot = &Pot;
 		vecsThreads[ i ].m_pveciDatasets = &veciDatasets;
 		vecsThreads[ i ].m_pvecdStdevs = &vecdStdevs;
-		vecsThreads[ i ].m_dBeta = m_setsMotifs.size( ) ? ( (float)m_setsMotifs.size( ) / ( m_setiDatasets.size( ) + m_setsMotifs.size( ) ) ) : 0.5;
+		vecsThreads[ i ].m_dBeta = m_setsMotifs.size( ) ? ( (float)m_setsMotifs.size( ) / ( m_setiDatasets.size( ) + m_setsMotifs.size( ) ) ) : 0.5f;
 		vecsThreads[ i ].m_iMinimum = iMinimum;
 		vecsThreads[ i ].m_dProbability = dProbability;
 		if( pthread_create( &vecpthdThreads[ i ], NULL, ThreadSignificantGene, &vecsThreads[ i ] ) ) {

File src/coalescecluster.h

View file
  • Ignore whitespace
 class CCoalesceCluster : public CCoalesceClusterImpl {
 public:
 	bool Initialize( const CPCL& PCL, CCoalesceCluster& Pot, const std::vector<SCoalesceDataset>& vecsDatasets,
-		std::set<std::pair<size_t, size_t> >& setpriiSeeds, size_t iPairs, float dPValue, float dProbability,
-		size_t iThreads );
+		std::set<std::pair<size_t, size_t> >& setpriiSeeds, const std::vector<float>& vecdSeed, size_t iPairs, float dPValue,
+		float dProbability, size_t iThreads );
 	void Subtract( CPCL& PCL, const CCoalesceCluster& Pot ) const;
 	void Subtract( CCoalesceGeneScores& GeneScores ) const;
 	bool SelectConditions( const CPCL& PCL, const CCoalesceCluster& Pot, size_t iThreads, float dPValue,

File src/coalesceclusteri.h

View file
  • Ignore whitespace
 		return sMotif.GetHash( ); }
 
 	void Add( size_t, CCoalesceCluster& );
-	bool AddCorrelatedGenes( const CPCL&, CCoalesceCluster&, float );
+	bool AddCorrelatedGenes( const CPCL&, CCoalesceCluster&, const std::vector<float>&, float );
 	bool AddSeedPair( const CPCL&, CCoalesceCluster&, std::set<std::pair<size_t, size_t> >&, float, float,
 		size_t );
 	void CalculateCentroid( const CPCL& );

File src/coalescei.h

View file
  • Ignore whitespace
 	std::vector<SCoalesceDataset>	m_vecsDatasets;
 	std::vector<const CFASTA*>		m_vecpWiggles;
 	std::vector<std::ostream*>		m_vecpostm;
+	std::vector<float>				m_vecdSeed;
 };
 
 }

File src/pcli.h

View file
  • Ignore whitespace
 
 		vecdTmp.resize( vecdValues.size( ) );
 		for( dSum = 0,i = 0; i < vecdTmp.size( ); ++i )
-			for( j = ( max( i, c_iRadius ) - c_iRadius ); j < max( vecdTmp.size( ), i + c_iRadius ); ++j ) {
+			for( j = ( max( i, c_iRadius ) - c_iRadius ); j < min( vecdTmp.size( ), i + c_iRadius ); ++j ) {
 				k = max( i, j ) - min( i, j );
 				vecdTmp[i] += ( d = ( vecdValues[j] / ( 1 + pow( (float)k, dPower ) ) ) );
 				dSum += d; }

File tools/COALESCE/COALESCE.cpp

View file
  • Ignore whitespace
 int main( int iArgs, char** aszArgs ) {
 	gengetopt_args_info			sArgs;
 	CFASTA						FASTA;
-	CPCL						PCL;
+	CPCL						PCL, PCLSeed;
 	CCoalesce					Coalesce;
 	vector<CCoalesceCluster>	vecClusters;
 	size_t						i, j;
 		cerr << "Could not open: " << sArgs.fasta_arg << endl;
 		return 1; }
 
+	if( sArgs.seed_arg && !PCLSeed.Open( sArgs.seed_arg, sArgs.skip_arg ) ) {
+		cerr << "Could not open: " << sArgs.seed_arg << endl;
+		return 1; }
 	if( sArgs.datasets_arg ) {
 		static const size_t	c_iBuffer	= 131072;
 		ifstream			ifsm;
 	Coalesce.SetSizeMaximum( sArgs.size_maximum_arg );
 	Coalesce.SetNormalize( !!sArgs.normalize_flag );
 	Coalesce.SetThreads( sArgs.threads_arg );
+	Coalesce.SetSeed( PCLSeed );
 	if( sArgs.output_arg )
 		Coalesce.SetDirectoryIntermediate( sArgs.output_arg );
 

File tools/COALESCE/COALESCE.ggo

View file
  • Ignore whitespace
 							flag	off
 option	"progressive"	O	"Generate output progressively"
 							flag	on
+option	"seed"			D	"Expression pattern with which to seed first cluster"
+							string	typestr="filename"
 
 section "Optional"
 option	"threads"		t	"Maximum number of concurrent threads"

File tools/Combiner/Combiner.cpp

View file
  • Ignore whitespace
 	return 0; }
 
 int MainDATs( const gengetopt_args_info& sArgs ) {
-	CDataset			Dataset;
-	CDat				DatOut, DatCur;
-	CHalfMatrix<float>	MatCounts;
-	size_t				i, j, k, iOne, iTwo;
-	vector<size_t>		veciGenes;
-	float				d, dWeight, dWeights;
-	vector<string>		vecstrFiles;
-	CPCL				PCLWeights( false );
+	CDataset				Dataset;
+	CDat					DatOut, DatCur;
+	CHalfMatrix<float>		MatCounts;
+	size_t					i, j, k, iOne, iTwo, iA, iB;
+	vector<vector<size_t> >	vecveciGenes;
+	float					d, dWeight, dWeights;
+	vector<string>			vecstrFiles, vecstrTerms;
+	CPCL					PCLWeights( false );
+	CGenome					Genome;
+	CGenes					GenesIn( Genome );
+	vector<CGenes*>			vecpTerms;
+	vector<set<size_t> >	vecsetiGenes;
 
 	if( !sArgs.inputs_num )
 		return 1;
 	if( sArgs.weights_arg && !PCLWeights.Open( sArgs.weights_arg, 0 ) ) {
 		cerr << "Could not open: " << sArgs.weights_arg << endl;
 		return 1; }
+	if( sArgs.genes_arg && !GenesIn.Open( sArgs.genes_arg ) ) {
+		cerr << "Could not open: " << sArgs.genes_arg << endl;
+		return 1; }
+	if( sArgs.terms_arg ) {
+		static const size_t	c_iBuffer	= 131072;
+		ifstream	ifsm;
+		char		acBuffer[ c_iBuffer ];
 
-	DatOut.Open( Dataset.GetGeneNames( ), false, sArgs.memmap_flag ? sArgs.output_arg : NULL );
+		ifsm.open( sArgs.terms_arg );
+		while( !ifsm.eof( ) ) {
+			vector<string>	vecstrLine;
+
+			ifsm.getline( acBuffer, c_iBuffer - 1 );
+			CMeta::Tokenize( acBuffer, vecstrLine );
+			if( vecstrLine.empty( ) )
+				continue;
+			vecstrTerms.push_back( vecstrLine[0] );
+			vecstrLine.erase( vecstrLine.begin( ) );
+			vecpTerms.push_back( new CGenes( Genome ) );
+			vecpTerms.back( )->Open( vecstrLine ); }
+		if( vecpTerms.empty( ) || !vecpTerms[0]->GetGenes( ) ) {
+			cerr << "Could not open: " << sArgs.terms_arg << endl;
+			return 1; } }
+
+	DatOut.Open( vecstrTerms.empty( ) ? Dataset.GetGeneNames( ) : vecstrTerms, false, sArgs.memmap_flag ? sArgs.output_arg : NULL );
 	if( !strcmp( c_szMax, sArgs.method_arg ) )
 		d = -FLT_MAX;
 	else if( !strcmp( c_szMin, sArgs.method_arg ) )
 	if( fabs( d ) < 2 ) {
 		MatCounts.Initialize( DatOut.GetGenes( ) );
 		MatCounts.Clear( ); }
-	veciGenes.resize( DatOut.GetGenes( ) );
+	vecveciGenes.resize( DatOut.GetGenes( ) );
+	vecsetiGenes.resize( DatOut.GetGenes( ) );
 	for( dWeights = 0,i = 0; i < sArgs.inputs_num; ++i ) {
-		if( !DatCur.Open( sArgs.inputs[ i ], !!sArgs.memmap_flag && !sArgs.normalize_flag ) ) {
+		if( !DatCur.Open( sArgs.inputs[ i ], !!sArgs.memmap_flag && !sArgs.normalize_flag && !GenesIn.GetGenes( ) ) ) {
 			cerr << "Couldn't open: " << sArgs.inputs[ i ] << endl;
 			return 1; }
 		if( PCLWeights.GetGenes( ) ) {
 		cerr << "Opened: " << sArgs.inputs[ i ] << endl;
 		if( sArgs.normalize_flag )
 			DatCur.Normalize( CDat::ENormalizeZScore );
-		for( j = 0; j < veciGenes.size( ); ++j )
-			veciGenes[ j ] = DatCur.GetGene( DatOut.GetGene( j ) );
-		for( j = 0; j < veciGenes.size( ); ++j ) {
-			if( ( iOne = veciGenes[ j ] ) == -1 )
-				continue;
-			for( k = ( j + 1 ); k < veciGenes.size( ); ++k ) {
-				if( ( ( iTwo = veciGenes[ k ] ) == -1 ) || CMeta::IsNaN( d = DatCur.Get( iOne, iTwo ) ) )
+		if( GenesIn.GetGenes( ) )
+			DatCur.FilterGenes( GenesIn, CDat::EFilterInclude );
+		for( j = 0; j < vecveciGenes.size( ); ++j ) {
+			vecveciGenes[j].clear( );
+			vecsetiGenes[j].clear( );
+			if( vecstrTerms.empty( ) )
+				vecveciGenes[j].push_back( DatCur.GetGene( DatOut.GetGene( j ) ) );
+			else {
+				for( k = 0; k < vecpTerms[j]->GetGenes( ); ++k )
+					vecveciGenes[j].push_back( DatCur.GetGene( vecpTerms[j]->GetGene( k ).GetName( ) ) );
+				for( k = 0; k < vecveciGenes[j].size( ); ++k )
+					vecsetiGenes[j].insert( vecveciGenes[j][k] ); } }
+		for( j = 0; j < vecveciGenes.size( ); ++j )
+			for( iA = 0; iA < vecveciGenes[j].size( ); ++iA ) {
+				if( ( iOne = vecveciGenes[j][iA] ) == -1 )
 					continue;
-				if( !strcmp( c_szMean, sArgs.method_arg ) ||
-					!strcmp( c_szSum, sArgs.method_arg ) ) {
-					DatOut.Get( j, k ) += dWeight * d;
-					MatCounts.Get( j, k ) += dWeight; }
-				else if( !strcmp( c_szGMean, sArgs.method_arg ) ) {
-					DatOut.Get( j, k ) *= pow( d, dWeight );
-					MatCounts.Get( j, k ) += dWeight; }
-				else if( !strcmp( c_szHMean, sArgs.method_arg ) ) {
-					DatOut.Get( j, k ) += dWeight / d;
-					MatCounts.Get( j, k ) += dWeight; }
-				else if( !strcmp( c_szMax, sArgs.method_arg ) ) {
-					if( d > DatOut.Get( j, k ) )
-						DatOut.Set( j, k, d ); }
-				else if( !strcmp( c_szMin, sArgs.method_arg ) ) {
-					if( d < DatOut.Get( j, k ) )
-						DatOut.Set( j, k, d ); } } } }
+				for( k = ( j + 1 ); k < vecveciGenes.size( ); ++k )
+					for( iB = 0; iB < vecveciGenes[k].size( ); ++iB ) {
+						if( ( ( iTwo = vecveciGenes[k][iB] ) == -1 ) ||
+							( vecsetiGenes[j].find( iTwo ) != vecsetiGenes[j].end( ) ) ||
+							CMeta::IsNaN( d = DatCur.Get( iOne, iTwo ) ) )
+							continue;
+						if( !strcmp( c_szMean, sArgs.method_arg ) ||
+							!strcmp( c_szSum, sArgs.method_arg ) ) {
+							DatOut.Get( j, k ) += dWeight * d;
+							MatCounts.Get( j, k ) += dWeight; }
+						else if( !strcmp( c_szGMean, sArgs.method_arg ) ) {
+							DatOut.Get( j, k ) *= pow( d, dWeight );
+							MatCounts.Get( j, k ) += dWeight; }
+						else if( !strcmp( c_szHMean, sArgs.method_arg ) ) {
+							DatOut.Get( j, k ) += dWeight / d;
+							MatCounts.Get( j, k ) += dWeight; }
+						else if( !strcmp( c_szMax, sArgs.method_arg ) ) {
+							if( d > DatOut.Get( j, k ) )
+								DatOut.Set( j, k, d ); }
+						else if( !strcmp( c_szMin, sArgs.method_arg ) ) {
+							if( d < DatOut.Get( j, k ) )
+								DatOut.Set( j, k, d ); } } } }
 	for( i = 0; i < DatOut.GetGenes( ); ++i )
 		for( j = ( i + 1 ); j < DatOut.GetGenes( ); ++j )
 			if( !strcmp( c_szMean, sArgs.method_arg ) )
 	if( !sArgs.memmap_flag )
 		DatOut.Save( sArgs.output_arg );
 
+	for( i = 0; i < vecpTerms.size( ); ++i )
+		delete vecpTerms[i];
 	return 0; }
 
 static int MainDABs( const gengetopt_args_info& sArgs ) {

File tools/Combiner/Combiner.ggo

View file
  • Ignore whitespace
 section "Modules"
 option	"jaccard"	j	"Minimum Jaccard index for module equivalence"
 						float	default="0.5"
-option	"intersection"	r	"Minimum intersection fractino for module inheritance"
+option	"intersection"	r	"Minimum intersection fraction for module inheritance"
 						double	default="0.666"
 
+section "Filtering"
+option	"genes"		g	"Process only genes from the given set"
+						string	typestr="filename"
+option	"terms"		e	"Produce DAT/DABs averaging within the provided terms"
+						string	typestr="filename"
+
 section "Optional"
 option	"skip"		k	"Columns to skip in input PCLs"
 						int	default="2"