Commits

Anonymous committed bfafef1

[svn r397] Improve RC removal during PST->PWM conversion
Now done by alignment rather than purine content

Comments (0)

Files changed (11)

src/coalescecluster.cpp

 	return true; }
 
 void CCoalesceCluster::Save( std::ostream& ostm, size_t iID, const CPCL& PCL,
-	const CCoalesceMotifLibrary* pMotifs, float dCutoffPWMs, bool fNoRCs ) const {
+	const CCoalesceMotifLibrary* pMotifs, float dCutoffPWMs, float dPenaltyGap, float dPenaltyMismatch,
+	bool fNoRCs ) const {
 	set<size_t>::const_iterator			iterID;
 	set<SMotifMatch>::const_iterator	iterMotif;
 	size_t								i;
 			ostm << '\t' << PCL.GetExperiment( GetCondition( *iterID, i ) );
 	ostm << endl << "Motifs" << endl;
 	for( iterMotif = GetMotifs( ).begin( ); iterMotif != GetMotifs( ).end( ); ++iterMotif )
-		if( !( strMotif = iterMotif->Save( pMotifs, true, dCutoffPWMs, fNoRCs ) ).empty( ) )
+		if( !( strMotif = iterMotif->Save( pMotifs, true, dCutoffPWMs, dPenaltyGap, dPenaltyMismatch,
+			fNoRCs ) ).empty( ) )
 			ostm << strMotif << endl; }
 
 size_t CCoalesceCluster::Open( const string& strPCL, size_t iSkip, const CPCL& PCL,

src/coalescecluster.h

 	bool Save( const std::string& strDirectory, size_t iID, const CPCL& PCL,
 		const CCoalesceMotifLibrary* pMotifs = NULL ) const;
 	void Save( std::ostream&, size_t iID, const CPCL& PCL, const CCoalesceMotifLibrary* pMotifs = NULL,
-		float dCutoffPWMs = 0, bool fNoRCs = false ) const;
+		float dCutoffPWMs = 0, float dPenaltyGap = 0, float dPenaltyMismatch = 0, bool fNoRCs = false ) const;
 	float GetSimilarity( const CCoalesceCluster& Cluster, size_t iGenes, size_t iDatasets ) const;
 	void Snapshot( const CCoalesceGeneScores& GeneScores, CCoalesceGroupHistograms& Histograms );
 

src/coalescemotifs.cpp

 
 	return iRet; }
 
-uint32_t CCoalesceMotifLibraryImpl::RemoveRCs( const CPST& PST ) {
+uint32_t CCoalesceMotifLibraryImpl::RemoveRCs( const CPST& PST, float dPenaltyGap, float dPenaltyMismatch ) {
 	CPST*								pPST;
 	uint32_t							iRet;
 	map<unsigned char, unsigned char>	mapccComplements;
 		mapccComplements[ c_szBases[ i ] ] = c_szComplements[ i ];
 	if( !( pPST = CreatePST( iRet ) ) )
 		return -1;
-	PST.RemoveRCs( mapccComplements, *pPST );
+	PST.RemoveRCs( mapccComplements, dPenaltyGap, dPenaltyMismatch, *pPST );
 	return iRet; }
 
-string CCoalesceMotifLibrary::GetPWM( uint32_t iMotif, float dCutoffPWMs, bool fNoRCs ) const {
+string CCoalesceMotifLibrary::GetPWM( uint32_t iMotif, float dCutoffPWMs, float dPenaltyGap,
+	float dPenaltyMismatch, bool fNoRCs ) const {
 	CFullMatrix<size_t>	MatPWM;
 	string				strMotif;
 	size_t				i, j;
 	float				d;
 
 	if( fNoRCs )
-		iMotif = ((CCoalesceMotifLibrary*)this)->RemoveRCs( iMotif );
+		iMotif = ((CCoalesceMotifLibrary*)this)->RemoveRCs( iMotif, dPenaltyGap, dPenaltyMismatch );
 	switch( GetType( iMotif ) ) {
 		case ETypeKMer:
 			if( !CCoalesceMotifLibraryImpl::GetPWM( GetMotif( iMotif ), MatPWM ) )

src/coalescemotifs.h

 
 		return CCoalesceMotifLibraryImpl::GetReverseComplement( strKMer ); }
 
-	static float GetPurines( const std::string& strSequence ) {
-		size_t	i, iCount;
-
-		for( iCount = i = 0; i < strSequence.size( ); ++i )
-			if( IsPurine( strSequence[ i ] ) )
-				iCount++;
-
-		return ( (float)iCount / strSequence.size( ) ); }
-
 	/*!
 	 * \brief
 	 * Initializes a new motif library based on kmers of the given length.
 	float GetMatch( const std::string& strSequence, uint32_t iMotif, size_t iOffset,
 		SCoalesceModifierCache& sModifiers ) const;
 	uint32_t Open( const std::string& strMotif );
-	std::string GetPWM( uint32_t iMotif, float dCutoffPWMs, bool fNoRCs ) const;
+	std::string GetPWM( uint32_t iMotif, float dCutoffPWMs, float dPenaltyGap, float dPenaltyMismatch,
+		bool fNoRCs ) const;
 
 	/*!
 	 * \brief
 
 		return MergeKMers( GetMotif( iOne ), GetMotif( iTwo ), dCutoff ); }
 
-	uint32_t RemoveRCs( uint32_t iMotif ) {
+	uint32_t RemoveRCs( uint32_t iMotif, float dPenaltyGap, float dPenaltyMismatch ) {
 
 		switch( GetType( iMotif ) ) {
 			case ETypePST:
-				return CCoalesceMotifLibraryImpl::RemoveRCs( *GetPST( iMotif ) );
+				return CCoalesceMotifLibraryImpl::RemoveRCs( *GetPST( iMotif ), dPenaltyGap,
+					dPenaltyMismatch );
 
 			case ETypeRC:
 				return (uint32_t)m_veciRC2KMer[ iMotif - GetBaseRCs( ) ]; }

src/coalescemotifsi.h

 		std::reverse( strReverse.begin( ), strReverse.end( ) );
 		return GetComplement( strReverse ); }
 
-	static bool IsPurine( unsigned char c ) {
-		static const char	c_acPurines[]	= "AG";
-
-		return !!strchr( c_acPurines, c ); }
-
 	static float GetInformation( const CFullMatrix<size_t>& MatPWM ) {
 		CDataMatrix			MatProbs;
 		size_t				iPos, iFrom, iTo;
 	float AlignRCs( uint32_t, uint32_t, float ) const;
 	float AlignRCPST( uint32_t, const CPST&, float ) const;
 	float AlignPSTs( const CPST&, const CPST&, float ) const;
-	uint32_t RemoveRCs( const CPST& );
+	uint32_t RemoveRCs( const CPST&, float, float );
 
 	EType GetType( uint32_t iMotif ) const {
 

src/coalescestructs.cpp

 		( m_iMotif = Motifs.Merge( iLeft, iRight, FLT_MAX ) ) ); }
 
 string SMotifMatch::Save( const CCoalesceMotifLibrary* pMotifs, bool fPWM, float dCutoffPWMs,
-	bool fNoRCs ) const {
+	float dPenaltyGap, float dPenaltyMismatch, bool fNoRCs ) const {
 	ostringstream	ossm;
 	string			strPWM;
 
 	if( pMotifs ) {
 		ossm << pMotifs->GetMotif( m_iMotif );
 		if( fPWM ) {
-			if( ( strPWM = pMotifs->GetPWM( m_iMotif, dCutoffPWMs, fNoRCs ) ).empty( ) )
+			if( ( strPWM = pMotifs->GetPWM( m_iMotif, dCutoffPWMs, dPenaltyGap, dPenaltyMismatch,
+				fNoRCs ) ).empty( ) )
 				return "";
 			ossm << endl << strPWM; } }
 	else

src/coalescestructsi.h

 
 	bool Open( std::istream&, CCoalesceMotifLibrary& );
 	uint32_t Open( const CHierarchy&, const std::vector<SMotifMatch>&, CCoalesceMotifLibrary&, size_t& );
-	std::string Save( const CCoalesceMotifLibrary*, bool = false, float = 0, bool = false ) const;
+	std::string Save( const CCoalesceMotifLibrary*, bool = false, float = 0, float = 0, float = 0,
+		bool = false ) const;
 
 	bool operator==( const SMotifMatch& sMotif ) const {
 
 
 namespace Sleipnir {
 
-void CPSTImpl::RemoveRCs( const map<unsigned char, unsigned char>& mapccComplements, const SNode& sNode,
-	size_t iOffset, string& strSeq, vector<SRC>& vecsOut ) {
-	size_t	i;
+struct SSortRCs {
+	bool operator()( const string& strOne, const string& strTwo ) const {
 
-	strSeq.push_back( sNode.m_cCharacter );
-	if( sNode.m_vecsChildren.size( ) == 0 )
-		vecsOut.push_back( ( CCoalesceMotifLibrary::GetPurines( strSeq ) < 0.5 ) ?
-			SRC( CCoalesceMotifLibrary::GetReverseComplement( strSeq ), iOffset ) : SRC( strSeq, 0 ) );
-	else {
-		iOffset--;
-		for( i = 0; i < sNode.m_vecsChildren.size( ); ++i )
-			RemoveRCs( mapccComplements, sNode.m_vecsChildren[ i ], iOffset, strSeq, vecsOut ); }
-	strSeq.resize( strSeq.size( ) - 1 ); }
+		return ( strOne.length( ) > strTwo.length( ) ); }
+};
+
+void CPST::RemoveRCs( const map<unsigned char, unsigned char>& mapccComplements, float dPenaltyGap,
+	float dPenaltyMismatch, CPST& PSTOut ) const {
+	size_t			i;
+	string			str;
+	vector<string>	vecstrAdd;
+	float			dOne, dTwo;
+	int				iOne, iTwo;
+
+	for( i = 0; i < m_sRoot.m_vecsChildren.size( ); ++i )
+		CPSTImpl::RemoveRCs( mapccComplements, m_sRoot.m_vecsChildren[ i ], str, vecstrAdd );
+	if( vecstrAdd.empty( ) )
+		return;
+
+	sort( vecstrAdd.begin( ), vecstrAdd.end( ), SSortRCs( ) );
+	PSTOut.Add( vecstrAdd[ 0 ], 0 );
+	for( i = 1; i < vecstrAdd.size( ); ++i ) {
+		dOne = PSTOut.Align( vecstrAdd[ i ], dPenaltyGap, dPenaltyMismatch, FLT_MAX, iOne );
+		dTwo = PSTOut.Align( str = CCoalesceMotifLibrary::GetReverseComplement( vecstrAdd[ i ] ),
+			dPenaltyGap, dPenaltyMismatch, FLT_MAX, iTwo );
+		if( dTwo < dOne )
+			PSTOut.Add( str, iTwo );
+		else
+			PSTOut.Add( vecstrAdd[ i ], iOne ); } }
 
 }
 	 */
 	CPST( size_t iArity ) : CPSTImpl(iArity) { }
 
+	void RemoveRCs( const std::map<unsigned char, unsigned char>& mapccComplements, float dPenaltyGap,
+		float dPenaltyMismatch, CPST& PSTOut ) const;
+
 	/*!
 	 * \brief
 	 * Add the given string and strings from the given PST to the current PST, with an optional character
 				( mapciChars.size( ) + j++ ) : iterChar->second;
 		CMeta::Permute( MatPWM.Get( ), veciOrder );
 		return true; }
-
-	void RemoveRCs( const std::map<unsigned char, unsigned char>& mapccComplements, CPST& PSTOut ) const {
-		size_t				i;
-		std::string			str;
-		std::vector<SRC>	vecsAdd;
-
-		for( i = 0; i < m_sRoot.m_vecsChildren.size( ); ++i )
-			CPSTImpl::RemoveRCs( mapccComplements, m_sRoot.m_vecsChildren[ i ], m_iDepth - 1, str,
-				vecsAdd );
-		std::sort( vecsAdd.begin( ), vecsAdd.end( ) );
-		for( i = 0; i < vecsAdd.size( ); ++i )
-			PSTOut.Add( vecsAdd[ i ].m_strSequence, vecsAdd[ i ].m_iOffset ); }
 };
 
 }
 		std::vector<SNode>	m_vecsChildren;
 	};
 
-	struct SRC {
-		SRC( const std::string& strSequence, size_t iOffset ) : m_strSequence(strSequence),
-			m_iOffset(iOffset) { }
+	static void RemoveRCs( const std::map<unsigned char, unsigned char>& mapccComplements, const SNode& sNode,
+		std::string& strSeq, std::vector<std::string>& vecstrOut ) {
+		size_t	i;
 
-		bool operator<( const SRC& sRC ) const {
-
-			return ( sRC.m_strSequence.length( ) < m_strSequence.length( ) ); }
-
-		std::string	m_strSequence;
-		size_t		m_iOffset;
-	};
-
-	static void RemoveRCs( const std::map<unsigned char, unsigned char>&, const SNode&, size_t, std::string&,
-		std::vector<SRC>& );
+		strSeq.push_back( sNode.m_cCharacter );
+		if( sNode.m_vecsChildren.empty( ) )
+			vecstrOut.push_back( strSeq );
+		else
+			for( i = 0; i < sNode.m_vecsChildren.size( ); ++i )
+				RemoveRCs( mapccComplements, sNode.m_vecsChildren[ i ], strSeq, vecstrOut );
+		strSeq.resize( strSeq.size( ) - 1 ); }
 
 	static std::string GetMotif( const SNode& sNode ) {
 		std::ostringstream	ossm;

tools/COALESCE/COALESCE.cpp

 		if( sArgs.output_arg )
 			vecClustersTo[ i ].Save( sArgs.output_arg, i, PCL, &Motifs );
 		vecClustersTo[ i ].Save( cout, i, PCL, &Motifs, (float)sArgs.min_info_arg,
-			!!sArgs.remove_rcs_flag ); }
+			(float)sArgs.penalty_gap_arg, (float)sArgs.penalty_mismatch_arg, !!sArgs.remove_rcs_flag ); }
 
 	return 0; }