Commits

Anonymous committed 1a27e9b

[svn r407] Fix a horrific bug in runtime COALESCE motif merging
Only the first subsequence score (and no subsequent ones) was being updated
Improve handling of redundant motif merges in COALESCE
Improve variance calculations in CStatistics
Add proper minimum edge weight filtering to DAT output in Dat2Graph

  • Participants
  • Parent commits 874eb39

Comments (0)

Files changed (7)

 			if( psData->m_pFASTA->Get( (*psData->m_pveciPCL2FASTA)[ i ], vecsSequences ) ) {
 				sModifiers.Get( i );
 				for( j = 0; j < vecsSequences.size( ); ++j )
-					if( psData->m_pGeneScores->Add( i, *psData->m_pMotifs, vecsSequences[ j ],
+					if( !psData->m_pGeneScores->Add( i, *psData->m_pMotifs, vecsSequences[ j ],
 						sModifiers, psData->m_iMotif, vecdScores, veciLengths ) )
 						break; } }
 

src/coalescemotifs.cpp

 	return Align( strOne, strTwo, dCutoff, iOffset ); }
 
 uint32_t CCoalesceMotifLibraryImpl::MergeKMers( const std::string& strOne, const std::string& strTwo,
-	float dCutoff ) {
+	float dCutoff, bool fAllowDuplicates ) {
 	int			iOffset;
 	float		dScore;
 	uint32_t	iRet;
 	CPST*		pPST;
 
-	if( ( dScore = Align( strOne, strTwo, dCutoff, iOffset ) ) > dCutoff )
+	if( ( ( dScore = Align( strOne, strTwo, dCutoff, iOffset ) ) > dCutoff ) ||
+		!( fAllowDuplicates || dScore ) )
 		return -1;
 
 	pPST = CreatePST( iRet );
 
 	return min( dOne, dTwo ); }
 
-uint32_t CCoalesceMotifLibraryImpl::MergeKMerRC( uint32_t iKMer, uint32_t iRC, float dCutoff ) {
+uint32_t CCoalesceMotifLibraryImpl::MergeKMerRC( uint32_t iKMer, uint32_t iRC, float dCutoff,
+	bool fAllowDuplicates ) {
 	string		strKMer, strOne, strTwo;
-	float		dOne, dTwo;
+	float		dOne, dTwo, dMin;
 	int			iOne, iTwo;
 	uint32_t	iRet;
 	CPST*		pPST;
 	strTwo = GetReverseComplement( strOne );
 	dOne = Align( strKMer, strOne, dCutoff, iOne );
 	dTwo = Align( strKMer, strTwo, dCutoff, iTwo );
-	if( ( dOne > dCutoff ) && ( dTwo > dCutoff ) )
+	dMin = min( dOne, dTwo );
+	if( ( dMin > dCutoff ) || !( fAllowDuplicates || dMin ) )
 		return -1;
 
 	pPST = CreatePST( iRet );
 
 	return dMin; }
 
-uint32_t CCoalesceMotifLibraryImpl::MergeRCs( uint32_t iOne, uint32_t iTwo, float dCutoff ) {
+uint32_t CCoalesceMotifLibraryImpl::MergeRCs( uint32_t iOne, uint32_t iTwo, float dCutoff,
+	bool fAllowDuplicates ) {
 	SCrossRCs	asCrosses[ 4 ];
 	uint32_t	iRet;
 	CPST*		pPST;
 		if( asCrosses[ i ].m_dScore < dMin ) {
 			dMin = asCrosses[ i ].m_dScore;
 			iMin = i; } }
-	if( dMin > dCutoff )
+	if( ( dMin > dCutoff ) || !( fAllowDuplicates || dMin ) )
 		return -1;
 
 	pPST = CreatePST( iRet );
 	return PSTIn.Align( strKMer, m_dPenaltyGap, m_dPenaltyMismatch, dCutoff, iOffset ); }
 
 uint32_t CCoalesceMotifLibraryImpl::MergeKMerPST( const std::string& strKMer, const CPST& PSTIn,
-	float dCutoff ) {
+	float dCutoff, bool fAllowDuplicates ) {
 	int			iOffset;
 	float		dScore;
 	uint32_t	iRet;
 	CPST*		pPSTOut;
 
-	if( ( dScore = PSTIn.Align( strKMer, m_dPenaltyGap, m_dPenaltyMismatch, dCutoff, iOffset ) ) > dCutoff )
+	if( ( ( dScore = PSTIn.Align( strKMer, m_dPenaltyGap, m_dPenaltyMismatch, dCutoff, iOffset ) ) >
+		dCutoff ) || !( fAllowDuplicates || dScore ) )
 		return -1;
 
 	pPSTOut = CreatePST( iRet );
 
 	return min( dOne, dTwo ); }
 
-uint32_t CCoalesceMotifLibraryImpl::MergeRCPST( uint32_t iRC, const CPST& PSTIn, float dCutoff ) {
+uint32_t CCoalesceMotifLibraryImpl::MergeRCPST( uint32_t iRC, const CPST& PSTIn, float dCutoff,
+	bool fAllowDuplicates ) {
 	int			iOne, iTwo;
 	uint32_t	iRet;
 	CPST*		pPSTOut;
 	string		strOne, strTwo;
-	float		dOne, dTwo;
+	float		dOne, dTwo, dMin;
 
 	strOne = GetRCOne( iRC );
 	strTwo = GetReverseComplement( strOne );
 	dOne = PSTIn.Align( strOne, m_dPenaltyGap, m_dPenaltyMismatch, dCutoff, iOne );
 	dTwo = PSTIn.Align( strTwo, m_dPenaltyGap, m_dPenaltyMismatch, dCutoff, iTwo );
-	if( ( dOne > dCutoff ) && ( dTwo > dCutoff ) )
+	dMin = min( dOne, dTwo );
+	if( ( dMin > dCutoff ) || !( fAllowDuplicates || dMin ) )
 		return -1;
 
 	pPSTOut = CreatePST( iRet );
 
 	return PSTOne.Align( PSTTwo, m_dPenaltyGap, m_dPenaltyMismatch, dCutoff, iOffset ); }
 
-uint32_t CCoalesceMotifLibraryImpl::MergePSTs( const CPST& PSTOne, const CPST& PSTTwo, float dCutoff ) {
+uint32_t CCoalesceMotifLibraryImpl::MergePSTs( const CPST& PSTOne, const CPST& PSTTwo, float dCutoff,
+	bool fAllowDuplicates ) {
 	int			iOffset;
 	uint32_t	iRet;
 	CPST*		pPSTOut;
 	float		dScore;
 
-	if( ( dScore = PSTOne.Align( PSTTwo, m_dPenaltyGap, m_dPenaltyMismatch, dCutoff, iOffset ) ) > dCutoff )
+	if( ( ( dScore = PSTOne.Align( PSTTwo, m_dPenaltyGap, m_dPenaltyMismatch, dCutoff, iOffset ) ) >
+		dCutoff ) || !( fAllowDuplicates || dScore ) )
 		return -1;
 
 	pPSTOut = CreatePST( iRet );

src/coalescemotifs.h

 			case ETypeRC:
 				switch( GetType( iTwo ) ) {
 					case ETypeKMer:
-						iRet = MergeKMerRC( iTwo, iOne, dCutoff );
+						iRet = MergeKMerRC( iTwo, iOne, dCutoff, fAllowDuplicates );
 						break;
 
 					case ETypeRC:
-						iRet = MergeRCs( iOne, iTwo, dCutoff );
+						iRet = MergeRCs( iOne, iTwo, dCutoff, fAllowDuplicates );
 						break;
 
 					case ETypePST:
-						iRet = MergeRCPST( iOne, *GetPST( iTwo ), dCutoff );
+						iRet = MergeRCPST( iOne, *GetPST( iTwo ), dCutoff, fAllowDuplicates );
 						break; }
 				break;
 
 			case ETypePST:
 				switch( GetType( iTwo ) ) {
 					case ETypeKMer:
-						iRet = MergeKMerPST( GetMotif( iTwo ), *GetPST( iOne ), dCutoff );
+						iRet = MergeKMerPST( GetMotif( iTwo ), *GetPST( iOne ), dCutoff, fAllowDuplicates );
 						break;
 
 					case ETypeRC:
-						iRet = MergeRCPST( iTwo, *GetPST( iOne ), dCutoff );
+						iRet = MergeRCPST( iTwo, *GetPST( iOne ), dCutoff, fAllowDuplicates );
 						break;
 
 					case ETypePST:
-						iRet = MergePSTs( *GetPST( iOne ), *GetPST( iTwo ), dCutoff );
+						iRet = MergePSTs( *GetPST( iOne ), *GetPST( iTwo ), dCutoff, fAllowDuplicates );
 						break; }
 				break;
 
 			case ETypeKMer:
 				switch( GetType( iTwo ) ) {
 					case ETypeRC:
-						iRet = MergeKMerRC( iOne, iTwo, dCutoff );
+						iRet = MergeKMerRC( iOne, iTwo, dCutoff, fAllowDuplicates );
 						break;
 
 					case ETypePST:
-						iRet = MergeKMerPST( GetMotif( iOne ), *GetPST( iTwo ), dCutoff );
+						iRet = MergeKMerPST( GetMotif( iOne ), *GetPST( iTwo ), dCutoff, fAllowDuplicates );
 						break;
 
 					case ETypeKMer:
-						iRet = MergeKMers( GetMotif( iOne ), GetMotif( iTwo ), dCutoff );
+						iRet = MergeKMers( GetMotif( iOne ), GetMotif( iTwo ), dCutoff, fAllowDuplicates );
 						break; }
 				break; }
 
 			return -1;
 		switch( GetType( iMotif ) ) {
 			case ETypeRC:
-				return MergeRCPST( iMotif, *pPST, FLT_MAX );
+				return MergeRCPST( iMotif, *pPST, FLT_MAX, true );
 
 			case ETypePST:
-				return MergePSTs( *GetPST( iMotif ), *pPST, FLT_MAX ); }
+				return MergePSTs( *GetPST( iMotif ), *pPST, FLT_MAX, true ); }
 
-		return MergeKMerPST( GetMotif( iMotif ), *pPST, FLT_MAX ); }
+		return MergeKMerPST( GetMotif( iMotif ), *pPST, FLT_MAX, true ); }
 
 	float Align( uint32_t iOne, uint32_t iTwo, float dCutoff ) {
 

src/coalescemotifsi.h

 
 	std::string GetMotif( uint32_t ) const;
 	CPST* CreatePST( uint32_t& );
-	uint32_t MergeKMers( const std::string&, const std::string&, float );
-	uint32_t MergeKMerRC( uint32_t, uint32_t, float );
-	uint32_t MergeKMerPST( const std::string&, const CPST&, float );
-	uint32_t MergeRCs( uint32_t, uint32_t, float );
-	uint32_t MergeRCPST( uint32_t, const CPST&, float );
-	uint32_t MergePSTs( const CPST&, const CPST&, float );
+	uint32_t MergeKMers( const std::string&, const std::string&, float, bool );
+	uint32_t MergeKMerRC( uint32_t, uint32_t, float, bool );
+	uint32_t MergeKMerPST( const std::string&, const CPST&, float, bool );
+	uint32_t MergeRCs( uint32_t, uint32_t, float, bool );
+	uint32_t MergeRCPST( uint32_t, const CPST&, float, bool );
+	uint32_t MergePSTs( const CPST&, const CPST&, float, bool );
 	float AlignKMers( const std::string&, const std::string&, float ) const;
 	float AlignKMerRC( const std::string&, uint32_t, float ) const;
 	float AlignKMerPST( const std::string&, const CPST&, float ) const;
 	 * Mean of the sample or population.
 	 * 
 	 * \returns
-	 * sum((x - dMean)^2) / (n - 1)
+	 * sum((x - dMean)^2) / n
 	 * 
 	 * \see
 	 * Average
 		size_t	iN;
 
 		Sums( Begin, End, NULL, &dRet, &iN );
-		return ( ( iN < 2 ) ? 0 : ( ( dRet / ( iN - 1 ) ) - ( dMean * dMean ) ) ); }
+		return ( iN ? max( ( dRet / iN ) - ( dMean * dMean ), 0 ) : 0 ); }
 
 	/*!
 	 * \brief
 		size_t	iN;
 
 		Sums( Begin, End, &dSum, &dRet, &iN );
-		if( iN < 2 )
+		if( !iN )
 			return 0;
 		dSum /= iN;
-		return ( ( dRet / ( iN - 1 ) ) - ( dSum * dSum ) ); }
+		return max( ( dRet / iN ) - ( dSum * dSum ), 0 ); }
 
 	/*!
 	 * \brief

tools/COALESCE/COALESCE.ggo

 option	"min_info"		u	"Uninformative motif threshhold (bits)"
 							double	default="0.3"
 option	"min_zscore"	U	"Minimum motif z-score magnitude"
-							double	default="0.25"
+							double	default="0.2"
 option	"max_motifs"	x	"Maximum motifs to merge exactly"
 							int	default="2500"
 

tools/Dat2Graph/Dat2Graph.cpp

 						iterGene->second += d; }
 		for( iterGene = mapGenes.begin( ); iterGene != mapGenes.end( ); ++iterGene )
 			cout << pDat->GetGene( iterGene->first ) << '\t' << iterGene->second << endl; }
-	else if( !strcmp( sArgs.format_arg, "dat" ) )
-		pDat->Save( cout, CDat::EFormatText );
+	else if( !strcmp( sArgs.format_arg, "dat" ) ) {
+		for( i = 0; i < pDat->GetGenes( ); ++i )
+			for( j = ( i + 1 ); j < pDat->GetGenes( ); ++j )
+				if( pDat->Get( i, j ) < dCutoff )
+					pDat->Set( i, j, CMeta::GetNaN( ) );
+		pDat->Save( cout, CDat::EFormatText ); }
 
 	return 0; }