Commits

chuttenh  committed 4a74800

[svn r424] Add all missing documentation (primarily COALESCE and SVDer)
Add off-by-default single channel normalization to COALESCE
Automatically detects and log/median transforms single channel data
It's slow and doesn't really improve results

  • Participants
  • Parent commits 3206b05

Comments (0)

Files changed (34)

File src/annotation.h

 	 * Requested child must be less than IOntology::GetChildren.
 	 */
 	virtual size_t GetChild( size_t iTerm, size_t iChild ) const = 0;
+	/*!
+	 * \brief
+	 * Retrieves the parent term IDs of the requested term.
+	 * 
+	 * \param iTerm
+	 * Index of ontology term.
+	 * 
+	 * \param setiParents
+	 * Output set of parent term IDs.
+	 * 
+	 * \returns
+	 * True on success, false otherwise.
+	 * 
+	 * \remarks
+	 * Operates recursively, returning all nodes between iTerm and the ontology root.
+	 * 
+	 * \see
+	 * GetParents | GetParent
+	 */
 	virtual bool GetParents( size_t iTerm, std::set<size_t>& setiParents ) const = 0;
+	/*!
+	 * \brief
+	 * Retrieves the descendant term IDs of the requested term.
+	 * 
+	 * \param iTerm
+	 * Index of ontology term.
+	 * 
+	 * \param setiChildren
+	 * Output set of descendant term IDs.
+	 * 
+	 * \returns
+	 * True on success, false otherwise.
+	 * 
+	 * \remarks
+	 * Operates recursively, returning all nodes between iTerm and the ontology leaves.
+	 * 
+	 * \see
+	 * GetChildren | GetChild
+	 */
 	virtual bool GetChildren( size_t iTerm, std::set<size_t>& setiChildren ) const = 0;
 	/*!
 	 * \brief
 
 		return m_vecstrSlims[ iSlim ]; }
 
+	/*!
+	 * \brief
+	 * Returns the number of ontology terms in the requested slim.
+	 * 
+	 * \param iSlim
+	 * ID of slim.
+	 * 
+	 * \returns
+	 * Number of ontology terms in the requested slim.
+	 * 
+	 * \see
+	 * GetNode
+	 */
 	size_t GetNodes( size_t iSlim ) const {
 
 		return m_vecveciTerms[ iSlim ].size( ); }
 
+	/*!
+	 * \brief
+	 * Returns the ontology term ID at the requested index in the requested slim.
+	 * 
+	 * \param iSlim
+	 * ID of slim.
+	 * 
+	 * \param iTerm
+	 * Index of ontology term to be returned.
+	 * 
+	 * \returns
+	 * Ontology term at the requested index within the given slim.
+	 * 
+	 * \remarks
+	 * No bounds checking is performed; iSlim must be less than GetSlims, and iTerm must be less than GetNodes.
+	 * 
+	 * \see
+	 * GetNodes
+	 */
 	size_t GetNode( size_t iSlim, size_t iTerm ) const {
 
 		return m_vecveciTerms[ iSlim ][ iTerm ]; }

File src/annotationi.h

-/*****************************************************************************
-* This file is provided under the Creative Commons Attribution 3.0 license.
-*
-* You are free to share, copy, distribute, transmit, or adapt this work
-* PROVIDED THAT you attribute the work to the authors listed below.
-* For more information, please see the following web page:
-* http://creativecommons.org/licenses/by/3.0/
-*
-* This file is a component of the Sleipnir library for functional genomics,
-* authored by:
-* Curtis Huttenhower (chuttenh@princeton.edu)
-* Mark Schroeder
-* Maria D. Chikina
-* Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
-*
-* If you use this library, the included executable tools, or any related
-* code in your work, please cite the following publication:
-* Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
-* Olga G. Troyanskaya.
-* "The Sleipnir library for computational functional genomics"
-*****************************************************************************/
-#ifndef ANNOTATIONI_H
-#define ANNOTATIONI_H
-
-#include <map>
-#include <iostream>
-#include <set>
-#include <stack>
-#include <string>
-#include <vector>
-
-#include "file.h"
-
-namespace Sleipnir {
-
-class CGene;
-class CGenes;
-class CGenome;
-class IOntology;
-
-class COntologyImpl {
-protected:
-	typedef std::map<std::string,size_t>	TMapStrI;
-	typedef std::set<const CGene*>			TSetPGenes;
-
-	struct SNode {
-		SNode( );
-
-		void Reset( );
-
-		std::string		m_strID;
-		std::string		m_strGloss;
-		size_t			m_iParents;
-		size_t*			m_aiParents;
-		size_t			m_iChildren;
-		size_t*			m_aiChildren;
-		size_t			m_iGenes;
-		const CGene**	m_apGenes;
-		size_t			m_iCacheGenes;
-		const CGene**	m_apCacheGenes;
-	};
-
-	struct SParser {
-		static const size_t	c_iBuffer	= 4096;
-
-		SParser( std::istream&, CGenome& );
-
-		bool GetLine( );
-		bool IsStart( const char* ) const;
-
-		std::istream&	m_istm;
-		CGenome&		m_Genome;
-		char			m_szLine[ c_iBuffer ];
-		std::string		m_strGloss;
-		size_t			m_iLine;
-	};
-
-	COntologyImpl( const std::string& strID ) : m_strID(strID), m_iNodes(0), m_aNodes(NULL) { }
-
-	~COntologyImpl( ) {
-
-		Reset( ); }
-
-	size_t GetNode( const std::string& ) const;
-	bool IsAnnotated( size_t, const CGene&, bool ) const;
-	const CGene& GetGene( size_t, size_t ) const;
-	void GetGeneNames( std::vector<std::string>& ) const;
-	void Reset( );
-	void CollectGenes( size_t, TSetPGenes& );
-	void TermFinder( const CGenes&, std::vector<STermFound>&, bool, bool, bool, float, const CGenes* ) const;
-
-	size_t GetNodes( ) const {
-
-		return m_iNodes; }
-
-	size_t GetParents( size_t iNode ) const {
-
-		return m_aNodes[ iNode ].m_iParents; }
-
-	size_t GetParent( size_t iNode, size_t iParent ) const {
-
-		return m_aNodes[ iNode ].m_aiParents[ iParent ]; }
-
-	size_t GetChildren( size_t iNode ) const {
-
-		return m_aNodes[ iNode ].m_iChildren; }
-
-	size_t GetChild( size_t iNode, size_t iChild ) const {
-
-		return m_aNodes[ iNode ].m_aiChildren[ iChild ]; }
-
-	size_t GetGenes( size_t iNode, bool fKids ) const {
-		size_t	iRet;
-
-		iRet = m_aNodes[ iNode ].m_iGenes;
-		if( fKids ) {
-			CollectGenes( iNode );
-			iRet += m_aNodes[ iNode ].m_iCacheGenes; }
-
-		return iRet; }
-
-	const std::string& GetID( ) const {
-
-		return m_strID; }
-
-	const std::string& GetID( size_t iNode ) const {
-
-		return m_aNodes[ iNode ].m_strID; }
-
-	const std::string& GetGloss( size_t iNode ) const {
-
-		return m_aNodes[ iNode ].m_strGloss; }
-
-	void CollectGenes( size_t iNode ) const {
-		TSetPGenes	setpGenes;
-
-		if( m_aNodes[ iNode ].m_iCacheGenes == -1 )
-			((COntologyImpl*)this)->CollectGenes( iNode, setpGenes ); }
-
+/*****************************************************************************
+* This file is provided under the Creative Commons Attribution 3.0 license.
+*
+* You are free to share, copy, distribute, transmit, or adapt this work
+* PROVIDED THAT you attribute the work to the authors listed below.
+* For more information, please see the following web page:
+* http://creativecommons.org/licenses/by/3.0/
+*
+* This file is a component of the Sleipnir library for functional genomics,
+* authored by:
+* Curtis Huttenhower (chuttenh@princeton.edu)
+* Mark Schroeder
+* Maria D. Chikina
+* Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
+*
+* If you use this library, the included executable tools, or any related
+* code in your work, please cite the following publication:
+* Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
+* Olga G. Troyanskaya.
+* "The Sleipnir library for computational functional genomics"
+*****************************************************************************/
+#ifndef ANNOTATIONI_H
+#define ANNOTATIONI_H
+
+#include <map>
+#include <iostream>
+#include <set>
+#include <stack>
+#include <string>
+#include <vector>
+
+#include "file.h"
+
+namespace Sleipnir {
+
+class CGene;
+class CGenes;
+class CGenome;
+class IOntology;
+
+class COntologyImpl {
+protected:
+	typedef std::map<std::string,size_t>	TMapStrI;
+	typedef std::set<const CGene*>			TSetPGenes;
+
+	struct SNode {
+		SNode( );
+
+		void Reset( );
+
+		std::string		m_strID;
+		std::string		m_strGloss;
+		size_t			m_iParents;
+		size_t*			m_aiParents;
+		size_t			m_iChildren;
+		size_t*			m_aiChildren;
+		size_t			m_iGenes;
+		const CGene**	m_apGenes;
+		size_t			m_iCacheGenes;
+		const CGene**	m_apCacheGenes;
+	};
+
+	struct SParser {
+		static const size_t	c_iBuffer	= 4096;
+
+		SParser( std::istream&, CGenome& );
+
+		bool GetLine( );
+		bool IsStart( const char* ) const;
+
+		std::istream&	m_istm;
+		CGenome&		m_Genome;
+		char			m_szLine[ c_iBuffer ];
+		std::string		m_strGloss;
+		size_t			m_iLine;
+	};
+
+	COntologyImpl( const std::string& strID ) : m_strID(strID), m_iNodes(0), m_aNodes(NULL) { }
+
+	~COntologyImpl( ) {
+
+		Reset( ); }
+
+	size_t GetNode( const std::string& ) const;
+	bool IsAnnotated( size_t, const CGene&, bool ) const;
+	const CGene& GetGene( size_t, size_t ) const;
+	void GetGeneNames( std::vector<std::string>& ) const;
+	void Reset( );
+	void CollectGenes( size_t, TSetPGenes& );
+	void TermFinder( const CGenes&, std::vector<STermFound>&, bool, bool, bool, float, const CGenes* ) const;
+
+	size_t GetNodes( ) const {
+
+		return m_iNodes; }
+
+	size_t GetParents( size_t iNode ) const {
+
+		return m_aNodes[ iNode ].m_iParents; }
+
+	size_t GetParent( size_t iNode, size_t iParent ) const {
+
+		return m_aNodes[ iNode ].m_aiParents[ iParent ]; }
+
+	size_t GetChildren( size_t iNode ) const {
+
+		return m_aNodes[ iNode ].m_iChildren; }
+
+	size_t GetChild( size_t iNode, size_t iChild ) const {
+
+		return m_aNodes[ iNode ].m_aiChildren[ iChild ]; }
+
+	size_t GetGenes( size_t iNode, bool fKids ) const {
+		size_t	iRet;
+
+		iRet = m_aNodes[ iNode ].m_iGenes;
+		if( fKids ) {
+			CollectGenes( iNode );
+			iRet += m_aNodes[ iNode ].m_iCacheGenes; }
+
+		return iRet; }
+
+	const std::string& GetID( ) const {
+
+		return m_strID; }
+
+	const std::string& GetID( size_t iNode ) const {
+
+		return m_aNodes[ iNode ].m_strID; }
+
+	const std::string& GetGloss( size_t iNode ) const {
+
+		return m_aNodes[ iNode ].m_strGloss; }
+
+	void CollectGenes( size_t iNode ) const {
+		TSetPGenes	setpGenes;
+
+		if( m_aNodes[ iNode ].m_iCacheGenes == -1 )
+			((COntologyImpl*)this)->CollectGenes( iNode, setpGenes ); }
+
 	bool GetChildren( size_t iNode, std::set<size_t>& setiChildren ) const {
 		size_t	i, iChild;
 
 			setiChildren.insert( iChild ); }
 
 		return true; }
-
+
 	bool GetParents( size_t iNode, std::set<size_t>& setiParents ) const {
 		size_t	i, iParent;
 
 			setiParents.insert( iParent ); }
 
 		return true; }
-
-	const IOntology*	m_pOntology;
-	std::string			m_strID;
-	size_t				m_iNodes;
-	TMapStrI			m_mapNodes;
-	SNode*				m_aNodes;
-};
-
-class COntologyKEGGImpl : protected COntologyImpl {
-protected:
-	static const char	c_szKEGG[];
-	static const char	c_szEntry[];
-	static const char	c_szName[];
-	static const char	c_szDefinition[];
-	static const char	c_szClass[];
-	static const char	c_szPath[];
-	static const char	c_szBR[];
-	static const char	c_szDBLinks[];
-	static const char	c_szGenes[];
-	static const char	c_szEnd[];
-	static const size_t	c_iKEGG		= 10000;
-
-	struct SParserKEGG : SParser {
-		SParserKEGG( std::istream&, CGenome&, const std::string&, bool fSynonyms );
-
-		void Reset( );
-
-		const std::string&					m_strOrganism;
-		bool								m_fOrganism;
-		bool								m_fPathing;
-		bool								m_fSynonyms;
-		std::vector<CGene*>					m_vecpGenes;
-		std::vector<std::string>			m_vecstrIDs;
-		std::map<std::string,std::string>	m_mapGlosses;
-	};
-
-	COntologyKEGGImpl( );
-
-	bool Open( SParserKEGG& );
-	bool OpenEntry( SParserKEGG& );
-	bool OpenName( SParserKEGG& );
-	bool OpenDefinition( SParserKEGG& );
-	bool OpenClass( SParserKEGG& );
-	bool OpenDBLinks( SParserKEGG& );
-	bool OpenGenes( SParserKEGG& );
-	bool OpenOrganism( SParserKEGG& );
-	char* OpenGene( SParserKEGG&, char* );
-	bool OpenEnd( SParserKEGG& );
-	bool OpenGloss( SParserKEGG& );
-};
-
-class COntologyGOImpl : protected COntologyImpl {
-protected:
-	static const char	c_szAltID[];
-	static const char	c_szGO[];
-	static const char	c_szGOC[];
-	static const char	c_szHUMAN[];
-	static const char	c_szID[];
-	static const char	c_szIsA[];
-	static const char	c_szIsObsolete[];
-	static const char	c_szName[];
-	static const char	c_szNamespace[];
-	static const char	c_szNOT[];
-	static const char	c_szPartOf[];
-	static const char	c_szRelationship[];
-	static const char	c_szSGD[];
-	static const char	c_szTerm[];
-
-	struct SParserGO : SParser {
-		typedef std::set<const CGene*>	TSetPGene;
-
-		SParserGO( std::istream&, CGenome&, bool = false, bool = false );
-
-		void Reset( );
-
-		const char*					m_szTarget;
-		std::vector<std::vector<std::string> >	m_vecvecstrParents;
-		bool						m_fObsolete;
-		bool						m_fDBIDs;
-		bool						m_fSynonyms;
-		std::string					m_strNamespace;
-		std::vector<std::string>	m_vecstrIDs;
-		std::vector<SNode>			m_vecNodes;
-		std::vector<TSetPGene>		m_vecsetpGenes;
-	};
-
-	COntologyGOImpl( );
-
-	bool OpenOntology( SParserGO& );
-	bool OpenHeader( SParserGO& );
-	bool OpenBlock( SParserGO& );
-	bool OpenTerm( SParserGO& );
-	bool OpenID( SParserGO& );
-	bool OpenName( SParserGO& );
-	bool OpenNamespace( SParserGO& );
-	bool OpenRelationship( SParserGO& );
-	bool OpenParent( SParserGO& );
-	bool OpenAltID( SParserGO& );
-	bool OpenObsolete( SParserGO& );
-	bool OpenGenes( SParserGO& );
-	bool OpenGene( SParserGO& );
-};
-
-class COntologyMIPSImpl : protected COntologyImpl {
-protected:
-	static const char	c_szMIPS[];
-
-	struct SParserMIPS : SParser {
-		SParserMIPS( std::istream&, CGenome& );
-
-		std::vector<size_t>						m_veciParents;
-		std::vector<std::string>				m_vecstrIDs;
-		std::vector<std::string>				m_vecstrGlosses;
-		std::stack<size_t>						m_stakiHier;
-		std::vector<std::vector<const CGene*> >	m_vecpGenes;
-	};
-
-	COntologyMIPSImpl( );
-
-	bool OpenOntology( SParserMIPS& );
-	bool OpenCategory( SParserMIPS& );
-	size_t OpenID( SParserMIPS& );
-	bool OpenGenes( SParserMIPS& );
-	bool OpenGene( SParserMIPS& );
-};
-
-class CSlimImpl : protected CFile {
-protected:
-	void Reset( const IOntology* );
-
-	std::vector<std::string>				m_vecstrSlims;
-	std::vector<std::vector<size_t> >		m_vecveciTerms;
-	std::vector<std::vector<const CGene*> >	m_vecvecpGenes;
-	const IOntology*						m_pOntology;
-};
-
-}
-
-#endif // ANNOTATIONI_H
+
+	const IOntology*	m_pOntology;
+	std::string			m_strID;
+	size_t				m_iNodes;
+	TMapStrI			m_mapNodes;
+	SNode*				m_aNodes;
+};
+
+class COntologyKEGGImpl : protected COntologyImpl {
+protected:
+	static const char	c_szKEGG[];
+	static const char	c_szEntry[];
+	static const char	c_szName[];
+	static const char	c_szDefinition[];
+	static const char	c_szClass[];
+	static const char	c_szPath[];
+	static const char	c_szBR[];
+	static const char	c_szDBLinks[];
+	static const char	c_szGenes[];
+	static const char	c_szEnd[];
+	static const size_t	c_iKEGG		= 10000;
+
+	struct SParserKEGG : SParser {
+		SParserKEGG( std::istream&, CGenome&, const std::string&, bool fSynonyms );
+
+		void Reset( );
+
+		const std::string&					m_strOrganism;
+		bool								m_fOrganism;
+		bool								m_fPathing;
+		bool								m_fSynonyms;
+		std::vector<CGene*>					m_vecpGenes;
+		std::vector<std::string>			m_vecstrIDs;
+		std::map<std::string,std::string>	m_mapGlosses;
+	};
+
+	COntologyKEGGImpl( );
+
+	bool Open( SParserKEGG& );
+	bool OpenEntry( SParserKEGG& );
+	bool OpenName( SParserKEGG& );
+	bool OpenDefinition( SParserKEGG& );
+	bool OpenClass( SParserKEGG& );
+	bool OpenDBLinks( SParserKEGG& );
+	bool OpenGenes( SParserKEGG& );
+	bool OpenOrganism( SParserKEGG& );
+	char* OpenGene( SParserKEGG&, char* );
+	bool OpenEnd( SParserKEGG& );
+	bool OpenGloss( SParserKEGG& );
+};
+
+class COntologyGOImpl : protected COntologyImpl {
+protected:
+	static const char	c_szAltID[];
+	static const char	c_szGO[];
+	static const char	c_szGOC[];
+	static const char	c_szHUMAN[];
+	static const char	c_szID[];
+	static const char	c_szIsA[];
+	static const char	c_szIsObsolete[];
+	static const char	c_szName[];
+	static const char	c_szNamespace[];
+	static const char	c_szNOT[];
+	static const char	c_szPartOf[];
+	static const char	c_szRelationship[];
+	static const char	c_szSGD[];
+	static const char	c_szTerm[];
+
+	struct SParserGO : SParser {
+		typedef std::set<const CGene*>	TSetPGene;
+
+		SParserGO( std::istream&, CGenome&, bool = false, bool = false );
+
+		void Reset( );
+
+		const char*					m_szTarget;
+		std::vector<std::vector<std::string> >	m_vecvecstrParents;
+		bool						m_fObsolete;
+		bool						m_fDBIDs;
+		bool						m_fSynonyms;
+		std::string					m_strNamespace;
+		std::vector<std::string>	m_vecstrIDs;
+		std::vector<SNode>			m_vecNodes;
+		std::vector<TSetPGene>		m_vecsetpGenes;
+	};
+
+	COntologyGOImpl( );
+
+	bool OpenOntology( SParserGO& );
+	bool OpenHeader( SParserGO& );
+	bool OpenBlock( SParserGO& );
+	bool OpenTerm( SParserGO& );
+	bool OpenID( SParserGO& );
+	bool OpenName( SParserGO& );
+	bool OpenNamespace( SParserGO& );
+	bool OpenRelationship( SParserGO& );
+	bool OpenParent( SParserGO& );
+	bool OpenAltID( SParserGO& );
+	bool OpenObsolete( SParserGO& );
+	bool OpenGenes( SParserGO& );
+	bool OpenGene( SParserGO& );
+};
+
+class COntologyMIPSImpl : protected COntologyImpl {
+protected:
+	static const char	c_szMIPS[];
+
+	struct SParserMIPS : SParser {
+		SParserMIPS( std::istream&, CGenome& );
+
+		std::vector<size_t>						m_veciParents;
+		std::vector<std::string>				m_vecstrIDs;
+		std::vector<std::string>				m_vecstrGlosses;
+		std::stack<size_t>						m_stakiHier;
+		std::vector<std::vector<const CGene*> >	m_vecpGenes;
+	};
+
+	COntologyMIPSImpl( );
+
+	bool OpenOntology( SParserMIPS& );
+	bool OpenCategory( SParserMIPS& );
+	size_t OpenID( SParserMIPS& );
+	bool OpenGenes( SParserMIPS& );
+	bool OpenGene( SParserMIPS& );
+};
+
+class CSlimImpl : protected CFile {
+protected:
+	void Reset( const IOntology* );
+
+	std::vector<std::string>				m_vecstrSlims;
+	std::vector<std::vector<size_t> >		m_vecveciTerms;
+	std::vector<std::vector<const CGene*> >	m_vecvecpGenes;
+	const IOntology*						m_pOntology;
+};
+
+}
+
+#endif // ANNOTATIONI_H

File src/clusthierarchical.cpp

 	if( m_pRight )
 		delete m_pRight; }
 
-bool CHierarchyImpl::Save( std::ostream& ostm, size_t iNode, const vector<string>* pvecstrGenes ) const {
+bool CHierarchyImpl::Save( std::ostream& ostm, size_t iNode,
+	const std::vector<std::string>* pvecstrGenes ) const {
 
 	if( IsGene( ) )
 		return false;
 	return ( ((const CHierarchyImpl*)m_pLeft)->Save( ostm, iNode, pvecstrGenes ) ||
 		((const CHierarchyImpl*)m_pRight)->Save( ostm, iNode, pvecstrGenes ) ); }
 
-string CHierarchyImpl::GetSave( const vector<string>* pvecstrGenes ) const {
+string CHierarchyImpl::GetSave( const std::vector<std::string>* pvecstrGenes ) const {
 	string	strRet;
 	char	achBuf[ 16 ];
 

File src/clusthierarchical.h

-/*****************************************************************************
-* This file is provided under the Creative Commons Attribution 3.0 license.
-*
-* You are free to share, copy, distribute, transmit, or adapt this work
-* PROVIDED THAT you attribute the work to the authors listed below.
-* For more information, please see the following web page:
-* http://creativecommons.org/licenses/by/3.0/
-*
-* This file is a component of the Sleipnir library for functional genomics,
-* authored by:
-* Curtis Huttenhower (chuttenh@princeton.edu)
-* Mark Schroeder
-* Maria D. Chikina
-* Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
-*
-* If you use this library, the included executable tools, or any related
-* code in your work, please cite the following publication:
-* Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
-* Olga G. Troyanskaya.
-* "The Sleipnir library for computational functional genomics"
-*****************************************************************************/
-#ifndef CLUSHIERARCHICAL_H
-#define CLUSHIERARCHICAL_H
-
-#include "clusthierarchicali.h"
-
-namespace Sleipnir {
-
-class CPCL;
-class IMeasure;
-
-/*!
- * \brief
- * Represents a simple node in a binary tree.
- * 
- * Generated by CClustHierarchical::Cluster, a CHierarchy is an extremely rudimentary representation of an
- * binary tree intended to be serialized to disk as the GTR file in a CDT/GTR pair.  Each node either zero
- * or two children, a unique integer identifier within the tree, and a similarity score indicating its height
- * within the tree.
- */
-class CHierarchy : public CHierarchyImpl {
-public:
-	CHierarchy( size_t iID, float dSimilarity, const CHierarchy* pLeft, const CHierarchy* pRight );
-
-	void GetGenes( std::vector<size_t>& veciGenes ) const;
-	float SortChildren( const std::vector<float>& vecdScores );
-
-	/*!
-	 * \brief
-	 * Save the hierarchy to the given stream in GTR format.
-	 * 
-	 * \param ostm
-	 * Output stream into which the hierarchy is saved.
-	 * 
-	 * \param iGenes
-	 * Total number of leaf nodes in the hierarchy.
-	 * 
-	 * \param pvecstrGenes
-	 * If non-NULL, vector of gene names to be emitted in place of GENE IDs.
-	 * 
-	 * \remarks
-	 * iGenes can be calculated from the hierarchy; it is included as an input solely for convenience
-	 * purposes, since the genes must be output in original order (not traversal order) to satisfy GTR
-	 * file formatting requirements.
-	 */
-	void Save( std::ostream& ostm, size_t iGenes,
-		const std::vector<std::string>* pvecstrGenes = NULL ) const {
-		size_t	i;
-
-		for( i = 0; ( i + 1 ) < iGenes; ++i )
-			CHierarchyImpl::Save( ostm, i, pvecstrGenes ); }
-
-	/*!
-	 * \brief
-	 * Safety method to delete a hierarchy.
-	 * 
-	 * \remarks
-	 * Included to avoid the necessity of directly deleting something allocated within a library method.
-	 */
-	void Destroy( ) {
-
-		delete this; }
-
-	/*!
-	 * \brief
-	 * Returns this node's height within the hierarchy.
-	 * 
-	 * \returns
-	 * The current node's height within the hierarchy.
-	 */
-	float GetSimilarity( ) const {
-
-		return m_dScore; }
-
-	/*!
-	 * \brief
-	 * Returns true if the current node is a leaf node (i.e. represents a gene in the hierarchy).
-	 * 
-	 * \returns
-	 * True if the current node is a leaf (has no children).
-	 */
-	bool IsGene( ) const {
-
-		return CHierarchyImpl::IsGene( ); }
-
-	/*!
-	 * \brief
-	 * Returns the current node's unique ID within the hierarchy.
-	 * 
-	 * \returns
-	 * The current node's ID.
-	 * 
-	 * \remarks
-	 * Leaf node IDs generally correspond to gene indices within the pre-clustered PCL; internal node IDs are
-	 * arbitrary unique values.
-	 */
-	size_t GetID( ) const {
-
-		return m_iID; }
-
-	/*!
-	 * \brief
-	 * Returns the current node's left or right child.
-	 * 
-	 * \param fRight
-	 * If true, return the right (second) child; otherwise, return the left (first).
-	 * 
-	 * \returns
-	 * One of the current node's two children.
-	 * 
-	 * \remarks
-	 * Do not call for leaf nodes.
-	 * 
-	 * \see
-	 * IsLeaf
-	 */
-	const CHierarchy& Get( bool fRight ) const {
-
-		return *( fRight ? m_pRight : m_pLeft ); }
-};
-
-/*!
- * \brief
- * Utility class containing static hierarchical clustering methods.
- */
-class CClustHierarchical : CClustHierarchicalImpl {
-public:
-	static CHierarchy* Cluster( const CDistanceMatrix& MatSimilarities );
-	static CHierarchy* Cluster( const CDistanceMatrix& MatSimilarities,
-		const std::vector<bool>& vecfIncluded );
-};
-
-}
-
-#endif // CLUSHIERARCHICAL_H
+/*****************************************************************************
+* This file is provided under the Creative Commons Attribution 3.0 license.
+*
+* You are free to share, copy, distribute, transmit, or adapt this work
+* PROVIDED THAT you attribute the work to the authors listed below.
+* For more information, please see the following web page:
+* http://creativecommons.org/licenses/by/3.0/
+*
+* This file is a component of the Sleipnir library for functional genomics,
+* authored by:
+* Curtis Huttenhower (chuttenh@princeton.edu)
+* Mark Schroeder
+* Maria D. Chikina
+* Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
+*
+* If you use this library, the included executable tools, or any related
+* code in your work, please cite the following publication:
+* Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
+* Olga G. Troyanskaya.
+* "The Sleipnir library for computational functional genomics"
+*****************************************************************************/
+#ifndef CLUSHIERARCHICAL_H
+#define CLUSHIERARCHICAL_H
+
+#include "clusthierarchicali.h"
+
+namespace Sleipnir {
+
+class CPCL;
+class IMeasure;
+
+/*!
+ * \brief
+ * Represents a simple node in a binary tree.
+ * 
+ * Generated by CClustHierarchical::Cluster, a CHierarchy is an extremely rudimentary representation of an
+ * binary tree intended to be serialized to disk as the GTR file in a CDT/GTR pair.  Each node either zero
+ * or two children, a unique integer identifier within the tree, and a similarity score indicating its height
+ * within the tree.
+ */
+class CHierarchy : public CHierarchyImpl {
+public:
+	CHierarchy( size_t iID, float dSimilarity, const CHierarchy* pLeft, const CHierarchy* pRight );
+
+	void GetGenes( std::vector<size_t>& veciGenes ) const;
+	float SortChildren( const std::vector<float>& vecdScores );
+
+	/*!
+	 * \brief
+	 * Save the hierarchy to the given stream in GTR format.
+	 * 
+	 * \param ostm
+	 * Output stream into which the hierarchy is saved.
+	 * 
+	 * \param iGenes
+	 * Total number of leaf nodes in the hierarchy.
+	 * 
+	 * \param pvecstrGenes
+	 * If non-NULL, vector of gene names to be emitted in place of GENE IDs.
+	 * 
+	 * \remarks
+	 * iGenes can be calculated from the hierarchy; it is included as an input solely for convenience
+	 * purposes, since the genes must be output in original order (not traversal order) to satisfy GTR
+	 * file formatting requirements.
+	 */
+	void Save( std::ostream& ostm, size_t iGenes,
+		const std::vector<std::string>* pvecstrGenes = NULL ) const {
+		size_t	i;
+
+		for( i = 0; ( i + 1 ) < iGenes; ++i )
+			CHierarchyImpl::Save( ostm, i, pvecstrGenes ); }
+
+	/*!
+	 * \brief
+	 * Safety method to delete a hierarchy.
+	 * 
+	 * \remarks
+	 * Included to avoid the necessity of directly deleting something allocated within a library method.
+	 */
+	void Destroy( ) {
+
+		delete this; }
+
+	/*!
+	 * \brief
+	 * Returns this node's height within the hierarchy.
+	 * 
+	 * \returns
+	 * The current node's height within the hierarchy.
+	 */
+	float GetSimilarity( ) const {
+
+		return m_dScore; }
+
+	/*!
+	 * \brief
+	 * Returns true if the current node is a leaf node (i.e. represents a gene in the hierarchy).
+	 * 
+	 * \returns
+	 * True if the current node is a leaf (has no children).
+	 */
+	bool IsGene( ) const {
+
+		return CHierarchyImpl::IsGene( ); }
+
+	/*!
+	 * \brief
+	 * Returns the current node's unique ID within the hierarchy.
+	 * 
+	 * \returns
+	 * The current node's ID.
+	 * 
+	 * \remarks
+	 * Leaf node IDs generally correspond to gene indices within the pre-clustered PCL; internal node IDs are
+	 * arbitrary unique values.
+	 */
+	size_t GetID( ) const {
+
+		return m_iID; }
+
+	/*!
+	 * \brief
+	 * Returns the current node's left or right child.
+	 * 
+	 * \param fRight
+	 * If true, return the right (second) child; otherwise, return the left (first).
+	 * 
+	 * \returns
+	 * One of the current node's two children.
+	 * 
+	 * \remarks
+	 * Do not call for leaf nodes.
+	 * 
+	 * \see
+	 * IsLeaf
+	 */
+	const CHierarchy& Get( bool fRight ) const {
+
+		return *( fRight ? m_pRight : m_pLeft ); }
+
+	/*!
+	 * \brief
+	 * Returns the number of leaves under the current node.
+	 * 
+	 * \returns
+	 * Number of leaves under the current node.
+	 */
+	size_t GetWeight( ) const {
+
+		return m_iWeight; }
+};
+
+/*!
+ * \brief
+ * Utility class containing static hierarchical clustering methods.
+ */
+class CClustHierarchical : CClustHierarchicalImpl {
+public:
+	static CHierarchy* Cluster( const CDistanceMatrix& MatSimilarities );
+	static CHierarchy* Cluster( const CDistanceMatrix& MatSimilarities,
+		const std::vector<bool>& vecfIncluded );
+};
+
+}
+
+#endif // CLUSHIERARCHICAL_H

File src/clustkmeans.cpp

 
 	return true; }
 
+/*!
+ * \brief
+ * Cluster a set of elements into k groups using the given pairwise similarities.
+ * 
+ * \param MatSimilarities
+ * Matrix of precalculated pairwise similarities between elements to be clustered.
+ * 
+ * \param iK
+ * Number of clusters to generate.
+ * 
+ * \param vecsClusters
+ * Output cluster IDs for each gene.
+ * 
+ * \returns
+ * True if clustering succeeded.
+ * 
+ * Performs k-means clustering on the given data using the specified similarites and number of
+ * clusters.  The indices of each element's final cluster are indicated in the output vector.  During
+ * k-means clustering, K centers are initially chosen at random.  Each gene is assigned to the center
+ * most similar to it, and the centers are moved to the mean of their assigned genes.  This process is
+ * iterated until no gene assignments change.  This places each gene in exactly one cluster.
+ * 
+ * \remarks
+ * The size of MatSimilarities must be at least iK; on successful return, the size of vecsClusters will be equal
+ * to the size of MatSimilarities.
+ * 
+ * \see
+ * CClustHierarchical::Cluster
+ */
 bool CClustKMeans::Cluster( const CDistanceMatrix& MatSimilarities, size_t iK,
 	vector<uint16_t>& vecsClusters ) {
 	size_t			i, j, iOne, iIteration, iChanged, iState;

File src/coalesce.cpp

 #include "coalesce.h"
 #include "fasta.h"
 #include "pcl.h"
+#include "halfmatrix.h"
 
 namespace Sleipnir {
 
 			iMotif, vecdScores, veciLengths, iOffset, sModifiers ) )
 			return false;
 	for( iSubsequence = ESubsequenceBegin; iSubsequence < vecdScores.size( ); ++iSubsequence )
-		if( ( iLength = veciLengths[ iSubsequence ] ) && ( dScore = vecdScores[ iSubsequence ] ) )
-			Set( iType, (ESubsequence)iSubsequence, iGene, iMotif, dScore / iLength );
+		if( ( iLength = veciLengths[ iSubsequence ] ) && ( dScore = vecdScores[ iSubsequence ] ) ) {
+//			dScore *= m_vecvecdWeights[ iType ][ iGene ];
+			Set( iType, (ESubsequence)iSubsequence, iGene, iMotif, dScore / iLength ); }
 
 	return true; }
 
 		( adScores = Get( iType, sMotif.m_eSubsequence, iGene ) ) )
 		adScores[ sMotif.m_iMotif ] -= sMotif.m_dAverage; }
 
+bool CCoalesceGeneScores::CalculateWeights( ) {
+	static const float		c_dCutoff		= 0.95f;
+	static const size_t		c_iSubsample	= 100;
+	size_t					i, j, k, iBin, iType;
+	CMeasureQuickPearson	MeasurePearson;
+	uint32_t				iMotif;
+	vector<float>			vecdSampleOne, vecdSampleTwo;
+	vector<size_t>			veciBins, veciCounts;
+	float					dOne, dTwo, dMaxOne, dMaxTwo;
+
+	g_CatSleipnir.notice( "CCoalesceGeneScores::CalculateWeights( ) calculating %d types for %d genes",
+		GetTypes( ), m_iGenes );
+	veciBins.resize( m_iGenes );
+	veciCounts.resize( m_iGenes );
+	vecdSampleOne.resize( c_iSubsample );
+	vecdSampleTwo.resize( c_iSubsample );
+	m_vecvecdWeights.resize( GetTypes( ) );
+	for( iType = 0; iType < m_vecvecdWeights.size( ); ++iType ) {
+		vector<float>&	vecdWeights	= m_vecvecdWeights[ iType ];
+
+		vecdWeights.resize( m_iGenes );
+		for( i = 0; i < veciBins.size( ); ++i )
+			veciBins[ i ] = i;
+		for( i = 0; i < m_iGenes; ++i ) {
+			const float*	adOne	= Get( iType, ESubsequenceTotal, i );
+
+			if( !( i % 1000 ) )
+				g_CatSleipnir.info( "CCoalesceGeneScores::CalculateWeights( ) calculating type %d/%d, gene %d/%d",
+					iType, m_vecvecdWeights.size( ), i, m_iGenes );
+			if( !adOne )
+				continue;
+			for( j = ( i + 1 ); j < m_iGenes; ++j ) {
+				const float*	adTwo	= Get( iType, ESubsequenceTotal, j );
+
+				if( !adTwo )
+					continue;
+				dMaxOne = dMaxTwo = 0;
+				for( iMotif = k = 0; ( k < vecdSampleOne.size( ) ) && ( iMotif < m_iMotifs ); ++iMotif ) {
+					dOne = adOne[ iMotif ];
+					dTwo = adTwo[ iMotif ];
+					if( dOne || dTwo ) {
+						if( dOne > dMaxOne )
+							dMaxOne = dOne;
+						if( dTwo > dMaxTwo )
+							dMaxTwo = dTwo;
+						vecdSampleOne[ k ] = dOne;
+						vecdSampleTwo[ k++ ] = dTwo; } }
+				if( !k || !dMaxOne || !dMaxTwo || ( MeasurePearson.Measure( &vecdSampleOne[ 0 ], k,
+					&vecdSampleTwo[ 0 ], k, IMeasure::EMapNone, NULL, NULL ) < c_dCutoff ) )
+					continue;
+
+				iBin = veciBins[ j ];
+				for( k = 0; k < veciBins.size( ); ++k )
+					if( veciBins[ k ] == iBin )
+						veciBins[ k ] = veciBins[ i ]; } }
+		fill( veciCounts.begin( ), veciCounts.end( ), 0 );
+		for( i = 0; i < veciBins.size( ); ++i )
+			veciCounts[ veciBins[ i ] ]++;
+		for( i = 0; i < veciBins.size( ); ++i )
+			vecdWeights[ i ] = 1.0f / veciCounts[ veciBins[ i ] ];
+
+		for( i = 0; i < m_iGenes; ++i ) {
+			if( vecdWeights[ i ] == 1 )
+				continue;
+			g_CatSleipnir.info( "CCoalesceGeneScores::CalculateWeights( ) weighting gene %d, type %d to %g",
+				i, iType, vecdWeights[ i ] );
+			for( j = ESubsequenceBegin; j < ESubsequenceEnd; ++j ) {
+				float*	adOne	= Get( iType, (ESubsequence)j, i );
+
+				if( adOne )
+					for( iMotif = 0; iMotif < GetMotifs( ); ++iMotif )
+						adOne[ iMotif ] *= vecdWeights[ i ]; } } }
+
+	return true; }
+
 // CCoalesceGroupHistograms
 
 bool CCoalesceGroupHistograms::Add( const CCoalesceGeneScores& GeneScores, size_t iGene, bool fSubtract,
 	if( !GeneScores.GetMotifs( ) )
 		Clear( );
 
-	return true; }
+// Disabled because it doesn't really work very well at all.
+	return true; } // GeneScores.CalculateWeights( ); }
 
+/*!
+ * \brief
+ * Executes the COALESCE regulatory module prediction algorithm on the given gene expression (and,
+ * optionally, sequence) data.
+ * 
+ * \param PCL
+ * PCL file containing genes and expression values with which clustering is performed.
+ * 
+ * \param FASTA
+ * FASTA file (possibly empty) containing gene sequences used for motif prediction during clustering.
+ * 
+ * \param vecClusters
+ * Output vector of regulatory modules predicted by COALESCE.
+ * 
+ * \returns
+ * True if clustering succeeded (possibly without predicting any modules), false otherwise.
+ * 
+ * Executes the COALESCE algorithm on the given data, predicting zero or more regulatory modules (expression
+ * biclusters plus putative sequence motifs).  Each predicted module consists of one or more genes, one or
+ * more conditions of the given PCL in which those genes are coregulated, and zero or more sequence motifs
+ * over- or under-enriched (and thus potentially causal) in the module's genes.  For more details, see
+ * CCoalesce and Huttenhower et al. 2009.
+ * 
+ * \remarks
+ * Cluster interacts heavily with CCoalesceCluster, which performs the major steps of condition, motif, and
+ * gene selection.  Cluster itself contains mainly the skeleton of the algorithm, including initialization
+ * and convergence detection.
+ * 
+ * \see
+ * CCoalesce | CCoalesceCluster
+ */
 bool CCoalesce::Cluster( const CPCL& PCL, const CFASTA& FASTA, vector<CCoalesceCluster>& vecClusters ) {
 	static const float			c_dEpsilon	= 1e-10f;
 	CPCL						PCLCopy;

File src/coalesce.h

 
 namespace Sleipnir {
 
+/*!
+ * \brief
+ * Performs regulatory module prediction (gene expression biclustering plus de novo sequence motif discovery)
+ * using the COALESCE algorithm of Huttenhower et al. 2009.
+ * 
+ * The COALESCE algorithm consumes gene expression data and, optionally, DNA sequences, to predict regulatory
+ * modules.  These consist of expression biclusters (subsets of genes and conditions) and putative regulatory
+ * motifs.  COALESCE predicts modules in a serial manner, seeding each module with a small number of
+ * correlated genes.  It then iterates between feature selection and Bayesian integration of the selected
+ * features to determine which genes should be in the module.  Feature selection chooses expression conditions
+ * in which the cluster's genes are differentially expressed (i.e. significantly different than the genomic
+ * background) and sequence motifs over- or under-enriched in sequences associated with the cluster's genes
+ * (also relative to genomic background).  Bayesian integration assumes that these features are independent
+ * (although prior knowledge of non-independent datasets can be provided and used to incorporate covariance
+ * information) and calculates the probability with which each gene in the genome is included in the developing
+ * module.  These two steps (feature selection and Bayesian integration) are iterated until the module has
+ * converged, at which point its average values (expression and motif frequencies) are subtracted from its
+ * genes' data, and COALESCE continues with the next module.  A variety of options and data can be used to
+ * modify this procedure, both at the level of the algorithm itself (e.g. the probability threshhold at which
+ * genes are included in a module) and at the level of implementation optimizations (e.g. the granularity
+ * with which motif frequencies are discretized).
+ * 
+ * \remarks
+ * CCoalesce is tightly coupled to CCoalesceCluster, where many of the details of the COALESCE algorithm
+ * are implemented.  CCoalesce serves mainly to store configuration state information for clustering, to
+ * initialize associated data structures, and to provide the outermost skeleton of the algorithm.
+ * 
+ * \see
+ * CCoalesceCluster
+ */
 class CCoalesce : CCoalesceImpl {
 public:
 	bool Cluster( const CPCL& PCL, const CFASTA& FASTA, std::vector<CCoalesceCluster>& vecClusters );
 
+	/*!
+	 * \brief
+	 * Sets the correlation p-value threshhold for genes to be included in a cluster during initialization.
+	 * 
+	 * \param dPValue
+	 * Correlation p-value threshhold for gene inclusion during module initialization.
+	 * 
+	 * \see
+	 * GetPValueCorrelation
+	 */
 	void SetPValueCorrelation( float dPValue ) {
 
 		m_dPValueCorrelation = dPValue; }
 
+	/*!
+	 * \brief
+	 * Returns the correlation p-value threshhold for genes to be included in a cluster during initialization.
+	 * 
+	 * \returns
+	 * P-value threshhold for gene inclusion during module initialization.
+	 * 
+	 * \see
+	 * SetPValueCorrelation
+	 */
 	float GetPValueCorrelation( ) const {
 
 		return m_dPValueCorrelation; }
 
+	/*!
+	 * \brief
+	 * Sets the number of discretization bins used for calculating motif frequency histograms.
+	 * 
+	 * \param iBins
+	 * Number of bins used to discretize motif frequencies.
+	 * 
+	 * \see
+	 * GetBins
+	 */
 	void SetBins( size_t iBins ) {
 
 		m_iBins = iBins; }
 
+	/*!
+	 * \brief
+	 * Returns the number of discretization bins used for calculating motif frequency histograms.
+	 * 
+	 * \returns
+	 * Number of bins used to discretize motif frequencies.
+	 * 
+	 * \see
+	 * SetBins
+	 */
 	size_t GetBins( ) const {
 
 		return m_iBins; }
 
+	/*!
+	 * \brief
+	 * Returns the z-score effect size threshhold for including significant expression conditions in a cluster.
+	 * 
+	 * \returns
+	 * Z-score threshhold for inclusion of expression conditions in a cluster.
+	 * 
+	 * \see
+	 * SetZScoreCondition
+	 */
 	float GetZScoreCondition( ) const {
 
 		return m_dZScoreCondition; }
 
+	/*!
+	 * \brief
+	 * Sets the z-score effect size threshhold for including significant expression conditions in a cluster.
+	 * 
+	 * \param dZScore
+	 * Z-score threshhold for inclusion of expression conditions in a cluster.
+	 * 
+	 * \see
+	 * GetZScoreCondition
+	 */
 	void SetZScoreCondition( float dZScore ) {
 
 		m_dZScoreCondition = dZScore; }
 
+	/*!
+	 * \brief
+	 * Returns the p-value threshhold for including significant expression conditions in a cluster.
+	 * 
+	 * \returns
+	 * P-value threshhold for inclusion of expression conditions in a cluster.
+	 * 
+	 * \see
+	 * SetPValueCondition
+	 */
 	float GetPValueCondition( ) const {
 
 		return m_dPValueCondition; }
 
+	/*!
+	 * \brief
+	 * Sets the p-value threshhold for including significant expression conditions in a cluster.
+	 * 
+	 * \param dPValue
+	 * P-value threshhold for inclusion of expression conditions in a cluster.
+	 * 
+	 * \see
+	 * GetPValueCondition
+	 */
 	void SetPValueCondition( float dPValue ) {
 
 		m_dPValueCondition = dPValue; }
 
+	/*!
+	 * \brief
+	 * Returns the z-score effect size threshhold for including significant sequence motifs in a cluster.
+	 * 
+	 * \returns
+	 * Z-score threshhold for inclusion of motifs in a cluster.
+	 * 
+	 * \see
+	 * SetZScoreMotif
+	 */
 	float GetZScoreMotif( ) const {
 
 		return m_dZScoreMotif; }
 
+	/*!
+	 * \brief
+	 * Sets the z-score effect size threshhold for including significant sequence motifs in a cluster.
+	 * 
+	 * \param dZScore
+	 * Z-score threshhold for inclusion of motifs in a cluster.
+	 * 
+	 * \see
+	 * GetZScoreMotif
+	 */
 	void SetZScoreMotif( float dZScore ) {
 
 		m_dZScoreMotif = dZScore; }
 
+	/*!
+	 * \brief
+	 * Returns the p-value threshhold for including significant sequence motifs in a cluster.
+	 * 
+	 * \returns
+	 * P-value threshhold for inclusion of motifs in a cluster.
+	 * 
+	 * \see
+	 * SetPValueMotif
+	 */
 	float GetPValueMotif( ) const {
 
 		return m_dPValueMotif; }
 
+	/*!
+	 * \brief
+	 * Sets the p-value threshhold for including significant sequence motifs in a cluster.
+	 * 
+	 * \param dPValue
+	 * P-value threshhold for inclusion of motifs in a cluster.
+	 * 
+	 * \see
+	 * GetPValueMotif
+	 */
 	void SetPValueMotif( float dPValue ) {
 
 		m_dPValueMotif = dPValue; }
 
+	/*!
+	 * \brief
+	 * Returns the probability threshhold for including genes in a cluster.
+	 * 
+	 * \returns
+	 * Probability threshhold for inclusion of genes in a cluster.
+	 * 
+	 * \see
+	 * SetProbabilityGene
+	 */
 	float GetProbabilityGene( ) const {
 
 		return m_dProbabilityGene; }
 
+	/*!
+	 * \brief
+	 * Sets the probability threshhold for including genes in a cluster.
+	 * 
+	 * \param dProbability
+	 * Probability threshhold for inclusion of genes in a cluster.
+	 * 
+	 * \see
+	 * GetProbabilityGene
+	 */
 	void SetProbabilityGene( float dProbability ) {
 
 		m_dProbabilityGene = dProbability; }
 
+	/*!
+	 * \brief
+	 * Returns true if a module output directory has been set.
+	 * 
+	 * \returns
+	 * True if a module output directory has been set; false if modules are to be output only to standard out.
+	 * 
+	 * \see
+	 * GetDirectoryIntermediate | SetDirectoryIntermediate
+	 */
 	bool IsDirectoryIntermediate( ) const {
 
 		return !GetDirectoryIntermediate( ).empty( ); }
 
+	/*!
+	 * \brief
+	 * Returns the output directory for predicted modules.
+	 * 
+	 * \returns
+	 * Output directory in which predicted modules are saved.
+	 * 
+	 * \see
+	 * SetDirectoryIntermediate
+	 */
 	const std::string& GetDirectoryIntermediate( ) const {
 
 		return m_strDirectoryIntermediate; }
 
+	/*!
+	 * \brief
+	 * Sets the output directory for predicted modules.
+	 * 
+	 * \param strDirectoryIntermediate
+	 * Output directory in which predicted modules are saved.
+	 * 
+	 * \see
+	 * GetDirectoryIntermediate
+	 */
 	void SetDirectoryIntermediate( const std::string& strDirectoryIntermediate ) {
 
 		m_strDirectoryIntermediate = strDirectoryIntermediate; }
 
+	/*!
+	 * \brief
+	 * Sets the motif library used to manage gene sequences and motifs.
+	 * 
+	 * \param Motifs
+	 * Motif library used to manage gene sequences and motifs during clustering.
+	 * 
+	 * \see
+	 * GetMotifs
+	 */
 	void SetMotifs( CCoalesceMotifLibrary& Motifs ) {
 
 		if( m_fMotifs && m_pMotifs && ( m_pMotifs != &Motifs ) )
 			delete m_pMotifs;
 		m_pMotifs = &Motifs; }
 
+	/*!
+	 * \brief
+	 * Returns the motif library used to manage gene sequences and motifs.
+	 * 
+	 * \returns
+	 * Motif library used to manage gene sequences and motifs during clustering; null if none has been set.
+	 * 
+	 * \see
+	 * SetMotifs
+	 */
 	const CCoalesceMotifLibrary* GetMotifs( ) const {
 
 		return m_pMotifs; }
 
+	/*!
+	 * \brief
+	 * Returns the length of k-mer motifs.
+	 * 
+	 * \returns
+	 * K-mer length of predicted motifs; also used as building blocks for more complex motifs.
+	 * 
+	 * \see
+	 * SetK
+	 */
 	size_t GetK( ) const {
 
 		return m_iK; }
 
+	/*!
+	 * \brief
+	 * Sets the length of k-mer motifs.
+	 * 
+	 * \param iK
+	 * K-mer length of predicted motifs; also used as building blocks for more complex motifs.
+	 * 
+	 * \see
+	 * GetK
+	 */
 	void SetK( size_t iK ) {
 
 		m_iK = iK; }
 
+	/*!
+	 * \brief
+	 * Returns the granularity in base pairs with which motif frequency histograms are calculated.
+	 * 
+	 * \returns
+	 * Number of base pairs per match used to calculated motif frequency histograms.
+	 * 
+	 * \see
+	 * SetBasesPerMatch
+	 */
 	size_t GetBasesPerMatch( ) const {
 
 		return m_iBasesPerMatch; }
 
+	/*!
+	 * \brief
+	 * Sets the granularity in base pairs with which motif frequency histograms are calculated.
+	 * 
+	 * \param iBasesPerMatch
+	 * Number of base pairs per match used to calculated motif frequency histograms.
+	 * 
+	 * \remarks
+	 * Each bin in a motif frequency histogram will be of width 1 / iBasesPerMatch.
+	 * 
+	 * \see
+	 * GetBasesPerMatch
+	 */
 	void SetBasesPerMatch( size_t iBasesPerMatch ) {
 
 		m_iBasesPerMatch = iBasesPerMatch; }
 
+	/*!
+	 * \brief
+	 * Returns the p-value threshhold at which motifs are merged to build PSTs.
+	 * 
+	 * \returns
+	 * P-value threshhold at which motifs are merged to build PSTs.
+	 * 
+	 * \see
+	 * SetPValueMerge
+	 */
 	float GetPValueMerge( ) const {
 
 		return m_dPValueMerge; }
 
+	/*!
+	 * \brief
+	 * Sets the p-value threshhold at which motifs are merged to build PSTs.
+	 * 
+	 * \param dPValue
+	 * P-value threshhold at which motifs are merged to build PSTs.
+	 * 
+	 * \see
+	 * GetPValueMerge
+	 */
 	void SetPValueMerge( float dPValue ) {
 
 		m_dPValueMerge = dPValue; }
 
+	/*!
+	 * \brief
+	 * Returns the edit distance threshhold at which motifs are merged to build PSTs.
+	 * 
+	 * \returns
+	 * Edit distance threshhold at which motifs are merged to build PSTs.
+	 * 
+	 * \see
+	 * SetCutoffMerge
+	 */
 	float GetCutoffMerge( ) const {
 
 		return m_dCutoffMerge; }
 
+	/*!
+	 * \brief
+	 * Sets the edit distance threshhold at which motifs are merged to build PSTs.
+	 * 
+	 * \param dCutoff
+	 * Edit distance threshhold at which motifs are merged to build PSTs.
+	 * 
+	 * \see
+	 * GetCutoffMerge
+	 */
 	void SetCutoffMerge( float dCutoff ) {
 
 		m_dCutoffMerge = dCutoff; }
 
+	/*!
+	 * \brief
+	 * Returns the minimum number of genes that must be present in a successful module.
+	 * 
+	 * \returns
+	 * Minimum number of genes present in a successful module.
+	 * 
+	 * \see
+	 * SetSizeMinimum
+	 */
 	size_t GetSizeMinimum( ) const {
 
 		return m_iSizeMinimum; }
 
+	/*!
+	 * \brief
+	 * Sets the minimum number of genes that must be present in a successful module.
+	 * 
+	 * \param iSizeGenes
+	 * Minimum number of genes present in a successful module.
+	 * 
+	 * \see
+	 * GetSizeMinimum
+	 */
 	void SetSizeMinimum( size_t iSizeGenes ) {
 
 		m_iSizeMinimum = iSizeGenes; }
 
+	/*!
+	 * \brief
+	 * Returns the maximum number of motifs that may be associated with a converging module.
+	 * 
+	 * \returns
+	 * Maximum number of motifs associated with a converging module.
+	 * 
+	 * \see
+	 * SetSizeMaximum
+	 */
 	size_t GetSizeMaximum( ) const {
 
 		return m_iSizeMaximum; }
 
+	/*!
+	 * \brief
+	 * Sets the maximum number of motifs that may be associated with a converging module.
+	 * 
+	 * \param iSizeMotifs
+	 * Maximum number of motifs associated with a converging module.
+	 * 
+	 * \remarks
+	 * Additional motifs may be associated with a module during a final pass after convergence.
+	 * 
+	 * \see
+	 * GetSizeMaximum
+	 */
 	void SetSizeMaximum( size_t iSizeMotifs ) {
 
 		m_iSizeMaximum = iSizeMotifs; }
 
+	/*!
+	 * \brief
+	 * Returns the maximum number of motifs that are considered for merging into PSTs during module convergence.
+	 * 
+	 * \returns
+	 * Maximum number of motifs considered for PSTs construction during module convergence.
+	 * 
+	 * \see
+	 * SetSizeMerge
+	 */
 	size_t GetSizeMerge( ) const {
 
 		return m_iSizeMerge; }
 
+	/*!
+	 * \brief
+	 * Sets the maximum number of motifs that are considered for merging into PSTs during module convergence.
+	 * 
+	 * \param iSizeMerge
+	 * Maximum number of motifs considered for PSTs construction during module convergence.
+	 * 
+	 * \remarks
+	 * Additional motifs may be merged during module postprocessing.
+	 * 
+	 * \see
+	 * GetSizeMerge
+	 */
 	void SetSizeMerge( size_t iSizeMerge ) {
 
 		m_iSizeMerge = iSizeMerge; }
 
+	/*!
+	 * \brief
+	 * Removes all currently set dataset blocks.
+	 * 
+	 * \see
+	 * AddDataset
+	 */
 	void ClearDatasets( ) {
 
 		m_vecsDatasets.clear( ); }
 
+	/*!
+	 * \brief
+	 * Adds a block of conditions known to form a non-independent dataset.
+	 * 
+	 * \param setiDataset
+	 * Set of condition indices forming a dataset.
+	 * 
+	 * \returns
+	 * True if the dataset was added successfully; false if it already existed in the current configuration.
+	 * 
+	 * Adds a dataset block to subsequent executions of COALESCE.  A dataset block consists of two or more
+	 * expression conditions known to be non-independent, e.g. multiple conditions belonging to the same
+	 * time course.  Such dataset blocks are treated as units for inclusion in/exclusion from predicted
+	 * modules, and their covariance is determined and incorporated into significance calculations for
+	 * differential expression.
+	 * 
+	 * \remarks
+	 * Condition indices must correspond to columns in a PCL file subsequently provided to a call to Cluster.
+	 * 
+	 * \see
+	 * ClearDatasets
+	 */
 	bool AddDataset( const std::set<size_t>& setiDataset ) {
 		size_t								i;
 		std::set<size_t>::const_iterator	iterExperiment;
 		m_vecsDatasets.push_back( SCoalesceDataset( setiDataset ) );
 		return true; }
 
+	/*!
+	 * \brief
+	 * Sets the maximum number of gene pairs subsampled for seed pair discovery during module initialization.
+	 * 
+	 * \param iPairs
+	 * Maximum number of gene pairs subsampled for module seeding.
+	 * 
+	 * \see
+	 * GetNumberCorrelation
+	 */
 	void SetNumberCorrelation( size_t iPairs ) {
 
 		m_iNumberCorrelation = iPairs; }
 
+	/*!
+	 * \brief
+	 * Returns the maximum number of gene pairs subsampled for seed pair discovery during module initialization.
+	 * 
+	 * \returns
+	 * Maximum number of gene pairs subsampled for module seeding.
+	 * 
+	 * \see
+	 * SetNumberCorrelation
+	 */
 	size_t GetNumberCorrelation( ) const {
 
 		return m_iNumberCorrelation; }
 
+	/*!
+	 * \brief
+	 * Sets the maximum number of simultaneous threads used for clustering.
+	 * 
+	 * \param iThreads
+	 * Maximum number of simultaneous threads used during clustering.
+	 * 
+	 * \see
+	 * GetThreads
+	 */
 	void SetThreads( size_t iThreads ) {
 
 		m_iThreads = iThreads; }
 
+	/*!
+	 * \brief
+	 * Returns the maximum number of simultaneous threads used for clustering.
+	 * 
+	 * \returns
+	 * Maximum number of simultaneous threads used during clustering.
+	 * 
+	 * \see
+	 * SetThreads
+	 */
 	size_t GetThreads( ) const {
 
 		return m_iThreads; }
 
+	/*!
+	 * \brief
+	 * Adds a wiggle track of supporting data to be used to weight sequence information.
+	 * 
+	 * \param FASTA
+	 * FASTA file containing peudo-wiggle-track formatted per-base weights for gene sequences.
+	 * 
+	 * Adds a wiggle track of supporting information used to weight gene sequence positions during
+	 * COALESCE clustering.  A wiggle track as used by COALESCE is not precisely in the wiggle track
+	 * format as defined by the ENCODE project; instead, it is a FASTA file in which sequence base pairs
+	 * have been replaced by per-base-pair scores, one floating point value per line.  In COALESCE, one or
+	 * more wiggle tracks can be used to weight the individual base pairs used to determine motif
+	 * occurrence and frequencies.  Lower weights (down to zero) will downweight the base pairs at those
+	 * positions (and thus the effective frequencies of any motifs that occur there), and higher weights
+	 * will upweight them.  In the absence of wiggle tracks, the default weight of all base pairs is one.
+	 * 
+	 * \remarks
+	 * Contents of the provided pseudo-wiggle file must align with the FASTA file of gene sequences
+	 * provided to subsequent calls to Cluster.
+	 * 
+	 * \see
+	 * ClearWiggles | CFASTA
+	 */
 	void AddWiggle( const CFASTA& FASTA ) {
 
 		m_vecpWiggles.push_back( &FASTA ); }
 
+	/*!
+	 * \brief
+	 * Removes all currently active wiggle tracks.
+	 * 
+	 * \see
+	 * AddWiggle
+	 */
 	void ClearWiggles( ) {
 
 		m_vecpWiggles.clear( ); }
 
+	/*!
+	 * \brief
+	 * Adds an output stream to which module information is printed after convergence.
+	 * 
+	 * \param ostm
+	 * Output stream to which each module will be printed after it converges.
+	 * 
+	 * \remarks
+	 * Usually a single output stream, standard output, is sufficient; this is provided for useless
+	 * convenience.
+	 * 
+	 * \see
+	 * RemoveOutputIntermediate
+	 */
 	void AddOutputIntermediate( std::ostream& ostm ) {
 
 		m_vecpostm.push_back( &ostm ); }
 
+	/*!
+	 * \brief
+	 * Removes an output stream to which module information was printed after convergence.
+	 * 
+	 * \param ostm
+	 * Output stream to which modules were to be printed.
+	 * 
+	 * \remarks
+	 * Removal of an output stream not in the current set will be ignored.
+	 * 
+	 * \see
+	 * AddOutputIntermediate
+	 */
 	void RemoveOutputIntermediate( std::ostream& ostm ) {
 		std::vector<std::ostream*>::iterator	iter;
 
 		if( ( iter = std::find( m_vecpostm.begin( ), m_vecpostm.end( ), &ostm ) ) != m_vecpostm.end( ) )
 			m_vecpostm.erase( iter ); }
 
+	/*!
+	 * \brief
+	 * Removes all currently active intermediate output streams.
+	 * 
+	 * \see
+	 * AddOutputIntermediate | RemoveOutputIntermediate
+	 */
 	void ClearOutputIntermediate( ) {
 
 		m_vecpostm.clear( ); }
 
+	/*!
+	 * \brief
+	 * Sets the normalization behavior for automatically detected single channel expression conditions.
+	 * 
+	 * \param fNormalize
+	 * If true, single channel conditions are detected and normalized; otherwise, they are left unchanged.
+	 * 
+	 * \remarks
+	 * Single channel normalization is time-consuming and often degrades performance; it should usually
+	 * be left disabled.  However, it can find some interesting clusters given the right input data.
+	 * 
+	 * \see
+	 * GetNormalize
+	 */
 	void SetNormalize( bool fNormalize ) {
 
 		m_fNormalize = fNormalize; }
 
+	/*!
+	 * \brief
+	 * Returns true if automatic detection and normalization of single channel expression data is enabled.
+	 * 
+	 * \returns
+	 * True if single channel condition detection and normalization is enabled.
+	 * 
+	 * \see
+	 * SetNormalize
+	 */
 	bool GetNormalize( ) const {
 
 		return m_fNormalize; }

File src/coalescecluster.cpp

 const char	CCoalesceClusterImpl::c_szGenes[]		= "Genes";
 const char	CCoalesceClusterImpl::c_szConditions[]	= "Conditions";
 
+/*!
+ * \brief
+ * Randomly initializes a new cluster from the given PCL by selecting a pair of correlated genes and
+ * a surrounding seed of additional genes.
+ * 
+ * \param PCL
+ * Expression data from which cluster is seeded.
+ * 
+ * \param Pot
+ * Cluster initialized to the inverse of the current cluster, i.e. all genes not in the new cluster.
+ * 
+ * \param vecsDatasets
+ * Vector of dataset block structure to be used by the new cluster for subsequent selection of
+ * significant conditions.
+ * 
+ * \param setpriiSeeds
+ * Set of previously failed seed pairs to be excluded for cluster initialization.
+ * 
+ * \param iPairs
+ * Maximum number of gene pairs to be sampled for seed pair discovery.
+ * 
+ * \param dPValue
+ * P-value threshhold for significant correlation.
+ * 
+ * \param iThreads
+ * Maximum number of simultaneous threads for gene pair sampling.
+ * 
+ * \returns
+ * True if the cluster was successfully initialized; false if no appropriate seed could be found.
+ * 
+ * \throws <exception class>
+ * Description of criteria for throwing this exception.
+ * 
+ * A new cluster is initialized by selecting the most highly correlated gene pair from a random sample
+ * of the input PCL that is below the significance threshhold.  The expression centroid of this pair is
+ * then calculated, and all other genes significantly correlated with this centroid are subsequently
+ * added.  Genes not selected for inclusion are added to the inverse cluster instead.  All significance
+ * tests are appropriately Bonferroni corrected for multiple hypothesis testing.
+ * 
+ * \remarks
+ * Initial seed pair is the most highly correlated and significant gene pair sampled from the input PCL;
+ * other genes significantly correlated with the resulting centroid are added subsequently to initialize
+ * the cluster.  The provided dataset blocks are used for all subsequent condition significance tests
+ * and covariance calculations in which the cluster is involved.
+ */
 bool CCoalesceCluster::Initialize( const CPCL& PCL, CCoalesceCluster& Pot,
 	const std::vector<SCoalesceDataset>& vecsDatasets, std::set<std::pair<size_t, size_t> >& setpriiSeeds,
 	size_t iPairs, float dPValue, size_t iThreads ) {
 
 	return true; }
 
+/*!
+ * \brief
+ * Updates the given motif score histograms based on the genes in the current cluster using the
+ * provided per-gene motif scores.
+ * 
+ * \param GeneScores
+ * Per-gene motif scores from which score histograms are calculated.
+ * 
+ * \param HistogramsCluster
+ * Motif score histograms based on genes in the current cluster.
+ * 
+ * \param pHistogramsPot
+ * If non-null, motif score histograms based on genes not in the current cluster.
+ * 
+ * COALESCE finds significant motifs for each cluster by determining which motifs have statistically
+ * different distributions in the cluster versus the genomic background.  To do this, one histogram of
+ * per-gene frequencies is constructed per motif; these histograms are in turn based on the frequencies
+ * with which each motif appears in each gene.  For example, suppose we have three genes \c G1 through \c G3
+ * and three motifs \c M1 through \c M3.  Based each gene's sequence, we determine that the motifs have
+ * the following frequencies:
+ * \code
+ * Motif G1 G2 G3
+ * M1    1  1  2
+ * M2    2  0  2
+ * M3    1  2  0
+ * \endcode
+ * If only genes \c G1 and \c G2 are in the cluster, we build a frequency histogram as follows:
+ * \code
+ * Motif 0  1  2
+ * M1    0  2  0
+ * M2    1  0  1
+ * M3    0  1  1
+ * \endcode
+ * That is, in the cluster, there are zero genes within which \c M1 occurs zero times, two in which it
+ * occurs once, zero in which it occurs twice, and so forth.  Each row (i.e. each motif's total histogram)
+ * must sum to the number of genes in the cluster, and the resulting inverse histograms for genes not
+ * in the cluster (i.e. \c G3) is:
+ * \code
+ * Motif 0  1  2
+ * M1    0  0  1
+ * M2    0  0  1
+ * M3    1  0  0
+ * \endcode
+ * In COALESCE, additional complexity arises since motif frequencies are continuous (and must thus be
+ * discretized in the histograms) and are calculated on a per-subsequence-type basis (giving rise to
+ * multiple histograms per cluster).
+ * 
+ * \see
+ * Snapshot
+ */
 void CCoalesceCluster::CalculateHistograms( const CCoalesceGeneScores& GeneScores,
 	CCoalesceGroupHistograms& HistogramsCluster, CCoalesceGroupHistograms* pHistogramsPot ) const {
 	set<size_t>::const_iterator	iterGene;
 			if( pHistogramsPot )
 				pHistogramsPot->Add( GeneScores, m_veciPrevGenes[ i ], false ); } }
 
+/*!
+ * \brief
+ * Subtract the average expression value for each condition in the cluster from each gene in the cluster.
+ * 
+ * \param PCL
+ * Expression matrix from which cluster averages are subtracted.
+ * 
+ * \param Pot
+ * Inverse of genes in the cluster; used to determine the difference of the cluster's average from
+ * the existing per-condition average.
+ * 
+ * \remarks
+ * This effectively masks the average effect of each condition in the cluster from the contained genes'
+ * expression values so that the cluster won't be re-found later.  Actually subtracts the difference
+ * between the cluster average and the overall average from each condition, since the overall average
+ * need not be zero.
+ */
 void CCoalesceCluster::Subtract( CPCL& PCL, const CCoalesceCluster& Pot ) const {
 	set<size_t>::const_iterator	iterGene, iterDataset;
 	size_t						i, iCondition;
 							( GetGenes( ).size( ) + Pot.GetGenes( ).size( ) );
 					PCL.Get( *iterGene, iCondition ) -= d - dAve; } } }
 
+/*!
+ * \brief
+ * Subtract the average frequency of each motif in the cluster from the score for that motif in
+ * each gene in the cluster.
+ * 
+ * \param GeneScores
+ * Per-gene motif frequency scores from which the cluster averages are subtracted.
+ * 
+ * \remarks
+ * This effectively masks the average effect of each motif in the cluster from the contained genes'
+ * sequences so that they won't be re-found later.
+ */
 void CCoalesceCluster::Subtract( CCoalesceGeneScores& GeneScores ) const {
 	set<size_t>::const_iterator			iterGene;
 	set<SMotifMatch>::const_iterator	iterMotif;
 
 	return NULL; }
 
+/*!
+ * \brief
+ * Performs feature selection to include significant expression conditions in a converging cluster.
+ * 
+ * \param PCL
+ * Expression dataset from which significant datasets are selected.
+ * 
+ * \param Pot
+ * Inverse of current cluster used for genomic background calculations.
+ * 
+ * \param iThreads
+ * Maximum number of simultaneous threads for condition significance calculations.
+ * 
+ * \param dPValue
+ * P-value threshhold for condition significance.
+ * 
+ * \param dZScore
+ * Z-score effect size threshhold for condition significance.
+ * 
+ * \returns
+ * True if zero or more conditions were selected successfully, false otherwise.
+ * 
+ * Selects zero or more conditions in which the cluster's current gene set is differentially expressed.
+ * That is, in each selected condition, the average expression of genes in the cluster must differ from
+ * the genomic background with at least the given significance and effect size threshholds.  If dataset
+ * blocks were given at cluster initialization time, all conditions in the block are added (or not)
+ * simultaneously using a multivariate significance test.
+ * 
+ * \see
+ * SelectMotifs | SelectGenes
+ */
 bool CCoalesceCluster::SelectConditions( const CPCL& PCL, const CCoalesceCluster& Pot, size_t iThreads,
 	float dPValue, float dZScore ) {
 	vector<pthread_t>				vecpthdThreads;
 
 	return NULL; }
 
+/*!
+ * \brief
+ * Performs feature selection to include significant sequence motifs in a converging cluster.
+ * 
+ * \param HistsCluster
+ * Precalculated histogram of motif frequencies using genes in the cluster.
+ * 
+ * \param HistsPot
+ * Precalculated histogram of motif frequencies using genes not in the cluster.
+ * 
+ * \param dPValue
+ * P-value threshhold for motif significance.
+ * 
+ * \param dZScore
+ * Z-score effect size threshhold for motif significance.
+ * 
+ * \param iMaxMotifs
+ * Maximum number of motifs associated with a cluster; if more motifs are present, no new selection is
+ * performed.
+ * 
+ * \param iThreads
+ * Maximum number of simultaneous threads for motif significance calculations.
+ * 
+ * \param pMotifs
+ * If non-null, motif library with which motifs are managed.
+ * 
+ * \returns
+ * True if zero or more significant motifs were selected; false otherwise.
+ * 
+ * Selects zero or more motifs differentially over- or under-enriched in the genes currently in a
+ * converging cluster.  Motifs are selected on a per-sequence-subtype basis, so a motif may be enriched,
+ * for example, in an upstream but not downstream flank.  All motifs managed by the given library are tested
+ * for significance, including all k-mers, reverse complements, and any currently constructed PSTs.
+ * Significance testing is performed by z-scoring the within versus without frequency histograms, and all
+ * p-values are Bonferroni corrected.
+ * 
+ * \remarks