Commits

Anonymous committed d906d57

Add per-gene weights to HEFalMp/bioPIXIE queries in CDat
Improve CDat::SaveDOT edge coloration (by z-score rather than 0-1 scaled)
Improve COALESCE convergence when performing dataset blocking
Improve Dat2Graph query weighting and edge normalization
Add column sum normalization to CPCL and Normalizer
Fix Answerer documentation - thanks to Arjun Krishnan!
Fix Cliquer bug in edge exclusion for non-zero-one scaled networks
Fix Funcifier calculation to account for missing values, gene set sizes/weights

Comments (0)

Files changed (24)

 	CCoalesceGeneScores			GeneScores;
 	set<pair<size_t, size_t> >	setpriiSeeds;
 	SCoalesceModifiers			sModifiers;
-	float						dFailure;
+	float						dFailure, dProbability;
 
 	for( i = 0; i < m_vecpWiggles.size( ); ++i )
 		sModifiers.Add( m_vecpWiggles[ i ] );
 		g_CatSleipnir( ).notice( "CCoalesce::Cluster( ) running with %d genes, %d conditions, and %d sequences",
 			PCL.GetGenes( ), PCL.GetExperiments( ), iSequences );
 		for( i = 0; i < PCL.GetExperiments( ); ++i )
-			ossm << ( i ? "\t" : "" ) << PCL.GetExperiment( i );
-		g_CatSleipnir( ).notice( ossm.str( ).c_str( ) );
+			g_CatSleipnir( ).notice( PCL.GetExperiment( i ) );
+//			ossm << ( i ? "\t" : "" ) << PCL.GetExperiment( i );
+//		g_CatSleipnir( ).notice( ossm.str( ).c_str( ) );
 		g_CatSleipnir( ).notice( "k %d, P gene %g, p condition %g, z condition %g, p motif %g, z motif %g, p correlation %g", GetK( ),
 			GetProbabilityGene( ), GetPValueCondition( ), GetZScoreCondition( ), GetPValueMotif( ),
 			GetZScoreMotif( ), GetPValueCorrelation( ) );
 			g_CatSleipnir( ).debug( ossm.str( ).c_str( ) ); }
 		if( Cluster.GetGenes( ).size( ) < GetSizeMinimum( ) )
 			continue;
+		dProbability = 1 - GetProbabilityGene( );
 		while( !( Cluster.IsConverged( ) || Cluster.IsEmpty( ) ) ) {
 			Cluster.CalculateHistograms( GeneScores, HistsCluster, &HistsPot );
 			Cluster.Snapshot( GeneScores, HistsCluster );
 				GetZScoreMotif( ),GetSizeMaximum( ), GetThreads( ), GetMotifs( ) ) ) )
 				return false;
 			if( !Cluster.SelectGenes( PCLCopy, GeneScores, HistsCluster, HistsPot, GetSizeMinimum( ),
-				GetThreads( ), Pot, GetProbabilityGene( ), GetMotifs( ) ) )
+				GetThreads( ), Pot, 1 - dProbability, GetMotifs( ) ) )
 				return false;
+			dProbability *= GetProbabilityGene( );
 			g_CatSleipnir( ).notice( "CCoalesce::Cluster( ) processed %d genes, %d datasets, %d motifs",
 				Cluster.GetGenes( ).size( ), Cluster.GetDatasets( ).size( ), Cluster.GetMotifs( ).size( ) ); }
 		if( Cluster.IsConverged( ) && ( Cluster.GetGenes( ).size( ) >= GetSizeMinimum( ) ) ) {

src/coalescecluster.cpp

 		set<SMotifMatch>::const_iterator	iterMotif;
 		size_t								iType;
 
-		g_CatSleipnir( ).debug( "CCoalesceClusterImpl::IsSignificant( %s ) is %g beta %g, exp. p=%g vs. %g, seq. p=%g vs %g",
-			PCL.GetGene( iGene ).c_str( ), dP, dBeta, dLogPExpressionGivenIn, dLogPExpressionGivenOut,
+		g_CatSleipnir( ).debug( "CCoalesceClusterImpl::IsSignificant( %s ) is %g, prior %g beta %g, exp. p=%g vs. %g, seq. p=%g vs %g",
+			PCL.GetGene( iGene ).c_str( ), dP, d, dBeta, dLogPExpressionGivenIn, dLogPExpressionGivenOut,
 			dLogPMotifsGivenIn, dLogPMotifsGivenOut );
 		for( iterMotif = m_setsMotifs.begin( ); iterMotif != m_setsMotifs.end( ); ++iterMotif )
 			if( ( iType = GeneScores.GetType( iterMotif->m_strType ) ) != -1 )
 				g_CatSleipnir( ).debug( "%g	%s", GeneScores.Get( iType, iterMotif->m_eSubsequence, iGene,
 					iterMotif->m_iMotif ), iterMotif->Save( pMotifs ).c_str( ) ); }
 
-	return ( dP > dProbability ); }
+	return ( dP >= dProbability ); }
 
 bool CCoalesceClusterImpl::CalculateProbabilityExpression( size_t iGene, const CPCL& PCL,
 	const vector<float>& vecdStdevs, const CCoalesceCluster& Pot, const vector<size_t>& veciDatasets,
 			vecdGene.resize( sDataset.GetConditions( ) );
 			for( iCondition = 0; iCondition < sDataset.GetConditions( ); ++iCondition )
 				vecdGene[ iCondition ] = PCL.Get( iGene, sDataset.GetCondition( iCondition ) );
-			dPCluster = max( c_dEpsilonZero, CStatistics::MultivariateNormalPDF( vecdGene,
+			dPCluster = min( 1.0, max( c_dEpsilonZero, CStatistics::MultivariateNormalPDF( vecdGene,
 				sDataset.m_vecdCentroid, sDataset.m_psDataset->m_dSigmaDetSqrt,
-				sDataset.m_psDataset->m_MatSigmaInv ) );
-			dPPot = max( c_dEpsilonZero, CStatistics::MultivariateNormalPDF( vecdGene,
+				sDataset.m_psDataset->m_MatSigmaInv ) ) );
+			dPPot = min( 1.0, max( c_dEpsilonZero, CStatistics::MultivariateNormalPDF( vecdGene,
 				Pot.m_vecsDatasets[ veciDatasets[ iDataset ] ].m_vecdCentroid,
-				sDataset.m_psDataset->m_dSigmaDetSqrt, sDataset.m_psDataset->m_MatSigmaInv ) ); }
+				sDataset.m_psDataset->m_dSigmaDetSqrt, sDataset.m_psDataset->m_MatSigmaInv ) ) ); }
 		dPIn *= dPCluster;
 		dPOut *= dPPot;
 		if( ( dPIn < DBL_MIN ) || ( dPOut < DBL_MIN ) ) {
  * EFilterTerm and to some degree EFilterEdge don't make a lot of sense for CDats that do not represent
  * gold standards.
  */
-void CDat::FilterGenes( const CGenes& Genes, EFilter eFilter, size_t iLimit, float dEdgeAggressiveness ) {
+void CDat::FilterGenes( const CGenes& Genes, EFilter eFilter, size_t iLimit, float dEdgeAggressiveness,
+	const vector<float>* pvecdWeights ) {
 	size_t			i, j;
 	vector<bool>	vecfGenes;
 
 	switch( eFilter ) {
 		case EFilterPixie:
 		case EFilterHefalmp:
-			FilterGenesGraph( Genes, vecfGenes, iLimit, dEdgeAggressiveness, eFilter == EFilterHefalmp );
+			FilterGenesGraph( Genes, vecfGenes, iLimit, dEdgeAggressiveness, eFilter == EFilterHefalmp, pvecdWeights );
 			return; }
 
 	for( i = 0; i < GetGenes( ); ++i ) {
 };
 
 void CDatImpl::FilterGenesGraph( const CGenes& Genes, vector<bool>& vecfGenes, size_t iLimit,
-	float dEdgeAggressiveness, bool fHefalmp ) {
-	vector<float>				vecdNeighbors;
+	float dEdgeAggressiveness, bool fHefalmp, const vector<float>* pvecdWeights ) {
+	vector<float>				vecdNeighbors, vecdWeights;
 	size_t						i, j, iOne, iTwo, iMinOne, iMinTwo, iN;
 	vector<size_t>				veciGenes, veciFinal, veciDegree;
 	set<size_t>					setiN;
 	veciGenes.resize( Genes.GetGenes( ) );
 	for( i = 0; i < veciGenes.size( ); ++i )
 		veciGenes[ i ] = GetGene( Genes.GetGene( i ).GetName( ) );
+	if( !pvecdWeights || ( pvecdWeights->size( ) < veciGenes.size( ) ) ) {
+		vecdWeights.resize( veciGenes.size( ) );
+		fill( vecdWeights.begin( ), vecdWeights.end( ), 1.0f );
+		pvecdWeights = &vecdWeights; }
 
 	vecdNeighbors.resize( GetGenes( ) );
 	fill( vecdNeighbors.begin( ), vecdNeighbors.end( ), 0.0f );
 					continue;
 				if( !CMeta::IsNaN( d = Get( i, iOne ) ) ) {
 					iIn++;
-					dIn += d; } }
+					dIn += d * (*pvecdWeights)[ j ]; } }
 			for( iOut = j = 0; j < GetGenes( ); ++j )
 				if( !CMeta::IsNaN( d = Get( i, j ) ) ) {
 					iOut++;
 				if( vecfGenes[ j ] )
 					continue;
 				if( !CMeta::IsNaN( d = Get( iOne, j ) ) )
-					vecdNeighbors[ j ] += d; } }
+					vecdNeighbors[ j ] += d * (*pvecdWeights)[ i ]; } }
 	for( i = 0; i < vecdNeighbors.size( ); ++i )
 		if( ( d = vecdNeighbors[ i ] ) > 0 )
 			pqueNeighbors.push( SPixie( i, d ) );
  */
 void CDat::SaveDOT( std::ostream& ostm, float dCutoff, const CGenome* pGenome, bool fUnlabeled, bool fHashes,
 	const std::vector<float>* pvecdColors, const std::vector<float>* pvecdBorders ) const {
-	size_t			i, j;
-	float			d, dMin, dMax;
+	size_t			i, j, iCount;
+	float			d, dAve, dStd;
 	bool			fAll, fLabel;
 	vector<string>	vecstrNames;
 	vector<bool>	vecfGenes;
 			ostm << "];" << endl; } }
 
 	ostm << endl;
-	dMin = FLT_MAX;
-	dMax = -FLT_MAX;
+	dAve = dStd = 0;
+	for( iCount = i = 0; i < GetGenes( ); ++i )
+		for( j = ( i + 1 ); j < GetGenes( ); ++j )
+			if( !CMeta::IsNaN( d = Get( i, j ) ) && ( fAll || ( d >= dCutoff ) ) ) {
+				iCount++;
+				dAve += d;
+				dStd += d * d; }
+	if( iCount ) {
+		dAve /= iCount;
+		dStd = sqrt( max( 0.0f, ( dStd / iCount ) - ( dAve * dAve ) ) ); }
 	for( i = 0; i < GetGenes( ); ++i )
 		for( j = ( i + 1 ); j < GetGenes( ); ++j )
 			if( !CMeta::IsNaN( d = Get( i, j ) ) && ( fAll || ( d >= dCutoff ) ) ) {
-				if( d < dMin )
-					dMin = d;
-				if( d > dMax )
-					dMax = d; }
-	dMax -= dMin;
-	for( i = 0; i < GetGenes( ); ++i )
-		for( j = ( i + 1 ); j < GetGenes( ); ++j )
-			if( !CMeta::IsNaN( d = Get( i, j ) ) && ( fAll || ( d >= dCutoff ) ) )
+				d = 1.0 / ( 1 + exp( ( dAve - d ) / dStd ) );
 				ostm << vecstrNames[ i ] << " -- " << vecstrNames[ j ] << " [weight = " << d <<
-					", color = \"" << ( fHashes ? "#" : "" ) << CColor::Interpolate( ( d - dMin ) / dMax,
-					CColor::c_Green, CColor::c_Black, CColor::c_Red ).ToRGB( ) << "\"];" << endl;
+					", color = \"" << ( fHashes ? "#" : "" ) << CColor::Interpolate( d,
+					CColor::c_Green, CColor::c_Black, CColor::c_Red ).ToRGB( ) << "\"];" << endl; }
 
 	ostm << "}" << endl; }
 
-/*****************************************************************************
-* This file is provided under the Creative Commons Attribution 3.0 license.
-*
-* You are free to share, copy, distribute, transmit, or adapt this work
-* PROVIDED THAT you attribute the work to the authors listed below.
-* For more information, please see the following web page:
-* http://creativecommons.org/licenses/by/3.0/
-*
-* This file is a component of the Sleipnir library for functional genomics,
-* authored by:
-* Curtis Huttenhower (chuttenh@princeton.edu)
-* Mark Schroeder
-* Maria D. Chikina
-* Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
-*
-* If you use this library, the included executable tools, or any related
-* code in your work, please cite the following publication:
-* Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
-* Olga G. Troyanskaya.
-* "The Sleipnir library for computational functional genomics"
-*****************************************************************************/
-#ifndef DAT_H
-#define DAT_H
-
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include "dati.h"
-
-namespace Sleipnir {
-
-class CGenes;
-class CGenome;
-
-/*!
- * \brief
- * Stores a continuously valued half matrix paired with a list of names for matrix elements.
- * 
- * Conceptually, a CDat stores a list of weighted pairs; this is equivalent to a weighted undirected
- * graph with labeled nodes, or a symmetric matrix with labels for each matrix element.  CDat entries are
- * stored as continuous values, although they can be discretized in various ways.  CDats can be constructed
- * in several ways, read from disk, persisted to disk in multiple file formats, or calculated from
- * existing gene sets, microarray data, or gold standards.  In practice, a CDat is simply a continuously
- * valued symmetric matrix (in which zero or more values may be missing) paired with a list of element
- * names (assumed to be genes), but this data structure is sufficiently flexible to represent nearly any
- * biological dataset.
- * 
- * CDats can be loaded (by Open) and/or stored (by Save) from/to disk in the following formats:
- * - DAT.  A tab-delimited text file in which each line contains two identifiers and a score:
- * \code
- * GENE1	GENE2	SCORE1
- * GENE1	GENE3	SCORE2
- * GENE2	GENE3	SCORE3
- * \endcode
- * Element pair order is irrelevant, missing values are allowed, and duplicates can be optionally ignored.
- * The DAT format is most suitable for human readability and manipulation by scripting languages; it is
- * much larger and slower to process than the other formats, however.
- * - DAB.  A binary file containing an integer size, a list of null-terminated element identifiers
- * (generally gene names), and the CDat's values in row-major order.  Missing values are stored as NaNs.
- * Should be generated by Save; usually the smallest and most rapidly parsed format, and the only one
- * amenable to memory mapping.
- * - DAS.  A sparse binary file containing an integer size, a list of null-terminated element identifiers
- * (generally gene names), and the CDat's non-missing values in row-major order, pairing column indices with
- * values.  Should be generated by Save.  Note that this sounds like it should save space for sparse
- * CDats, but because of the overhead of storing column indices, the matrix has to be awfully sparse before
- * it actually does.
- * - PCL.  A standard PCL file, which is loaded and converted to pairwise similarity scores using
- * z-transformed Pearson correlation as calculated by CMeasurePearNorm.  Can be converted once and cached
- * in memory or calculated on-the-fly; the former consumes more memory, the latter is (often) slower.
- * 
- * \see
- * CDataPair | CHalfMatrix
- */
-class CDat : protected CDatImpl {
-public:
-	/*!
-	 * \brief
-	 * Ways in which nodes/edges can be removed to filter a CDat.
-	 * 
-	 * \see
-	 * FilterGenes
-	 */
-	enum EFilter {
-		/*!
-		 * \brief
-		 * Remove any edge including a node outside the given set.
-		 */
-		EFilterInclude		= 0,
-		/*!
-		 * \brief
-		 * Remove any positive edge including a node outside the given set.
-		 */
-		EFilterTerm			= EFilterInclude + 1,
-		/*!
-		 * \brief
-		 * Remove any edge including a node in the given set.
-		 */
-		EFilterExclude		= EFilterTerm + 1,
-		/*!
-		 * \brief
-		 * Perform a bioPIXIE query using the given set and remove any edge not in the resulting subgraph.
-		 */
-		EFilterPixie		= EFilterExclude + 1,
-		/*!
-		 * \brief
-		 * Remove any edge not including a node in the given set.
-		 */
-		EFilterEdge			= EFilterPixie + 1,
-		/*!
-		 * \brief
-		 * Perform a HEFalMp query using the given set and remove any edge not in the resulting subgraph.
-		 */
-		EFilterHefalmp		= EFilterEdge + 1
-	};
-
-	/*!
-	 * \brief
-	 * Ways in which a CDat can be persisted to/from disk.
-	 * 
-	 * \see
-	 * Open | Save
-	 */
-	enum EFormat {
-		/*!
-		 * \brief
-		 * Binary format listing null-terminated element name strings followed by floating point values.
-		 */
-		EFormatBinary	= 0,
-		/*!
-		 * \brief
-		 * Text format listing element name pairs followed by numerical value strings.
-		 */
-		EFormatText		= EFormatBinary + 1,
-		/*!
-		 * \brief
-		 * PCL file from which pairwise scores are calculated using some similarity measure.
-		 */
-		EFormatPCL		= EFormatText + 1,
-		/*!
-		 * \brief
-		 * Binary format listing null-terminated element name strings followed by index/value pairs.
-		 */
-		EFormatSparse	= EFormatPCL + 1
-	};
-
-	/*!
-	 * \brief
-	 * Ways in which a CDat can have its edge values normalized.
-	 * 
-	 * \see
-	 * Normalize
-	 */
-	enum ENormalize {
-		ENormalizeNone		= 0,
-		/*!
-		 * \brief
-		 * Linearly transform the minimum score to 0 and the maximum to 1.
-		 */
-		ENormalizeMinMax	= ENormalizeNone + 1,
-		/*!
-		 * \brief
-		 * Z-score all edges (subtract mean, divide by standard deviation).
-		 */
-		ENormalizeZScore	= ENormalizeMinMax + 1,
-		/*!
-		 * \brief
-		 * Sigmoid transform scores to the range [0, 1].
-		 */
-		ENormalizeSigmoid	= ENormalizeZScore + 1
-	};
-
-	bool Open( const char* szFile, bool fMemmap = false, size_t iSkip = 2, bool fZScore = false,
-		bool fDuplicates = false );
-	bool Open( std::istream& istm, EFormat eFormat = EFormatBinary, float dDefault = HUGE_VAL,
-		bool fDuplicates = false, size_t iSkip = 2, bool fZScore = false );
-	bool Open( const CSlim& Slim );
-	bool Open( const CSlim& SlimPositives, const CSlim& SlimNonnegatives );
-	bool Open( const std::vector<std::string>& vecstrGenes, bool fClear = true, const char* szFile = NULL );
-	bool Open( const std::vector<std::string>& vecstrGenes, const CDistanceMatrix& MatValues );
-	bool Open( const std::vector<CGenes*>& vecpPositives, const std::vector<CGenes*>& vecpNonnegatives,
-		float dPValue, const CGenome& Genome );
-	bool Open( const CDat& DatKnown, const std::vector<CGenes*>& vecpOther, const CGenome& Genome,
-		bool fKnownNegatives );
-	bool Open( const CPCL& PCL, const IMeasure* pMeasure, bool fMeasureMemory );
-	bool Open( const CDat& Dat );
-
-	bool OpenGenes( std::istream& istm, bool fBinary, bool fPCL = false );
-	bool OpenGenes( const char* szFile, size_t iSkip = 2 );
-	void Save( std::ostream& ostm, EFormat eFormat = EFormatBinary ) const;
-	void Save( const char* szFile ) const;
-	void SaveDOT( std::ostream& ostm, float dCutoff = HUGE_VAL, const CGenome* pGenome = NULL,
-		bool fUnlabeled = false, bool fHashes = true, const std::vector<float>* pvecdColors = NULL,
-		const std::vector<float>* pvecdBorders = NULL ) const;
-	void SaveGDF( std::ostream& ostm, float dCutoff = HUGE_VAL ) const;
-	void SaveNET( std::ostream& ostm, float dCutoff = HUGE_VAL ) const;
-	void SaveMATISSE( std::ostream& ostm, float dCutoff = HUGE_VAL, const CGenome* pGenome = NULL ) const;
-	void Invert( );
-	void Rank( );
-	bool FilterGenes( const char* szGenes, EFilter eFilter, size_t iLimit = -1 );
-	void FilterGenes( const CGenes& Genes, EFilter eFilter, size_t iLimit = -1,
-		float dEdgeAggressiveness = 0.5 );
-
-	/*!
-	 * \brief
-	 * Normalize each finite value in the CDat by a specific function.
-	 * 
-	 * \param eNormalize
-	 * Method by which scores are normalized.
-	 * 
-	 * \remarks
-	 * Values are left unchanged if ( dMax == dMin ) or ( dStd == 0 ).
-	 * 
-	 * \see
-	 * ENormalize | Invert
-	 */
-	void Normalize( ENormalize eNormalize ) {
-
-		switch( eNormalize ) {
-			case ENormalizeMinMax:
-				NormalizeMinmax( );
-				break;
-
-			case ENormalizeZScore:
-				NormalizeStdev( );
-				break;
-
-			default:
-				NormalizeSigmoid( ); } }
-
-	/*!
-	 * \brief
-	 * Return the index of the given gene name, or -1 if it is not included in the CDat.
-	 * 
-	 * \param strGene
-	 * Gene name to retrieve.
-	 * 
-	 * \returns
-	 * Index of the requested gene name, or -1 if it is not in the CDat.
-	 * 
-	 * \see
-	 * GetGeneNames
-	 */
-	size_t GetGene( const std::string& strGene ) const {
-
-		return CDatImpl::GetGene( strGene ); }
-
-	/*!
-	 * \brief
-	 * Return the value at the requested CDat position.
-	 * 
-	 * \param iY
-	 * CDat row.
-	 * 
-	 * \param iX
-	 * CDat column.
-	 * 
-	 * \returns
-	 * Value at the requested CDat position.
-	 * 
-	 * \remarks
-	 * For efficiency, no bounds checking is performed.  The given row and column must be smaller than
-	 * GetGenes.  As a symmetric matrix, the value at position XY will always equal the value at position YX.
-	 * 
-	 * \see
-	 * Set
-	 */
-	float& Get( size_t iY, size_t iX ) const {
-
-		return CDatImpl::Get( iY, iX ); }
-
-	/*!
-	 * \brief
-	 * Returns the number of elements (genes) in the CDat.
-	 * 
-	 * \returns
-	 * Number of elements (genes) in the CDat.
-	 * 
-	 * \remarks
-	 * Since a symmetric matrix must be square, the number of rows equals the number of columns and is thus
-	 * referred to as the number of elements (genes).
-	 */
-	size_t GetGenes( ) const {
-
-		return CDatImpl::GetGenes( ); }
-
-	/*!
-	 * \brief
-	 * Returns the symmetric matrix containing the CDat's values.
-	 * 
-	 * \returns
-	 * Symmetric matrix containing the CDat's values.
-	 */
-	const CDistanceMatrix& Get( ) const {
-
-		return m_Data; }
-
-	/*!
-	 * \brief
-	 * Returns the symmetric matrix containing the CDat's values.
-	 * 
-	 * \returns
-	 * Symmetric matrix containing the CDat's values.
-	 */
-	CDistanceMatrix& Get( ) {
-
-		return m_Data; }
-
-	/*!
-	 * \brief
-	 * Set the value at the requested CDat position.
-	 * 
-	 * \param iY
-	 * CDat row.
-	 * 
-	 * \param iX
-	 * CDat column.
-	 * 
-	 * \param dValue
-	 * Value to store.
-	 * 
-	 * \returns
-	 * True if the value was stored successfully.
-	 * 
-	 * \remarks
-	 * For efficiency, no bounds checking is performed.  The given row and column must be smaller than
-	 * GetGenes.
-	 * 
-	 * \see
-	 * Get
-	 */
-	bool Set( size_t iY, size_t iX, float dValue ) {
-
-		return CDatImpl::Set( iY, iX, dValue ); }
-
-	/*!
-	 * \brief
-	 * Returns the gene name at the given CDat position.
-	 * 
-	 * \param iGene
-	 * Index of gene name to return.
-	 * 
-	 * \returns
-	 * Gene name at the requested index.
-	 * 
-	 * \remarks
-	 * For efficiency, no bounds checking is performed.  The given index must be smaller than GetGenes.
-	 */
-	std::string GetGene( size_t iGene ) const {
-
-		return CDatImpl::GetGene( iGene ); }
-
-	/*!
-	 * \brief
-	 * Returns the vector of gene names associated with this CDat.
-	 * 
-	 * \returns
-	 * Vector of this CDat's gene names.
-	 * 
-	 * \remarks
-	 * Returned vector size will be identical to GetGenes.
-	 */
-	const std::vector<std::string>& GetGeneNames( ) const {
-
-		return CDatImpl::GetGeneNames( ); }
-
-	/*!
-	 * \brief
-	 * Set an entire row of CDat values efficiently.
-	 * 
-	 * \param iY
-	 * CDat row.
-	 * 
-	 * \param adValues
-	 * Values to store.
-	 * 
-	 * \remarks
-	 * For efficiency, no bounds checking is performed.  The given row must be smaller than GetGenes, and the
-	 * given array must be non-null and have length exactly (size - iY - 1).
-	 * 
-	 * \see
-	 * Get
-	 */
-	void Set( size_t iY, const float* adValues ) {
-
-		m_Data.Set( iY, adValues ); }
-
-	/*!
-	 * \brief
-	 * Get an entire row of CDat values efficiently.
-	 * 
-	 * \param iY
-	 * CDat row.
-	 * 
-	 * \returns
-	 * Retrieved values.
-	 * 
-	 * \remarks
-	 * For efficiency, no bounds checking is performed.  The given row must be smaller than GetGenes and the
-	 * returned array will have length exactly (size - iY - 1).
-	 * 
-	 * \see
-	 * Set
-	 */
-	const float* Get( size_t iY ) const {
-
-		return m_Data.Get( iY ); }
-
-	/*!
-	 * \brief
-	 * Get an entire row of CDat values efficiently.
-	 * 
-	 * \param iY
-	 * CDat row.
-	 * 
-	 * \returns
-	 * Retrieved values.
-	 * 
-	 * \remarks
-	 * For efficiency, no bounds checking is performed.  The given row must be smaller than GetGenes and the
-	 * returned array will have length exactly (size - iY - 1).
-	 * 
-	 * \see
-	 * Set
-	 */
-	float* Get( size_t iY ) {
-
-		return m_Data.Get( iY ); }
-
-	/*!
-	 * \brief
-	 * Set the gene name at the given index.
-	 * 
-	 * \param iGene
-	 * Index of gene name to modify.
-	 * 
-	 * \param strGene
-	 * Gene name to store at the requested index.
-	 * 
-	 * \remarks
-	 * For efficiency, no bounds checking is performed.  The given index must be smaller than GetGenes.
-	 * 
-	 * \see
-	 * GetGene
-	 */
-	void SetGene( size_t iGene, const std::string& strGene ) {
-
-		if( m_pPCL )
-			m_pPCL->SetGene( iGene, strGene );
-		else
-			m_vecstrGenes[ iGene ] = strGene; }
-
+/*****************************************************************************
+* This file is provided under the Creative Commons Attribution 3.0 license.
+*
+* You are free to share, copy, distribute, transmit, or adapt this work
+* PROVIDED THAT you attribute the work to the authors listed below.
+* For more information, please see the following web page:
+* http://creativecommons.org/licenses/by/3.0/
+*
+* This file is a component of the Sleipnir library for functional genomics,
+* authored by:
+* Curtis Huttenhower (chuttenh@princeton.edu)
+* Mark Schroeder
+* Maria D. Chikina
+* Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
+*
+* If you use this library, the included executable tools, or any related
+* code in your work, please cite the following publication:
+* Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
+* Olga G. Troyanskaya.
+* "The Sleipnir library for computational functional genomics"
+*****************************************************************************/
+#ifndef DAT_H
+#define DAT_H
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "dati.h"
+
+namespace Sleipnir {
+
+class CGenes;
+class CGenome;
+
+/*!
+ * \brief
+ * Stores a continuously valued half matrix paired with a list of names for matrix elements.
+ * 
+ * Conceptually, a CDat stores a list of weighted pairs; this is equivalent to a weighted undirected
+ * graph with labeled nodes, or a symmetric matrix with labels for each matrix element.  CDat entries are
+ * stored as continuous values, although they can be discretized in various ways.  CDats can be constructed
+ * in several ways, read from disk, persisted to disk in multiple file formats, or calculated from
+ * existing gene sets, microarray data, or gold standards.  In practice, a CDat is simply a continuously
+ * valued symmetric matrix (in which zero or more values may be missing) paired with a list of element
+ * names (assumed to be genes), but this data structure is sufficiently flexible to represent nearly any
+ * biological dataset.
+ * 
+ * CDats can be loaded (by Open) and/or stored (by Save) from/to disk in the following formats:
+ * - DAT.  A tab-delimited text file in which each line contains two identifiers and a score:
+ * \code
+ * GENE1	GENE2	SCORE1
+ * GENE1	GENE3	SCORE2
+ * GENE2	GENE3	SCORE3
+ * \endcode
+ * Element pair order is irrelevant, missing values are allowed, and duplicates can be optionally ignored.
+ * The DAT format is most suitable for human readability and manipulation by scripting languages; it is
+ * much larger and slower to process than the other formats, however.
+ * - DAB.  A binary file containing an integer size, a list of null-terminated element identifiers
+ * (generally gene names), and the CDat's values in row-major order.  Missing values are stored as NaNs.
+ * Should be generated by Save; usually the smallest and most rapidly parsed format, and the only one
+ * amenable to memory mapping.
+ * - DAS.  A sparse binary file containing an integer size, a list of null-terminated element identifiers
+ * (generally gene names), and the CDat's non-missing values in row-major order, pairing column indices with
+ * values.  Should be generated by Save.  Note that this sounds like it should save space for sparse
+ * CDats, but because of the overhead of storing column indices, the matrix has to be awfully sparse before
+ * it actually does.
+ * - PCL.  A standard PCL file, which is loaded and converted to pairwise similarity scores using
+ * z-transformed Pearson correlation as calculated by CMeasurePearNorm.  Can be converted once and cached
+ * in memory or calculated on-the-fly; the former consumes more memory, the latter is (often) slower.
+ * 
+ * \see
+ * CDataPair | CHalfMatrix
+ */
+class CDat : protected CDatImpl {
+public:
+	/*!
+	 * \brief
+	 * Ways in which nodes/edges can be removed to filter a CDat.
+	 * 
+	 * \see
+	 * FilterGenes
+	 */
+	enum EFilter {
+		/*!
+		 * \brief
+		 * Remove any edge including a node outside the given set.
+		 */
+		EFilterInclude		= 0,
+		/*!
+		 * \brief
+		 * Remove any positive edge including a node outside the given set.
+		 */
+		EFilterTerm			= EFilterInclude + 1,
+		/*!
+		 * \brief
+		 * Remove any edge including a node in the given set.
+		 */
+		EFilterExclude		= EFilterTerm + 1,
+		/*!
+		 * \brief
+		 * Perform a bioPIXIE query using the given set and remove any edge not in the resulting subgraph.
+		 */
+		EFilterPixie		= EFilterExclude + 1,
+		/*!
+		 * \brief
+		 * Remove any edge not including a node in the given set.
+		 */
+		EFilterEdge			= EFilterPixie + 1,
+		/*!
+		 * \brief
+		 * Perform a HEFalMp query using the given set and remove any edge not in the resulting subgraph.
+		 */
+		EFilterHefalmp		= EFilterEdge + 1
+	};
+
+	/*!
+	 * \brief
+	 * Ways in which a CDat can be persisted to/from disk.
+	 * 
+	 * \see
+	 * Open | Save
+	 */
+	enum EFormat {
+		/*!
+		 * \brief
+		 * Binary format listing null-terminated element name strings followed by floating point values.
+		 */
+		EFormatBinary	= 0,
+		/*!
+		 * \brief
+		 * Text format listing element name pairs followed by numerical value strings.
+		 */
+		EFormatText		= EFormatBinary + 1,
+		/*!
+		 * \brief
+		 * PCL file from which pairwise scores are calculated using some similarity measure.
+		 */
+		EFormatPCL		= EFormatText + 1,
+		/*!
+		 * \brief
+		 * Binary format listing null-terminated element name strings followed by index/value pairs.
+		 */
+		EFormatSparse	= EFormatPCL + 1
+	};
+
+	/*!
+	 * \brief
+	 * Ways in which a CDat can have its edge values normalized.
+	 * 
+	 * \see
+	 * Normalize
+	 */
+	enum ENormalize {
+		ENormalizeNone		= 0,
+		/*!
+		 * \brief
+		 * Linearly transform the minimum score to 0 and the maximum to 1.
+		 */
+		ENormalizeMinMax	= ENormalizeNone + 1,
+		/*!
+		 * \brief
+		 * Z-score all edges (subtract mean, divide by standard deviation).
+		 */
+		ENormalizeZScore	= ENormalizeMinMax + 1,
+		/*!
+		 * \brief
+		 * Sigmoid transform scores to the range [0, 1].
+		 */
+		ENormalizeSigmoid	= ENormalizeZScore + 1
+	};
+
+	bool Open( const char* szFile, bool fMemmap = false, size_t iSkip = 2, bool fZScore = false,
+		bool fDuplicates = false );
+	bool Open( std::istream& istm, EFormat eFormat = EFormatBinary, float dDefault = HUGE_VAL,
+		bool fDuplicates = false, size_t iSkip = 2, bool fZScore = false );
+	bool Open( const CSlim& Slim );
+	bool Open( const CSlim& SlimPositives, const CSlim& SlimNonnegatives );
+	bool Open( const std::vector<std::string>& vecstrGenes, bool fClear = true, const char* szFile = NULL );
+	bool Open( const std::vector<std::string>& vecstrGenes, const CDistanceMatrix& MatValues );
+	bool Open( const std::vector<CGenes*>& vecpPositives, const std::vector<CGenes*>& vecpNonnegatives,
+		float dPValue, const CGenome& Genome );
+	bool Open( const CDat& DatKnown, const std::vector<CGenes*>& vecpOther, const CGenome& Genome,
+		bool fKnownNegatives );
+	bool Open( const CPCL& PCL, const IMeasure* pMeasure, bool fMeasureMemory );
+	bool Open( const CDat& Dat );
+
+	bool OpenGenes( std::istream& istm, bool fBinary, bool fPCL = false );
+	bool OpenGenes( const char* szFile, size_t iSkip = 2 );
+	void Save( std::ostream& ostm, EFormat eFormat = EFormatBinary ) const;
+	void Save( const char* szFile ) const;
+	void SaveDOT( std::ostream& ostm, float dCutoff = HUGE_VAL, const CGenome* pGenome = NULL,
+		bool fUnlabeled = false, bool fHashes = true, const std::vector<float>* pvecdColors = NULL,
+		const std::vector<float>* pvecdBorders = NULL ) const;
+	void SaveGDF( std::ostream& ostm, float dCutoff = HUGE_VAL ) const;
+	void SaveNET( std::ostream& ostm, float dCutoff = HUGE_VAL ) const;
+	void SaveMATISSE( std::ostream& ostm, float dCutoff = HUGE_VAL, const CGenome* pGenome = NULL ) const;
+	void Invert( );
+	void Rank( );
+	bool FilterGenes( const char* szGenes, EFilter eFilter, size_t iLimit = -1 );
+	void FilterGenes( const CGenes& Genes, EFilter eFilter, size_t iLimit = -1,
+		float dEdgeAggressiveness = 0.5, const std::vector<float>* pvecdWeights = NULL );
+
+	/*!
+	 * \brief
+	 * Normalize each finite value in the CDat by a specific function.
+	 * 
+	 * \param eNormalize
+	 * Method by which scores are normalized.
+	 * 
+	 * \remarks
+	 * Values are left unchanged if ( dMax == dMin ) or ( dStd == 0 ).
+	 * 
+	 * \see
+	 * ENormalize | Invert
+	 */
+	void Normalize( ENormalize eNormalize ) {
+
+		switch( eNormalize ) {
+			case ENormalizeMinMax:
+				NormalizeMinmax( );
+				break;
+
+			case ENormalizeZScore:
+				NormalizeStdev( );
+				break;
+
+			default:
+				NormalizeSigmoid( ); } }
+
+	/*!
+	 * \brief
+	 * Return the index of the given gene name, or -1 if it is not included in the CDat.
+	 * 
+	 * \param strGene
+	 * Gene name to retrieve.
+	 * 
+	 * \returns
+	 * Index of the requested gene name, or -1 if it is not in the CDat.
+	 * 
+	 * \see
+	 * GetGeneNames
+	 */
+	size_t GetGene( const std::string& strGene ) const {
+
+		return CDatImpl::GetGene( strGene ); }
+
+	/*!
+	 * \brief
+	 * Return the value at the requested CDat position.
+	 * 
+	 * \param iY
+	 * CDat row.
+	 * 
+	 * \param iX
+	 * CDat column.
+	 * 
+	 * \returns
+	 * Value at the requested CDat position.
+	 * 
+	 * \remarks
+	 * For efficiency, no bounds checking is performed.  The given row and column must be smaller than
+	 * GetGenes.  As a symmetric matrix, the value at position XY will always equal the value at position YX.
+	 * 
+	 * \see
+	 * Set
+	 */
+	float& Get( size_t iY, size_t iX ) const {
+
+		return CDatImpl::Get( iY, iX ); }
+
+	/*!
+	 * \brief
+	 * Returns the number of elements (genes) in the CDat.
+	 * 
+	 * \returns
+	 * Number of elements (genes) in the CDat.
+	 * 
+	 * \remarks
+	 * Since a symmetric matrix must be square, the number of rows equals the number of columns and is thus
+	 * referred to as the number of elements (genes).
+	 */
+	size_t GetGenes( ) const {
+
+		return CDatImpl::GetGenes( ); }
+
+	/*!
+	 * \brief
+	 * Returns the symmetric matrix containing the CDat's values.
+	 * 
+	 * \returns
+	 * Symmetric matrix containing the CDat's values.
+	 */
+	const CDistanceMatrix& Get( ) const {
+
+		return m_Data; }
+
+	/*!
+	 * \brief
+	 * Returns the symmetric matrix containing the CDat's values.
+	 * 
+	 * \returns
+	 * Symmetric matrix containing the CDat's values.
+	 */
+	CDistanceMatrix& Get( ) {
+
+		return m_Data; }
+
+	/*!
+	 * \brief
+	 * Set the value at the requested CDat position.
+	 * 
+	 * \param iY
+	 * CDat row.
+	 * 
+	 * \param iX
+	 * CDat column.
+	 * 
+	 * \param dValue
+	 * Value to store.
+	 * 
+	 * \returns
+	 * True if the value was stored successfully.
+	 * 
+	 * \remarks
+	 * For efficiency, no bounds checking is performed.  The given row and column must be smaller than
+	 * GetGenes.
+	 * 
+	 * \see
+	 * Get
+	 */
+	bool Set( size_t iY, size_t iX, float dValue ) {
+
+		return CDatImpl::Set( iY, iX, dValue ); }
+
+	/*!
+	 * \brief
+	 * Returns the gene name at the given CDat position.
+	 * 
+	 * \param iGene
+	 * Index of gene name to return.
+	 * 
+	 * \returns
+	 * Gene name at the requested index.
+	 * 
+	 * \remarks
+	 * For efficiency, no bounds checking is performed.  The given index must be smaller than GetGenes.
+	 */
+	std::string GetGene( size_t iGene ) const {
+
+		return CDatImpl::GetGene( iGene ); }
+
+	/*!
+	 * \brief
+	 * Returns the vector of gene names associated with this CDat.
+	 * 
+	 * \returns
+	 * Vector of this CDat's gene names.
+	 * 
+	 * \remarks
+	 * Returned vector size will be identical to GetGenes.
+	 */
+	const std::vector<std::string>& GetGeneNames( ) const {
+
+		return CDatImpl::GetGeneNames( ); }
+
+	/*!
+	 * \brief
+	 * Set an entire row of CDat values efficiently.
+	 * 
+	 * \param iY
+	 * CDat row.
+	 * 
+	 * \param adValues
+	 * Values to store.
+	 * 
+	 * \remarks
+	 * For efficiency, no bounds checking is performed.  The given row must be smaller than GetGenes, and the
+	 * given array must be non-null and have length exactly (size - iY - 1).
+	 * 
+	 * \see
+	 * Get
+	 */
+	void Set( size_t iY, const float* adValues ) {
+
+		m_Data.Set( iY, adValues ); }
+
+	/*!
+	 * \brief
+	 * Get an entire row of CDat values efficiently.
+	 * 
+	 * \param iY
+	 * CDat row.
+	 * 
+	 * \returns
+	 * Retrieved values.
+	 * 
+	 * \remarks
+	 * For efficiency, no bounds checking is performed.  The given row must be smaller than GetGenes and the
+	 * returned array will have length exactly (size - iY - 1).
+	 * 
+	 * \see
+	 * Set
+	 */
+	const float* Get( size_t iY ) const {
+
+		return m_Data.Get( iY ); }
+
+	/*!
+	 * \brief
+	 * Get an entire row of CDat values efficiently.
+	 * 
+	 * \param iY
+	 * CDat row.
+	 * 
+	 * \returns
+	 * Retrieved values.
+	 * 
+	 * \remarks
+	 * For efficiency, no bounds checking is performed.  The given row must be smaller than GetGenes and the
+	 * returned array will have length exactly (size - iY - 1).
+	 * 
+	 * \see
+	 * Set
+	 */
+	float* Get( size_t iY ) {
+
+		return m_Data.Get( iY ); }
+
+	/*!
+	 * \brief
+	 * Set the gene name at the given index.
+	 * 
+	 * \param iGene
+	 * Index of gene name to modify.
+	 * 
+	 * \param strGene
+	 * Gene name to store at the requested index.
+	 * 
+	 * \remarks
+	 * For efficiency, no bounds checking is performed.  The given index must be smaller than GetGenes.
+	 * 
+	 * \see
+	 * GetGene
+	 */
+	void SetGene( size_t iGene, const std::string& strGene ) {
+
+		if( m_pPCL )
+			m_pPCL->SetGene( iGene, strGene );
+		else
+			m_vecstrGenes[ iGene ] = strGene; }
+
 	/*!
 	 * \brief
 	 * Randomizes the CDat's values by iterated swapping.
 	 */
-	void Randomize( ) {
-		size_t	i, j, iOne, iTwo;
-		float	dOne, dTwo;
-
-		for( i = 0; i < GetGenes( ); ++i )
-			for( j = ( i + 1 ); j < GetGenes( ); ++j ) {
-				if( CMeta::IsNaN( dOne = Get( i, j ) ) )
-					continue;
-				while( true ) {
-					iOne = rand( ) % GetGenes( );
-					iTwo = rand( ) % GetGenes( );
-					if( iOne > iTwo )
-						std::swap( iOne, iTwo );
-					if( ( ( iOne != i ) || ( iTwo != j ) ) && !CMeta::IsNaN( dTwo = Get( iOne, iTwo ) ) )
-						break; }
-				Set( i, j, dTwo );
-				Set( iOne, iTwo, dOne ); } }
-};
-
-}
-
-#endif // DAT_H
+	void Randomize( ) {
+		size_t	i, j, iOne, iTwo;
+		float	dOne, dTwo;
+
+		for( i = 0; i < GetGenes( ); ++i )
+			for( j = ( i + 1 ); j < GetGenes( ); ++j ) {
+				if( CMeta::IsNaN( dOne = Get( i, j ) ) )
+					continue;
+				while( true ) {
+					iOne = rand( ) % GetGenes( );
+					iTwo = rand( ) % GetGenes( );
+					if( iOne > iTwo )
+						std::swap( iOne, iTwo );
+					if( ( ( iOne != i ) || ( iTwo != j ) ) && !CMeta::IsNaN( dTwo = Get( iOne, iTwo ) ) )
+						break; }
+				Set( i, j, dTwo );
+				Set( iOne, iTwo, dOne ); } }
+};
+
+}
+
+#endif // DAT_H
 	void OpenHelper( const CGenes*, const CGenes*, float );
 	bool OpenHelper( );
 	bool OpenMemmap( const unsigned char* );
-	void FilterGenesGraph( const CGenes&, std::vector<bool>&, size_t, float, bool );
+	void FilterGenesGraph( const CGenes&, std::vector<bool>&, size_t, float, bool, const std::vector<float>* );
 
 	float& Get( size_t iX, size_t iY ) const {
 		static float	s_dRet;
 
 		case ENormalizeColumn:
 		case ENormalizeColumnCenter:
+		case ENormalizeColumnFraction:
 			for( i = 0; i < GetExperiments( ); ++i ) {
 				dAve = dStd = 0;
 				for( iCount = j = 0; j < GetGenes( ); ++j )
 						dAve += d;
 						dStd += d * d; }
 				if( iCount ) {
-					dAve /= iCount;
-					dStd = ( dStd / iCount ) - ( dAve * dAve );
-					dStd = ( dStd <= 0 ) ? 1 : sqrt( dStd );
-					if( eNormalize == ENormalizeColumnCenter )
-						dStd = 1;
+					if( eNormalize != ENormalizeColumnFraction ) {
+						dAve /= iCount;
+						dStd = ( dStd / iCount ) - ( dAve * dAve );
+						dStd = ( dStd <= 0 ) ? 1 : sqrt( dStd );
+						if( eNormalize == ENormalizeColumnCenter )
+							dStd = 1; }
 					for( j = 0; j < GetGenes( ); ++j )
-						if( !CMeta::IsNaN( d = Get( j, i ) ) )
-							Set( j, i, (float)( ( d - dAve ) / dStd ) ); } }
+						if( !CMeta::IsNaN( d = Get( j, i ) ) ) {
+							if( eNormalize == ENormalizeColumnFraction )
+								d /= dAve;
+							else
+								d = ( d - dAve ) / dStd;
+							Set( j, i, (float)d ); } } }
 			break;
 
 		case ENormalizeMinMax:
 		 * \brief
 		 * Subtract the column average from every value.
 		 */
-		ENormalizeColumnCenter
+		ENormalizeColumnCenter,
+		/*!
+		 * \brief
+		 * Divide each entry by the column sum.
+		 */
+		ENormalizeColumnFraction
 	};
 
 	static int Distance( const char* szFile, size_t iSkip, const char* szSimilarityMeasure, bool fNormalize,
  * 
  * \section sec_history Version History
  * 
+ * - <a href="sleipnir-2.2.tar.gz">2.2</a>, *** <br>
+ * Fix confusing documentation in \ref Answerer - thanks to Arjun Krishnan!
+ * 
  * - <a href="sleipnir-2.1.tar.gz">2.1</a>, 12-20-09 <br>
  * Update includes for gcc 4.3 compatibility - thanks to Casey Greene! <br>
  * Add \c half2relative.rb and \c half2weights.rb scripts to \ref MIer - thanks to Arjun Krishnan! <br>

tools/Answerer/stdafx.cpp

  * Given sets of known related genes - pathways, complexes, GO terms, etc. - Answerer generates a gold
  * standard Sleipnir::CDat.  By considering every pair of genes coannotated to one of these sets to be
  * related, the gold standard answers will include a collection of known functionally related pairs.
- * If Answerer is only provided with positive gene sets, it splits the world of gene pairs up into two
- * values, positive (related) and negative (unrelated), modulo any uncertain pairs introduced by the
- * \c overlap option (see below).
+ * If Answerer is only provided with positive gene sets, it will only generate positive (related) pairs,
+ *  modulo any uncertain pairs introduced by the \c overlap option (see below).
  * 
  * In addition to these positive gene sets, Answerer optionally also takes one or more negative sets.
  * These represent "minimally related" genes, such that gene pairs \e not coannotated to a negative set

tools/Cliquer/Cliquer.cpp

 int cliques( const gengetopt_args_info&, const CDat&, const CDat&, const vector<size_t>& );
 int heavy( const gengetopt_args_info&, CDat&, const CDat&, const vector<size_t>& );
 int heavy2( const gengetopt_args_info&, CDat&, const CDat&, const vector<size_t>& );
+int motifs( const gengetopt_args_info&, CDat& );
 bool connectivity( size_t, const vector<size_t>&, const vector<float>&, const vector<size_t>&,
 	float, size_t, float, size_t, const CDat&, float&, size_t&, float&, size_t& );
 void max_connectivity( const vector<bool>&, const vector<size_t>&, const vector<float>&,
 		for( i = 0; i < veciKnowns.size( ); ++i )
 			veciKnowns[ i ] = DatKnowns.GetGene( Dat.GetGene( i ) ); }
 
-	iRet = sArgs.heavy_arg ? heavy2( sArgs, Dat, DatKnowns, veciKnowns ) :
-		cliques( sArgs, Dat, DatKnowns, veciKnowns );
+	iRet = sArgs.motifs_arg ? motifs( sArgs, Dat ) : ( sArgs.heavy_arg ?
+		heavy2( sArgs, Dat, DatKnowns, veciKnowns ) :
+		cliques( sArgs, Dat, DatKnowns, veciKnowns ) );
 
 	return iRet; }
 
 			break; }
 
 	return 0; }
+
+int motifs( const gengetopt_args_info& sArgs, CDat& Dat ) {
+	size_t			i, j, k, iOne, iTwo, iThree;
+	vector<size_t>	veciOne, veciTwo;
+	vector<bool>	vecfSign;
+	float			dOne, dTwo, dThree;
+
+	for( i = 0; i < Dat.GetGenes( ); ++i )
+		for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j )
+			if( !CMeta::IsNaN( dOne = Dat.Get( i, j ) ) && ( fabs( dOne ) >= sArgs.motifs_arg ) ) {
+				iOne = ( dOne >= 0 ) ? 1 : 0;
+				for( k = ( j + 1 ); k < Dat.GetGenes( ); ++k )
+					if( !CMeta::IsNaN( dTwo = Dat.Get( j, k ) ) && ( fabs( dTwo ) >= sArgs.motifs_arg ) &&
+						!CMeta::IsNaN( dThree = Dat.Get( i, k ) ) && ( fabs( dThree ) >= sArgs.motifs_arg ) ) {
+						iTwo = ( dTwo >= 0 ) ? 1 : 0;
+						iThree = ( dThree >= 0 ) ? 1 : 0;
+						if( ( iOne + iTwo + iThree ) == 1 ) {
+							cout << Dat.GetGene( i ) << '\t' << dOne << '\t' <<
+								Dat.GetGene( j ) << '\t' << dTwo << '\t' <<
+								Dat.GetGene( k ) << '\t' << dThree << endl; } } }
+
+	return 0; }

tools/Cliquer/Cliquer.ggo

 							int	default="100"
 option	"size"			S	"Size of subgraphs to find"
 							int	default="3"
+option	"motifs"		f	"Extract programmatically defined network motifs"
+							double	default="0"
 
 section "Preprocessing"
 option	"knowns"		k	"Known interactions (DAT/DAB) to ignore"
 option	"normalize"		n	"Normalize input file"
 							flag	off
 option	"cutoff"		c	"Exclude edges below cutoff"
-							double
+							double	default="0"
 
 section "Optional"
 option	"memmap"		m	"Memory map input"

tools/Cliquer/cmdline.c

   "\nMiscellaneous:",
   "  -s, --subgraphs=INT       Number of subgraphs to output  (default=`100')",
   "  -S, --size=INT            Size of subgraphs to find  (default=`3')",
+  "  -f, --motifs=DOUBLE       Extract programmatically defined network motifs  \n                              (default=`0')",
   "\nPreprocessing:",
   "  -k, --knowns=filename     Known interactions (DAT/DAB) to ignore",
   "  -n, --normalize           Normalize input file  (default=off)",
-  "  -c, --cutoff=DOUBLE       Exclude edges below cutoff",
+  "  -c, --cutoff=DOUBLE       Exclude edges below cutoff  (default=`0')",
   "\nOptional:",
   "  -m, --memmap              Memory map input  (default=off)",
   "  -v, --verbosity=INT       Message verbosity  (default=`5')",
   args_info->specificity_given = 0 ;
   args_info->subgraphs_given = 0 ;
   args_info->size_given = 0 ;
+  args_info->motifs_given = 0 ;
   args_info->knowns_given = 0 ;
   args_info->normalize_given = 0 ;
   args_info->cutoff_given = 0 ;
   args_info->subgraphs_orig = NULL;
   args_info->size_arg = 3;
   args_info->size_orig = NULL;
+  args_info->motifs_arg = 0;
+  args_info->motifs_orig = NULL;
   args_info->knowns_arg = NULL;
   args_info->knowns_orig = NULL;
   args_info->normalize_flag = 0;
+  args_info->cutoff_arg = 0;
   args_info->cutoff_orig = NULL;
   args_info->memmap_flag = 0;
   args_info->verbosity_arg = 5;
   args_info->specificity_help = gengetopt_args_info_help[5] ;
   args_info->subgraphs_help = gengetopt_args_info_help[7] ;
   args_info->size_help = gengetopt_args_info_help[8] ;
-  args_info->knowns_help = gengetopt_args_info_help[10] ;
-  args_info->normalize_help = gengetopt_args_info_help[11] ;
-  args_info->cutoff_help = gengetopt_args_info_help[12] ;
-  args_info->memmap_help = gengetopt_args_info_help[14] ;
-  args_info->verbosity_help = gengetopt_args_info_help[15] ;
+  args_info->motifs_help = gengetopt_args_info_help[9] ;
+  args_info->knowns_help = gengetopt_args_info_help[11] ;
+  args_info->normalize_help = gengetopt_args_info_help[12] ;
+  args_info->cutoff_help = gengetopt_args_info_help[13] ;
+  args_info->memmap_help = gengetopt_args_info_help[15] ;
+  args_info->verbosity_help = gengetopt_args_info_help[16] ;
   
 }
 
   free_string_field (&(args_info->specificity_orig));
   free_string_field (&(args_info->subgraphs_orig));
   free_string_field (&(args_info->size_orig));
+  free_string_field (&(args_info->motifs_orig));
   free_string_field (&(args_info->knowns_arg));
   free_string_field (&(args_info->knowns_orig));
   free_string_field (&(args_info->cutoff_orig));
     write_into_file(outfile, "subgraphs", args_info->subgraphs_orig, 0);
   if (args_info->size_given)
     write_into_file(outfile, "size", args_info->size_orig, 0);
+  if (args_info->motifs_given)
+    write_into_file(outfile, "motifs", args_info->motifs_orig, 0);
   if (args_info->knowns_given)
     write_into_file(outfile, "knowns", args_info->knowns_orig, 0);
   if (args_info->normalize_given)
         { "specificity",	1, NULL, 'r' },
         { "subgraphs",	1, NULL, 's' },
         { "size",	1, NULL, 'S' },
+        { "motifs",	1, NULL, 'f' },
         { "knowns",	1, NULL, 'k' },
         { "normalize",	0, NULL, 'n' },
         { "cutoff",	1, NULL, 'c' },
         { NULL,	0, NULL, 0 }
       };
 
-      c = getopt_long (argc, argv, "hVi:w:r:s:S:k:nc:mv:", long_options, &option_index);
+      c = getopt_long (argc, argv, "hVi:w:r:s:S:f:k:nc:mv:", long_options, &option_index);
 
       if (c == -1) break;	/* Exit from `while (1)' loop.  */
 
             goto failure;
         
           break;
+        case 'f':	/* Extract programmatically defined network motifs.  */
+        
+        
+          if (update_arg( (void *)&(args_info->motifs_arg), 
+               &(args_info->motifs_orig), &(args_info->motifs_given),
+              &(local_args_info.motifs_given), optarg, 0, "0", ARG_DOUBLE,
+              check_ambiguity, override, 0, 0,
+              "motifs", 'f',
+              additional_error))
+            goto failure;
+        
+          break;
         case 'k':	/* Known interactions (DAT/DAB) to ignore.  */
         
         
         
           if (update_arg( (void *)&(args_info->cutoff_arg), 
                &(args_info->cutoff_orig), &(args_info->cutoff_given),
-              &(local_args_info.cutoff_given), optarg, 0, 0, ARG_DOUBLE,
+              &(local_args_info.cutoff_given), optarg, 0, "0", ARG_DOUBLE,
               check_ambiguity, override, 0, 0,
               "cutoff", 'c',
               additional_error))

tools/Cliquer/cmdline.h

   int size_arg;	/**< @brief Size of subgraphs to find (default='3').  */
   char * size_orig;	/**< @brief Size of subgraphs to find original value given at command line.  */
   const char *size_help; /**< @brief Size of subgraphs to find help description.  */
+  double motifs_arg;	/**< @brief Extract programmatically defined network motifs (default='0').  */
+  char * motifs_orig;	/**< @brief Extract programmatically defined network motifs original value given at command line.  */
+  const char *motifs_help; /**< @brief Extract programmatically defined network motifs help description.  */
   char * knowns_arg;	/**< @brief Known interactions (DAT/DAB) to ignore.  */
   char * knowns_orig;	/**< @brief Known interactions (DAT/DAB) to ignore original value given at command line.  */
   const char *knowns_help; /**< @brief Known interactions (DAT/DAB) to ignore help description.  */
   int normalize_flag;	/**< @brief Normalize input file (default=off).  */
   const char *normalize_help; /**< @brief Normalize input file help description.  */
-  double cutoff_arg;	/**< @brief Exclude edges below cutoff.  */
+  double cutoff_arg;	/**< @brief Exclude edges below cutoff (default='0').  */
   char * cutoff_orig;	/**< @brief Exclude edges below cutoff original value given at command line.  */
   const char *cutoff_help; /**< @brief Exclude edges below cutoff help description.  */
   int memmap_flag;	/**< @brief Memory map input (default=off).  */
   unsigned int specificity_given ;	/**< @brief Whether specificity was given.  */
   unsigned int subgraphs_given ;	/**< @brief Whether subgraphs was given.  */
   unsigned int size_given ;	/**< @brief Whether size was given.  */
+  unsigned int motifs_given ;	/**< @brief Whether motifs was given.  */
   unsigned int knowns_given ;	/**< @brief Whether knowns was given.  */
   unsigned int normalize_given ;	/**< @brief Whether normalize was given.  */
   unsigned int cutoff_given ;	/**< @brief Whether cutoff was given.  */

tools/Dat2Graph/Dat2Graph.cpp

 
 	return 0; }
 
-int main( int iArgs, char** aszArgs ) {
+int open_values( const char* szFile, vector<float>& vecdValues ) {
 	static const size_t	c_iBuf	= 1024;
 	char				szBuf[ c_iBuf ];
+	ifstream			ifsm;
+
+	if( szFile ) {
+		ifsm.open( szFile );
+		if( !ifsm.is_open( ) ) {
+			cerr << "Could not open: " << szFile << endl;
+			return 1; }
+		while( ifsm.peek( ) != EOF ) {
+			ifsm.getline( szBuf, c_iBuf - 1 );
+			vecdValues.push_back( (float)atof( szBuf ) ); }
+		ifsm.close( ); }
+
+	return 0; }
+
+int main( int iArgs, char** aszArgs ) {
 	gengetopt_args_info	sArgs;
 	ifstream			ifsm;
 	CDat				Dat, DatNew;
 	CDat*				pDat;
 	float				d, dCutoff;
 	CGenome				Genome;
-	CGenes				GenesIn( Genome ), GenesQr( Genome );
+	CGenes				GenesIn( Genome ), GenesEx( Genome ), GenesQr( Genome );
 	int					iRet;
-	size_t				i, j;
-	vector<float>		vecdColors, vecdBorders;
+	size_t				i, j, k;
+	vector<float>		vecdColors, vecdBorders, vecdWeights;
+	vector<size_t>		veciQuery;
 
 	if( cmdline_parser2( iArgs, aszArgs, &sArgs, 0, 1, 0 ) && ( sArgs.config_arg &&
 		cmdline_parser_configfile( sArgs.config_arg, &sArgs, 0, 0, 1 ) ) ) {
 			cerr << "Could not open: " << sArgs.features_arg << endl;
 			return 1; }
 		ifsm.close( ); }
-
-	if( sArgs.colors_arg ) {
-		ifsm.clear( );
-		ifsm.open( sArgs.colors_arg );
-		if( !ifsm.is_open( ) ) {
-			cerr << "Could not open: " << sArgs.colors_arg << endl;
-			return 1; }
-		while( ifsm.peek( ) != EOF ) {
-			ifsm.getline( szBuf, c_iBuf - 1 );
-			vecdColors.push_back( (float)atof( szBuf ) ); }
-		ifsm.close( ); }
-
-	if( sArgs.borders_arg ) {
-		ifsm.clear( );
-		ifsm.open( sArgs.borders_arg );
-		if( !ifsm.is_open( ) ) {
-			cerr << "Could not open: " << sArgs.borders_arg << endl;
-			return 1; }
-		while( ifsm.peek( ) != EOF ) {
-			ifsm.getline( szBuf, c_iBuf - 1 );
-			vecdBorders.push_back( (float)atof( szBuf ) ); }
-		ifsm.close( ); }
-
+	if( iRet = open_values( sArgs.colors_arg, vecdColors ) )
+		return iRet;
+	if( iRet = open_values( sArgs.borders_arg, vecdBorders ) )
+		return iRet;
+	if( iRet = open_values( sArgs.genew_arg, vecdWeights ) )
+		return iRet;
 	if( iRet = open_genes( sArgs.genes_arg, GenesIn ) )
 		return iRet;
+	if( iRet = open_genes( sArgs.genex_arg, GenesEx ) )
+		return iRet;
 	if( iRet = open_genes( sArgs.geneq_arg, GenesQr ) )
 		return iRet;
 
 		return 1; }
 	pDat = &Dat;
 
-	dCutoff = (float)( sArgs.cutoff_given ? sArgs.cutoff_arg : HUGE_VAL );
+	veciQuery.resize( pDat->GetGenes( ) );
+	for( i = 0; i < veciQuery.size( ); ++i )
+		veciQuery[ i ] = GenesQr.GetGene( pDat->GetGene( i ) );
+
+	dCutoff = (float)( sArgs.cutoff_given ? sArgs.cutoff_arg : -FLT_MAX );
 	if( GenesIn.GetGenes( ) ) {
 		vector<size_t>	veciGenes;
 
 				if( veciGenes[ j ] != -1 )
 					DatNew.Set( i, j, Dat.Get( veciGenes[ i ], veciGenes[ j ] ) ); }
 		pDat = &DatNew; }
+	if( GenesEx.GetGenes( ) )
+		pDat->FilterGenes( GenesEx, CDat::EFilterExclude );
 	if( sArgs.normalize_flag )
-		pDat->Normalize( CDat::ENormalizeMinMax );
+		pDat->Normalize( CDat::ENormalizeSigmoid );
 	if( GenesQr.GetGenes( ) ) {
 		if( sArgs.cutoff_given )
 			for( i = 0; i < pDat->GetGenes( ); ++i )
 			for( i = 0; i < vecdScores.size( ); ++i )
 				cout << pDat->GetGene( i ) << '\t' << vecdScores[ i ] << endl; }
 		else {
-			dCutoff = 0;
 			if( vecdColors.empty( ) ) {
 				vecdColors.resize( pDat->GetGenes( ) );
 				fill( vecdColors.begin( ), vecdColors.end( ), 0.5f );
 					if( ( j = pDat->GetGene( GenesQr.GetGene( i ).GetName( ) ) ) != -1 )
 						vecdColors[ j ] = 1; }
 			pDat->FilterGenes( GenesQr, sArgs.hefalmp_flag ? CDat::EFilterHefalmp : CDat::EFilterPixie,
-				sArgs.neighbors_arg, (float)sArgs.edges_arg ); } }
+				sArgs.neighbors_arg, (float)sArgs.edges_arg, vecdWeights.empty( ) ? NULL : &vecdWeights ); } }
 	if( sArgs.knowns_arg ) {
 		CDat			DatKnowns;
 		vector<size_t>	veciKnowns;
 	else if( !strcmp( sArgs.format_arg, "matisse" ) )
 		pDat->SaveMATISSE( cout, dCutoff, &Genome );
 	else if( !strcmp( sArgs.format_arg, "list" ) ) {
-		vector<bool>					vecfQuery;
 		map<size_t, float>				mapGenes;
 		map<size_t, float>::iterator	iterGene;
-		size_t							iGene;
+		float							dCur;
 
-		vecfQuery.resize( pDat->GetGenes( ) );
-		for( i = 0; i < vecfQuery.size( ); ++i )
-			vecfQuery[ i ] = GenesQr.IsGene( pDat->GetGene( i ) );
 		for( i = 0; i < pDat->GetGenes( ); ++i )
 			for( j = ( i + 1 ); j < pDat->GetGenes( ); ++j )
 				if( !CMeta::IsNaN( d = pDat->Get( i, j ) ) &&
-					( CMeta::IsNaN( dCutoff ) || ( d > dCutoff ) ) &&
-					( vecfQuery[ i ] != vecfQuery[ j ] ) ) {
-					iGene = vecfQuery[ i ] ? j : i;
-					if( ( iterGene = mapGenes.find( iGene ) ) == mapGenes.end( ) )
-						mapGenes[ iGene ] = d;
-					else
-						iterGene->second += d; }
+					( CMeta::IsNaN( dCutoff ) || ( d > dCutoff ) ) ) {
+					if( ( k = veciQuery[ i ] ) != -1 ) {
+						dCur = d * ( vecdWeights.empty( ) ? 1 : vecdWeights[ k ] );
+						if( ( iterGene = mapGenes.find( j ) ) == mapGenes.end( ) )
+							mapGenes[ j ] = dCur;
+						else
+							iterGene->second += dCur; }
+					if( ( k = veciQuery[ j ] ) != -1 ) {
+						dCur = d * ( vecdWeights.empty( ) ? 1 : vecdWeights[ k ] );
+						if( ( iterGene = mapGenes.find( i ) ) == mapGenes.end( ) )
+							mapGenes[ i ] = dCur;
+						else
+							iterGene->second += dCur; } }
 		for( iterGene = mapGenes.begin( ); iterGene != mapGenes.end( ); ++iterGene )
 			cout << pDat->GetGene( iterGene->first ) << '\t' << iterGene->second << endl; }
 	else if( !strcmp( sArgs.format_arg, "dat" ) ) {
 		for( i = 0; i < pDat->GetGenes( ); ++i )
 			for( j = ( i + 1 ); j < pDat->GetGenes( ); ++j )
-				if( pDat->Get( i, j ) < dCutoff )
+				if( ( d = pDat->Get( i, j ) ) < dCutoff )
 					pDat->Set( i, j, CMeta::GetNaN( ) );
 		pDat->Save( cout, CDat::EFormatText ); }
 

tools/Dat2Graph/Dat2Graph.ggo

 section "Graph Queries"
 option	"geneq"		q	"Query inclusion file"
 						string	typestr="filename"
+option	"genew"		Q	"Query weights file"
+						string	typestr="filename"
 option	"neighbors"	k	"Size of query neighborhood"
 						int	default="-1"
 option	"hefalmp"	a	"Perform HEFalMp query instead of bioPIXIE query"
 						double
 option	"genes"		g	"Gene inclusion file"
 						string	typestr="filename"
+option	"genex"		G	"Gene exclusion file"
+						string	typestr="filename"
 option	"knowns"	w	"Known interactions (DAT/DAB) to ignore"
 						string	typestr="filename"
 

tools/Dat2Graph/cmdline.c

   "  -t, --format=STRING      Output graph format  (possible values=\"dot\", \n                             \"gdf\", \"net\", \"matisse\", \"list\", \"dat\", \n                             \"correl\" default=`dot')",
   "\nGraph Queries:",
   "  -q, --geneq=filename     Query inclusion file",
+  "  -Q, --genew=filename     Query weights file",
   "  -k, --neighbors=INT      Size of query neighborhood  (default=`-1')",
   "  -a, --hefalmp            Perform HEFalMp query instead of bioPIXIE query  \n                             (default=on)",
   "  -d, --edges=DOUBLE       Aggressiveness of edge trimming after query  \n                             (default=`1')",
   "\nFiltering:",
   "  -e, --cutoff=DOUBLE      Minimum edge weight for output",
   "  -g, --genes=filename     Gene inclusion file",
+  "  -G, --genex=filename     Gene exclusion file",
   "  -w, --knowns=filename    Known interactions (DAT/DAB) to ignore",
   "\nAnnotation:",
   "  -f, --features=filename  SGD gene features",
   args_info->input_given = 0 ;
   args_info->format_given = 0 ;
   args_info->geneq_given = 0 ;
+  args_info->genew_given = 0 ;
   args_info->neighbors_given = 0 ;
   args_info->hefalmp_given = 0 ;
   args_info->edges_given = 0 ;
   args_info->cutoff_given = 0 ;
   args_info->genes_given = 0 ;
+  args_info->genex_given = 0 ;
   args_info->knowns_given = 0 ;
   args_info->features_given = 0 ;
   args_info->colors_given = 0 ;
   args_info->format_orig = NULL;
   args_info->geneq_arg = NULL;
   args_info->geneq_orig = NULL;
+  args_info->genew_arg = NULL;
+  args_info->genew_orig = NULL;
   args_info->neighbors_arg = -1;
   args_info->neighbors_orig = NULL;
   args_info->hefalmp_flag = 1;
   args_info->cutoff_orig = NULL;
   args_info->genes_arg = NULL;
   args_info->genes_orig = NULL;
+  args_info->genex_arg = NULL;
+  args_info->genex_orig = NULL;
   args_info->knowns_arg = NULL;
   args_info->knowns_orig = NULL;
   args_info->features_arg = NULL;
   args_info->input_help = gengetopt_args_info_help[3] ;
   args_info->format_help = gengetopt_args_info_help[4] ;
   args_info->geneq_help = gengetopt_args_info_help[6] ;
-  args_info->neighbors_help = gengetopt_args_info_help[7] ;
-  args_info->hefalmp_help = gengetopt_args_info_help[8] ;
-  args_info->edges_help = gengetopt_args_info_help[9] ;
-  args_info->cutoff_help = gengetopt_args_info_help[11] ;
-  args_info->genes_help = gengetopt_args_info_help[12] ;
-  args_info->knowns_help = gengetopt_args_info_help[13] ;
-  args_info->features_help = gengetopt_args_info_help[15] ;
-  args_info->colors_help = gengetopt_args_info_help[16] ;
-  args_info->borders_help = gengetopt_args_info_help[17] ;
-  args_info->normalize_help = gengetopt_args_info_help[19] ;
-  args_info->memmap_help = gengetopt_args_info_help[20] ;
-  args_info->config_help = gengetopt_args_info_help[21] ;
-  args_info->verbosity_help = gengetopt_args_info_help[22] ;
+  args_info->genew_help = gengetopt_args_info_help[7] ;
+  args_info->neighbors_help = gengetopt_args_info_help[8] ;
+  args_info->hefalmp_help = gengetopt_args_info_help[9] ;
+  args_info->edges_help = gengetopt_args_info_help[10] ;
+  args_info->cutoff_help = gengetopt_args_info_help[12] ;
+  args_info->genes_help = gengetopt_args_info_help[13] ;
+  args_info->genex_help = gengetopt_args_info_help[14] ;
+  args_info->knowns_help = gengetopt_args_info_help[15] ;
+  args_info->features_help = gengetopt_args_info_help[17] ;
+  args_info->colors_help = gengetopt_args_info_help[18] ;
+  args_info->borders_help = gengetopt_args_info_help[19] ;
+  args_info->normalize_help = gengetopt_args_info_help[21] ;
+  args_info->memmap_help = gengetopt_args_info_help[22] ;
+  args_info->config_help = gengetopt_args_info_help[23] ;
+  args_info->verbosity_help = gengetopt_args_info_help[24] ;
   
 }
 
   free_string_field (&(args_info->format_orig));
   free_string_field (&(args_info->geneq_arg));
   free_string_field (&(args_info->geneq_orig));
+  free_string_field (&(args_info->genew_arg));
+  free_string_field (&(args_info->genew_orig));
   free_string_field (&(args_info->neighbors_orig));
   free_string_field (&(args_info->edges_orig));
   free_string_field (&(args_info->cutoff_orig));
   free_string_field (&(args_info->genes_arg));
   free_string_field (&(args_info->genes_orig));
+  free_string_field (&(args_info->genex_arg));
+  free_string_field (&(args_info->genex_orig));
   free_string_field (&(args_info->knowns_arg));
   free_string_field (&(args_info->knowns_orig));
   free_string_field (&(args_info->features_arg));
     write_into_file(outfile, "format", args_info->format_orig, cmdline_parser_format_values);
   if (args_info->geneq_given)
     write_into_file(outfile, "geneq", args_info->geneq_orig, 0);
+  if (args_info->genew_given)
+    write_into_file(outfile, "genew", args_info->genew_orig, 0);
   if (args_info->neighbors_given)
     write_into_file(outfile, "neighbors", args_info->neighbors_orig, 0);
   if (args_info->hefalmp_given)
     write_into_file(outfile, "cutoff", args_info->cutoff_orig, 0);
   if (args_info->genes_given)
     write_into_file(outfile, "genes", args_info->genes_orig, 0);
+  if (args_info->genex_given)
+    write_into_file(outfile, "genex", args_info->genex_orig, 0);
   if (args_info->knowns_given)
     write_into_file(outfile, "knowns", args_info->knowns_orig, 0);
   if (args_info->features_given)
         { "input",	1, NULL, 'i' },
         { "format",	1, NULL, 't' },
         { "geneq",	1, NULL, 'q' },
+        { "genew",	1, NULL, 'Q' },
         { "neighbors",	1, NULL, 'k' },
         { "hefalmp",	0, NULL, 'a' },
         { "edges",	1, NULL, 'd' },
         { "cutoff",	1, NULL, 'e' },
         { "genes",	1, NULL, 'g' },
+        { "genex",	1, NULL, 'G' },
         { "knowns",	1, NULL, 'w' },
         { "features",	1, NULL, 'f' },
         { "colors",	1, NULL, 'l' },
         { NULL,	0, NULL, 0 }
       };
 
-      c = getopt_long (argc, argv, "hVi:t:q:k:ad:e:g:w:f:l:b:nmc:v:", long_options, &option_index);
+      c = getopt_long (argc, argv, "hVi:t:q:Q:k:ad:e:g:G:w:f:l:b:nmc:v:", long_options, &option_index);
 
       if (c == -1) break;	/* Exit from `while (1)' loop.  */
 
             goto failure;
         
           break;
+        case 'Q':	/* Query weights file.  */
+        
+        
+          if (update_arg( (void *)&(args_info->genew_arg), 
+               &(args_info->genew_orig), &(args_info->genew_given),
+              &(local_args_info.genew_given), optarg, 0, 0, ARG_STRING,
+              check_ambiguity, override, 0, 0,
+              "genew", 'Q',
+              additional_error))
+            goto failure;
+        
+          break;
         case 'k':	/* Size of query neighborhood.  */
         
         
             goto failure;
         
           break;
+        case 'G':	/* Gene exclusion file.  */
+        
+        
+          if (update_arg( (void *)&(args_info->genex_arg), 
+               &(args_info->genex_orig), &(args_info->genex_given),
+              &(local_args_info.genex_given), optarg, 0, 0, ARG_STRING,
+              check_ambiguity, override, 0, 0,
+              "genex", 'G',
+              additional_error))
+            goto failure;
+        
+          break;
         case 'w':	/* Known interactions (DAT/DAB) to ignore.  */
         
         

tools/Dat2Graph/cmdline.h

   char * geneq_arg;	/**< @brief Query inclusion file.  */
   char * geneq_orig;	/**< @brief Query inclusion file original value given at command line.  */
   const char *geneq_help; /**< @brief Query inclusion file help description.  */
+  char * genew_arg;	/**< @brief Query weights file.  */
+  char * genew_orig;	/**< @brief Query weights file original value given at command line.  */
+  const char *genew_help; /**< @brief Query weights file help description.  */
   int neighbors_arg;	/**< @brief Size of query neighborhood (default='-1').  */
   char * neighbors_orig;	/**< @brief Size of query neighborhood original value given at command line.  */
   const char *neighbors_help; /**< @brief Size of query neighborhood help description.  */
   char * genes_arg;	/**< @brief Gene inclusion file.  */
   char * genes_orig;	/**< @brief Gene inclusion file original value given at command line.  */
   const char *genes_help; /**< @brief Gene inclusion file help description.  */
+  char * genex_arg;	/**< @brief Gene exclusion file.  */
+  char * genex_orig;	/**< @brief Gene exclusion file original value given at command line.  */
+  const char *genex_help; /**< @brief Gene exclusion file help description.  */
   char * knowns_arg;	/**< @brief Known interactions (DAT/DAB) to ignore.  */
   char * knowns_orig;	/**< @brief Known interactions (DAT/DAB) to ignore original value given at command line.  */
   const char *knowns_help; /**< @brief Known interactions (DAT/DAB) to ignore help description.  */
   unsigned int input_given ;	/**< @brief Whether input was given.  */
   unsigned int format_given ;	/**< @brief Whether format was given.  */
   unsigned int geneq_given ;	/**< @brief Whether geneq was given.  */
+  unsigned int genew_given ;	/**< @brief Whether genew was given.  */
   unsigned int neighbors_given ;	/**< @brief Whether neighbors was given.  */
   unsigned int hefalmp_given ;	/**< @brief Whether hefalmp was given.  */
   unsigned int edges_given ;	/**< @brief Whether edges was given.  */
   unsigned int cutoff_given ;	/**< @brief Whether cutoff was given.  */
   unsigned int genes_given ;	/**< @brief Whether genes was given.  */
+  unsigned int genex_given ;	/**< @brief Whether genex was given.  */
   unsigned int knowns_given ;	/**< @brief Whether knowns was given.  */
   unsigned int features_given ;	/**< @brief Whether features was given.  */
   unsigned int colors_given ;	/**< @brief Whether colors was given.  */

tools/Funcifier/Funcifier.cpp

 	vector<CGenes*>			vecpGenes;
 	vector<string>			vecstrNames;
 	vector<vector<size_t> >	vecveciGenes;
-	float					d, dAveIn, dAveOut;
+	vector<vector<float> >	vecvecdGenes;
+	float					d, dAveIn, dAveOut, dOne, dTwo;
+	double					dCountIn;
 	ofstream				ofsm;
 	EShared					eShared;
 
 		vecveciGenes[ i ].resize( vecpGenes[ i ]->GetGenes( ) );
 		for( j = 0; j < vecpGenes[ i ]->GetGenes( ); ++j )
 			vecveciGenes[ i ][ j ] = DatIn.GetGene( vecpGenes[ i ]->GetGene( j ).GetName( ) ); }
+	if( sArgs.weights_arg ) {
+		CPCL	PCLWeights;
+
+		vecvecdGenes.resize( vecpGenes.size( ) );
+		if( !PCLWeights.Open( sArgs.weights_arg, 0 ) ) {
+			cerr << "Could not open: " << sArgs.weights_arg << endl;
+			return 1; }
+		for( i = 0; i < vecstrNames.size( ); ++i ) {
+			vecvecdGenes[i].resize( vecveciGenes[i].size( ) );
+			if( ( iOne = PCLWeights.GetExperiment( vecstrNames[i] ) ) == -1 ) {
+				cerr << "Could not find gene set weight: " << vecstrNames[i] << endl;
+				fill( vecvecdGenes[i].begin( ), vecvecdGenes[i].end( ), 0 );
+				continue; }
+			for( j = 0; j < vecpGenes[i]->GetGenes( ); ++j )
+				vecvecdGenes[i][j] = ( ( iTwo = PCLWeights.GetGene( vecpGenes[i]->GetGene( j ).GetName( ) ) ) == -1 ) ?
+					0 : PCLWeights.Get( iTwo, iOne ); } }
 
 	{
 		CDat	DatOut;
 					else
 						iterGene->second++; }
 
-				dAveIn = 0;
-				for( iCountIn = i = 0; i < vecveciGenes[ iF1 ].size( ); ++i ) {
+				dCountIn = dAveIn = 0;
+				for( i = 0; i < vecveciGenes[ iF1 ].size( ); ++i ) {
 					if( ( iOne = vecveciGenes[ iF1 ][ i ] ) == -1 )
 						continue;
+					dOne = vecvecdGenes.empty( ) ? 1 : vecvecdGenes[iF1][i];
 					iSharedOne = mappiGenes[ &vecpGenes[ iF1 ]->GetGene( i ) ];
 					if( ( eShared == ESharedDiscard ) && ( iSharedOne > 1 ) )
 						continue;
 
 						if( ( ( iTwo = vecveciGenes[ iF2 ][ j ] ) != -1 ) &&
 							!CMeta::IsNaN( d = DatIn.Get( iOne, iTwo ) ) ) {
-							iCountIn++;
-							dAveIn += d; } } }
-				DatOut.Set( iF1, iF2, dAveIn / iCountIn ); }
+							dTwo = vecvecdGenes.empty( ) ? 1 : vecvecdGenes[iF2][j];
+							dCountIn += dOne * dTwo;
+							dAveIn += d * dOne * dTwo; } } }
+				DatOut.Set( iF1, iF2, dCountIn ? (float)( dAveIn / dCountIn ) : CMeta::GetNaN( ) ); }
 		if( sArgs.zscore_flag )
 			DatOut.Normalize( CDat::ENormalizeZScore );
 		DatOut.Save( sArgs.output_arg );

tools/Funcifier/Funcifier.ggo

 							values="ignore","discard","oneonly"	default="discard"
 option	"colors"		l	"Function cohesiveness output file"
 							string	typestr="filename"
+option	"weights"		w	"PCL file of set-by-gene weights"
+							string	typestr="filename"
 
 section "Optional"
 option	"normalize"		n	"Normalize input to the range [0,1]"

tools/Funcifier/cmdline.c

 const char *gengetopt_args_info_description = "";
 
 const char *gengetopt_args_info_help[] = {
-  "  -h, --help             Print help and exit",
-  "  -V, --version          Print version and exit",
+  "  -h, --help              Print help and exit",
+  "  -V, --version           Print version and exit",
   "\nMain:",
-  "  -i, --input=filename   Input interaction network",
-  "  -o, --output=filename  Output function network",
+  "  -i, --input=filename    Input interaction network",
+  "  -o, --output=filename   Output function network",
   "\nMiscellaneous:",
-  "  -s, --shared=STRING    Determine shared gene handling  (possible \n                           values=\"ignore\", \"discard\", \"oneonly\" \n                           default=`discard')",
-  "  -l, --colors=filename  Function cohesiveness output file",
+  "  -s, --shared=STRING     Determine shared gene handling  (possible \n                            values=\"ignore\", \"discard\", \"oneonly\" \n                            default=`discard')",
+  "  -l, --colors=filename   Function cohesiveness output file",
+  "  -w, --weights=filename  PCL file of set-by-gene weights",
   "\nOptional:",
-  "  -n, --normalize        Normalize input to the range [0,1]  (default=off)",
-  "  -z, --zscore           Normalize output by z-scoring  (default=off)",
-  "  -m, --memmap           Memory map input  (default=off)",
-  "  -v, --verbosity=INT    Message verbosity  (default=`5')",
+  "  -n, --normalize         Normalize input to the range [0,1]  (default=off)",
+  "  -z, --zscore            Normalize output by z-scoring  (default=off)",
+  "  -m, --memmap            Memory map input  (default=off)",
+  "  -v, --verbosity=INT     Message verbosity  (default=`5')",
     0
 };
 
   args_info->output_given = 0 ;
   args_info->shared_given = 0 ;
   args_info->colors_given = 0 ;
+  args_info->weights_given = 0 ;
   args_info->normalize_given = 0 ;
   args_info->zscore_given = 0 ;
   args_info->memmap_given = 0 ;
   args_info->shared_orig = NULL;
   args_info->colors_arg = NULL;
   args_info->colors_orig = NULL;
+  args_info->weights_arg = NULL;
+  args_info->weights_orig = NULL;
   args_info->normalize_flag = 0;
   args_info->zscore_flag = 0;
   args_info->memmap_flag = 0;
   args_info->output_help = gengetopt_args_info_help[4] ;
   args_info->shared_help = gengetopt_args_info_help[6] ;
   args_info->colors_help = gengetopt_args_info_help[7] ;
-  args_info->normalize_help = gengetopt_args_info_help[9] ;
-  args_info->zscore_help = gengetopt_args_info_help[10] ;
-  args_info->memmap_help = gengetopt_args_info_help[11] ;
-  args_info->verbosity_help = gengetopt_args_info_help[12] ;
+  args_info->weights_help = gengetopt_args_info_help[8] ;
+  args_info->normalize_help = gengetopt_args_info_help[10] ;
+  args_info->zscore_help = gengetopt_args_info_help[11] ;
+  args_info->memmap_help = gengetopt_args_info_help[12] ;
+  args_info->verbosity_help = gengetopt_args_info_help[13] ;
   
 }
 
   free_string_field (&(args_info->shared_orig));
   free_string_field (&(args_info->colors_arg));
   free_string_field (&(args_info->colors_orig));
+  free_string_field (&(args_info->weights_arg));
+  free_string_field (&(args_info->weights_orig));
   free_string_field (&(args_info->verbosity_orig));
   
   
     write_into_file(outfile, "shared", args_info->shared_orig, cmdline_parser_shared_values);
   if (args_info->colors_given)
     write_into_file(outfile, "colors", args_info->colors_orig, 0);
+  if (args_info->weights_given)
+    write_into_file(outfile, "weights", args_info->weights_orig, 0);
   if (args_info->normalize_given)
     write_into_file(outfile, "normalize", 0, 0 );
   if (args_info->zscore_given)
         { "output",	1, NULL, 'o' },
         { "shared",	1, NULL, 's' },
         { "colors",	1, NULL, 'l' },
+        { "weights",	1, NULL, 'w' },
         { "normalize",	0, NULL, 'n' },
         { "zscore",	0, NULL, 'z' },
         { "memmap",	0, NULL, 'm' },
         { NULL,	0, NULL, 0 }
       };
 
-      c = getopt_long (argc, argv, "hVi:o:s:l:nzmv:", long_options, &option_index);
+      c = getopt_long (argc, argv, "hVi:o:s:l:w:nzmv:", long_options, &option_index);
 
       if (c == -1) break;	/* Exit from `while (1)' loop.  */
 
             goto failure;
         
           break;
+        case 'w':	/* PCL file of set-by-gene weights.  */
+        
+        
+          if (update_arg( (void *)&(args_info->weights_arg), 
+               &(args_info->weights_orig), &(args_info->weights_given),
+              &(local_args_info.weights_given), optarg, 0, 0, ARG_STRING,
+              check_ambiguity, override, 0, 0,
+              "weights", 'w',
+              additional_error))
+            goto failure;
+        
+          break;
         case 'n':	/* Normalize input to the range [0,1].  */
         
         

tools/Funcifier/cmdline.h

   char * colors_arg;	/**< @brief Function cohesiveness output file.  */
   char * colors_orig;	/**< @brief Function cohesiveness output file original value given at command line.  */
   const char *colors_help; /**< @brief Function cohesiveness output file help description.  */
+  char * weights_arg;	/**< @brief PCL file of set-by-gene weights.  */
+  char * weights_orig;	/**< @brief PCL file of set-by-gene weights original value given at command line.  */
+  const char *weights_help; /**< @brief PCL file of set-by-gene weights help description.  */
   int normalize_flag;	/**< @brief Normalize input to the range [0,1] (default=off).  */
   const char *normalize_help; /**< @brief Normalize input to the range [0,1] help description.  */
   int zscore_flag;	/**< @brief Normalize output by z-scoring (default=off).  */
   unsigned int output_given ;	/**< @brief Whether output was given.  */
   unsigned int shared_given ;	/**< @brief Whether shared was given.  */
   unsigned int colors_given ;	/**< @brief Whether colors was given.  */
+  unsigned int weights_given ;	/**< @brief Whether weights was given.  */
   unsigned int normalize_given ;	/**< @brief Whether normalize was given.  */
   unsigned int zscore_given ;	/**< @brief Whether zscore was given.  */
   unsigned int memmap_given ;	/**< @brief Whether memmap was given.  */

tools/Normalizer/Normalizer.cpp

 	{"globalz",		CPCL::ENormalizeZScore},
 	{"0to1",		CPCL::ENormalizeMinMax},
 	{"colcenter",	CPCL::ENormalizeColumnCenter},
+	{"colfrac",		CPCL::ENormalizeColumnFraction},
 	{NULL,			CPCL::ENormalizeNone}
 };
 

tools/Normalizer/Normalizer.ggo

 option	"itype"		t	"Data file type"
 						values="pcl","dat"	default="dat"
 option	"otype"		T	"Normalization type"
-						values="columnz","rowz","globalz","column0","0to1","colcenter","medmult"	default="globalz"
+						values="columnz","rowz","globalz","column0","0to1","colcenter","medmult","colfrac"	default="globalz"
 
 section "Optional"
 option	"flip"		f	"Flip high/low scores"

tools/Normalizer/cmdline.c

   "  -i, --input=filename   Input/output PCL/DAT/DAB file",
   "  -o, --output=filename  Output PCL/DAB file",
   "  -t, --itype=STRING     Data file type  (possible values=\"pcl\", \"dat\" \n                           default=`dat')",
-  "  -T, --otype=STRING     Normalization type  (possible values=\"columnz\", \n                           \"rowz\", \"globalz\", \"column0\", \"0to1\", \n                           \"colcenter\", \"medmult\" default=`globalz')",
+  "  -T, --otype=STRING     Normalization type  (possible values=\"columnz\", \n                           \"rowz\", \"globalz\", \"column0\", \"0to1\", \n                           \"colcenter\", \"medmult\", \"colfrac\" \n                           default=`globalz')",
   "\nOptional:",
   "  -f, --flip             Flip high/low scores  (default=off)",
   "  -s, --skip=INT         Columns to skip in input PCL  (default=`2')",
 
 
 char *cmdline_parser_itype_values[] = {"pcl", "dat", 0} ;	/* Possible values for itype.  */
-char *cmdline_parser_otype_values[] = {"columnz", "rowz", "globalz", "column0", "0to1", "colcenter", "medmult", 0} ;	/* Possible values for otype.  */
+char *cmdline_parser_otype_values[] = {"columnz", "rowz", "globalz", "column0", "0to1", "colcenter", "medmult", "colfrac", 0} ;	/* Possible values for otype.  */