Commits

chut...@hutlab3.sph.harvard.edu  committed 85aae76

Update documentation to include new email and Mercurial repo location
Update Combiner to make random effects + AD normality the default
Fix a scary-stupid memory leak (my fault) in CHalfMatrix::SetSize
Update COntologyKEGG to parse more recent versions of the KEGG Ontology
Add PCC normalization to CDat and Normalizer - thanks to Arjun Krishnan!
Add minimum edge count between groups for inclusion to Funcifier
Add CStatistics::AndersonDarlingScore and Test as a normality test
Add qmeta (AndersonDarling-only) weighting option to Combiner

  • Participants
  • Parent commits 1f6a970

Comments (0)

Files changed (19)

File src/annotationi.h

 	static const char	c_szDefinition[];
 	static const char	c_szClass[];
 	static const char	c_szPath[];
+	static const char	c_szReference[];
+	static const char	c_szDisease[];
+	static const char	c_szPathway[];
+	static const char	c_szModule[];
 	static const char	c_szBR[];
 	static const char	c_szDBLinks[];
 	static const char	c_szGenes[];
 
 	bool Open( SParserKEGG& );
 	bool OpenEntry( SParserKEGG& );
+	bool OpenReferences( SParserKEGG& );
+	bool OpenReference( SParserKEGG& );
 	bool OpenName( SParserKEGG& );
+	bool OpenDisease( SParserKEGG& );
+	bool OpenPathway( SParserKEGG& );
+	bool OpenModule( SParserKEGG& );
 	bool OpenDefinition( SParserKEGG& );
 	bool OpenClass( SParserKEGG& );
 	bool OpenDBLinks( SParserKEGG& );

File src/annotationkegg.cpp

 const char	COntologyKEGGImpl::c_szDefinition[]	= "DEFINITION";
 const char	COntologyKEGGImpl::c_szClass[]		= "CLASS";
 const char	COntologyKEGGImpl::c_szPath[]		= "PATH:";
+const char	COntologyKEGGImpl::c_szReference[]	= "REFERENCE";
+const char	COntologyKEGGImpl::c_szDisease[]	= "DISEASE";
+const char	COntologyKEGGImpl::c_szPathway[]	= "PATHWAY";
+const char	COntologyKEGGImpl::c_szModule[]		= "MODULE";
 const char	COntologyKEGGImpl::c_szBR[]			= "BR:";
 const char	COntologyKEGGImpl::c_szDBLinks[]	= "DBLINKS";
 const char	COntologyKEGGImpl::c_szGenes[]		= "GENES";
 
 	sParser.Reset( );
 	return ( OpenEntry( sParser ) && OpenName( sParser ) &&
-		OpenDefinition( sParser ) && OpenClass( sParser ) &&
-		OpenDBLinks( sParser ) && OpenGenes( sParser ) &&
+		OpenDefinition( sParser ) && OpenPathway( sParser ) &&
+		OpenModule( sParser ) && OpenDisease( sParser ) &&
+		OpenClass( sParser ) && OpenDBLinks( sParser ) &&
+		OpenGenes( sParser ) && OpenReferences( sParser ) &&
 		OpenEnd( sParser ) ); }
 
 bool COntologyKEGGImpl::OpenEntry( SParserKEGG& sParser ) {
 	g_CatSleipnir( ).debug( "COntologyKEGGImpl::OpenName( ) %s", sParser.m_szLine );
 	return ( sParser.IsStart( c_szName ) ? sParser.GetLine( ) : true ); }
 
+bool COntologyKEGGImpl::OpenPathway( SParserKEGG& sParser ) {
+
+	if( !sParser.IsStart( c_szPathway ) )
+		return true;
+
+	do
+		if( !sParser.GetLine( ) )
+			return false;
+	while( isspace( sParser.m_szLine[ 0 ] ) );
+
+	return true; }
+
+bool COntologyKEGGImpl::OpenReferences( SParserKEGG& sParser ) {
+
+	while( OpenReference( sParser ) );
+
+	return true; }
+
+bool COntologyKEGGImpl::OpenReference( SParserKEGG& sParser ) {
+
+	if( !sParser.IsStart( c_szReference ) )
+		return false;
+
+	do
+		if( !sParser.GetLine( ) )
+			return false;
+	while( isspace( sParser.m_szLine[ 0 ] ) );
+
+	return true; }
+
+bool COntologyKEGGImpl::OpenDisease( SParserKEGG& sParser ) {
+
+	if( !sParser.IsStart( c_szDisease ) )
+		return true;
+
+	do
+		if( !sParser.GetLine( ) )
+			return false;
+	while( isspace( sParser.m_szLine[ 0 ] ) );
+
+	return true; }
+
+bool COntologyKEGGImpl::OpenModule( SParserKEGG& sParser ) {
+
+	if( !sParser.IsStart( c_szModule ) )
+		return true;
+
+	do
+		if( !sParser.GetLine( ) )
+			return false;
+	while( isspace( sParser.m_szLine[ 0 ] ) );
+
+	return true; }
+
 bool COntologyKEGGImpl::OpenDefinition( SParserKEGG& sParser ) {
 
 	if( !sParser.IsStart( c_szDefinition ) )
 			for( j = ( i + 1 ); j < GetGenes( ); ++j )
 				Set( i, j, ( Get( i, j ) - dMin ) / dMax ); }
 
+void CDatImpl::NormalizePCC( ) {
+	size_t			i, j;
+	vector<float>	vecdAves, vecdStds;
+	vector<size_t>	veciCounts;
+	float			d, dOne, dTwo;
+
+	vecdAves.resize( GetGenes( ) );
+	vecdStds.resize( GetGenes( ) );
+	veciCounts.resize( GetGenes( ) );
+	for( i = 0; i < GetGenes( ); ++i )
+		for( j = ( i + 1 ); j < GetGenes( ); ++j )
+			if( !CMeta::IsNaN( d = Get( i, j ) ) ) {
+				veciCounts[i]++;
+				veciCounts[j]++;
+				vecdAves[i] += d;
+				vecdAves[j] += d;
+				d *= d;
+				vecdStds[i] += d;
+				vecdStds[j] += d; }
+	for( i = 0; i < GetGenes( ); ++i ) {
+		if( veciCounts[i] ) {
+			vecdAves[i] /= veciCounts[i];
+			vecdStds[i] = sqrt( ( vecdStds[i] / ( max( veciCounts[i], 2 ) - 1 ) ) -
+				( vecdAves[i] * vecdAves[i] ) ); }
+		if( !vecdStds[i] )
+			vecdStds[i] = 1; }
+	for( i = 0; i < GetGenes( ); ++i ) {
+		for( j = ( i + 1 ); j < GetGenes( ); ++j )
+			if( !CMeta::IsNaN( d = Get( i, j ) ) ) {
+				dOne = ( d - vecdAves[i] ) / vecdStds[i];
+				dTwo = ( d - vecdAves[j] ) / vecdStds[j];
+				Set( i, j, sqrt( ( dOne * dOne ) + ( dTwo * dTwo ) ) ); } } }
+
 /*!
  * \brief
  * Replace each finite value in the CDat with one minus that value.
 		 * Sigmoid transform scores to the range [0, 1].
 		 */
 		ENormalizeSigmoid	= ENormalizeZScore + 1,
-		ENormalizeNormCDF	= ENormalizeSigmoid + 1
+		ENormalizeNormCDF	= ENormalizeSigmoid + 1,
+		ENormalizePCC		= ENormalizeNormCDF + 1
 	};
 
 	bool Open( const char* szFile, bool fMemmap = false, size_t iSkip = 2, bool fZScore = false,
 				NormalizeNormCDF( );
 				break;
 
+			case ENormalizePCC:
+				NormalizePCC( );
+				break;
+
 			default:
 				NormalizeSigmoid( ); } }
 
 	void SlimCache( const CSlim&, std::vector<std::vector<size_t> >& ) const;
 	void AveStd( double&, double&, size_t&, size_t = -1 ) const;
 	void NormalizeMinmax( );
+	void NormalizePCC( );
 	void NormalizeStdev( );
 	void NormalizeSigmoid( );
 	void NormalizeNormCDF( );

File src/halfmatrix.h

 				memcpy( aaData[i], m_aaData[i], ( min( iSize, m_iSize ) - i - 1 ) * sizeof(*aaData[i]) );
 			if( fClear && ( iSize > m_iSize ) )
 				std::fill( aaData[i] + ( ( i < m_iSize ) ? ( m_iSize - i - 1 ) : 0 ), aaData[i] + iCur, CMeta::GetNaN( ) ); }
+		Reset( );
 		m_iSize = iSize;
 		m_aaData = aaData;
 

File src/statistics.h

 
 		return ( 1 - Normal01CDF( fabs( dZScore ) * sqrt( (double)iN ) ) ); }
 
+	template<class tType, class tIter>
+	static double AndersonDarlingScore( tIter Begin, tIter End ) {
+		tIter				Cur;
+		double				d, dA2, dAve, dStd;
+		size_t				i, iN;
+		std::vector<tType>	vecValues;
+
+		dAve = dStd = 0;
+		for( iN = 0,Cur = Begin; Cur != End; ++iN,++Cur ) {
+			dAve += *Cur;
+			dStd += *Cur * *Cur; }
+		if( iN < 2 )
+			return CMeta::GetNaN( );
+		dAve /= iN;
+		dStd = sqrt( ( dStd / ( iN - 1 ) ) - ( dAve * dAve ) );
+		if( dStd <= 0 )
+			dStd = 1;
+
+		vecValues.resize( iN );
+		std::copy( Begin, End, vecValues.begin( ) );
+		std::sort( vecValues.begin( ), vecValues.end( ) );
+
+		dA2 = 0;
+		for( i = 0; i < vecValues.size( ); ++i ) {
+			d = Normal01CDF( ( vecValues[i] - dAve ) / dStd );
+			if( d <= std::numeric_limits<double>::epsilon( ) )
+				d = std::numeric_limits<double>::epsilon( );
+			else if( ( 1 - d ) <= std::numeric_limits<double>::epsilon( ) )
+				d = 1 - std::numeric_limits<double>::epsilon( );
+			dA2 += ( ( ( 2 * ( i + 1 ) ) - 1 ) * log( d ) ) + ( ( ( 2 * ( iN - i ) ) - 1 ) * log( 1 - d ) ); }
+		dA2 = ( -dA2 / iN ) - iN;
+		dA2 *= 1 + ( 0.75 / iN ) + ( 2.25 / ( iN * iN ) );
+
+		return dA2; }
+
+	static double AndersonDarlingTest( double dA2 ) {
+		double	dRet;
+
+		if( dA2 < 0.2 )
+			dRet = 1 - exp( -13.436 + ( 101.14 * dA2 ) - ( 223.73 * dA2 * dA2 ) );
+		else if( dA2 < 0.34 )
+			dRet = 1 - exp( -8.318 + ( 42.796 * dA2 ) - ( 59.938 * dA2 * dA2 ) );
+		else if( dA2 < 0.6 )
+			dRet = exp( 0.9177 - ( 4.279 * dA2 ) - ( 1.38 * dA2 * dA2 ) );
+		else if( dA2 < 13 )
+			dRet = exp( 1.2937 - ( 5.709 * dA2 ) + ( 0.0186 * dA2 * dA2 ) );
+		else
+			dRet = 0;
+
+		return dRet; }
+
 	/*!
 	 * \brief
 	 * Returns the root-mean-square error distance between two input arrays.

File src/stdafx.cpp

  * 
  * - <a href="sleipnir-current.tar.gz">sleipnir-current.tar.gz</a>, the current source code.
  * - <a href="sleipnir-doc-current.tar.gz">sleipnir-doc-current.tar.gz</a>, the current documentation.
- * - You can also access the Sleipnir <a href="http://subversion.tigris.org/">Subversion</a> repository at:
- *	<tt>svn://gen-svn-anon.princeton.edu/sleipnir/</tt>.
+ * - You can also access the Sleipnir <a href="http://mercurial.selenic.com">Mercurial</a> repository at:
+ *	<tt>http://huttenhower.sph.harvard.edu/hg/sleipnir/</tt>.
  * 
  * Sleipnir and its associated tools are provided as source code that can be compiled under Linux (using
  * gcc), Windows (using Visual Studio or cygwin), or MacOS (using gcc).  For more information, see
  * 
  * If you use Sleipnir, please cite our publication:
  * 
- * <b>Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
- *	<a href="mailto:ogt@princeton.edu">Olga G. Troyanskaya</a>
+ * <b><a href="mailto:chuttenh@hsph.harvard.edu">Curtis Huttenhower</a>, Mark Schroeder, Maria D. Chikina, and
+ *	Olga G. Troyanskaya
  *	"The Sleipnir library for computational functional genomics", Bioinformatics 2008</b>
- * <a href="http://www.ncbi.nlm.nih.gov/sites/entrez?Db=pubmed&Cmd=ShowDetailView&TermToSearch=18499696">PMID 18499696</a>
+ * <a href="http://www.ncbi.nlm.nih.gov/pubmed/18499696">PMID 18499696</a>
  * 
  * \section sec_building Building Sleipnir
  * 
  * We avoid distributing binaries directly due to licensing issues, but if you have problems building
  * Sleipnir or need a binary distribution for some other reason, please
- * <a href="mail:ogt@princeton.edu">contact us</a>!  We're happy to help, and if you have suggestions or
+ * <a href="mail:chuttenh@hsph.harvard.edu">contact us</a>!  We're happy to help, and if you have suggestions or
  * contributions, we'll post them here with appropriate credit.
  * 
  * \subsection ssec_building_prerequisites Prerequisites
  * - If the Sleipnir tools instantaneously crash with a segmentation fault, try compiling Sleipnir \em and
  *	its external dependencies with \c CXXFLAGS=-fno-threadsafe-statics.  This works around a bug in certain
  *	versions of g++ and pthreads.
- * - If all else fails, <a href="mailto:ogt@princeton.edu">contact us</a>!  We're happy to receive
+ * - If all else fails, <a href="mailto:chuttenh@hsph.harvard.edu">contact us</a>!  We're happy to receive
  *	feedback about Sleipnir, good or bad, and we're more than willing to update the code or documentation
  *	based on user contributions.  If you'd like to see your name in lights on this web page, feel free to
  *	send patches, questions, or suggestions, and we'll post them whenever possible.
  * include (with full credit, of course) any patches submitted by the community.  If you're interested in
  * developing new Sleipnir tools or library components, the following steps may be useful:
  * 
- * - First, <a href="mailto:ogt@princeton.edu">let us know</a>!  We'd love to hear how people are using
+ * - First, <a href="mailto:chuttenh@hsph.harvard.edu">let us know</a>!  We'd love to hear how people are using
  *	Sleipnir and what you plan to do with it.  We're happy to answer questions and offer development tips
  *	whenever possible.
  * 
- * - Check out our <a href="http://subversion.tigris.org/">Subversion</a> repository at
- *	<tt>svn://gen-svn-anon.princeton.edu/sleipnir/</tt>.  The \c trunk branch always contains the latest development
+ * - Check out our <a href="http://mercurial.selenic.com/">Mercurial</a> repository at
+ *	<tt>http://huttenhower.sph.harvard.edu/hg/sleipnir/</tt>.  The \c trunk branch always contains the latest development
  *	version of Sleipnir, and official versioned releases appear under \c tags.  If you'd like to submit
  *	patches to us for inclusion in Sleipnir, please try to do so against the current development version
  *	(\c trunk).
  *	includes your modifications and additions.  Alternatively, if you've built an independent tool that
  *	relies on Sleipnir, we can include a link to it on the Sleipnir web site.
  * 
- * - And finally, <a href="mailto:ogt@princeton.edu">let us know</a> again!  We'll do our best to include any
+ * - And finally, <a href="mailto:chuttenh@hsph.harvard.edu">let us know</a> again!  We'll do our best to include any
  *	patch or link you send us, always with full credit to the creators.
  * 
  * \section sec_history Version History
  * 
  * - <a href="sleipnir-2.2.tar.gz">2.2</a>, *** <br>
- * Fix confusing documentation in \ref Answerer - thanks to Arjun Krishnan!
- * Fix missing \c SIZE_MAX definition on Mac OS X - thanks to Alice Koechlin!
+ * Fix confusing documentation in \ref Answerer - thanks to Arjun Krishnan! <br>
+ * Fix missing \c SIZE_MAX definition on Mac OS X - thanks to Alice Koechlin! <br>
+ * Add Partial Correlation Coefficient normalization to \t CDat and \ref Normalizer - thanks to Arjun Krishnan!
  * 
  * - <a href="sleipnir-2.1.tar.gz">2.1</a>, 12-20-09 <br>
  * Update includes for gcc 4.3 compatibility - thanks to Casey Greene! <br>

File tools/Combiner/Combiner.cpp

 	EMethodMin		= EMethodMax + 1,
 	EMethodDiff		= EMethodMin + 1,
 	EMethodMeta		= EMethodDiff + 1,
-	EMethodEnd		= EMethodMeta + 1
+	EMethodQMeta	= EMethodMeta + 1,
+	EMethodEnd		= EMethodQMeta + 1
 };
 
 static const char*	c_aszMethods[]	= {
-	"mean", "sum", "gmean", "hmean", "max", "min", "diff", "meta", NULL
+	"mean", "sum", "gmean", "hmean", "max", "min", "diff", "meta", "qmeta", NULL
 };
 
 int main( int iArgs, char** aszArgs ) {
 //*/
 	return ( 1 / ( ( 1 / sCallbackMeta.m_DatWei.Get( sCallback.m_iOne, sCallback.m_iTwo ) ) + dDelta2 ) ); }
 
+void callback_andersondarling( SCallbackVars& sCallback ) {
+	static const float	c_dFrac		= 0.001f;
+	vector<float>		vecdValues;
+	size_t				i, j, iGenes;
+	float				d;
+
+	iGenes = sCallback.m_pDatCur->GetGenes( );
+	vecdValues.reserve( (size_t)( iGenes * iGenes * c_dFrac ) );
+	for( i = 0; i < iGenes; ++i )
+		for( j = ( i + 1 ); j < iGenes; ++j )
+			if( !CMeta::IsNaN( d = sCallback.m_pDatCur->Get( i, j ) ) &&
+				( ( (float)rand( ) / RAND_MAX ) < c_dFrac ) )
+				vecdValues.push_back( d );
+	sCallback.m_dWeight = (float)( log( CStatistics::AndersonDarlingScore<float>(
+		vecdValues.begin( ), vecdValues.end( ) ) ) / log( 2.0 ) ); }
+
 void callback_combine( SCallbackVars& sCallback ) {
 
 	if( sCallback.m_eMethod == EMethodMeta )
 			sCallback.m_pMatCounts->Get( sCallback.m_iOne, sCallback.m_iTwo ) += sCallback.m_dWeight; } }
 
 void callback_wei( SCallbackVars& sCallback ) {
+	static const float				c_dFrac			= 0.001f;
 	SCallbackMeta&					sCallbackMeta	= sCallback.m_sCallbackMeta;
 	const vector<vector<size_t> >&	vecveciGenes	= *sCallback.m_pvecveciGenes;
 	const vector<set<size_t> >&		vecsetiGenes	= *sCallback.m_pvecsetiGenes;
 	size_t							i, j, iOne, iTwo, iA, iB, iGenes;
 	float							d, dSum, dSumSqs;
-	vector<float>					vecdSums, vecdSumSqs;
+	vector<float>					vecdSums, vecdSumSqs, vecdValues;
 
 	iGenes = sCallback.m_pDatCur->GetGenes( );
+	vecdValues.reserve( (size_t)( iGenes * ( iGenes - 1 ) * c_dFrac ) );
 	vecdSums.resize( sCallback.m_pDatOut->GetGenes( ) );
 	fill( vecdSums.begin( ), vecdSums.end( ), 0.0f );
 	vecdSumSqs.resize( sCallback.m_pDatOut->GetGenes( ) );
 					iTwo = vecveciGenes[j][iB];
 					if( vecsetiGenes[iTwo].find( i ) != vecsetiGenes[iTwo].end( ) )
 						continue;
+					if( !( iA || iB ) && ( ( (float)rand( ) / RAND_MAX ) < c_dFrac ) )
+						vecdValues.push_back( d );
 					dSum += d;
 					vecdSums[iOne] += d;
 					vecdSums[iTwo] += d;
 		d = ( vecdSums[i] /= iGenes - 1 );
 		vecdSumSqs[i] = ( vecdSumSqs[i] / ( iGenes - 2 ) ) - ( d * d ); }
 
+	d = (float)( log( CStatistics::AndersonDarlingScore<float>( vecdValues.begin( ), vecdValues.end( ) ) ) /
+		log( 2.0 ) );
 	sCallbackMeta.m_DatWei.Open( sCallback.m_pDatOut->GetGeneNames( ) );
 	sCallbackMeta.m_DatWei.Clear( 0 );
 	for( i = 0; i < vecdSumSqs.size( ); ++i )
 		for( j = ( i + 1 ); j < vecdSumSqs.size( ); ++j )
 			sCallbackMeta.m_DatWei.Set( i, j,
 //				2 / ( vecdSumSqs[i] + vecdSumSqs[j] )			// pooled variance of adjacent genes
-				3 / ( vecdSumSqs[i] + vecdSumSqs[j] + dSumSqs )	// unweighted pool of adjanced genes + whole network
+//				3 / ( vecdSumSqs[i] + vecdSumSqs[j] + dSumSqs )	// unweighted pool of adjanced genes + whole network
+				3 * d / ( vecdSumSqs[i] + vecdSumSqs[j] + dSumSqs )
 			); }
 
 void callback_deltaed( SCallbackVars& sCallback ) {
 	vector<float>			vecdWis;
 	int						iRet;
 	SCallback				sCallback( sArgs, GenesIn, PCLWeights, vecstrTerms, vecpTerms );
+	void (*pfnInitialize)( SCallbackVars& );
 
 	if( !sArgs.inputs_num )
 		return 1;
 	if( eMethod == EMethodMeta )
 		initialize_meta( sCallback );
 
-	if( iRet = iterate_inputs( sCallback, callback_combine, ( eMethod == EMethodMeta ) ? callback_wei : NULL ) )
+	switch( eMethod ) {
+		case EMethodMeta:
+			pfnInitialize = callback_wei;
+			break;
+
+		case EMethodQMeta:
+			pfnInitialize = callback_andersondarling;
+			break;
+
+		default:
+			pfnInitialize = NULL; }
+	if( iRet = iterate_inputs( sCallback, callback_combine, pfnInitialize ) )
 		return iRet;
 	for( i = 0; i < DatOut.GetGenes( ); ++i )
 		for( j = ( i + 1 ); j < DatOut.GetGenes( ); ++j )
 			switch( eMethod ) {
 				case EMethodMean:
 				case EMethodMeta:
+				case EMethodQMeta:
 					DatOut.Set( i, j, ( d = MatCounts.Get( i, j ) ) ? ( DatOut.Get( i, j ) / ( sArgs.reweight_flag ? 1 : d ) ) :
 						CMeta::GetNaN( ) );
 					break;

File tools/Combiner/Combiner.ggo

 option	"type"		t	"Output data file type"
 						values="pcl","dat","dab","module","revdat"	default="pcl"
 option	"method"	m	"Combination method"
-						values="min","max","mean","gmean","hmean","sum","diff","meta"	default="mean"
+						values="min","max","mean","gmean","hmean","sum","diff","meta","qmeta"	default="mean"
 option	"output"	o	"Output file"
 						string	typestr="filename"
 option	"weights"	w	"Weights file"

File tools/Combiner/cmdline.c

   "  -V, --version              Print version and exit",
   "\nMain:",
   "  -t, --type=STRING          Output data file type  (possible values=\"pcl\", \n                               \"dat\", \"dab\", \"module\", \"revdat\" \n                               default=`pcl')",
-  "  -m, --method=STRING        Combination method  (possible values=\"min\", \n                               \"max\", \"mean\", \"gmean\", \"hmean\", \n                               \"sum\", \"diff\", \"meta\" default=`mean')",
+  "  -m, --method=STRING        Combination method  (possible values=\"min\", \n                               \"max\", \"mean\", \"gmean\", \"hmean\", \n                               \"sum\", \"diff\", \"meta\", \"qmeta\" \n                               default=`mean')",
   "  -o, --output=filename      Output file",
   "  -w, --weights=filename     Weights file",
   "\nModules:",
 
 
 char *cmdline_parser_type_values[] = {"pcl", "dat", "dab", "module", "revdat", 0} ;	/* Possible values for type.  */
-char *cmdline_parser_method_values[] = {"min", "max", "mean", "gmean", "hmean", "sum", "diff", "meta", 0} ;	/* Possible values for method.  */
+char *cmdline_parser_method_values[] = {"min", "max", "mean", "gmean", "hmean", "sum", "diff", "meta", "qmeta", 0} ;	/* Possible values for method.  */
 
 static char *
 gengetopt_strdup (const char *s);

File tools/Combiner/stdafx.h

 #include "genome.h"
 #include "meta.h"
 #include "pcl.h"
+#include "statistics.h"
 using namespace Sleipnir;
 
 #include "cmdline.h"

File tools/Funcifier/Funcifier.cpp

 							dTwo = vecvecdGenes.empty( ) ? 1 : vecvecdGenes[iF2][j];
 							dCountIn += dOne * dTwo;
 							dAveIn += d * dOne * dTwo; } } }
-				DatOut.Set( iF1, iF2, dCountIn ? (float)( dAveIn / dCountIn ) : CMeta::GetNaN( ) ); }
+				DatOut.Set( iF1, iF2, ( dCountIn >= sArgs.minimum_arg ) ? (float)( dAveIn / dCountIn ) : CMeta::GetNaN( ) ); }
 		if( sArgs.zscore_flag )
 			DatOut.Normalize( CDat::ENormalizeZScore );
 		DatOut.Save( sArgs.output_arg );

File tools/Funcifier/Funcifier.ggo

 							string	typestr="filename"
 option	"weights"		w	"PCL file of set-by-gene weights"
 							string	typestr="filename"
+option	"minimum"		u	"Minimum edge count/weight to use"
+							double	default="0"
 
 section "Optional"
 option	"normalize"		n	"Normalize input to the range [0,1]"

File tools/Funcifier/cmdline.c

   "  -s, --shared=STRING     Determine shared gene handling  (possible \n                            values=\"ignore\", \"discard\", \"oneonly\" \n                            default=`discard')",
   "  -l, --colors=filename   Function cohesiveness output file",
   "  -w, --weights=filename  PCL file of set-by-gene weights",
+  "  -u, --minimum=DOUBLE    Minimum edge count/weight to use  (default=`0')",
   "\nOptional:",
   "  -n, --normalize         Normalize input to the range [0,1]  (default=off)",
   "  -z, --zscore            Normalize output by z-scoring  (default=off)",
   , ARG_FLAG
   , ARG_STRING
   , ARG_INT
+  , ARG_DOUBLE
 } cmdline_parser_arg_type;
 
 static
   args_info->shared_given = 0 ;
   args_info->colors_given = 0 ;
   args_info->weights_given = 0 ;
+  args_info->minimum_given = 0 ;
   args_info->normalize_given = 0 ;
   args_info->zscore_given = 0 ;
   args_info->memmap_given = 0 ;
   args_info->colors_orig = NULL;
   args_info->weights_arg = NULL;
   args_info->weights_orig = NULL;
+  args_info->minimum_arg = 0;
+  args_info->minimum_orig = NULL;
   args_info->normalize_flag = 0;
   args_info->zscore_flag = 0;
   args_info->memmap_flag = 0;
   args_info->shared_help = gengetopt_args_info_help[6] ;
   args_info->colors_help = gengetopt_args_info_help[7] ;
   args_info->weights_help = gengetopt_args_info_help[8] ;
-  args_info->normalize_help = gengetopt_args_info_help[10] ;
-  args_info->zscore_help = gengetopt_args_info_help[11] ;
-  args_info->memmap_help = gengetopt_args_info_help[12] ;
-  args_info->verbosity_help = gengetopt_args_info_help[13] ;
+  args_info->minimum_help = gengetopt_args_info_help[9] ;
+  args_info->normalize_help = gengetopt_args_info_help[11] ;
+  args_info->zscore_help = gengetopt_args_info_help[12] ;
+  args_info->memmap_help = gengetopt_args_info_help[13] ;
+  args_info->verbosity_help = gengetopt_args_info_help[14] ;
   
 }
 
   free_string_field (&(args_info->colors_orig));
   free_string_field (&(args_info->weights_arg));
   free_string_field (&(args_info->weights_orig));
+  free_string_field (&(args_info->minimum_orig));
   free_string_field (&(args_info->verbosity_orig));
   
   
     write_into_file(outfile, "colors", args_info->colors_orig, 0);
   if (args_info->weights_given)
     write_into_file(outfile, "weights", args_info->weights_orig, 0);
+  if (args_info->minimum_given)
+    write_into_file(outfile, "minimum", args_info->minimum_orig, 0);
   if (args_info->normalize_given)
     write_into_file(outfile, "normalize", 0, 0 );
   if (args_info->zscore_given)
   case ARG_INT:
     if (val) *((int *)field) = strtol (val, &stop_char, 0);
     break;
+  case ARG_DOUBLE:
+    if (val) *((double *)field) = strtod (val, &stop_char);
+    break;
   case ARG_STRING:
     if (val) {
       string_field = (char **)field;
   /* check numeric conversion */
   switch(arg_type) {
   case ARG_INT:
+  case ARG_DOUBLE:
     if (val && !(stop_char && *stop_char == '\0')) {
       fprintf(stderr, "%s: invalid numeric value: %s\n", package_name, val);
       return 1; /* failure */
         { "shared",	1, NULL, 's' },
         { "colors",	1, NULL, 'l' },
         { "weights",	1, NULL, 'w' },
+        { "minimum",	1, NULL, 'u' },
         { "normalize",	0, NULL, 'n' },
         { "zscore",	0, NULL, 'z' },
         { "memmap",	0, NULL, 'm' },
         { NULL,	0, NULL, 0 }
       };
 
-      c = getopt_long (argc, argv, "hVi:o:s:l:w:nzmv:", long_options, &option_index);
+      c = getopt_long (argc, argv, "hVi:o:s:l:w:u:nzmv:", long_options, &option_index);
 
       if (c == -1) break;	/* Exit from `while (1)' loop.  */
 
             goto failure;
         
           break;
+        case 'u':	/* Minimum edge count/weight to use.  */
+        
+        
+          if (update_arg( (void *)&(args_info->minimum_arg), 
+               &(args_info->minimum_orig), &(args_info->minimum_given),
+              &(local_args_info.minimum_given), optarg, 0, "0", ARG_DOUBLE,
+              check_ambiguity, override, 0, 0,
+              "minimum", 'u',
+              additional_error))
+            goto failure;
+        
+          break;
         case 'n':	/* Normalize input to the range [0,1].  */
         
         

File tools/Funcifier/cmdline.h

   char * weights_arg;	/**< @brief PCL file of set-by-gene weights.  */
   char * weights_orig;	/**< @brief PCL file of set-by-gene weights original value given at command line.  */
   const char *weights_help; /**< @brief PCL file of set-by-gene weights help description.  */
+  double minimum_arg;	/**< @brief Minimum edge count/weight to use (default='0').  */
+  char * minimum_orig;	/**< @brief Minimum edge count/weight to use original value given at command line.  */
+  const char *minimum_help; /**< @brief Minimum edge count/weight to use help description.  */
   int normalize_flag;	/**< @brief Normalize input to the range [0,1] (default=off).  */
   const char *normalize_help; /**< @brief Normalize input to the range [0,1] help description.  */
   int zscore_flag;	/**< @brief Normalize output by z-scoring (default=off).  */
   unsigned int shared_given ;	/**< @brief Whether shared was given.  */
   unsigned int colors_given ;	/**< @brief Whether colors was given.  */
   unsigned int weights_given ;	/**< @brief Whether weights was given.  */
+  unsigned int minimum_given ;	/**< @brief Whether minimum was given.  */
   unsigned int normalize_given ;	/**< @brief Whether normalize was given.  */
   unsigned int zscore_given ;	/**< @brief Whether zscore was given.  */
   unsigned int memmap_given ;	/**< @brief Whether memmap was given.  */

File tools/Normalizer/Normalizer.cpp

 	{"0to1",	CDat::ENormalizeMinMax},
 	{"sigmoid",	CDat::ENormalizeSigmoid},
 	{"normcdf",	CDat::ENormalizeNormCDF},
+	{"pcc",		CDat::ENormalizePCC},
 	{NULL,		CDat::ENormalizeNone}
 };
 

File tools/Normalizer/Normalizer.ggo

 option	"itype"		t	"Data file type"
 						values="pcl","dat"	default="dat"
 option	"otype"		T	"Normalization type"
-						values="columnz","rowz","globalz","column0","0to1","colcenter","medmult","colfrac","sigmoid","normcdf"	default="globalz"
+						values="columnz","rowz","globalz","column0","0to1","colcenter","medmult","colfrac","sigmoid","normcdf","pcc"	default="globalz"
 
 section "Optional"
 option	"flip"		f	"Flip high/low scores"

File tools/Normalizer/cmdline.c

   "  -i, --input=filename   Input/output PCL/DAT/DAB file",
   "  -o, --output=filename  Output PCL/DAB file",
   "  -t, --itype=STRING     Data file type  (possible values=\"pcl\", \"dat\" \n                           default=`dat')",
-  "  -T, --otype=STRING     Normalization type  (possible values=\"columnz\", \n                           \"rowz\", \"globalz\", \"column0\", \"0to1\", \n                           \"colcenter\", \"medmult\", \"colfrac\", \n                           \"sigmoid\", \"normcdf\" default=`globalz')",
+  "  -T, --otype=STRING     Normalization type  (possible values=\"columnz\", \n                           \"rowz\", \"globalz\", \"column0\", \"0to1\", \n                           \"colcenter\", \"medmult\", \"colfrac\", \n                           \"sigmoid\", \"normcdf\", \"pcc\" default=`globalz')",
   "\nOptional:",
   "  -f, --flip             Flip high/low scores  (default=off)",
   "  -s, --skip=INT         Columns to skip in input PCL  (default=`2')",
 
 
 char *cmdline_parser_itype_values[] = {"pcl", "dat", 0} ;	/* Possible values for itype.  */
-char *cmdline_parser_otype_values[] = {"columnz", "rowz", "globalz", "column0", "0to1", "colcenter", "medmult", "colfrac", "sigmoid", "normcdf", 0} ;	/* Possible values for otype.  */
+char *cmdline_parser_otype_values[] = {"columnz", "rowz", "globalz", "column0", "0to1", "colcenter", "medmult", "colfrac", "sigmoid", "normcdf", "pcc", 0} ;	/* Possible values for otype.  */
 
 static char *
 gengetopt_strdup (const char *s);