Commits

chut...@hutlab3.sph.harvard.edu  committed d8f4484

Add per-column frequency weighting to Distancer and CPCL::Distance
Weights similarity measure by fraction of rows with values
Fix Combiner absolute reweighting
Add edge-specific filtering to Dat2Dab
Add output z-score normalization to Funcifier
Fix Hubber calculations to be more accurate/efficient/comprehensive

  • Participants
  • Parent commits 52db7f6

Comments (0)

Files changed (19)

  */
 int CPCL::Distance( const char* szFile, size_t iSkip, const char* szSimilarityMeasure, bool fNormalize,
 	bool fZScore, bool fAutocorrelate, const char* szGeneFile, float dCutoff, size_t iLimit, CPCL& PCL,
-	CDat& Dat, IMeasure::EMap eMap ) {
+	CDat& Dat, IMeasure::EMap eMap, bool fFrequencyWeight ) {
 	size_t						i, j, iOne, iTwo;
 	float						d;
 	ifstream					ifsm;
 	CGenes						GenesIn( Genome );
 	vector<size_t>				veciGenes;
 	const float*				adOne;
+	const float*				adWeights;
+	vector<float>				vecdWeights;
 	IMeasure*					pMeasure;
 	CMeasurePearson				Pearson;
 	CMeasureEuclidean			Euclidean;
 	if( !pMeasure )
 		return 1;
 
+	if( fFrequencyWeight ) {
+		vecdWeights.resize( PCL.GetExperiments( ) );
+		for( i = 0; i < vecdWeights.size( ); ++i ) {
+			for( iOne = j = 0; j < PCL.GetGenes( ); ++j )
+				if( !CMeta::IsNaN( d = PCL.Get( j, i ) ) && ( d > 0 ) )
+					iOne++;
+			vecdWeights[ i ] = (float)( ( PCL.GetGenes( ) + 1 ) - iOne ) / PCL.GetGenes( ); } }
+	adWeights = vecdWeights.empty( ) ? NULL : &vecdWeights[ 0 ];
+
 	CMeasureAutocorrelate		Autocorrelate( pMeasure, false );
 	if( fAutocorrelate )
 		pMeasure = &Autocorrelate;
 			for( j = ( i + 1 ); j < GenesIn.GetGenes( ); ++j )
 				if( ( iTwo = veciGenes[ j ] ) != -1 )
 					Dat.Set( i, j, (float)pMeasure->Measure(
-						adOne, PCL.GetExperiments( ), PCL.Get( iTwo ), PCL.GetExperiments( ), eMap ) ); }
+						adOne, PCL.GetExperiments( ), PCL.Get( iTwo ), PCL.GetExperiments( ), eMap, adWeights, adWeights ) ); }
 
 		if( fNormalize || fZScore )
 			Dat.Normalize( fZScore ? CDat::ENormalizeZScore : CDat::ENormalizeMinMax );
 
 	static int Distance( const char* szFile, size_t iSkip, const char* szSimilarityMeasure, bool fNormalize,
 		bool fZScore, bool fAutocorrelate, const char* szGeneFile, float dCutoff, size_t iLimit, CPCL& PCL,
-		CDat& Dat, IMeasure::EMap eMap = IMeasure::EMapCenter );
+		CDat& Dat, IMeasure::EMap eMap = IMeasure::EMapCenter, bool fFrequencyWeight = false );
 
 	/*!
 	 * \brief

File tools/Combiner/Combiner.cpp

 	CHalfMatrix<float>		MatCounts;
 	size_t					i, j, k, iOne, iTwo, iA, iB;
 	vector<vector<size_t> >	vecveciGenes;
-	float					d, dWeight1, dWeight2;
+	float					d, dWeight;
 	vector<string>			vecstrFiles, vecstrTerms;
 	CPCL					PCLWeights( false );
 	CGenome					Genome;
 			if( ( j = PCLWeights.GetGene( CMeta::Deextension( CMeta::Basename( sArgs.inputs[ i ] ) ) ) ) == -1 ) {
 				cerr << "Ignoring unweighted graph: " << sArgs.inputs[ i ] << endl;
 				continue; }
-			dWeight1 = PCLWeights.Get( j, 0 );
-			dWeight2 = sArgs.reweight_flag ? 1 : dWeight1; }
+			dWeight = PCLWeights.Get( j, 0 ); }
 		else
-			dWeight1 = dWeight2 = 1;
+			dWeight = 1;
 		cerr << "Opened: " << sArgs.inputs[ i ] << endl;
 		if( sArgs.normalize_flag )
 			DatCur.Normalize( CDat::ENormalizeZScore );
 							continue;
 						switch( eMethod ) {
 							case EMethodGMean:
-								DatOut.Get( iOne, iTwo ) *= pow( d, dWeight1 );
-								MatCounts.Get( iOne, iTwo ) += dWeight2;
+								DatOut.Get( iOne, iTwo ) *= pow( d, dWeight );
+								MatCounts.Get( iOne, iTwo ) += dWeight;
 								break;
 
 							case EMethodHMean:
-								DatOut.Get( iOne, iTwo ) += dWeight1 / d;
-								MatCounts.Get( iOne, iTwo ) += dWeight2;
+								DatOut.Get( iOne, iTwo ) += dWeight / d;
+								MatCounts.Get( iOne, iTwo ) += dWeight;
 								break;
 
 							case EMethodMax:
 								break;
 
 							default:
-								DatOut.Get( iOne, iTwo ) += dWeight1 * d;
-								MatCounts.Get( iOne, iTwo ) += dWeight2; } } } } }
+								DatOut.Get( iOne, iTwo ) += dWeight * d;
+								MatCounts.Get( iOne, iTwo ) += dWeight; } } } } }
 	for( i = 0; i < DatOut.GetGenes( ); ++i )
 		for( j = ( i + 1 ); j < DatOut.GetGenes( ); ++j )
 			switch( eMethod ) {
 				case EMethodMean:
-					DatOut.Set( i, j, ( d = MatCounts.Get( i, j ) ) ? ( DatOut.Get( i, j ) / d ) :
+					DatOut.Set( i, j, ( d = MatCounts.Get( i, j ) ) ? ( DatOut.Get( i, j ) / ( sArgs.reweight_flag ? 1 : d ) ) :
 						CMeta::GetNaN( ) );
 					break;
 
 				case EMethodGMean:
 					DatOut.Set( i, j, ( d = MatCounts.Get( i, j ) ) ?
-						(float)pow( (double)DatOut.Get( i, j ), 1.0 / d ) : CMeta::GetNaN( ) );
+						(float)pow( (double)DatOut.Get( i, j ), 1.0 / ( sArgs.reweight_flag ? 1 : d ) ) : CMeta::GetNaN( ) );
 					break;
 
 				case EMethodHMean:
-					DatOut.Set( i, j, ( d = MatCounts.Get( i, j ) ) ? ( d / DatOut.Get( i, j ) ) :
+					DatOut.Set( i, j, ( d = MatCounts.Get( i, j ) ) ? ( ( sArgs.reweight_flag ? 1 : d ) / DatOut.Get( i, j ) ) :
 						CMeta::GetNaN( ) );
 					break;
 
 					if( DatOut.Get( i, j ) == FLT_MAX )
 						DatOut.Set( i, j, CMeta::GetNaN( ) ); }
 
+	if( sArgs.zscore_flag )
+		DatOut.Normalize( CDat::ENormalizeZScore );
 	if( !sArgs.memmap_flag )
 		DatOut.Save( sArgs.output_arg );
 

File tools/Combiner/Combiner.ggo

 						flag	off
 option	"normalize"	n	"Normalize inputs before combining"
 						flag	off
+option	"zscore"	z	"Z-score output after combining"
+						flag	off
 option	"subset"	s	"Subset size (none if zero)"
 						int	default="0"
 option	"verbosity"	v	"Message verbosity"

File tools/Combiner/cmdline.c

   "  -k, --skip=INT             Columns to skip in input PCLs  (default=`2')",
   "  -p, --memmap               Memory map input files  (default=off)",
   "  -n, --normalize            Normalize inputs before combining  (default=off)",
+  "  -z, --zscore               Z-score output after combining  (default=off)",
   "  -s, --subset=INT           Subset size (none if zero)  (default=`0')",
   "  -v, --verbosity=INT        Message verbosity  (default=`5')",
     0
   args_info->skip_given = 0 ;
   args_info->memmap_given = 0 ;
   args_info->normalize_given = 0 ;
+  args_info->zscore_given = 0 ;
   args_info->subset_given = 0 ;
   args_info->verbosity_given = 0 ;
 }
   args_info->skip_orig = NULL;
   args_info->memmap_flag = 0;
   args_info->normalize_flag = 0;
+  args_info->zscore_flag = 0;
   args_info->subset_arg = 0;
   args_info->subset_orig = NULL;
   args_info->verbosity_arg = 5;
   args_info->skip_help = gengetopt_args_info_help[15] ;
   args_info->memmap_help = gengetopt_args_info_help[16] ;
   args_info->normalize_help = gengetopt_args_info_help[17] ;
-  args_info->subset_help = gengetopt_args_info_help[18] ;
-  args_info->verbosity_help = gengetopt_args_info_help[19] ;
+  args_info->zscore_help = gengetopt_args_info_help[18] ;
+  args_info->subset_help = gengetopt_args_info_help[19] ;
+  args_info->verbosity_help = gengetopt_args_info_help[20] ;
   
 }
 
     write_into_file(outfile, "memmap", 0, 0 );
   if (args_info->normalize_given)
     write_into_file(outfile, "normalize", 0, 0 );
+  if (args_info->zscore_given)
+    write_into_file(outfile, "zscore", 0, 0 );
   if (args_info->subset_given)
     write_into_file(outfile, "subset", args_info->subset_orig, 0);
   if (args_info->verbosity_given)
         { "skip",	1, NULL, 'k' },
         { "memmap",	0, NULL, 'p' },
         { "normalize",	0, NULL, 'n' },
+        { "zscore",	0, NULL, 'z' },
         { "subset",	1, NULL, 's' },
         { "verbosity",	1, NULL, 'v' },
         { NULL,	0, NULL, 0 }
       };
 
-      c = getopt_long (argc, argv, "hVt:m:o:w:j:r:g:e:Wk:pns:v:", long_options, &option_index);
+      c = getopt_long (argc, argv, "hVt:m:o:w:j:r:g:e:Wk:pnzs:v:", long_options, &option_index);
 
       if (c == -1) break;	/* Exit from `while (1)' loop.  */
 
             goto failure;
         
           break;
+        case 'z':	/* Z-score output after combining.  */
+        
+        
+          if (update_arg((void *)&(args_info->zscore_flag), 0, &(args_info->zscore_given),
+              &(local_args_info.zscore_given), optarg, 0, 0, ARG_FLAG,
+              check_ambiguity, override, 1, 0, "zscore", 'z',
+              additional_error))
+            goto failure;
+        
+          break;
         case 's':	/* Subset size (none if zero).  */
         
         

File tools/Combiner/cmdline.h

   const char *memmap_help; /**< @brief Memory map input files help description.  */
   int normalize_flag;	/**< @brief Normalize inputs before combining (default=off).  */
   const char *normalize_help; /**< @brief Normalize inputs before combining help description.  */
+  int zscore_flag;	/**< @brief Z-score output after combining (default=off).  */
+  const char *zscore_help; /**< @brief Z-score output after combining help description.  */
   int subset_arg;	/**< @brief Subset size (none if zero) (default='0').  */
   char * subset_orig;	/**< @brief Subset size (none if zero) original value given at command line.  */
   const char *subset_help; /**< @brief Subset size (none if zero) help description.  */
   unsigned int skip_given ;	/**< @brief Whether skip was given.  */
   unsigned int memmap_given ;	/**< @brief Whether memmap was given.  */
   unsigned int normalize_given ;	/**< @brief Whether normalize was given.  */
+  unsigned int zscore_given ;	/**< @brief Whether zscore was given.  */
   unsigned int subset_given ;	/**< @brief Whether subset was given.  */
   unsigned int verbosity_given ;	/**< @brief Whether verbosity was given.  */
 

File tools/Dat2Dab/Dat2Dab.cpp

 					( ( (float)rand( ) / RAND_MAX ) > sArgs.subsample_arg ) )
 					Dat.Set( i, j, CMeta::GetNaN( ) );
 
+	if( sArgs.edges_arg ) {
+		CDat			DatLk1;
+		vector<size_t>	veciGenesOne;
+		size_t			iOne, iTwo;
+
+		if( !DatLk1.Open( sArgs.edges_arg ) ) {
+			cerr << "Could not open: " << sArgs.edges_arg << endl;
+			return 1; }
+		veciGenesOne.resize( Dat.GetGenes( ) );
+		for( i = 0; i < veciGenesOne.size( ); ++i )
+			veciGenesOne[ i ] = DatLk1.GetGene( Dat.GetGene( i ) );
+		for( i = 0; i < Dat.GetGenes( ); ++i ) {
+			if( ( iOne = veciGenesOne[ i ] ) == -1 ) {
+				for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j )
+					Dat.Set( i, j, CMeta::GetNaN( ) );
+				continue; }
+			for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j )
+				if( ( ( iTwo = veciGenesOne[ j ] ) == -1 ) ||
+					CMeta::IsNaN( DatLk1.Get( iOne, iTwo ) ) )
+					Dat.Set( i, j, CMeta::GetNaN( ) ); } }
 	if( sArgs.lookups1_arg ) {
 		CGenes			GenesLk1( Genome );
 		vector<size_t>	veciGenesOne;

File tools/Dat2Dab/Dat2Dab.ggo

 							string	typestr="filename"
 option	"genex"			G	"Exclude all genes from the given set"
 							string	typestr="filename"
+option	"edges"			e	"Process only edges from the given DAT/DAB"
+							string	typestr="filename"
 option	"cutoff"		c	"Exclude edges below cutoff"
 							double
-option	"zero"			e	"Zero missing values"
+option	"zero"			Z	"Zero missing values"
 							flag	off
 option	"duplicates"	d	"Allow dissimilar duplicate values"
 							flag	off

File tools/Dat2Dab/cmdline.c

   "\nFiltering:",
   "  -g, --genes=filename     Process only genes from the given set",
   "  -G, --genex=filename     Exclude all genes from the given set",
+  "  -e, --edges=filename     Process only edges from the given DAT/DAB",
   "  -c, --cutoff=DOUBLE      Exclude edges below cutoff",
-  "  -e, --zero               Zero missing values  (default=off)",
+  "  -Z, --zero               Zero missing values  (default=off)",
   "  -d, --duplicates         Allow dissimilar duplicate values  (default=off)",
   "  -u, --subsample=FLOAT    Fraction of output to randomly subsample  \n                             (default=`1')",
   "\nLookups:",
   args_info->randomize_given = 0 ;
   args_info->genes_given = 0 ;
   args_info->genex_given = 0 ;
+  args_info->edges_given = 0 ;
   args_info->cutoff_given = 0 ;
   args_info->zero_given = 0 ;
   args_info->duplicates_given = 0 ;
   args_info->genes_orig = NULL;
   args_info->genex_arg = NULL;
   args_info->genex_orig = NULL;
+  args_info->edges_arg = NULL;
+  args_info->edges_orig = NULL;
   args_info->cutoff_orig = NULL;
   args_info->zero_flag = 0;
   args_info->duplicates_flag = 0;
   args_info->randomize_help = gengetopt_args_info_help[10] ;
   args_info->genes_help = gengetopt_args_info_help[12] ;
   args_info->genex_help = gengetopt_args_info_help[13] ;
-  args_info->cutoff_help = gengetopt_args_info_help[14] ;
-  args_info->zero_help = gengetopt_args_info_help[15] ;
-  args_info->duplicates_help = gengetopt_args_info_help[16] ;
-  args_info->subsample_help = gengetopt_args_info_help[17] ;
-  args_info->lookup1_help = gengetopt_args_info_help[19] ;
-  args_info->lookup2_help = gengetopt_args_info_help[20] ;
-  args_info->lookups1_help = gengetopt_args_info_help[21] ;
-  args_info->lookups2_help = gengetopt_args_info_help[22] ;
-  args_info->genelist_help = gengetopt_args_info_help[23] ;
-  args_info->paircount_help = gengetopt_args_info_help[24] ;
-  args_info->remap_help = gengetopt_args_info_help[26] ;
-  args_info->table_help = gengetopt_args_info_help[27] ;
-  args_info->skip_help = gengetopt_args_info_help[28] ;
-  args_info->memmap_help = gengetopt_args_info_help[29] ;
-  args_info->verbosity_help = gengetopt_args_info_help[30] ;
+  args_info->edges_help = gengetopt_args_info_help[14] ;
+  args_info->cutoff_help = gengetopt_args_info_help[15] ;
+  args_info->zero_help = gengetopt_args_info_help[16] ;
+  args_info->duplicates_help = gengetopt_args_info_help[17] ;
+  args_info->subsample_help = gengetopt_args_info_help[18] ;
+  args_info->lookup1_help = gengetopt_args_info_help[20] ;
+  args_info->lookup2_help = gengetopt_args_info_help[21] ;
+  args_info->lookups1_help = gengetopt_args_info_help[22] ;
+  args_info->lookups2_help = gengetopt_args_info_help[23] ;
+  args_info->genelist_help = gengetopt_args_info_help[24] ;
+  args_info->paircount_help = gengetopt_args_info_help[25] ;
+  args_info->remap_help = gengetopt_args_info_help[27] ;
+  args_info->table_help = gengetopt_args_info_help[28] ;
+  args_info->skip_help = gengetopt_args_info_help[29] ;
+  args_info->memmap_help = gengetopt_args_info_help[30] ;
+  args_info->verbosity_help = gengetopt_args_info_help[31] ;
   
 }
 
   free_string_field (&(args_info->genes_orig));
   free_string_field (&(args_info->genex_arg));
   free_string_field (&(args_info->genex_orig));
+  free_string_field (&(args_info->edges_arg));
+  free_string_field (&(args_info->edges_orig));
   free_string_field (&(args_info->cutoff_orig));
   free_string_field (&(args_info->subsample_orig));
   free_string_field (&(args_info->lookup1_arg));
     write_into_file(outfile, "genes", args_info->genes_orig, 0);
   if (args_info->genex_given)
     write_into_file(outfile, "genex", args_info->genex_orig, 0);
+  if (args_info->edges_given)
+    write_into_file(outfile, "edges", args_info->edges_orig, 0);
   if (args_info->cutoff_given)
     write_into_file(outfile, "cutoff", args_info->cutoff_orig, 0);
   if (args_info->zero_given)
         { "randomize",	0, NULL, 'a' },
         { "genes",	1, NULL, 'g' },
         { "genex",	1, NULL, 'G' },
+        { "edges",	1, NULL, 'e' },
         { "cutoff",	1, NULL, 'c' },
-        { "zero",	0, NULL, 'e' },
+        { "zero",	0, NULL, 'Z' },
         { "duplicates",	0, NULL, 'd' },
         { "subsample",	1, NULL, 'u' },
         { "lookup1",	1, NULL, 'l' },
         { NULL,	0, NULL, 0 }
       };
 
-      c = getopt_long (argc, argv, "hVi:o:fnzrag:G:c:edu:l:L:t:T:EPp:bs:mv:", long_options, &option_index);
+      c = getopt_long (argc, argv, "hVi:o:fnzrag:G:e:c:Zdu:l:L:t:T:EPp:bs:mv:", long_options, &option_index);
 
       if (c == -1) break;	/* Exit from `while (1)' loop.  */
 
             goto failure;
         
           break;
+        case 'e':	/* Process only edges from the given DAT/DAB.  */
+        
+        
+          if (update_arg( (void *)&(args_info->edges_arg), 
+               &(args_info->edges_orig), &(args_info->edges_given),
+              &(local_args_info.edges_given), optarg, 0, 0, ARG_STRING,
+              check_ambiguity, override, 0, 0,
+              "edges", 'e',
+              additional_error))
+            goto failure;
+        
+          break;
         case 'c':	/* Exclude edges below cutoff.  */
         
         
             goto failure;
         
           break;
-        case 'e':	/* Zero missing values.  */
+        case 'Z':	/* Zero missing values.  */
         
         
           if (update_arg((void *)&(args_info->zero_flag), 0, &(args_info->zero_given),
               &(local_args_info.zero_given), optarg, 0, 0, ARG_FLAG,
-              check_ambiguity, override, 1, 0, "zero", 'e',
+              check_ambiguity, override, 1, 0, "zero", 'Z',
               additional_error))
             goto failure;
         

File tools/Dat2Dab/cmdline.h

   char * genex_arg;	/**< @brief Exclude all genes from the given set.  */
   char * genex_orig;	/**< @brief Exclude all genes from the given set original value given at command line.  */
   const char *genex_help; /**< @brief Exclude all genes from the given set help description.  */
+  char * edges_arg;	/**< @brief Process only edges from the given DAT/DAB.  */
+  char * edges_orig;	/**< @brief Process only edges from the given DAT/DAB original value given at command line.  */
+  const char *edges_help; /**< @brief Process only edges from the given DAT/DAB help description.  */
   double cutoff_arg;	/**< @brief Exclude edges below cutoff.  */
   char * cutoff_orig;	/**< @brief Exclude edges below cutoff original value given at command line.  */
   const char *cutoff_help; /**< @brief Exclude edges below cutoff help description.  */
   unsigned int randomize_given ;	/**< @brief Whether randomize was given.  */
   unsigned int genes_given ;	/**< @brief Whether genes was given.  */
   unsigned int genex_given ;	/**< @brief Whether genex was given.  */
+  unsigned int edges_given ;	/**< @brief Whether edges was given.  */
   unsigned int cutoff_given ;	/**< @brief Whether cutoff was given.  */
   unsigned int zero_given ;	/**< @brief Whether zero was given.  */
   unsigned int duplicates_given ;	/**< @brief Whether duplicates was given.  */

File tools/Distancer/Distancer.cpp

 
 	if( iRet = CPCL::Distance( sArgs.input_arg, sArgs.skip_arg, sArgs.distance_arg, !!sArgs.normalize_flag,
 		!!sArgs.zscore_flag, !!sArgs.autocorrelate_flag, sArgs.genes_arg, sArgs.cutoff_given ?
-		(float)sArgs.cutoff_arg : CMeta::GetNaN( ), sArgs.limit_arg, PCL, Dat ) ) {
+		(float)sArgs.cutoff_arg : CMeta::GetNaN( ), sArgs.limit_arg, PCL, Dat, IMeasure::EMapCenter, !!sArgs.freqweight_flag ) ) {
 		cmdline_parser_print_help( );
 		return iRet; }
 	if( sArgs.flip_flag )

File tools/Distancer/Distancer.ggo

 							string	typestr="filename"
 option	"autocorrelate"	a	"Autocorrelate distances"
 							flag	off
+option	"freqweight"	q	"Weight conditions by frequency"
+							flag	off
 
 section "Preprocessing"
 option	"normalize"		n	"Normalize distances"

File tools/Distancer/cmdline.c

   "\nMiscellaneous:",
   "  -w, --weights=filename  Input weights file",
   "  -a, --autocorrelate     Autocorrelate distances  (default=off)",
+  "  -q, --freqweight        Weight conditions by frequency  (default=off)",
   "\nPreprocessing:",
   "  -n, --normalize         Normalize distances  (default=off)",
   "  -z, --zscore            Convert correlations to z-scores  (default=on)",
   args_info->distance_given = 0 ;
   args_info->weights_given = 0 ;
   args_info->autocorrelate_given = 0 ;
+  args_info->freqweight_given = 0 ;
   args_info->normalize_given = 0 ;
   args_info->zscore_given = 0 ;
   args_info->flip_given = 0 ;
   args_info->weights_arg = NULL;
   args_info->weights_orig = NULL;
   args_info->autocorrelate_flag = 0;
+  args_info->freqweight_flag = 0;
   args_info->normalize_flag = 0;
   args_info->zscore_flag = 1;
   args_info->flip_flag = 0;
   args_info->distance_help = gengetopt_args_info_help[5] ;
   args_info->weights_help = gengetopt_args_info_help[7] ;
   args_info->autocorrelate_help = gengetopt_args_info_help[8] ;
-  args_info->normalize_help = gengetopt_args_info_help[10] ;
-  args_info->zscore_help = gengetopt_args_info_help[11] ;
-  args_info->flip_help = gengetopt_args_info_help[12] ;
-  args_info->genes_help = gengetopt_args_info_help[14] ;
-  args_info->cutoff_help = gengetopt_args_info_help[15] ;
-  args_info->skip_help = gengetopt_args_info_help[17] ;
-  args_info->limit_help = gengetopt_args_info_help[18] ;
-  args_info->verbosity_help = gengetopt_args_info_help[19] ;
+  args_info->freqweight_help = gengetopt_args_info_help[9] ;
+  args_info->normalize_help = gengetopt_args_info_help[11] ;
+  args_info->zscore_help = gengetopt_args_info_help[12] ;
+  args_info->flip_help = gengetopt_args_info_help[13] ;
+  args_info->genes_help = gengetopt_args_info_help[15] ;
+  args_info->cutoff_help = gengetopt_args_info_help[16] ;
+  args_info->skip_help = gengetopt_args_info_help[18] ;
+  args_info->limit_help = gengetopt_args_info_help[19] ;
+  args_info->verbosity_help = gengetopt_args_info_help[20] ;
   
 }
 
     write_into_file(outfile, "weights", args_info->weights_orig, 0);
   if (args_info->autocorrelate_given)
     write_into_file(outfile, "autocorrelate", 0, 0 );
+  if (args_info->freqweight_given)
+    write_into_file(outfile, "freqweight", 0, 0 );
   if (args_info->normalize_given)
     write_into_file(outfile, "normalize", 0, 0 );
   if (args_info->zscore_given)
         { "distance",	1, NULL, 'd' },
         { "weights",	1, NULL, 'w' },
         { "autocorrelate",	0, NULL, 'a' },
+        { "freqweight",	0, NULL, 'q' },
         { "normalize",	0, NULL, 'n' },
         { "zscore",	0, NULL, 'z' },
         { "flip",	0, NULL, 'f' },
         { NULL,	0, NULL, 0 }
       };
 
-      c = getopt_long (argc, argv, "hVi:o:d:w:anzfg:e:s:l:v:", long_options, &option_index);
+      c = getopt_long (argc, argv, "hVi:o:d:w:aqnzfg:e:s:l:v:", long_options, &option_index);
 
       if (c == -1) break;	/* Exit from `while (1)' loop.  */
 
             goto failure;
         
           break;
+        case 'q':	/* Weight conditions by frequency.  */
+        
+        
+          if (update_arg((void *)&(args_info->freqweight_flag), 0, &(args_info->freqweight_given),
+              &(local_args_info.freqweight_given), optarg, 0, 0, ARG_FLAG,
+              check_ambiguity, override, 1, 0, "freqweight", 'q',
+              additional_error))
+            goto failure;
+        
+          break;
         case 'n':	/* Normalize distances.  */
         
         

File tools/Distancer/cmdline.h

   const char *weights_help; /**< @brief Input weights file help description.  */
   int autocorrelate_flag;	/**< @brief Autocorrelate distances (default=off).  */
   const char *autocorrelate_help; /**< @brief Autocorrelate distances help description.  */
+  int freqweight_flag;	/**< @brief Weight conditions by frequency (default=off).  */
+  const char *freqweight_help; /**< @brief Weight conditions by frequency help description.  */
   int normalize_flag;	/**< @brief Normalize distances (default=off).  */
   const char *normalize_help; /**< @brief Normalize distances help description.  */
   int zscore_flag;	/**< @brief Convert correlations to z-scores (default=on).  */
   unsigned int distance_given ;	/**< @brief Whether distance was given.  */
   unsigned int weights_given ;	/**< @brief Whether weights was given.  */
   unsigned int autocorrelate_given ;	/**< @brief Whether autocorrelate was given.  */
+  unsigned int freqweight_given ;	/**< @brief Whether freqweight was given.  */
   unsigned int normalize_given ;	/**< @brief Whether normalize was given.  */
   unsigned int zscore_given ;	/**< @brief Whether zscore was given.  */
   unsigned int flip_given ;	/**< @brief Whether flip was given.  */

File tools/Funcifier/Funcifier.cpp

 							iCountIn++;
 							dAveIn += d; } } }
 				DatOut.Set( iF1, iF2, dAveIn / iCountIn ); }
+		if( sArgs.zscore_flag )
+			DatOut.Normalize( CDat::ENormalizeZScore );
 		DatOut.Save( sArgs.output_arg );
 	}
 

File tools/Funcifier/Funcifier.ggo

 							string	typestr="filename"
 
 section "Optional"
-option	"normalize"		n	"Normalize to the range [0,1]"
+option	"normalize"		n	"Normalize input to the range [0,1]"
+							flag	off
+option	"zscore"		z	"Normalize output by z-scoring"
 							flag	off
 option	"memmap"		m	"Memory map input"
 							flag	off

File tools/Funcifier/cmdline.c

   "  -s, --shared=STRING    Determine shared gene handling  (possible \n                           values=\"ignore\", \"discard\", \"oneonly\" \n                           default=`discard')",
   "  -l, --colors=filename  Function cohesiveness output file",
   "\nOptional:",
-  "  -n, --normalize        Normalize to the range [0,1]  (default=off)",
+  "  -n, --normalize        Normalize input to the range [0,1]  (default=off)",
+  "  -z, --zscore           Normalize output by z-scoring  (default=off)",
   "  -m, --memmap           Memory map input  (default=off)",
   "  -v, --verbosity=INT    Message verbosity  (default=`5')",
     0
   args_info->shared_given = 0 ;
   args_info->colors_given = 0 ;
   args_info->normalize_given = 0 ;
+  args_info->zscore_given = 0 ;
   args_info->memmap_given = 0 ;
   args_info->verbosity_given = 0 ;
 }
   args_info->colors_arg = NULL;
   args_info->colors_orig = NULL;
   args_info->normalize_flag = 0;
+  args_info->zscore_flag = 0;
   args_info->memmap_flag = 0;
   args_info->verbosity_arg = 5;
   args_info->verbosity_orig = NULL;
   args_info->shared_help = gengetopt_args_info_help[6] ;
   args_info->colors_help = gengetopt_args_info_help[7] ;
   args_info->normalize_help = gengetopt_args_info_help[9] ;
-  args_info->memmap_help = gengetopt_args_info_help[10] ;
-  args_info->verbosity_help = gengetopt_args_info_help[11] ;
+  args_info->zscore_help = gengetopt_args_info_help[10] ;
+  args_info->memmap_help = gengetopt_args_info_help[11] ;
+  args_info->verbosity_help = gengetopt_args_info_help[12] ;
   
 }
 
     write_into_file(outfile, "colors", args_info->colors_orig, 0);
   if (args_info->normalize_given)
     write_into_file(outfile, "normalize", 0, 0 );
+  if (args_info->zscore_given)
+    write_into_file(outfile, "zscore", 0, 0 );
   if (args_info->memmap_given)
     write_into_file(outfile, "memmap", 0, 0 );
   if (args_info->verbosity_given)
         { "shared",	1, NULL, 's' },
         { "colors",	1, NULL, 'l' },
         { "normalize",	0, NULL, 'n' },
+        { "zscore",	0, NULL, 'z' },
         { "memmap",	0, NULL, 'm' },
         { "verbosity",	1, NULL, 'v' },
         { NULL,	0, NULL, 0 }
       };
 
-      c = getopt_long (argc, argv, "hVi:o:s:l:nmv:", long_options, &option_index);
+      c = getopt_long (argc, argv, "hVi:o:s:l:nzmv:", long_options, &option_index);
 
       if (c == -1) break;	/* Exit from `while (1)' loop.  */
 
             goto failure;
         
           break;
-        case 'n':	/* Normalize to the range [0,1].  */
+        case 'n':	/* Normalize input to the range [0,1].  */
         
         
           if (update_arg((void *)&(args_info->normalize_flag), 0, &(args_info->normalize_given),
             goto failure;
         
           break;
+        case 'z':	/* Normalize output by z-scoring.  */
+        
+        
+          if (update_arg((void *)&(args_info->zscore_flag), 0, &(args_info->zscore_given),
+              &(local_args_info.zscore_given), optarg, 0, 0, ARG_FLAG,
+              check_ambiguity, override, 1, 0, "zscore", 'z',
+              additional_error))
+            goto failure;
+        
+          break;
         case 'm':	/* Memory map input.  */
         
         

File tools/Funcifier/cmdline.h

   char * colors_arg;	/**< @brief Function cohesiveness output file.  */
   char * colors_orig;	/**< @brief Function cohesiveness output file original value given at command line.  */
   const char *colors_help; /**< @brief Function cohesiveness output file help description.  */
-  int normalize_flag;	/**< @brief Normalize to the range [0,1] (default=off).  */
-  const char *normalize_help; /**< @brief Normalize to the range [0,1] help description.  */
+  int normalize_flag;	/**< @brief Normalize input to the range [0,1] (default=off).  */
+  const char *normalize_help; /**< @brief Normalize input to the range [0,1] help description.  */
+  int zscore_flag;	/**< @brief Normalize output by z-scoring (default=off).  */
+  const char *zscore_help; /**< @brief Normalize output by z-scoring help description.  */
   int memmap_flag;	/**< @brief Memory map input (default=off).  */
   const char *memmap_help; /**< @brief Memory map input help description.  */
   int verbosity_arg;	/**< @brief Message verbosity (default='5').  */
   unsigned int shared_given ;	/**< @brief Whether shared was given.  */
   unsigned int colors_given ;	/**< @brief Whether colors was given.  */
   unsigned int normalize_given ;	/**< @brief Whether normalize was given.  */
+  unsigned int zscore_given ;	/**< @brief Whether zscore was given.  */
   unsigned int memmap_given ;	/**< @brief Whether memmap was given.  */
   unsigned int verbosity_given ;	/**< @brief Whether verbosity was given.  */
 

File tools/Hubber/Hubber.cpp

 
 static const char	c_szDab[]	= ".dab";
 
-struct SDatum {
-	float						m_dHubbiness;
-	float						m_dHubbinessStd;
-	float						m_dCliquiness;
-	float						m_dCliquinessStd;
-	vector<pair<size_t,float> >	m_vecprSpecific;
-};
-
 struct SWithin {
 	const CDat*						m_pDat;
 	const vector<vector<size_t> >*	m_pvecveciSets;
 	size_t							m_iLength;
 };
 
-static size_t hubs( const CDat&, vector<float>& );
-static void cliques( const CDat&, size_t, const vector<float>&, bool, SDatum&, const CGenes* );
+struct SSummary {
+	size_t	m_iCount;
+	float	m_dSum;
+	float	m_dSumSquares;
+
+	SSummary( ) {
+
+		Clear( ); }
+
+	void Add( float d ) {
+
+		m_iCount++;
+		m_dSum += d;
+		m_dSumSquares += d * d; }
+
+	void Add( const SSummary& sSummary ) {
+
+		m_iCount += sSummary.m_iCount;
+		m_dSum += sSummary.m_dSum;
+		m_dSumSquares += sSummary.m_dSumSquares; }
+
+	void Multiply( float d ) {
+
+		m_iCount = (size_t)( m_iCount * d );
+		m_dSum *= d;
+		m_dSumSquares *= d; }
+
+	void Clear( ) {
+
+		m_iCount = 0;
+		m_dSum = m_dSumSquares = 0; }
+
+	float GetAverage( ) const {
+
+		return ( m_iCount ? ( m_dSum / m_iCount ) : CMeta::GetNaN( ) ); }
+
+	float GetStdev( ) const {
+
+		return ( m_iCount ? sqrt( ( m_dSumSquares / ( max( (size_t)2, m_iCount ) - 1 ) ) - pow( GetAverage( ), 2 ) ) : CMeta::GetNaN( ) ); }
+};
+
+struct SDatum {
+	SSummary					m_sHubbiness;
+	SSummary					m_sCliquiness;
+	vector<pair<size_t,float> >	m_vecprSpecific;
+
+	void Clear( ) {
+
+		m_sHubbiness.Clear( );
+		m_sCliquiness.Clear( );
+		m_vecprSpecific.clear( ); }
+};
+
+static size_t hubs( const CDat&, vector<SSummary>& );
+static size_t cliques( const CDat&, size_t, const vector<SSummary>&, size_t, SDatum&, const CGenes* );
 static void enset( const CDat&, const vector<vector<string> >&, vector<vector<size_t> >& );
 static int sets( const char*, const vector<string>&, vector<vector<string> >& );
 static int process( const char*, bool, bool, const vector<vector<string> >&, const vector<vector<string> >&,
 	CGenome				Genome;
 	CDat				Dat;
 	size_t				i, j, iGenes, iTotal;
-	vector<float>		vecdHub;
+	vector<SSummary>	vecsHubs;
 	SDatum				sDatum;
 
 	if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) {
 	if( sArgs.normalize_flag )
 		Dat.Normalize( CDat::ENormalizeSigmoid );
 
-	iTotal = hubs( Dat, vecdHub );
+	iTotal = hubs( Dat, vecsHubs );
 	if( sArgs.genes_arg == -1 ) {
 		cout << "Function";
 		for( i = 0; i < Dat.GetGenes( ); ++i )
 			cout << '\t' << Dat.GetGene( i ); }
 	else {
-		cliques( Dat, iTotal, vecdHub, true, sDatum, NULL );
-		cout << "name	size	hubbiness	hubbiness std.	cliquiness	cliquiness std." << endl;
-		cout << "total	" << iTotal << '\t' << sDatum.m_dHubbiness << '\t' <<
-			sDatum.m_dHubbinessStd << '\t' << sDatum.m_dCliquiness << '\t' << sDatum.m_dCliquinessStd; }
+		cliques( Dat, iTotal, vecsHubs, 0, sDatum, NULL );
+		cout << "name	size	hubbiness	hubbiness std.	hubbiness n	cliquiness	cliquiness std.	cliquiness n" << endl;
+		cout << "total	" << iTotal << '\t' << sDatum.m_sHubbiness.GetAverage( ) << '\t' <<
+			sDatum.m_sHubbiness.GetStdev( ) << '\t' << sDatum.m_sHubbiness.m_iCount << '\t' << sDatum.m_sCliquiness.GetAverage( ) << '\t' <<
+			sDatum.m_sCliquiness.GetStdev( ) << '\t' << sDatum.m_sCliquiness.m_iCount; }
 	cout << endl;
 
 	for( iGenes = 0; iGenes < sArgs.inputs_num; ++iGenes ) {
 		CGenes		Genes( Genome );
 		ifstream	ifsm;
-		size_t		i;
+		size_t		i, iCur;
 
 		if( !( iGenes % 25 ) )
 			cerr << iGenes << '/' << sArgs.inputs_num << endl;
 			cerr << "Could not open: " << sArgs.inputs[ iGenes ] << endl;
 			return 1; }
 		ifsm.close( );
-		cliques( Dat, iTotal, vecdHub, sArgs.genes_arg != -1, sDatum, &Genes );
+		iCur = cliques( Dat, iTotal, vecsHubs, sArgs.genes_arg, sDatum, &Genes );
 		cout << CMeta::Basename( sArgs.inputs[ iGenes ] );
 		if( sArgs.genes_arg == -1 )
 			for( i = 0; i < sDatum.m_vecprSpecific.size( ); ++i )
 				cout << '\t' << ( sDatum.m_vecprSpecific[ i ].second *
 					( Genes.IsGene( Dat.GetGene( sDatum.m_vecprSpecific[ i ].first ) ) ? -1 : 1 ) );
 		else {
-			cout << '\t' << Genes.GetGenes( ) << '\t' << sDatum.m_dHubbiness << '\t' <<
-				sDatum.m_dHubbinessStd << '\t' << sDatum.m_dCliquiness << '\t' <<
-				sDatum.m_dCliquinessStd;
+			cout << '\t' << iCur << '\t' << sDatum.m_sHubbiness.GetAverage( ) << '\t' <<
+				sDatum.m_sHubbiness.GetStdev( ) << '\t' << sDatum.m_sHubbiness.m_iCount << '\t' << sDatum.m_sCliquiness.GetAverage( ) << '\t' <<
+				sDatum.m_sCliquiness.GetStdev( ) << '\t' << sDatum.m_sCliquiness.m_iCount;
 			for( i = 0; i < min( (size_t)sArgs.genes_arg, sDatum.m_vecprSpecific.size( ) ); ++i )
 				cout << '\t' << Dat.GetGene( sDatum.m_vecprSpecific[ i ].first ) << '|' <<
 					sDatum.m_vecprSpecific[ i ].second << '|' <<
 #endif // WIN32
 	return 0; }
 
-size_t hubs( const CDat& Dat, vector<float>& vecdHub ) {
-	size_t			i, j, iRet;
-	float			d;
-	vector<size_t>	veciHub;
+size_t hubs( const CDat& Dat, vector<SSummary>& vecsHubs ) {
+	size_t	i, j, iRet;
+	float	d;
 
-	vecdHub.resize( Dat.GetGenes( ) );
-	veciHub.resize( vecdHub.size( ) );
-	for( i = 0; i < vecdHub.size( ); ++i )
-		vecdHub[ i ] = 0;
+	vecsHubs.resize( Dat.GetGenes( ) );
 	for( i = 0; i < Dat.GetGenes( ); ++i )
 		for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j ) {
 			if( CMeta::IsNaN( d = Dat.Get( i, j ) ) )
 				continue;
-			veciHub[ i ]++;
-			vecdHub[ i ] += d;
-			veciHub[ j ]++;
-			vecdHub[ j ] += d; }
-	for( iRet = i = 0; i < vecdHub.size( ); ++i ) {
-		if( veciHub[ i ] > iRet )
-			iRet = veciHub[ i ];
-		vecdHub[ i ] = veciHub[ i ] ? ( vecdHub[ i ] / veciHub[ i ] ) : CMeta::GetNaN( ); }
+			vecsHubs[ i ].Add( d );
+			vecsHubs[ j ].Add( d ); }
+	for( iRet = i = 0; i < vecsHubs.size( ); ++i )
+		if( vecsHubs[ i ].m_iCount )
+			iRet++;
 
 	return iRet; }
 
 		return ( prOne.second > prTwo.second ); }
 };
 
-void cliques( const CDat& Dat, size_t iGenes, const vector<float>& vecdHub, bool fSort, SDatum& sDatum,
+size_t cliques( const CDat& Dat, size_t iGenes, const vector<SSummary>& vecsHubs, size_t iSort, SDatum& sDatum,
 	const CGenes* pGenes ) {
-	size_t			i, j, iCount;
-	float			d;
-	vector<float>	vecdClique, vecdClique2;
-	vector<size_t>	veciClique;
-	vector<bool>	vecfOutside;
+	size_t				i, j, iRet;
+	float				d;
+	vector<SSummary>	vecsClique;
+	vector<bool>		vecfOutside;
 
 	vecfOutside.resize( Dat.GetGenes( ) );
 	if( pGenes ) {
 		for( i = 0; i < vecfOutside.size( ); ++i )
 			if( !pGenes->IsGene( Dat.GetGene( i ) ) )
 				vecfOutside[ i ] = true; }
-	veciClique.resize( Dat.GetGenes( ) );
-	vecdClique.resize( Dat.GetGenes( ) );
-	vecdClique2.resize( Dat.GetGenes( ) );
+	vecsClique.resize( Dat.GetGenes( ) );
 	for( i = 0; i < Dat.GetGenes( ); ++i )
 		for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j ) {
 			if( CMeta::IsNaN( d = Dat.Get( i, j ) ) )
 				continue;
-			if( !vecfOutside[ i ] ) {
-				veciClique[ j ]++;
-				vecdClique[ j ] += d;
-				vecdClique2[ j ] += d * d; }
-			if( !vecfOutside[ j ] ) {
-				veciClique[ i ]++;
-				vecdClique[ i ] += d;
-				vecdClique2[ i ] += d * d; } }
-	for( sDatum.m_dCliquiness = sDatum.m_dCliquinessStd = 0,iCount = i = 0; i < vecdClique.size( ); ++i ) {
-//		vecdClique[ i ] /= veciClique[ i ] ? veciClique[ i ] : 1;
+			if( !vecfOutside[ i ] )
+				vecsClique[ j ].Add( d );
+			if( !vecfOutside[ j ] )
+				vecsClique[ i ].Add( d ); }
+	sDatum.Clear( );
+	for( iRet = i = 0; i < vecsClique.size( ); ++i )
 		if( !vecfOutside[ i ] ) {
-			iCount += veciClique[ i ];
-			sDatum.m_dCliquiness += vecdClique[ i ];
-			sDatum.m_dCliquinessStd += vecdClique2[ i ]; } }
-	iCount /= 2;
-	sDatum.m_dCliquiness /= 2;
-	sDatum.m_dCliquinessStd /= 2;
-	sDatum.m_dCliquiness /= iCount;
-	sDatum.m_dCliquinessStd = (float)sqrt( ( sDatum.m_dCliquinessStd / ( iCount - 1 ) ) -
-		( sDatum.m_dCliquiness * sDatum.m_dCliquiness ) );
+			if( vecsClique[ i ].m_iCount )
+				iRet++;
+			sDatum.m_sCliquiness.Add( vecsClique[ i ] ); }
+	sDatum.m_sCliquiness.Multiply( 0.5 );
 
-	sDatum.m_dHubbiness = sDatum.m_dHubbinessStd = 0;
-	for( iCount = i = 0; i < Dat.GetGenes( ); ++i )
-		if( !vecfOutside[ i ] && !CMeta::IsNaN( d = vecdHub[ i ] ) ) {
-			iCount++;
-			sDatum.m_dHubbiness += d;
-			sDatum.m_dHubbinessStd += d * d; }
-	i = pGenes ? iCount : iGenes;
-	sDatum.m_dHubbiness /= i;
-	sDatum.m_dHubbinessStd = (float)sqrt( ( sDatum.m_dHubbinessStd / ( i - 1 ) ) -
-		( sDatum.m_dHubbiness * sDatum.m_dHubbiness ) );
+	for( i = 0; i < Dat.GetGenes( ); ++i )
+		if( !vecfOutside[ i ] )
+			sDatum.m_sHubbiness.Add( vecsHubs[ i ] );
 
-	sDatum.m_vecprSpecific.resize( Dat.GetGenes( ) );
-	for( i = 0; i < sDatum.m_vecprSpecific.size( ); ++i ) {
-		sDatum.m_vecprSpecific[ i ].first = i;
-		sDatum.m_vecprSpecific[ i ].second = vecdClique[ i ] / vecdHub[ i ]; }
-	if( fSort )
-		sort( sDatum.m_vecprSpecific.begin( ), sDatum.m_vecprSpecific.end( ), SSorter( ) ); }
+	if( iSort ) {
+		sDatum.m_vecprSpecific.resize( Dat.GetGenes( ) );
+		for( i = 0; i < sDatum.m_vecprSpecific.size( ); ++i ) {
+			sDatum.m_vecprSpecific[ i ].first = i;
+			sDatum.m_vecprSpecific[ i ].second = vecsClique[ i ].GetAverage( ) / vecsHubs[ i ].GetAverage( ); }
+		if( iSort != -1 )
+			sort( sDatum.m_vecprSpecific.begin( ), sDatum.m_vecprSpecific.end( ), SSorter( ) ); }
+
+	return iRet; }
 
 int process( const char* szFile, bool fMemmap, bool fNormalize, const vector<vector<string> >& vecvecstrSets1,
 	const vector<vector<string> >& vecvecstrSets2, long double* adResults, TFnProcessor* pfnProcessor,