Commits

Chris Park committed f798172

Adding a few changes made.

Comments (0)

Files changed (19)

tools/DChecker/DChecker.cpp

         cmdline_parser_print_help( );
         return 1;
     }
-    CMeta Meta( sArgs.verbosity_arg );
-
+    CMeta Meta( sArgs.verbosity_arg, sArgs.random_arg );
+    
     fMapAnswers = !!sArgs.memmap_flag && !( sArgs.genes_arg || sArgs.genet_arg || sArgs.genex_arg || sArgs.genee_arg );
     if( !Answers.Open( sArgs.answers_arg, fMapAnswers ) ) {
         cerr << "Couldn't open: " << sArgs.answers_arg << endl;
         cerr << "Couldn't open: " << sArgs.genex_arg << endl;
         return 1;
     }
+
+    if( sArgs.genec_given ) {
+      CGenome		tGenome;
+      CGenes		Genes( tGenome );
+      CGenome		uGenome;
+      CGenes		Genesubiq( uGenome );
+      vector<bool>    	    vecfctxt;
+      vector<bool>    	    vecfubiq;
+      float d;
+      
+      if( !Genes.Open( sArgs.genec_arg ) ) {
+	cerr << "Couldn't open: " << sArgs.genec_arg << endl;
+	return 1;
+      }
+      
+      vecfctxt.resize( Answers.GetGenes( ) );
+      for( i = 0; i < vecfctxt.size( ); ++i ) {
+	vecfctxt[ i ] = Genes.IsGene( Answers.GetGene( i ) );
+      }
+            
+      if( sArgs.ubiqg_given ) {
+	if( !Genesubiq.Open( sArgs.ubiqg_arg ) ) {
+	  cerr << "Could not open: " << sArgs.ubiqg_arg << endl;
+	  return 1;
+	}
+	vecfubiq.resize( Answers.GetGenes( ) );
+	for( i = 0; i < vecfubiq.size( ); ++i ) {
+	  vecfubiq[ i ] = Genesubiq.IsGene( Answers.GetGene( i ) );
+	}
+      }
+      
+
+      for( i = 0; i < Answers.GetGenes( ); ++i ) {
+	for( j = ( i + 1 ); j < Answers.GetGenes( ); ++j ) {
+	  
+	  if( CMeta::IsNaN( d = Answers.Get( i, j) ))
+	    continue;
+	  
+	  // remove all original negatives
+	  if( d < 1 ){
+	    Answers.Set( i, j, CMeta::GetNaN());
+	    continue;
+	  }
+	  
+	  if( !vecfctxt[ i ] && !vecfctxt[ j ] )
+	    Answers.Set( i, j, 0);
+	  else if( vecfctxt[ i ] && vecfctxt[ j ] )
+	    continue;
+	  else if( sArgs.ubiqg_given && 
+		   ((vecfubiq[ i ] && !vecfctxt[ j ]) || 
+		    (vecfubiq[ j ] && !vecfctxt[ i ]))  )
+	    Answers.Set( i, j, 0);
+	  else
+	    Answers.Set( i, j, CMeta::GetNaN());
+	}
+      }      
+    }
+    
     if( !Data.Open( sArgs.input_arg, !!sArgs.memmap_flag ) ) {
         cerr << "Couldn't open: " << sArgs.input_arg << endl;
         return 1;
     }
+        
+    if( sArgs.singlegene_flag ){
+      float d;
+      vector<size_t>	    veciPos, veciNeg, veciIndex, veciDgenes;
+      size_t pj, nj, x, y, iOne, iTwo;
+      CDat		    sAnswers;
+
+      veciDgenes.resize( Answers.GetGenes( ) );
+      for( i = 0; i < Answers.GetGenes( ); ++i )
+        veciDgenes[ i ] = Data.GetGene( Answers.GetGene( i ) );
+      
+      //copy over
+      for( i = 0; i < Answers.GetGenes( ); ++i ) {
+	iOne = veciDgenes[ i ];	
+	for( j = ( i + 1 ); j < Answers.GetGenes( ); ++j ) {
+	  iTwo = veciDgenes[ j ];	  
+	  if( iOne == -1 || iTwo == -1 || CMeta::IsNaN( d = Data.Get( iOne, iTwo ) ))
+	    Answers.Set( i, j, CMeta::GetNaN() );
+	}
+      }
+      
+      sAnswers.Open( Answers );
+      for( i = 0; i < Answers.GetGenes( ); ++i ) {
+	for( j = ( i + 1 ); j < Answers.GetGenes( ); ++j ) {
+	  Answers.Set( i, j, CMeta::GetNaN() );
+	}
+      }
+      
+      veciIndex.resize(sAnswers.GetGenes( ));
+      for( i = 0; i < sAnswers.GetGenes( ); ++i ) {
+	veciIndex[ i ] = i;
+      }
+      std::random_shuffle ( veciIndex.begin(), veciIndex.end() );
+      
+      for( x = 0; x < sAnswers.GetGenes( ); ++x ) {
+	i = veciIndex[x];
+	
+	veciPos.clear();
+	veciNeg.clear();
+	
+	for( j = 0; j < sAnswers.GetGenes( ); ++j ) {
+	  if( i == j || CMeta::IsNaN( d = sAnswers.Get( i, j ) )  )
+	    continue;	  
+	  
+	  if( d == 0 ){
+	    veciNeg.push_back(j);	    
+	  }else if( d == 1 ){
+	    veciPos.push_back(j);	    
+	  }
+	}
+	
+	pj = -1;
+	nj = -1;
+	if( veciPos.size() > 0 )
+	  pj = veciPos[ rand( ) % veciPos.size() ];	
+	if( veciNeg.size() > 0 )
+	  nj = veciNeg[ rand( ) % veciNeg.size() ];
+	
+	if( pj != -1 )
+	  Answers.Set( i, pj, 1);
+	if( nj != -1 )
+	  Answers.Set( i, nj, 0);
+	
+	// remove gene rows that have already been sampled
+	for( j = 0; j < sAnswers.GetGenes( ); ++j ) {
+	  if(  i != j  && j != pj && j != nj )
+	    sAnswers.Set( i, j, CMeta::GetNaN() );
+	}
+	
+	if( pj != -1 ){
+	  for(j = 0; j < sAnswers.GetGenes( ); ++j ) {
+	    if( j == pj || CMeta::IsNaN( d = sAnswers.Get( j, pj) )  )
+	      continue;	  	  	  
+	    if( d == 1 )
+	      sAnswers.Set( j, pj, CMeta::GetNaN() );	  
+	  }
+	}
+	if( nj != -1 ){
+	  for(j = 0; j < sAnswers.GetGenes( ); ++j ) {
+	    if( j == nj || CMeta::IsNaN( d = sAnswers.Get( j, nj) )  )
+	      continue;	  	  	  
+	    if( d == 0 )
+	      sAnswers.Set( j, nj, CMeta::GetNaN() );	  
+	  }
+	}
+      }      
+    }    
+        
     if( sArgs.normalize_flag )
-        Data.Normalize( CDat::ENormalizeMinMax );
+      Data.Normalize( CDat::ENormalizeMinMax );
     
 	
     if(sArgs.weights_arg){

tools/DChecker/DChecker.ggo

 							string	typestr="filename"
 option	"flipneg"           F       "Flip weights(one minus original) for negative standards"
                                                         flag    on
+option	"singlegene"  	        S       "Randomly subsample the standards so that a gene occurs at most once in positive and also in negative standards"
+                                                        flag    off
+option	"genec"			E	"Gene file to split positives into new positive examples and negative examples. All positive pairs with both genes in the gene list are only consided as positives. All positive pairs with both genes not in the gene list are considered as negatives. (all original negatives are ignored)."
+							string	typestr="filename"
 
 section "Preprocessing"
 option	"normalize"		n	"Normalize scores before processing"
 							flag	off
 option	"verbosity"		v	"Message verbosity"
 							int	default="5"
+option	"random"		r	"Seed random generator (default -1 uses current time)"
+							int	default="-1" no

tools/DChecker/cmdline.c

   "  -U, --outneg               Use negative edges outside the context  \n                               (default=off)",
   "  -W, --weights=filename     Weight file",
   "  -F, --flipneg              Flip weights(one minus original) for negative \n                               standards  (default=on)",
+  "  -S, --singlegene           Randomly subsample the standards so that a gene \n                               occurs at most once in positive and also in \n                               negative standards  (default=off)",
+  "  -E, --genec=filename       Gene file to split positives into new positive \n                               examples and negative examples. All positive \n                               pairs with both genes in the gene list are only \n                               consided as positives. All positive pairs with \n                               both genes not in the gene list are considered \n                               as negatives. (all original negatives are \n                               ignored).",
   "\nPreprocessing:",
   "  -n, --normalize            Normalize scores before processing  (default=off)",
   "  -t, --invert               Invert correlations to distances  (default=off)",
   "  -s, --sse                  Calculate sum of squared errors  (default=off)",
   "  -p, --memmap               Memory map input DABs  (default=off)",
   "  -v, --verbosity=INT        Message verbosity  (default=`5')",
+  "  -r, --random=INT           Seed random generator (default -1 uses current \n                               time)  (default=`-1')",
     0
 };
 
   args_info->outneg_given = 0 ;
   args_info->weights_given = 0 ;
   args_info->flipneg_given = 0 ;
+  args_info->singlegene_given = 0 ;
+  args_info->genec_given = 0 ;
   args_info->normalize_given = 0 ;
   args_info->invert_given = 0 ;
   args_info->abs_given = 0 ;
   args_info->sse_given = 0 ;
   args_info->memmap_given = 0 ;
   args_info->verbosity_given = 0 ;
+  args_info->random_given = 0 ;
 }
 
 static
   args_info->weights_arg = NULL;
   args_info->weights_orig = NULL;
   args_info->flipneg_flag = 1;
+  args_info->singlegene_flag = 0;
+  args_info->genec_arg = NULL;
+  args_info->genec_orig = NULL;
   args_info->normalize_flag = 0;
   args_info->invert_flag = 0;
   args_info->abs_arg = 0.0;
   args_info->memmap_flag = 0;
   args_info->verbosity_arg = 5;
   args_info->verbosity_orig = NULL;
+  args_info->random_arg = -1;
+  args_info->random_orig = NULL;
   
 }
 
   args_info->outneg_help = gengetopt_args_info_help[27] ;
   args_info->weights_help = gengetopt_args_info_help[28] ;
   args_info->flipneg_help = gengetopt_args_info_help[29] ;
-  args_info->normalize_help = gengetopt_args_info_help[31] ;
-  args_info->invert_help = gengetopt_args_info_help[32] ;
-  args_info->abs_help = gengetopt_args_info_help[33] ;
-  args_info->sse_help = gengetopt_args_info_help[35] ;
-  args_info->memmap_help = gengetopt_args_info_help[36] ;
-  args_info->verbosity_help = gengetopt_args_info_help[37] ;
+  args_info->singlegene_help = gengetopt_args_info_help[30] ;
+  args_info->genec_help = gengetopt_args_info_help[31] ;
+  args_info->normalize_help = gengetopt_args_info_help[33] ;
+  args_info->invert_help = gengetopt_args_info_help[34] ;
+  args_info->abs_help = gengetopt_args_info_help[35] ;
+  args_info->sse_help = gengetopt_args_info_help[37] ;
+  args_info->memmap_help = gengetopt_args_info_help[38] ;
+  args_info->verbosity_help = gengetopt_args_info_help[39] ;
+  args_info->random_help = gengetopt_args_info_help[40] ;
   
 }
 
   free_string_field (&(args_info->genep_orig));
   free_string_field (&(args_info->weights_arg));
   free_string_field (&(args_info->weights_orig));
+  free_string_field (&(args_info->genec_arg));
+  free_string_field (&(args_info->genec_orig));
   free_string_field (&(args_info->abs_orig));
   free_string_field (&(args_info->verbosity_orig));
+  free_string_field (&(args_info->random_orig));
   
   
   for (i = 0; i < args_info->inputs_num; ++i)
     write_into_file(outfile, "weights", args_info->weights_orig, 0);
   if (args_info->flipneg_given)
     write_into_file(outfile, "flipneg", 0, 0 );
+  if (args_info->singlegene_given)
+    write_into_file(outfile, "singlegene", 0, 0 );
+  if (args_info->genec_given)
+    write_into_file(outfile, "genec", args_info->genec_orig, 0);
   if (args_info->normalize_given)
     write_into_file(outfile, "normalize", 0, 0 );
   if (args_info->invert_given)
     write_into_file(outfile, "memmap", 0, 0 );
   if (args_info->verbosity_given)
     write_into_file(outfile, "verbosity", args_info->verbosity_orig, 0);
+  if (args_info->random_given)
+    write_into_file(outfile, "random", args_info->random_orig, 0);
   
 
   i = EXIT_SUCCESS;
         { "outneg",	0, NULL, 'U' },
         { "weights",	1, NULL, 'W' },
         { "flipneg",	0, NULL, 'F' },
+        { "singlegene",	0, NULL, 'S' },
+        { "genec",	1, NULL, 'E' },
         { "normalize",	0, NULL, 'n' },
         { "invert",	0, NULL, 't' },
         { "abs",	1, NULL, 'A' },
         { "sse",	0, NULL, 's' },
         { "memmap",	0, NULL, 'p' },
         { "verbosity",	1, NULL, 'v' },
+        { "random",	1, NULL, 'r' },
         { NULL,	0, NULL, 0 }
       };
 
-      c = getopt_long (argc, argv, "hVi:w:d:a:R:b:fm:M:e:g:G:P:c:C:l:qQjJuUW:FntA:spv:", long_options, &option_index);
+      c = getopt_long (argc, argv, "hVi:w:d:a:R:b:fm:M:e:g:G:P:c:C:l:qQjJuUW:FSE:ntA:spv:r:", long_options, &option_index);
 
       if (c == -1) break;	/* Exit from `while (1)' loop.  */
 
             goto failure;
         
           break;
+        case 'S':	/* Randomly subsample the standards so that a gene occurs at most once in positive and also in negative standards.  */
+        
+        
+          if (update_arg((void *)&(args_info->singlegene_flag), 0, &(args_info->singlegene_given),
+              &(local_args_info.singlegene_given), optarg, 0, 0, ARG_FLAG,
+              check_ambiguity, override, 1, 0, "singlegene", 'S',
+              additional_error))
+            goto failure;
+        
+          break;
+        case 'E':	/* Gene file to split positives into new positive examples and negative examples. All positive pairs with both genes in the gene list are only consided as positives. All positive pairs with both genes not in the gene list are considered as negatives. (all original negatives are ignored)..  */
+        
+        
+          if (update_arg( (void *)&(args_info->genec_arg), 
+               &(args_info->genec_orig), &(args_info->genec_given),
+              &(local_args_info.genec_given), optarg, 0, 0, ARG_STRING,
+              check_ambiguity, override, 0, 0,
+              "genec", 'E',
+              additional_error))
+            goto failure;
+        
+          break;
         case 'n':	/* Normalize scores before processing.  */
         
         
             goto failure;
         
           break;
+        case 'r':	/* Seed random generator (default -1 uses current time).  */
+        
+        
+          if (update_arg( (void *)&(args_info->random_arg), 
+               &(args_info->random_orig), &(args_info->random_given),
+              &(local_args_info.random_given), optarg, 0, "-1", ARG_INT,
+              check_ambiguity, override, 0, 0,
+              "random", 'r',
+              additional_error))
+            goto failure;
+        
+          break;
 
         case 0:	/* Long option with no short option */
         case '?':	/* Invalid option.  */

tools/DChecker/cmdline.h

   const char *weights_help; /**< @brief Weight file help description.  */
   int flipneg_flag;	/**< @brief Flip weights(one minus original) for negative standards (default=on).  */
   const char *flipneg_help; /**< @brief Flip weights(one minus original) for negative standards help description.  */
+  int singlegene_flag;	/**< @brief Randomly subsample the standards so that a gene occurs at most once in positive and also in negative standards (default=off).  */
+  const char *singlegene_help; /**< @brief Randomly subsample the standards so that a gene occurs at most once in positive and also in negative standards help description.  */
+  char * genec_arg;	/**< @brief Gene file to split positives into new positive examples and negative examples. All positive pairs with both genes in the gene list are only consided as positives. All positive pairs with both genes not in the gene list are considered as negatives. (all original negatives are ignored)..  */
+  char * genec_orig;	/**< @brief Gene file to split positives into new positive examples and negative examples. All positive pairs with both genes in the gene list are only consided as positives. All positive pairs with both genes not in the gene list are considered as negatives. (all original negatives are ignored). original value given at command line.  */
+  const char *genec_help; /**< @brief Gene file to split positives into new positive examples and negative examples. All positive pairs with both genes in the gene list are only consided as positives. All positive pairs with both genes not in the gene list are considered as negatives. (all original negatives are ignored). help description.  */
   int normalize_flag;	/**< @brief Normalize scores before processing (default=off).  */
   const char *normalize_help; /**< @brief Normalize scores before processing help description.  */
   int invert_flag;	/**< @brief Invert correlations to distances (default=off).  */
   int verbosity_arg;	/**< @brief Message verbosity (default='5').  */
   char * verbosity_orig;	/**< @brief Message verbosity original value given at command line.  */
   const char *verbosity_help; /**< @brief Message verbosity help description.  */
+  int random_arg;	/**< @brief Seed random generator (default -1 uses current time) (default='-1').  */
+  char * random_orig;	/**< @brief Seed random generator (default -1 uses current time) original value given at command line.  */
+  const char *random_help; /**< @brief Seed random generator (default -1 uses current time) help description.  */
   
   unsigned int help_given ;	/**< @brief Whether help was given.  */
   unsigned int version_given ;	/**< @brief Whether version was given.  */
   unsigned int outneg_given ;	/**< @brief Whether outneg was given.  */
   unsigned int weights_given ;	/**< @brief Whether weights was given.  */
   unsigned int flipneg_given ;	/**< @brief Whether flipneg was given.  */
+  unsigned int singlegene_given ;	/**< @brief Whether singlegene was given.  */
+  unsigned int genec_given ;	/**< @brief Whether genec was given.  */
   unsigned int normalize_given ;	/**< @brief Whether normalize was given.  */
   unsigned int invert_given ;	/**< @brief Whether invert was given.  */
   unsigned int abs_given ;	/**< @brief Whether abs was given.  */
   unsigned int sse_given ;	/**< @brief Whether sse was given.  */
   unsigned int memmap_given ;	/**< @brief Whether memmap was given.  */
   unsigned int verbosity_given ;	/**< @brief Whether verbosity was given.  */
+  unsigned int random_given ;	/**< @brief Whether random was given.  */
 
   char **inputs ; /**< @brief unamed options (options without names) */
   unsigned inputs_num ; /**< @brief unamed options number */

tools/Dat2Dab/Dat2Dab.cpp

 			}			
 		} 
 	}
-
+	
+	if( sArgs.summary_flag ) {
+	  double sum, sq_sum, cTotal, mean, variance, d;
+	  
+	  sum = 0.0;
+	  sq_sum = 0.0;
+	  cTotal = 0.0;
+	  
+	  for( i = 0; i < Dat.GetGenes( ); ++i )
+	    for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j )
+	      if( !CMeta::IsNaN( d = Dat.Get( i, j ) ) ) {
+		sum += d;
+		sq_sum += (d*d);
+		cTotal += 1;
+	      }
+	  
+	  mean = sum / cTotal;	  	  
+	  variance = ( (sq_sum)  -   cTotal *(mean * mean) ) / ( cTotal - 1 );
+	  cout << mean << endl;
+	  cout << sqrt(variance) << endl;
+	  return 0;
+	}
+		
 	if( sArgs.lookups1_arg ) {
 		CGenes			GenesLk1( Genome );
 		vector<size_t>	veciGenesOne;

tools/Dat2Dab/Dat2Dab.ggo

 							flag	off
 option	"mar"			J	"Output the maximum adjacency ratio for each gene"
 							flag	off
+option	"summary"		S	"Output the summary of values (mean, stand dev.)"
+							flag	off
 
 section "Optional"
 option	"remap"			p	"Gene name remapping file"

tools/Dat2Dab/cmdline.c

   "  -C, --ccoeff             Output clustering coefficient for each gene  \n                             (default=off)",
   "  -H, --hubbiness          Output the average edge weight for each gene  \n                             (default=off)",
   "  -J, --mar                Output the maximum adjacency ratio for each gene  \n                             (default=off)",
+  "  -S, --summary            Output the summary of values (mean, stand dev.)  \n                             (default=off)",
   "\nOptional:",
   "  -p, --remap=filename     Gene name remapping file",
   "  -b, --table              Produce table formatted output  (default=off)",
   args_info->ccoeff_given = 0 ;
   args_info->hubbiness_given = 0 ;
   args_info->mar_given = 0 ;
+  args_info->summary_given = 0 ;
   args_info->remap_given = 0 ;
   args_info->table_given = 0 ;
   args_info->skip_given = 0 ;
   args_info->ccoeff_flag = 0;
   args_info->hubbiness_flag = 0;
   args_info->mar_flag = 0;
+  args_info->summary_flag = 0;
   args_info->remap_arg = NULL;
   args_info->remap_orig = NULL;
   args_info->table_flag = 0;
   args_info->random_help = gengetopt_args_info_help[45] ;
   args_info->noise_help = gengetopt_args_info_help[46] ;
   args_info->verbosity_help = gengetopt_args_info_help[47] ;
-  
 }
 
 void
     write_into_file(outfile, "hubbiness", 0, 0 );
   if (args_info->mar_given)
     write_into_file(outfile, "mar", 0, 0 );
+  if (args_info->summary_given)
+    write_into_file(outfile, "summary", 0, 0 );
   if (args_info->remap_given)
     write_into_file(outfile, "remap", args_info->remap_orig, 0);
   if (args_info->table_given)
         { "ccoeff",	0, NULL, 'C' },
         { "hubbiness",	0, NULL, 'H' },
         { "mar",	0, NULL, 'J' },
+        { "summary",	0, NULL, 'S' },
         { "remap",	1, NULL, 'p' },
         { "table",	0, NULL, 'b' },
         { "skip",	1, NULL, 's' },
             goto failure;
         
           break;
+        case 'S':	/* Output the summary of values (mean, stand dev.).  */
+        
+        
+          if (update_arg((void *)&(args_info->summary_flag), 0, &(args_info->summary_given),
+              &(local_args_info.summary_given), optarg, 0, 0, ARG_FLAG,
+              check_ambiguity, override, 1, 0, "summary", 'S',
+              additional_error))
+            goto failure;
+        
+          break;
         case 'p':	/* Gene name remapping file.  */
         
         

tools/Dat2Dab/cmdline.h

   const char *hubbiness_help; /**< @brief Output the average edge weight for each gene help description.  */
   int mar_flag;	/**< @brief Output the maximum adjacency ratio for each gene (default=off).  */
   const char *mar_help; /**< @brief Output the maximum adjacency ratio for each gene help description.  */
+  int summary_flag;	/**< @brief Output the summary of values (mean, stand dev.) (default=off).  */
+  const char *summary_help; /**< @brief Output the summary of values (mean, stand dev.) help description.  */
   char * remap_arg;	/**< @brief Gene name remapping file.  */
   char * remap_orig;	/**< @brief Gene name remapping file original value given at command line.  */
   const char *remap_help; /**< @brief Gene name remapping file help description.  */
   unsigned int ccoeff_given ;	/**< @brief Whether ccoeff was given.  */
   unsigned int hubbiness_given ;	/**< @brief Whether hubbiness was given.  */
   unsigned int mar_given ;	/**< @brief Whether mar was given.  */
+  unsigned int summary_given ;	/**< @brief Whether summary was given.  */
   unsigned int remap_given ;	/**< @brief Whether remap was given.  */
   unsigned int table_given ;	/**< @brief Whether table was given.  */
   unsigned int skip_given ;	/**< @brief Whether skip was given.  */

tools/MIed/MIed.ggo

                                                 string  typestr="file"
 option	"zeros"		Z	"Read zeroed node IDs/outputs from the given file"
 						string	typestr="filename"
-option	"edges"		e	"Process only edges from the given DAT/DAB, should be used with Beta values"
+option	"edges"		e	"Process only edges from the given DAT/DAB"
 						string	typestr="filename"

tools/MIed/cmdline.c

 /*
   File autogenerated by gengetopt version 2.22
   generated with the following command:
-  /r03/cypark/sleipnir/../sleipnir-extlib/gengetopt-2.22/src/gengetopt -iMIed.ggo --default-optional -u -N -e 
+  /Genomics/ogtr03/cypark/sleipnir/../sleipnir-extlib/gengetopt-2.22/src/gengetopt -iMIed.ggo --default-optional -u -N -e 
 
   The developers of gengetopt consider the fixed text that goes in all
   gengetopt output files to be in the public domain:
   "  -d, --directory=directory  input directory",
   "  -f, --datasets=file        Calculate MI for datasets in given file against \n                               all datasets",
   "  -Z, --zeros=filename       Read zeroed node IDs/outputs from the given file",
-  "  -e, --edges=filename       Process only edges from the given DAT/DAB, should \n                               be used with Beta values",
+  "  -e, --edges=filename       Process only edges from the given DAT/DAB",
     0
 };
 
             goto failure;
         
           break;
-        case 'e':	/* Process only edges from the given DAT/DAB, should be used with Beta values.  */
+        case 'e':	/* Process only edges from the given DAT/DAB.  */
         
         
           if (update_arg( (void *)&(args_info->edges_arg), 

tools/MIed/cmdline.h

   char * zeros_arg;	/**< @brief Read zeroed node IDs/outputs from the given file.  */
   char * zeros_orig;	/**< @brief Read zeroed node IDs/outputs from the given file original value given at command line.  */
   const char *zeros_help; /**< @brief Read zeroed node IDs/outputs from the given file help description.  */
-  char * edges_arg;	/**< @brief Process only edges from the given DAT/DAB, should be used with Beta values.  */
-  char * edges_orig;	/**< @brief Process only edges from the given DAT/DAB, should be used with Beta values original value given at command line.  */
-  const char *edges_help; /**< @brief Process only edges from the given DAT/DAB, should be used with Beta values help description.  */
+  char * edges_arg;	/**< @brief Process only edges from the given DAT/DAB.  */
+  char * edges_orig;	/**< @brief Process only edges from the given DAT/DAB original value given at command line.  */
+  const char *edges_help; /**< @brief Process only edges from the given DAT/DAB help description.  */
   
   unsigned int help_given ;	/**< @brief Whether help was given.  */
   unsigned int version_given ;	/**< @brief Whether version was given.  */

tools/SVMperfing/SVMperfing.cpp

 		
 	}
 	ifsm.close();
+	return true;
+}
+
+bool ReadGenesHoldoutFoldFile(ifstream& ifsm, map<string, size_t>& mapGene2Fold) {
+	static const size_t c_iBuffer = 1024;
+	char acBuffer[c_iBuffer];
+	char* nameBuffer;
+	vector<string> vecstrTokens;
+	
+	while (!ifsm.eof()) {
+		ifsm.getline(acBuffer, c_iBuffer - 1);
+		acBuffer[c_iBuffer - 1] = 0;
+		vecstrTokens.clear();
+		CMeta::Tokenize(acBuffer, vecstrTokens);
+		if (vecstrTokens.empty())
+			continue;
+		if (vecstrTokens.size() != 2) {
+			cerr << "Illegal line (" << vecstrTokens.size() << "): "
+					<< acBuffer << endl;
+			continue;
+		}
+		
+		if (acBuffer[0] == '#') {
+		  cerr << "skipping " << acBuffer << endl;
+		} else {		  		  
+		  mapGene2Fold[ vecstrTokens[0] ] = atoi( vecstrTokens[1].c_str() );
+		}
+	}
+	return true;
 }
 
 // Read in the 
 	vector<size_t> mapTgene2fold;
 	vector<int> tgeneCount;
 	
+	map<string, size_t> mapGene2Fold;
+
 	DIR* dp;
 	struct dirent* ep;	
 	CGenome Genome;
 	CGenome GenomeThree;
         CGenes Allgenes(GenomeThree);
 	
+	CGenome GenomeFour;
+        CGenes labelgenes(GenomeFour);
+	
 	if (cmdline_parser(iArgs, aszArgs, &sArgs)) {
 		cmdline_parser_print_help();
 		return 1;
 	  }	  
 	}
 	
+	// read in the gene list
+	if( sArgs.genes_arg ) {
+	  ifsm.open( sArgs.genes_arg );
+	  if( !labelgenes.Open( ifsm ) ) {
+	    cerr << "Could not open: " << sArgs.genes_arg << endl;
+	    return 1; }
+	  ifsm.close( ); }
+	
 	// read target gene list
 	if(sArgs.tgene_given ) {
 	  ifstream ifsm;
 	  }
 	  ifsm.close();
 	}
-	
+
 	// read context gene list
 	if(sArgs.context_given ) {
 	  ifstream ifsm;
 	  }
 	  ifsm.close();
 	}
-		
+
 	// read all gene list
 	// IF given this flag predict for all gene pairs
 	if(sArgs.allgenes_given ) {
 	  }
 	  ifsm.close();
 	}
+
+	// read the gene holdout fold
+	if( sArgs.GenesHoldoutFold_given ){
+	  ifstream ifsm;
+	  ifsm.open( sArgs.GenesHoldoutFold_arg );
+	  
+	  if (!ReadGenesHoldoutFoldFile(ifsm, mapGene2Fold) ) {
+	    cerr << "Could not open: " << sArgs.GenesHoldoutFold_arg << endl;
+	    return 1;
+	  }
+	  
+	  ifsm.close();	  	  
+	}
 	
 	///######################
 	// Chris added
 	    cerr << "Could not open input labels Dat" << endl;
 	    return 1;
 	  }
-	  
+
+	  if( labelgenes.GetGenes( ) )
+	    Labels.FilterGenes( labelgenes, CDat::EFilterInclude );
+	  	  
 	  // random sample labels
 	  if( sArgs.subsample_given ){
 	    cerr << "Sub-sample labels to rate:" << sArgs.subsample_arg << endl;
 	    
 	    // keep track of positive gene counts
 	    tgeneCount.resize(Labels.GetGenes());
-	    	    
+	    
 	    // if given a target gene file
 	    // Only keep eges that have only one gene in this targe gene list
 	    if( sArgs.onetgene_flag ){
 		mapCgene[i] = true;
 	    }
 	  }
-	  
+
 	  // Set target prior
 	  if(sArgs.prior_given){
 	    numpos = 0;
 		    ++numneg;
 		  }
 		}
-	    
+
 	    if( ((float)numpos / (numpos + numneg)) < sArgs.prior_arg){
 	      
 	      cerr << "Convert prior from orig: " << ((float)numpos / (numpos + numneg)) << " to target: " << sArgs.prior_arg << endl;
 	  }
 	  
 	  // Exclude labels without context genes
-	  if(sArgs.context_given )
-	    Labels.FilterGenes( Context, CDat::EFilterInclude );
-	  
+	  if(sArgs.context_given && !sArgs.allContextPred_flag){
+	    if( sArgs.touchContext_flag ){
+	      Labels.FilterGenes( sArgs.context_arg, CDat::EFilterEdge );
+	    }else{
+	      Labels.FilterGenes( Context, CDat::EFilterInclude );
+	    }
+	  }
 	  
 	  // If not given a SVM model/models we are in learning mode, thus construct each SVMLabel object for label
 	  if( !sArgs.model_given && !sArgs.modelPrefix_given ){
 		  mapTgene2fold[i] = -1; 
 		  continue;
 		}
-		//cerr << "what's up?" << endl;
-		mapTgene2fold[i] = rand() % sArgs.cross_validation_arg;
+		
+		if( sArgs.GenesHoldoutFold_given ){
+		  // Does not check if this gene has been assigned a random fold
+		  mapTgene2fold[i] = mapGene2Fold[ Labels.GetGene(i) ];
+		}else{
+		  mapTgene2fold[i] = rand() % sArgs.cross_validation_arg;
+		}
+		
 	      }
 	      
 	      // cross-fold by target gene
 	      for (i = 0; i < sArgs.cross_validation_arg; i++) {
 		cerr << "cross validation holds setup:" << i << endl;
 		
-		// keep track of positive gene counts
-		if(sArgs.balance_flag){
-		  cerr << "Set up balance: " << i << endl;
-		  for(j = 0; j < Labels.GetGenes(); j++)
-		    tgeneCount[j] = 0;
-		  
-		  for(j = 0; j < vecLabels.size(); j++)
-		    if(vecLabels[j]->Target > 0){
-		      ++(tgeneCount[vecLabels[j]->iidx]);
-		      ++(tgeneCount[vecLabels[j]->jidx]);
-		    }
-		  
-		  if(sArgs.bfactor_given)
-		    for(j = 0; j < vecLabels.size(); j++)
-		      if(tgeneCount[vecLabels[j]->jidx] < 500)
-			tgeneCount[vecLabels[j]->jidx] = sArgs.bfactor_arg*tgeneCount[vecLabels[j]->jidx];
-		}
-		
 		for (j = 0; j < vecLabels.size(); j++) {
 		  //if( j % 1000 == 0)
 		  //cerr << "cross validation push labels:" << j << endl;
 		  
-		  // assume only one gene is a target gene in a edge
-		  if(mapTgene[vecLabels[j]->iidx]){
-		    if(vecLabels[j]->Target < 0){
-		      --(tgeneCount[vecLabels[j]->iidx]);
-		    }
+		  if(mapTgene[vecLabels[j]->iidx] || mapTgene[vecLabels[j]->jidx]){
 		    
-		    if(mapTgene2fold[vecLabels[j]->iidx] == i)			    
+		    if(mapTgene2fold[vecLabels[j]->iidx] == i || mapTgene2fold[vecLabels[j]->jidx] == i){
+		      
+		      // only add if both genes are in context
+		      if( sArgs.context_given  && 
+			  !sArgs.allContextPred_flag  &&
+			  !sArgs.touchContext_flag &&
+			  ( !mapCgene[vecLabels[j]->iidx] || !mapCgene[vecLabels[j]->jidx])){
+
+			if( !( sArgs.onlyPos_flag && vecLabels[j]->Target < 0 ) )
+			  continue;
+		      }
+		      
+		      if( sArgs.context_given  && 
+			  !sArgs.allContextPred_flag  &&
+			  sArgs.touchContext_flag &&
+			  ( !mapCgene[vecLabels[j]->iidx] && !mapCgene[vecLabels[j]->jidx])){
+			
+			if( !( sArgs.onlyPos_flag && vecLabels[j]->Target < 0 ) )
+			  continue;			
+		      }
+
 		      pTestVector[i].push_back(vecLabels[j]);
-		    else{
-		      //cerr << tgeneCount[vecLabels[j]->iidx] << endl;
-		      
-		      if( sArgs.balance_flag && vecLabels[j]->Target < 0 && tgeneCount[vecLabels[j]->iidx] < 0){
-			continue;
+		    }else{		      		      
+
+		      // only add if both genes are in context
+		      if( sArgs.context_given  && 
+			  !sArgs.touchContext_flag &&
+			  ( !mapCgene[vecLabels[j]->iidx] || !mapCgene[vecLabels[j]->jidx])){
+
+			if( !( sArgs.onlyPos_flag && vecLabels[j]->Target < 0 ) )
+			  continue;
+
 		      }
 		      
 		      // only add if both genes are in context
-		      if( sArgs.context_given  && ( !mapCgene[vecLabels[j]->iidx] || !mapCgene[vecLabels[j]->jidx]))
-			continue;
+		      if( sArgs.context_given  && 
+			  sArgs.touchContext_flag &&
+			  ( !mapCgene[vecLabels[j]->iidx] && !mapCgene[vecLabels[j]->jidx])){
+			
+			if( !( sArgs.onlyPos_flag && vecLabels[j]->Target < 0 ) )
+			  continue;
+		      }
 		      
 		      pTrainVector[i].push_back(vecLabels[j]); 
-		    }
+		    }	
+		    /*else if(mapTgene2fold[vecLabels[j]->iidx] != i && mapTgene2fold[vecLabels[j]->jidx] != i){		      		      
+		      pTrainVector[i].push_back(vecLabels[j]); 
+		      }*/    
+		  }else{
+		    cerr << "Error: edge exist without a target gene" << endl; 
+		    return 1;
 		  }
-		  else if(mapTgene[vecLabels[j]->jidx]){
-		    if(vecLabels[j]->Target < 0)
-		      --(tgeneCount[vecLabels[j]->jidx]);
-		    
-		    if(mapTgene2fold[vecLabels[j]->jidx] == i)
-		      pTestVector[i].push_back(vecLabels[j]);
-		    else{
-		      //cerr << tgeneCount[vecLabels[j]->jidx] << endl;
-		      
-		      if( sArgs.balance_flag && vecLabels[j]->Target < 0 && tgeneCount[vecLabels[j]->jidx] < 0){
-			continue;
-		      }
-		      
-		      // only add if both genes are in context
-		      if( sArgs.context_given && ( !mapCgene[vecLabels[j]->iidx] || !mapCgene[vecLabels[j]->jidx]))
-			continue;
-		      
-		      pTrainVector[i].push_back(vecLabels[j]); 
-		    }
-		  }
-		  else{
-		    cerr << "Error: edge exist without a target gene" << endl; return 1;
-		  }
-		}
+		}		
 		
 		cerr << "test,"<< i <<": " << pTestVector[i].size() << endl;
 		int numpos = 0;
+		int numpos_test = 0;
 		for(j=0; j < pTrainVector[i].size(); j++)
 		  if(pTrainVector[i][j]->Target > 0)
 		    ++numpos;
+		for(j=0; j < pTestVector[i].size(); j++)
+		  if(pTestVector[i][j]->Target > 0)
+		    ++numpos_test;
 		
 		if( numpos < 1 || (sArgs.mintrain_given && sArgs.mintrain_arg > numpos) ){						
 		  cerr << "Not enough positive examples from fold: " << i  << " file: " << sArgs.labels_arg << " numpos: " << numpos << endl;
 		}
 		
 		cerr << "train,"<< i <<","<<numpos<<": " << pTrainVector[i].size() << endl;
-		
+		cerr << "test,"<< i <<","<<numpos_test<<": " <<  pTestVector[i].size() << endl;
 	      }
 	    }
 	    else{ //randomly set eges into cross-fold
-	      if( sArgs.context_given ){
+	      cerr << "Edge holdout" << endl;
+	      /*
+		if( sArgs.context_given ){
 		cerr << "context not implemented yet for random edge holdout" << endl;
 		return 1;
-	      }
+		}
+	      */
 	      
 	      for (i = 0; i < sArgs.cross_validation_arg; i++) {
 		pTestVector[i].reserve((size_t) vecLabels.size()
 		for (j = 0; j < vecLabels.size(); j++) {
 		  if (j % sArgs.cross_validation_arg == i) {
 		    pTestVector[i].push_back(vecLabels[j]);
-		  } else {
-		    pTrainVector[i].push_back((vecLabels[j]));
+		  }else{
+		    if( sArgs.context_given ){
+		      if( mapCgene[vecLabels[j]->iidx] && mapCgene[vecLabels[j]->jidx] )
+			pTrainVector[i].push_back((vecLabels[j]));
+		    }else{
+		      pTrainVector[i].push_back((vecLabels[j]));
+		    }
 		  }
 		}
-	      }
+	      	
+		// print out number of examples
+		int numpos = 0;
+		int numpos_test = 0;
+		for(j=0; j < pTrainVector[i].size(); j++)
+		  if(pTrainVector[i][j]->Target > 0)
+		    ++numpos;
+		for(j=0; j < pTestVector[i].size(); j++)
+		  if(pTestVector[i][j]->Target > 0)
+		    ++numpos_test;		
+		cerr << "train: "<< i <<" , "<<numpos<<": " << pTrainVector[i].size() << endl;
+		cerr << "test: "<< i <<" , "<<numpos_test<<": " <<  pTestVector[i].size() << endl;
+		
+	      }      	      
 	    }
 	  }
 	  else{ // if you have less than 2 fold cross, no cross validation is done, all train genes are used and predicted
 					  vecLabels,
 					  Labels.GetGeneNames(),
 					  Sleipnir::CDat::ENormalizeMinMaxNPone);
+	  }else if(sArgs.zscore_flag){
+	    SVMLight::CSVMPERF::CreateDoc(vecstrDatasets,
+					  vecLabels,
+					  Labels.GetGeneNames(),
+					  Sleipnir::CDat::ENormalizeZScore);
 	  }else{
 	    SVMLight::CSVMPERF::CreateDoc(vecstrDatasets,
 					  vecLabels,
 	    }else if(sArgs.normalizeNPone_flag){
 	      cerr << "Normalize input [-1,1] data" << endl;
 	      wDat.Normalize( Sleipnir::CDat::ENormalizeMinMaxNPone );
+	    }else if(sArgs.zscore_flag){
+	      cerr << "Normalize input data to zscore" << endl;
+	      wDat.Normalize( Sleipnir::CDat::ENormalizeZScore );
 	    }
 	    
 	    // map result gene list to dataset gene list
 	  }
 	  
 	  // Exclude pairs without context genes
-	  if(sArgs.context_given ){
+	  if(sArgs.context_given && !sArgs.allContextPred_flag){
 	    for(iSVM = 0; iSVM < sArgs.cross_validation_arg; iSVM++){
-	      vecResults[ iSVM ]->FilterGenes( Context, CDat::EFilterInclude );
+	      if( sArgs.touchContext_flag ){
+		vecResults[ iSVM ]->FilterGenes( sArgs.context_arg, CDat::EFilterEdge );
+	      }else{
+		vecResults[ iSVM ]->FilterGenes( Context, CDat::EFilterInclude );
+	      }
 	    }
 	  }
 	  

tools/SVMperfing/SVMperfing.ggo

 							int	default="-1" no
 option	"tgene"				T	"Target gene list, use this gene list as gene holdout cross-validation and also filter labels that only have one gene in given target gene list"
 							string	typestr="filename"
-option	"balance"			b	"DEBUG: check before usage, Balance the training gene ratios"
-										flag	off
-option  "bfactor"    			F   "DEBUG: only for < 500, When balancing neg and pos counts exmaples for training what factor to increase. default is 1."
-										float  no
 option	"prob"				B	"Output prediction values as estimated probablity (Platt method)"
 										flag	off
 option	"probCross"			D	"Cross-validation setting for output prediction values as estimated probablity (Platt method)"
                                                         flag    off
 option  "normalizeNPone"             	N       "Normalize input data to the range [-1, 1]"
                                                         flag    off
+option	"zscore"			Z	"Normalize input data to convert values to z-scores"
+							flag	off
 option  "edgeholdout"             	X       "For cross-validation perform edge holdout (Default is gene holdout)"
                                                         flag    off
 option  "skipSVM"	             	Q       "If given this flag, skip training SVM models when file already exist. Often used when cluster runs timeout/error and need to re-run jobs."
 										float  no
 option	"OutLabels"			U	"Save the sampled labels to the file and exit"
 							string	typestr="filename"
+option	"GenesHoldoutFold"		H	"Input the gene holdout fold"
+							string	typestr="filename"
+option	"touchContext"			f	"If given context gene list, context is defined by all edges touch the context. (default is both genes in edge need to be in context)"
+							flag	off
+option	"onlyPos"			w	"When given the context file, only filter for positive examples and leave negative examples as originally given."
+							flag	off
 
 section "Filtering"
+option	"genes"				g	"Process only genes from the given set from labels"
+							string	typestr="filename"
 option	"onetgene"			q	"Only keep edges from lables that have one gene in the target gene list"
 							flag off
-option "prior"    			P   "Randomly sub-sample the negative labels to reach target prior. If cannot reach target prior, set to closest prior."
+option "prior"    			P   	"Randomly sub-sample the negative labels to reach target prior. If cannot reach target prior, set to closest prior."
 										float  no
 option	"savemodel"			s	"Save model to file"
 										flag	off
-option "mintrain"	    		E   "Minimum number of total positive examples to allow training, if not met exit"
+option "mintrain"	    		E   	"Minimum number of total positive examples to allow training, if not met exit"
 										float  no
 option	"context"			C	"Context gene list"
 							string	typestr="filename"
+option	"allContextPred"		V	"When given context genes list, allow prediction too all genes"
+										flag	off

tools/SVMperfing/cmdline.c

 
 const char *gengetopt_args_info_help[] = {
   "  -h, --help                    Print help and exit",
-  "  -V, --version                 Print version and exit",
+  "      --version                 Print version and exit",
   "\nMain:",
   "  -l, --labels=filename         Labels file",
   "  -o, --output=filename         Output file ",
   "  -M, --mmap                    Memory map binary input  (default=off)",
   "  -R, --random=INT              Seed random generator (default -1 uses current \n                                  time)  (default=`-1')",
   "  -T, --tgene=filename          Target gene list, use this gene list as gene \n                                  holdout cross-validation and also filter \n                                  labels that only have one gene in given \n                                  target gene list",
-  "  -b, --balance                 DEBUG: check before usage, Balance the training \n                                  gene ratios  (default=off)",
-  "  -F, --bfactor=FLOAT           DEBUG: only for < 500, When balancing neg and \n                                  pos counts exmaples for training what factor \n                                  to increase. default is 1.",
   "  -B, --prob                    Output prediction values as estimated \n                                  probablity (Platt method)  (default=off)",
   "  -D, --probCross               Cross-validation setting for output prediction \n                                  values as estimated probablity (Platt method) \n                                   (default=off)",
   "  -z, --normalizeZero           Normalize input data to the range [0, 1]  \n                                  (default=off)",
   "  -N, --normalizeNPone          Normalize input data to the range [-1, 1]  \n                                  (default=off)",
+  "  -Z, --zscore                  Normalize input data to convert values to \n                                  z-scores  (default=off)",
   "  -X, --edgeholdout             For cross-validation perform edge holdout \n                                  (Default is gene holdout)  (default=off)",
   "  -Q, --skipSVM                 If given this flag, skip training SVM models \n                                  when file already exist. Often used when \n                                  cluster runs timeout/error and need to re-run \n                                  jobs.  (default=off)",
   "  -x, --aggregateMax            If given this flag, when predicting for all \n                                  gene pairs with multiple SVM models(bagging) \n                                  aggregate using the maximum prediction value \n                                  (Default: average)  (default=off)",
   "  -y, --SampledLabels=filename  Save the sampled final training labels to this \n                                  file",
   "  -A, --subsample=FLOAT         Sample the labels to the following rate",
   "  -U, --OutLabels=filename      Save the sampled labels to the file and exit",
+  "  -H, --GenesHoldoutFold=filename\n                                Input the gene holdout fold",
+  "  -f, --touchContext            If given context gene list, context is defined \n                                  by all edges touch the context. (default is \n                                  both genes in edge need to be in context)  \n                                  (default=off)",
+  "  -w, --onlyPos                 When given the context file, only filter for \n                                  positive examples and leave negative examples \n                                  as originally given.  (default=off)",
   "\nFiltering:",
+  "  -g, --genes=filename          Process only genes from the given set from \n                                  labels",
   "  -q, --onetgene                Only keep edges from lables that have one gene \n                                  in the target gene list  (default=off)",
   "  -P, --prior=FLOAT             Randomly sub-sample the negative labels to \n                                  reach target prior. If cannot reach target \n                                  prior, set to closest prior.",
   "  -s, --savemodel               Save model to file  (default=off)",
   "  -E, --mintrain=FLOAT          Minimum number of total positive examples to \n                                  allow training, if not met exit",
   "  -C, --context=filename        Context gene list",
+  "  -V, --allContextPred          When given context genes list, allow prediction \n                                  too all genes  (default=off)",
     0
 };
 
   args_info->mmap_given = 0 ;
   args_info->random_given = 0 ;
   args_info->tgene_given = 0 ;
-  args_info->balance_given = 0 ;
-  args_info->bfactor_given = 0 ;
   args_info->prob_given = 0 ;
   args_info->probCross_given = 0 ;
   args_info->normalizeZero_given = 0 ;
   args_info->normalizeNPone_given = 0 ;
+  args_info->zscore_given = 0 ;
   args_info->edgeholdout_given = 0 ;
   args_info->skipSVM_given = 0 ;
   args_info->aggregateMax_given = 0 ;
   args_info->SampledLabels_given = 0 ;
   args_info->subsample_given = 0 ;
   args_info->OutLabels_given = 0 ;
+  args_info->GenesHoldoutFold_given = 0 ;
+  args_info->touchContext_given = 0 ;
+  args_info->onlyPos_given = 0 ;
+  args_info->genes_given = 0 ;
   args_info->onetgene_given = 0 ;
   args_info->prior_given = 0 ;
   args_info->savemodel_given = 0 ;
   args_info->mintrain_given = 0 ;
   args_info->context_given = 0 ;
+  args_info->allContextPred_given = 0 ;
 }
 
 static
   args_info->random_orig = NULL;
   args_info->tgene_arg = NULL;
   args_info->tgene_orig = NULL;
-  args_info->balance_flag = 0;
-  args_info->bfactor_orig = NULL;
   args_info->prob_flag = 0;
   args_info->probCross_flag = 0;
   args_info->normalizeZero_flag = 0;
   args_info->normalizeNPone_flag = 0;
+  args_info->zscore_flag = 0;
   args_info->edgeholdout_flag = 0;
   args_info->skipSVM_flag = 0;
   args_info->aggregateMax_flag = 0;
   args_info->subsample_orig = NULL;
   args_info->OutLabels_arg = NULL;
   args_info->OutLabels_orig = NULL;
+  args_info->GenesHoldoutFold_arg = NULL;
+  args_info->GenesHoldoutFold_orig = NULL;
+  args_info->touchContext_flag = 0;
+  args_info->onlyPos_flag = 0;
+  args_info->genes_arg = NULL;
+  args_info->genes_orig = NULL;
   args_info->onetgene_flag = 0;
   args_info->prior_orig = NULL;
   args_info->savemodel_flag = 0;
   args_info->mintrain_orig = NULL;
   args_info->context_arg = NULL;
   args_info->context_orig = NULL;
+  args_info->allContextPred_flag = 0;
   
 }
 
   args_info->mmap_help = gengetopt_args_info_help[18] ;
   args_info->random_help = gengetopt_args_info_help[19] ;
   args_info->tgene_help = gengetopt_args_info_help[20] ;
-  args_info->balance_help = gengetopt_args_info_help[21] ;
-  args_info->bfactor_help = gengetopt_args_info_help[22] ;
-  args_info->prob_help = gengetopt_args_info_help[23] ;
-  args_info->probCross_help = gengetopt_args_info_help[24] ;
-  args_info->normalizeZero_help = gengetopt_args_info_help[25] ;
-  args_info->normalizeNPone_help = gengetopt_args_info_help[26] ;
-  args_info->edgeholdout_help = gengetopt_args_info_help[27] ;
-  args_info->skipSVM_help = gengetopt_args_info_help[28] ;
-  args_info->aggregateMax_help = gengetopt_args_info_help[29] ;
-  args_info->NoCrossPredict_help = gengetopt_args_info_help[30] ;
-  args_info->CrossResult_help = gengetopt_args_info_help[31] ;
-  args_info->SampledLabels_help = gengetopt_args_info_help[32] ;
-  args_info->subsample_help = gengetopt_args_info_help[33] ;
-  args_info->OutLabels_help = gengetopt_args_info_help[34] ;
-  args_info->onetgene_help = gengetopt_args_info_help[36] ;
-  args_info->prior_help = gengetopt_args_info_help[37] ;
-  args_info->savemodel_help = gengetopt_args_info_help[38] ;
-  args_info->mintrain_help = gengetopt_args_info_help[39] ;
-  args_info->context_help = gengetopt_args_info_help[40] ;
+  args_info->prob_help = gengetopt_args_info_help[21] ;
+  args_info->probCross_help = gengetopt_args_info_help[22] ;
+  args_info->normalizeZero_help = gengetopt_args_info_help[23] ;
+  args_info->normalizeNPone_help = gengetopt_args_info_help[24] ;
+  args_info->zscore_help = gengetopt_args_info_help[25] ;
+  args_info->edgeholdout_help = gengetopt_args_info_help[26] ;
+  args_info->skipSVM_help = gengetopt_args_info_help[27] ;
+  args_info->aggregateMax_help = gengetopt_args_info_help[28] ;
+  args_info->NoCrossPredict_help = gengetopt_args_info_help[29] ;
+  args_info->CrossResult_help = gengetopt_args_info_help[30] ;
+  args_info->SampledLabels_help = gengetopt_args_info_help[31] ;
+  args_info->subsample_help = gengetopt_args_info_help[32] ;
+  args_info->OutLabels_help = gengetopt_args_info_help[33] ;
+  args_info->GenesHoldoutFold_help = gengetopt_args_info_help[34] ;
+  args_info->touchContext_help = gengetopt_args_info_help[35] ;
+  args_info->onlyPos_help = gengetopt_args_info_help[36] ;
+  args_info->genes_help = gengetopt_args_info_help[38] ;
+  args_info->onetgene_help = gengetopt_args_info_help[39] ;
+  args_info->prior_help = gengetopt_args_info_help[40] ;
+  args_info->savemodel_help = gengetopt_args_info_help[41] ;
+  args_info->mintrain_help = gengetopt_args_info_help[42] ;
+  args_info->context_help = gengetopt_args_info_help[43] ;
+  args_info->allContextPred_help = gengetopt_args_info_help[44] ;
   
 }
 
   free_string_field (&(args_info->random_orig));
   free_string_field (&(args_info->tgene_arg));
   free_string_field (&(args_info->tgene_orig));
-  free_string_field (&(args_info->bfactor_orig));
   free_string_field (&(args_info->CrossResult_arg));
   free_string_field (&(args_info->CrossResult_orig));
   free_string_field (&(args_info->SampledLabels_arg));
   free_string_field (&(args_info->subsample_orig));
   free_string_field (&(args_info->OutLabels_arg));
   free_string_field (&(args_info->OutLabels_orig));
+  free_string_field (&(args_info->GenesHoldoutFold_arg));
+  free_string_field (&(args_info->GenesHoldoutFold_orig));
+  free_string_field (&(args_info->genes_arg));
+  free_string_field (&(args_info->genes_orig));
   free_string_field (&(args_info->prior_orig));
   free_string_field (&(args_info->mintrain_orig));
   free_string_field (&(args_info->context_arg));
     write_into_file(outfile, "random", args_info->random_orig, 0);
   if (args_info->tgene_given)
     write_into_file(outfile, "tgene", args_info->tgene_orig, 0);
-  if (args_info->balance_given)
-    write_into_file(outfile, "balance", 0, 0 );
-  if (args_info->bfactor_given)
-    write_into_file(outfile, "bfactor", args_info->bfactor_orig, 0);
   if (args_info->prob_given)
     write_into_file(outfile, "prob", 0, 0 );
   if (args_info->probCross_given)
     write_into_file(outfile, "normalizeZero", 0, 0 );
   if (args_info->normalizeNPone_given)
     write_into_file(outfile, "normalizeNPone", 0, 0 );
+  if (args_info->zscore_given)
+    write_into_file(outfile, "zscore", 0, 0 );
   if (args_info->edgeholdout_given)
     write_into_file(outfile, "edgeholdout", 0, 0 );
   if (args_info->skipSVM_given)
     write_into_file(outfile, "subsample", args_info->subsample_orig, 0);
   if (args_info->OutLabels_given)
     write_into_file(outfile, "OutLabels", args_info->OutLabels_orig, 0);
+  if (args_info->GenesHoldoutFold_given)
+    write_into_file(outfile, "GenesHoldoutFold", args_info->GenesHoldoutFold_orig, 0);
+  if (args_info->touchContext_given)
+    write_into_file(outfile, "touchContext", 0, 0 );
+  if (args_info->onlyPos_given)
+    write_into_file(outfile, "onlyPos", 0, 0 );
+  if (args_info->genes_given)
+    write_into_file(outfile, "genes", args_info->genes_orig, 0);
   if (args_info->onetgene_given)
     write_into_file(outfile, "onetgene", 0, 0 );
   if (args_info->prior_given)
     write_into_file(outfile, "mintrain", args_info->mintrain_orig, 0);
   if (args_info->context_given)
     write_into_file(outfile, "context", args_info->context_orig, 0);
+  if (args_info->allContextPred_given)
+    write_into_file(outfile, "allContextPred", 0, 0 );
   
 
   i = EXIT_SUCCESS;
 
       static struct option long_options[] = {
         { "help",	0, NULL, 'h' },
-        { "version",	0, NULL, 'V' },
+        { "version",	0, NULL, 0 },
         { "labels",	1, NULL, 'l' },
         { "output",	1, NULL, 'o' },
         { "directory",	1, NULL, 'd' },
         { "mmap",	0, NULL, 'M' },
         { "random",	1, NULL, 'R' },
         { "tgene",	1, NULL, 'T' },
-        { "balance",	0, NULL, 'b' },
-        { "bfactor",	1, NULL, 'F' },
         { "prob",	0, NULL, 'B' },
         { "probCross",	0, NULL, 'D' },
         { "normalizeZero",	0, NULL, 'z' },
         { "normalizeNPone",	0, NULL, 'N' },
+        { "zscore",	0, NULL, 'Z' },
         { "edgeholdout",	0, NULL, 'X' },
         { "skipSVM",	0, NULL, 'Q' },
         { "aggregateMax",	0, NULL, 'x' },
         { "SampledLabels",	1, NULL, 'y' },
         { "subsample",	1, NULL, 'A' },
         { "OutLabels",	1, NULL, 'U' },
+        { "GenesHoldoutFold",	1, NULL, 'H' },
+        { "touchContext",	0, NULL, 'f' },
+        { "onlyPos",	0, NULL, 'w' },
+        { "genes",	1, NULL, 'g' },
         { "onetgene",	0, NULL, 'q' },
         { "prior",	1, NULL, 'P' },
         { "savemodel",	0, NULL, 's' },
         { "mintrain",	1, NULL, 'E' },
         { "context",	1, NULL, 'C' },
+        { "allContextPred",	0, NULL, 'V' },
         { NULL,	0, NULL, 0 }
       };
 
-      c = getopt_long (argc, argv, "hVl:o:d:m:L:Sv:c:e:k:t:a:p:nMR:T:bF:BDzNXQxru:y:A:U:qP:sE:C:", long_options, &option_index);
+      c = getopt_long (argc, argv, "hl:o:d:m:L:Sv:c:e:k:t:a:p:nMR:T:BDzNZXQxru:y:A:U:H:fwg:qP:sE:C:V", long_options, &option_index);
 
       if (c == -1) break;	/* Exit from `while (1)' loop.  */
 
           cmdline_parser_free (&local_args_info);
           exit (EXIT_SUCCESS);
 
-        case 'V':	/* Print version and exit.  */
-        
-        
-          if (update_arg( 0 , 
-               0 , &(args_info->version_given),
-              &(local_args_info.version_given), optarg, 0, 0, ARG_NO,
-              check_ambiguity, override, 0, 0,
-              "version", 'V',
-              additional_error))
-            goto failure;
-          cmdline_parser_free (&local_args_info);
-          return 0;
-        
-          break;
         case 'l':	/* Labels file.  */
         
         
             goto failure;
         
           break;
-        case 'b':	/* DEBUG: check before usage, Balance the training gene ratios.  */
-        
-        
-          if (update_arg((void *)&(args_info->balance_flag), 0, &(args_info->balance_given),
-              &(local_args_info.balance_given), optarg, 0, 0, ARG_FLAG,
-              check_ambiguity, override, 1, 0, "balance", 'b',
-              additional_error))
-            goto failure;
-        
-          break;
-        case 'F':	/* DEBUG: only for < 500, When balancing neg and pos counts exmaples for training what factor to increase. default is 1..  */
-        
-        
-          if (update_arg( (void *)&(args_info->bfactor_arg), 
-               &(args_info->bfactor_orig), &(args_info->bfactor_given),
-              &(local_args_info.bfactor_given), optarg, 0, 0, ARG_FLOAT,
-              check_ambiguity, override, 0, 0,
-              "bfactor", 'F',
-              additional_error))
-            goto failure;
-        
-          break;
         case 'B':	/* Output prediction values as estimated probablity (Platt method).  */
         
         
             goto failure;
         
           break;
+        case 'Z':	/* Normalize input data to convert values to z-scores.  */
+        
+        
+          if (update_arg((void *)&(args_info->zscore_flag), 0, &(args_info->zscore_given),
+              &(local_args_info.zscore_given), optarg, 0, 0, ARG_FLAG,
+              check_ambiguity, override, 1, 0, "zscore", 'Z',
+              additional_error))
+            goto failure;
+        
+          break;
         case 'X':	/* For cross-validation perform edge holdout (Default is gene holdout).  */
         
         
             goto failure;
         
           break;
+        case 'H':	/* Input the gene holdout fold.  */
+        
+        
+          if (update_arg( (void *)&(args_info->GenesHoldoutFold_arg), 
+               &(args_info->GenesHoldoutFold_orig), &(args_info->GenesHoldoutFold_given),
+              &(local_args_info.GenesHoldoutFold_given), optarg, 0, 0, ARG_STRING,
+              check_ambiguity, override, 0, 0,
+              "GenesHoldoutFold", 'H',
+              additional_error))
+            goto failure;
+        
+          break;
+        case 'f':	/* If given context gene list, context is defined by all edges touch the context. (default is both genes in edge need to be in context).  */
+        
+        
+          if (update_arg((void *)&(args_info->touchContext_flag), 0, &(args_info->touchContext_given),
+              &(local_args_info.touchContext_given), optarg, 0, 0, ARG_FLAG,
+              check_ambiguity, override, 1, 0, "touchContext", 'f',
+              additional_error))
+            goto failure;
+        
+          break;
+        case 'w':	/* When given the context file, only filter for positive examples and leave negative examples as originally given..  */
+        
+        
+          if (update_arg((void *)&(args_info->onlyPos_flag), 0, &(args_info->onlyPos_given),
+              &(local_args_info.onlyPos_given), optarg, 0, 0, ARG_FLAG,
+              check_ambiguity, override, 1, 0, "onlyPos", 'w',
+              additional_error))
+            goto failure;
+        
+          break;
+        case 'g':	/* Process only genes from the given set from labels.  */
+        
+        
+          if (update_arg( (void *)&(args_info->genes_arg), 
+               &(args_info->genes_orig), &(args_info->genes_given),
+              &(local_args_info.genes_given), optarg, 0, 0, ARG_STRING,
+              check_ambiguity, override, 0, 0,
+              "genes", 'g',
+              additional_error))
+            goto failure;
+        
+          break;
         case 'q':	/* Only keep edges from lables that have one gene in the target gene list.  */
         
         
             goto failure;
         
           break;
+        case 'V':	/* When given context genes list, allow prediction too all genes.  */
+        
+        
+          if (update_arg((void *)&(args_info->allContextPred_flag), 0, &(args_info->allContextPred_given),
+              &(local_args_info.allContextPred_given), optarg, 0, 0, ARG_FLAG,
+              check_ambiguity, override, 1, 0, "allContextPred", 'V',
+              additional_error))
+            goto failure;
+        
+          break;
 
         case 0:	/* Long option with no short option */
+          /* Print version and exit.  */
+          if (strcmp (long_options[option_index].name, "version") == 0)
+          {
+          
+          
+            if (update_arg( 0 , 
+                 0 , &(args_info->version_given),
+                &(local_args_info.version_given), optarg, 0, 0, ARG_NO,
+                check_ambiguity, override, 0, 0,
+                "version", 'V',
+                additional_error))
+              goto failure;
+            cmdline_parser_free (&local_args_info);
+            return 0;
+          
+          }
+          
+          break;
         case '?':	/* Invalid option.  */
           /* `getopt_long' already printed an error message.  */
           goto failure;

tools/SVMperfing/cmdline.h

   char * tgene_arg;	/**< @brief Target gene list, use this gene list as gene holdout cross-validation and also filter labels that only have one gene in given target gene list.  */
   char * tgene_orig;	/**< @brief Target gene list, use this gene list as gene holdout cross-validation and also filter labels that only have one gene in given target gene list original value given at command line.  */
   const char *tgene_help; /**< @brief Target gene list, use this gene list as gene holdout cross-validation and also filter labels that only have one gene in given target gene list help description.  */
-  int balance_flag;	/**< @brief DEBUG: check before usage, Balance the training gene ratios (default=off).  */
-  const char *balance_help; /**< @brief DEBUG: check before usage, Balance the training gene ratios help description.  */
-  float bfactor_arg;	/**< @brief DEBUG: only for < 500, When balancing neg and pos counts exmaples for training what factor to increase. default is 1..  */
-  char * bfactor_orig;	/**< @brief DEBUG: only for < 500, When balancing neg and pos counts exmaples for training what factor to increase. default is 1. original value given at command line.  */
-  const char *bfactor_help; /**< @brief DEBUG: only for < 500, When balancing neg and pos counts exmaples for training what factor to increase. default is 1. help description.  */
   int prob_flag;	/**< @brief Output prediction values as estimated probablity (Platt method) (default=off).  */
   const char *prob_help; /**< @brief Output prediction values as estimated probablity (Platt method) help description.  */
   int probCross_flag;	/**< @brief Cross-validation setting for output prediction values as estimated probablity (Platt method) (default=off).  */
   const char *normalizeZero_help; /**< @brief Normalize input data to the range [0, 1] help description.  */
   int normalizeNPone_flag;	/**< @brief Normalize input data to the range [-1, 1] (default=off).  */
   const char *normalizeNPone_help; /**< @brief Normalize input data to the range [-1, 1] help description.  */
+  int zscore_flag;	/**< @brief Normalize input data to convert values to z-scores (default=off).  */
+  const char *zscore_help; /**< @brief Normalize input data to convert values to z-scores help description.  */
   int edgeholdout_flag;	/**< @brief For cross-validation perform edge holdout (Default is gene holdout) (default=off).  */
   const char *edgeholdout_help; /**< @brief For cross-validation perform edge holdout (Default is gene holdout) help description.  */
   int skipSVM_flag;	/**< @brief If given this flag, skip training SVM models when file already exist. Often used when cluster runs timeout/error and need to re-run jobs. (default=off).  */
   char * OutLabels_arg;	/**< @brief Save the sampled labels to the file and exit.  */
   char * OutLabels_orig;	/**< @brief Save the sampled labels to the file and exit original value given at command line.  */
   const char *OutLabels_help; /**< @brief Save the sampled labels to the file and exit help description.  */
+  char * GenesHoldoutFold_arg;	/**< @brief Input the gene holdout fold.  */
+  char * GenesHoldoutFold_orig;	/**< @brief Input the gene holdout fold original value given at command line.  */
+  const char *GenesHoldoutFold_help; /**< @brief Input the gene holdout fold help description.  */
+  int touchContext_flag;	/**< @brief If given context gene list, context is defined by all edges touch the context. (default is both genes in edge need to be in context) (default=off).  */
+  const char *touchContext_help; /**< @brief If given context gene list, context is defined by all edges touch the context. (default is both genes in edge need to be in context) help description.  */
+  int onlyPos_flag;	/**< @brief When given the context file, only filter for positive examples and leave negative examples as originally given. (default=off).  */
+  const char *onlyPos_help; /**< @brief When given the context file, only filter for positive examples and leave negative examples as originally given. help description.  */
+  char * genes_arg;	/**< @brief Process only genes from the given set from labels.  */
+  char * genes_orig;	/**< @brief Process only genes from the given set from labels original value given at command line.  */
+  const char *genes_help; /**< @brief Process only genes from the given set from labels help description.  */
   int onetgene_flag;	/**< @brief Only keep edges from lables that have one gene in the target gene list (default=off).  */
   const char *onetgene_help; /**< @brief Only keep edges from lables that have one gene in the target gene list help description.  */
   float prior_arg;	/**< @brief Randomly sub-sample the negative labels to reach target prior. If cannot reach target prior, set to closest prior..  */
   char * context_arg;	/**< @brief Context gene list.  */
   char * context_orig;	/**< @brief Context gene list original value given at command line.  */
   const char *context_help; /**< @brief Context gene list help description.  */
+  int allContextPred_flag;	/**< @brief When given context genes list, allow prediction too all genes (default=off).  */
+  const char *allContextPred_help; /**< @brief When given context genes list, allow prediction too all genes help description.  */
   
   unsigned int help_given ;	/**< @brief Whether help was given.  */
   unsigned int version_given ;	/**< @brief Whether version was given.  */
   unsigned int mmap_given ;	/**< @brief Whether mmap was given.  */
   unsigned int random_given ;	/**< @brief Whether random was given.  */
   unsigned int tgene_given ;	/**< @brief Whether tgene was given.  */
-  unsigned int balance_given ;	/**< @brief Whether balance was given.  */
-  unsigned int bfactor_given ;	/**< @brief Whether bfactor was given.  */
   unsigned int prob_given ;	/**< @brief Whether prob was given.  */
   unsigned int probCross_given ;	/**< @brief Whether probCross was given.  */
   unsigned int normalizeZero_given ;	/**< @brief Whether normalizeZero was given.  */
   unsigned int normalizeNPone_given ;	/**< @brief Whether normalizeNPone was given.  */
+  unsigned int zscore_given ;	/**< @brief Whether zscore was given.  */
   unsigned int edgeholdout_given ;	/**< @brief Whether edgeholdout was given.  */
   unsigned int skipSVM_given ;	/**< @brief Whether skipSVM was given.  */
   unsigned int aggregateMax_given ;	/**< @brief Whether aggregateMax was given.  */
   unsigned int SampledLabels_given ;	/**< @brief Whether SampledLabels was given.  */
   unsigned int subsample_given ;	/**< @brief Whether subsample was given.  */
   unsigned int OutLabels_given ;	/**< @brief Whether OutLabels was given.  */
+  unsigned int GenesHoldoutFold_given ;	/**< @brief Whether GenesHoldoutFold was given.  */
+  unsigned int touchContext_given ;	/**< @brief Whether touchContext was given.  */
+  unsigned int onlyPos_given ;	/**< @brief Whether onlyPos was given.  */
+  unsigned int genes_given ;	/**< @brief Whether genes was given.  */
   unsigned int onetgene_given ;	/**< @brief Whether onetgene was given.  */
   unsigned int prior_given ;	/**< @brief Whether prior was given.  */
   unsigned int savemodel_given ;	/**< @brief Whether savemodel was given.  */
   unsigned int mintrain_given ;	/**< @brief Whether mintrain was given.  */
   unsigned int context_given ;	/**< @brief Whether context was given.  */
+  unsigned int allContextPred_given ;	/**< @brief Whether allContextPred was given.  */
 
   char **inputs ; /**< @brief unamed options (options without names) */
   unsigned inputs_num ; /**< @brief unamed options number */

tools/SparseNetCombiner/SparseNetCombiner.cpp

 	EMethodMean		= EMethodBegin,
 	EMethodMax		= EMethodMean + 1,
 	EMethodQuant		= EMethodMax + 1,
-	EMethodEnd		= EMethodQuant + 1
+	EMethodMedian		= EMethodQuant + 1,
+	EMethodSelectMean	= EMethodMedian + 1,
+	EMethodEnd		= EMethodSelectMean + 1
 };
 
 static const char*	c_aszMethods[]	= {
-  "mean", "max", "quant",NULL
+  "mean", "max", "quant", "median", "selectmean", NULL
 };
 
 
   }
 }
 
+float Median(vector<float>& vecVals) {
+  size_t iSize, idx;
+  
+  iSize = vecVals.size();
+  if(iSize == 0)
+    return CMeta::GetNaN();
+  
+  if(iSize == 1)
+    return vecVals[0];
+  
+  std::sort(vecVals.begin(), vecVals.end());
+  
+  idx = vecVals.size() / 2;
+  if( vecVals.size() % 2 != 0 )
+    return vecVals[idx];
+  else
+    return ((vecVals[(idx-1)] + vecVals[idx]) * 0.5);
+}
+
+float SelectMean(vector<float>& vecVals) {
+  size_t iSize, idx, i, j;
+  float sum;
+  
+  iSize = vecVals.size();
+  if(iSize == 0)
+    return CMeta::GetNaN();
+  
+  if(iSize == 1)
+    return vecVals[0];
+  
+  std::sort(vecVals.begin(), vecVals.end());
+
+  // return the mean of the top quartile    
+  idx = (vecVals.size() / 4) * 3;
+  
+  j = 0;
+  sum = 0.0;
+  for(i = idx; i < iSize; ++i){
+    ++j;
+    sum += vecVals[i];
+  }
+  
+  return sum / j;
+  
+  
+  // DEBUG
+  if( 1 == 2 ){
+  idx = vecVals.size() / 2;
+  
+  j = 0;
+  sum = 0.0;
+  for(i = idx; i < iSize; ++i){
+    ++j;
+    sum += vecVals[i];
+  }
+  
+  return sum / j;
+  }
+  
+  // DEBUG
+  if( 1 == 2 ){
+    
+  idx = (vecVals.size() / 10);
+  idx = iSize - idx;
+  
+  j = 0;
+  sum = 0.0;
+  for(i = idx; i < iSize; ++i){
+    ++j;
+    sum += vecVals[i];
+  }
+  
+  return sum / j;
+  }
+  
+  // DEBUG
+  if( 1 == 2 ){
+    
+  idx = (vecVals.size() / 4) * 3;
+  return vecVals[idx];
+  }
+
+}
+
 int main( int iArgs, char** aszArgs ) {
 	gengetopt_args_info	sArgs;
 	int					iRet;
 	  cerr << "Total number of datasets combining with non-zero weights: " << numDataset << endl;
 	}
 	
+	if( eMethod == EMethodMedian or eMethod == EMethodSelectMean){
+	  vector<float>	vecVals;
+	  float val;
+	  vector<CDat*> vecData;
+	  
+	  vecData.resize( vecstrDatasets.size( ) );
+	  // now iterate dat/dab networks
+	  for( i = 0; i < vecstrDatasets.size( ); ++i ) {
+	    vecData[ i ] = new CDat( );
+	    if( !vecData[ i ]->Open( vecstrDatasets[ i ].c_str() ) ) {
+	      cerr << "Couldn't open: " << vecstrDatasets[ i ] << endl;
+	      return 1; }
+	    
+	    if( sArgs.rank_flag )
+	      vecData[ i ]->Rank( );
+	    if( sArgs.zscore_flag )
+	      vecData[ i ]->Normalize( CDat::ENormalizeZScore );	    
+	    
+	    cerr << "open: " << vecstrDatasets[ i ] << endl;
+	  }
+	  
+	  // initialized the output dab and pair value vector
+	  DatOut.Open(vecData[ 0 ]->GetGeneNames());
+	  vecVals.resize(vecstrDatasets.size( ));
+	  
+	  // debug
+	  cerr << "num dataset: " << vecstrDatasets.size( ) << endl;
+	  
+	  for( i = 0; i < DatOut.GetGenes(); ++i ){
+	    for( j = i+1; j < DatOut.GetGenes(); ++j ){
+	      // iterate over each dataset
+	      vecVals.clear();
+	      for( k = 0; k < vecstrDatasets.size( ); ++k ){		
+		if( CMeta::IsNaN(val =  vecData[ k ]->Get( i, j)))
+		  continue;		
+		
+		if( sArgs.weight_given ){
+		  val *= vecWeights[k];
+		}		
+		vecVals.push_back(val);
+	      }
+	      
+	      if(vecVals.size() < 1)
+		continue;
+	      
+	      if( eMethod == EMethodMedian)
+		// find median
+		DatOut.Set(i, j, Median(vecVals));
+	      else if( eMethod == EMethodSelectMean )
+		DatOut.Set(i, j, SelectMean(vecVals));
+	    }
+	  }
+	  
+	  DatOut.Save( sArgs.output_arg );
+	  return 0;	  
+	}
+	
 	/// IF combine method is Quantile
 	/// Beaware that values are qunatized to allow full read in of the input datsets
 	if( eMethod == EMethodQuant ){
 	      //for(size_t t = 0; t < vecVals.size(); t++ )
 	      //	cerr << vecVals[t] << ' ';
 	      //cerr << endl;
-	    }
+	    }	
 	  }
 	  
 	  DatOut.Save( sArgs.output_arg );
 	    if( !DatCur.Open( vecstrDatasets[ i ].c_str() ) ) {
 	      cerr << "Couldn't open: " << vecstrDatasets[ i ] << endl;
 	      return 1; }
+	    	    
+	    if( sArgs.rank_flag )
+	      DatCur.Rank( );
+	    if( sArgs.zscore_flag )
+	      DatCur.Normalize( CDat::ENormalizeZScore );
 	    
 	    DatOut.Open( DatCur );	    	    
 	    DatTrack.Open( DatCur );
 	    // this Dat is used to track various values (count, max)
 	    for( j = 0; j < DatTrack.GetGenes( ); ++j )
 	      for( k = ( j + 1 ); k < DatTrack.GetGenes( ); ++k ){
+		if(eMethod == EMethodMean)
+		  DatTrack.Set( j, k, 0.0);		
 		if( CMeta::IsNaN( d = DatCur.Get( j, k)))
 		  continue;
 		
 	    return 1; }
 	  cerr << "opened: " << vecstrDatasets[ i ] << endl;
 	  
+	  if( sArgs.rank_flag )
+	    DatCur.Rank( );
+	  if( sArgs.zscore_flag )
+	    DatCur.Normalize( CDat::ENormalizeZScore );
+	  
 	  if( sArgs.map_flag ){
 	    // Get gene index match	  
 	    cerr << "inside map flag" << endl;
 	case EMethodMean:
 	  // now convert sum to mean
 	  for( j = 0; j < DatOut.GetGenes( ); ++j )
-	    for( k = ( j + 1 ); k < DatOut.GetGenes( ); ++k )
-	      DatOut.Set( j, k, DatOut.Get( j, k ) / DatTrack.Get( j, k ) );
+	    for( k = ( j + 1 ); k < DatOut.GetGenes( ); ++k ){
+	      if( CMeta::IsNaN( d = DatOut.Get(  j, k ) ) )
+		continue;
+	      DatOut.Set( j, k, d / DatTrack.Get( j, k ) );
+	    }
 	}
-
+	
 	// Filter dat
 	if( sArgs.genes_given ) {
 	  ifsm.clear( );

tools/SparseNetCombiner/SparseNetCombiner.ggo

                                                 string  typestr="directory"
 option	"map"		m	"Map gene index among the network dabs to combine. (Should be used when the gene intex are not identical among network dabs)"
 						flag	off
-option	"method"	M	"Combination method"
-						values="max","mean","quant"	default="mean"
+option	"method"	M	"Combination method, (selectmean computes the mea of the upper quartile values)"
+						values="max","mean","median","quant","selectmean"	default="mean"
 section "Optional"
 option	"quantile"	q	"If combine method is Quantile, set the returning quantile (default is median qunatile 0.5)"
 						float	default="0.5"
 option	"weight"	w	"File with dataset weights, if given each dataset values if weighted by the dataset weight. Skips datasets with no-entry or with zero weights. File format: dataset name<tab>weight"
 						string	typestr="filename"
+option	"zscore"	z	"Convert values to z-scores before combine"
+						flag	off
+option	"rank"		r	"Rank transform data before combine"
+						flag	off
 
 section "Filtering"
 option	"genes"		g	"Process only genes from the given set"

tools/SparseNetCombiner/cmdline.c

   "  -v, --verbosity=INT        Message verbosity  (default=`5')",
   "  -d, --directory=directory  input directory (must only contain input files)",
   "  -m, --map                  Map gene index among the network dabs to combine. \n                               (Should be used when the gene intex are not \n                               identical among network dabs)  (default=off)",
-  "  -M, --method=STRING        Combination method  (possible values=\"max\", \n                               \"mean\", \"quant\" default=`mean')",
+  "  -M, --method=STRING        Combination method, (selectmean computes the mea \n                               of the upper quartile values)  (possible \n                               values=\"max\", \"mean\", \"median\", \"quant\", \n                               \"selectmean\" default=`mean')",
   "\nOptional:",
   "  -q, --quantile=FLOAT       If combine method is Quantile, set the returning \n                               quantile (default is median qunatile 0.5)  \n                               (default=`0.5')",
   "  -w, --weight=filename      File with dataset weights, if given each dataset \n                               values if weighted by the dataset weight. Skips \n                               datasets with no-entry or with zero weights. \n                               File format: dataset name<tab>weight",
+  "  -z, --zscore               Convert values to z-scores before combine  \n                               (default=off)",
+  "  -r, --rank                 Rank transform data before combine  (default=off)",
   "\nFiltering:",
   "  -g, --genes=filename       Process only genes from the given set",
   "  -D, --genee=filename       Process only edges including a gene from the given \n                               set",
                         struct cmdline_parser_params *params, const char *additional_error);
 
 
-char *cmdline_parser_method_values[] = {"max", "mean", "quant", 0} ;	/* Possible values for method.  */
+char *cmdline_parser_method_values[] = {"max", "mean", "median", "quant", "selectmean", 0} ;	/* Possible values for method.  */
 
 static char *
 gengetopt_strdup (const char *s);
   args_info->method_given = 0 ;
   args_info->quantile_given = 0 ;
   args_info->weight_given = 0 ;
+  args_info->zscore_given = 0 ;
+  args_info->rank_given = 0 ;
   args_info->genes_given = 0 ;
   args_info->genee_given = 0 ;
 }
   args_info->quantile_orig = NULL;
   args_info->weight_arg = NULL;
   args_info->weight_orig = NULL;
+  args_info->zscore_flag = 0;
+  args_info->rank_flag = 0;
   args_info->genes_arg = NULL;
   args_info->genes_orig = NULL;
   args_info->genee_arg = NULL;
   args_info->method_help = gengetopt_args_info_help[7] ;
   args_info->quantile_help = gengetopt_args_info_help[9] ;
   args_info->weight_help = gengetopt_args_info_help[10] ;
-  args_info->genes_help = gengetopt_args_info_help[12] ;
-  args_info->genee_help = gengetopt_args_info_help[13] ;
+  args_info->zscore_help = gengetopt_args_info_help[11] ;
+  args_info->rank_help = gengetopt_args_info_help[12] ;
+  args_info->genes_help = gengetopt_args_info_help[14] ;
+  args_info->genee_help = gengetopt_args_info_help[15] ;
   
 }
 
     write_into_file(outfile, "quantile", args_info->quantile_orig, 0);
   if (args_info->weight_given)
     write_into_file(outfile, "weight", args_info->weight_orig, 0);
+  if (args_info->zscore_given)
+    write_into_file(outfile, "zscore", 0, 0 );
+  if (args_info->rank_given)
+    write_into_file(outfile, "rank", 0, 0 );
   if (args_info->genes_given)
     write_into_file(outfile, "genes", args_info->genes_orig, 0);
   if (args_info->genee_given)
         { "method",	1, NULL, 'M' },
         { "quantile",	1, NULL, 'q' },
         { "weight",	1, NULL, 'w' },
+        { "zscore",	0, NULL, 'z' },
+        { "rank",	0, NULL, 'r' },
         { "genes",	1, NULL, 'g' },
         { "genee",	1, NULL, 'D' },
         { NULL,	0, NULL, 0 }
       };
 
-      c = getopt_long (argc, argv, "hVo:v:d:mM:q:w:g:D:", long_options, &option_index);
+      c = getopt_long (argc, argv, "hVo:v:d:mM:q:w:zrg:D:", long_options, &option_index);
 
       if (c == -1) break;	/* Exit from `while (1)' loop.  */
 
             goto failure;
         
           break;
-        case 'M':	/* Combination method.  */
+        case 'M':	/* Combination method, (selectmean computes the mea of the upper quartile values).  */
         
         
           if (update_arg( (void *)&(args_info->method_arg), 
             goto failure;
         
           break;
+        case 'z':	/* Convert values to z-scores before combine.  */
+        
+        
+          if (update_arg((void *)&(args_info->zscore_flag), 0, &(args_info->zscore_given),
+              &(local_args_info.zscore_given), optarg, 0, 0, ARG_FLAG,
+              check_ambiguity, override, 1, 0, "zscore", 'z',
+              additional_error))
+            goto failure;
+        
+          break;
+        case 'r':	/* Rank transform data before combine.  */
+        
+        
+          if (update_arg((void *)&(args_info->rank_flag), 0, &(args_info->rank_given),
+              &(local_args_info.rank_given), optarg, 0, 0, ARG_FLAG,
+              check_ambiguity, override, 1, 0, "rank", 'r',
+              additional_error))
+            goto failure;
+        
+          break;
         case 'g':	/* Process only genes from the given set.  */
         
         

tools/SparseNetCombiner/cmdline.h

   const char *directory_help; /**< @brief input directory (must only contain input files) help description.  */
   int map_flag;	/**< @brief Map gene index among the network dabs to combine. (Should be used when the gene intex are not identical among network dabs) (default=off).  */
   const char *map_help; /**< @brief Map gene index among the network dabs to combine. (Should be used when the gene intex are not identical among network dabs) help description.  */
-  char * method_arg;	/**< @brief Combination method (default='mean').  */
-  char * method_orig;	/**< @brief Combination method original value given at command line.  */
-  const char *method_help; /**< @brief Combination method help description.  */
+  char * method_arg;	/**< @brief Combination method, (selectmean computes the mea of the upper quartile values) (default='mean').  */
+  char * method_orig;	/**< @brief Combination method, (selectmean computes the mea of the upper quartile values) original value given at command line.  */
+  const char *method_help; /**< @brief Combination method, (selectmean computes the mea of the upper quartile values) help description.  */
   float quantile_arg;	/**< @brief If combine method is Quantile, set the returning quantile (default is median qunatile 0.5) (default='0.5').  */
   char * quantile_orig;	/**< @brief If combine method is Quantile, set the returning quantile (default is median qunatile 0.5) original value given at command line.  */
   const char *quantile_help; /**< @brief If combine method is Quantile, set the returning quantile (default is median qunatile 0.5) help description.  */
   char * weight_arg;	/**< @brief File with dataset weights, if given each dataset values if weighted by the dataset weight. Skips datasets with no-entry or with zero weights. File format: dataset name<tab>weight.  */
   char * weight_orig;	/**< @brief File with dataset weights, if given each dataset values if weighted by the dataset weight. Skips datasets with no-entry or with zero weights. File format: dataset name<tab>weight original value given at command line.  */
   const char *weight_help; /**< @brief File with dataset weights, if given each dataset values if weighted by the dataset weight. Skips datasets with no-entry or with zero weights. File format: dataset name<tab>weight help description.  */
+  int zscore_flag;	/**< @brief Convert values to z-scores before combine (default=off).  */
+  const char *zscore_help; /**< @brief Convert values to z-scores before combine help description.  */
+  int rank_flag;	/**< @brief Rank transform data before combine (default=off).  */
+  const char *rank_help; /**< @brief Rank transform data before combine help description.  */
   char * genes_arg;	/**< @brief Process only genes from the given set.  */
   char * genes_orig;	/**< @brief Process only genes from the given set original value given at command line.  */
   const char *genes_help; /**< @brief Process only genes from the given set help description.  */
   unsigned int method_given ;	/**< @brief Whether method was given.  */
   unsigned int quantile_given ;	/**< @brief Whether quantile was given.  */
   unsigned int weight_given ;	/**< @brief Whether weight was given.  */
+  unsigned int zscore_given ;	/**< @brief Whether zscore was given.  */
+  unsigned int rank_given ;	/**< @brief Whether rank was given.  */
   unsigned int genes_given ;	/**< @brief Whether genes was given.  */
   unsigned int genee_given ;	/**< @brief Whether genee was given.  */