Commits

YOUNG-SUK LEE  committed 1d2b199

multiple cv runs and negative subsampling

  • Participants
  • Parent commits 0b0b0bb

Comments (0)

Files changed (4)

File tools/LibSVMer/LibSVMer.cpp

 
 using namespace LIBSVM;
 
+/*
+vector<LIBSVM::SVMLabel>* Subsampling( vector<LIBSVM::SVMLabel>* pTrainVector, size_t num, size_t numSample) {
+  size_t iSample, iSubsample, numPos, index, len;
+  size_t i;
+
+cerr << "subsampling: " << num << endl;
+
+  len = numSample;
+  
+cerr << "number of samples: " << len << endl;
+
+  vector<LIBSVM::SVMLabel>* ppTmpTrain[len * num];
+
+  vector<LIBSVM::SVMLabel> Negatives;
+  vector<LIBSVM::SVMLabel> Positives;
+  
+  for( iSample = 0 ; iSample < len ; iSample ++ ) {
+    numPos = 0;
+    Negatives.empty();
+    Positives.empty();
+    
+    for(vector<LIBSVM::SVMLabel>::iterator it = pTrainVector[iSample].begin() ;
+        it != pTrainVector[iSample].end(); it++){
+      if ( (*it).Target == 1 ) { // if positive
+        numPos ++;
+        Positives.push_back(*it);
+      }else if ( (*it).Target == -1 )
+        Negatives.push_back(*it);
+    }
+
+
+    for( iSubsample = 0 ; iSubsample < num ; iSubsample ++ ) {
+      index = num * iSample + iSubsample;
+      (*ppTmpTrain[ index ]).reserve((size_t) (numPos * 10));
+//pTmpTrain[ index ] = new vector<LIBSVM::SVMLabel>;
+      //copy( Positives.begin( ), Positives.end( ), pTmpTrain[ index ].begin( ) ); doesn't work..
+      for( i = 0 ; i < numPos ; i ++ ) {
+        (*ppTmpTrain)[ index ].push_back(Positives.at( i ) );
+        (*ppTmpTrain)[ index ].push_back(Negatives.at( rand() % Negatives.size() )) ; //with replacement!!
+      }
+
+cerr << "blah" << endl;
+cerr << (*ppTmpTrain[ index ]).size() << endl;
+    }
+  }
+
+  return &ppTmpTrain;
+//  pTmpTest
+}*/
 
 vector<LIBSVM::SVMLabel> ReadLabels(ifstream & ifsm) {
 
           cerr << "number of cv runs is < 1. Must be set at least 1" << endl;
           return 1;
         }
+
+        if (sArgs.negative_subsamples_arg < 0){
+          cerr << "number of negative subsample runs is < 0. Must be non-negative" << endl;
+          return 1;
+        }
+
+        if ( (sArgs.negative_subsamples_arg > 0 && sArgs.num_cv_runs_arg > 1) ) {
+          cerr << "negative subsamping for multiple cv runs has yet been implemented." << endl;
+          return 1;
+        }
       
         SVM.SetTradeoff(sArgs.tradeoff_arg);
         SVM.SetNu(sArgs.nu_arg);
 	}
 
 	LIBSVM::SAMPLE* pTrainSample;
-	vector<LIBSVM::SVMLabel> pTrainVector[sArgs.cross_validation_arg * sArgs.num_cv_runs_arg];
-	vector<LIBSVM::SVMLabel> pTestVector[sArgs.cross_validation_arg * sArgs.num_cv_runs_arg];
+
+        size_t numSample;
+        if(sArgs.negative_subsamples_arg > 0)
+          numSample = sArgs.cross_validation_arg * sArgs.num_cv_runs_arg * sArgs.negative_subsamples_arg;
+        else
+          numSample = sArgs.cross_validation_arg * sArgs.num_cv_runs_arg;
+	vector<LIBSVM::SVMLabel> pTrainVector[numSample];
+	vector<LIBSVM::SVMLabel> pTestVector[numSample];
 	vector<LIBSVM::Result> AllResults;
 	vector<LIBSVM::Result> tmpAllResults;
 
 			cerr << "Could not open output file" << endl;
 		}
 	} else if (sArgs.output_given && sArgs.labels_given) {
+                size_t ii, index;
 		//do learning and classifying with cross validation
-/*	        if( sArgs.cross_validation_arg > 1){	    
-		  for (i = 0; i < sArgs.cross_validation_arg; i++) {
-		    pTestVector[i].reserve((size_t) vecLabels.size()
-					   / sArgs.cross_validation_arg + sArgs.cross_validation_arg);
-		    pTrainVector[i].reserve((size_t) vecLabels.size()
-					    / (sArgs.cross_validation_arg)
-					    * (sArgs.cross_validation_arg - 1)
-					    + sArgs.cross_validation_arg);
-		    for (j = 0; j < vecLabels.size(); j++) {
+//                if( sArgs.cross_validation_arg > 1 && sArgs.bagging )
+                if( sArgs.cross_validation_arg > 1 && sArgs.negative_subsamples_arg > 0){
+cerr << "negative subsampling" << endl;
+        	  vector<LIBSVM::SVMLabel> pTmpTrain[sArgs.cross_validation_arg * sArgs.num_cv_runs_arg];
+        	  vector<LIBSVM::SVMLabel> pTmpTest[sArgs.cross_validation_arg * sArgs.num_cv_runs_arg];
+          
+                  for(i = 0; i < sArgs.cross_validation_arg; i++) {
+                    index = i;
+                      
+                    pTmpTest[index].reserve((size_t) vecLabels.size()
+  			   / sArgs.cross_validation_arg + sArgs.cross_validation_arg);
+                    pTmpTrain[index].reserve((size_t) vecLabels.size()
+			    / (sArgs.cross_validation_arg)
+			    * (sArgs.cross_validation_arg - 1)
+			    + sArgs.cross_validation_arg);
+                    for (j = 0; j < vecLabels.size(); j++) {
+//cerr << vecLabels[j].GeneName << endl;
 		      if (j % sArgs.cross_validation_arg == i) {
-			pTestVector[i].push_back(vecLabels[j]);
+		        pTmpTest[index].push_back(vecLabels[j]);
 		      } else {
-			pTrainVector[i].push_back((vecLabels[j]));
+		        pTmpTrain[index].push_back(vecLabels[j]);
 		      }
 		    }
-		  }
-		}*/
-                if( sArgs.cross_validation_arg > 1 && sArgs.num_cv_runs_arg >= 1 ){
-                  size_t ii, index;
-                  for (ii = 0; ii < sArgs.num_cv_runs_arg; ii++) {
+                  }
+                
+size_t iSample, iSubsample, numPos;
+size_t len, num;
+num = sArgs.negative_subsamples_arg;
+cerr << "subsampling: " << num << endl;
+len = sArgs.cross_validation_arg;
+cerr << "number of samples: " << len << endl;
+
+vector<LIBSVM::SVMLabel> Negatives;
+vector<LIBSVM::SVMLabel> Positives;
+  
+for( iSample = 0 ; iSample < len ; iSample ++ ) {
+    numPos = 0;
+    Negatives.empty();
+    Positives.empty();
+    
+    for(vector<LIBSVM::SVMLabel>::iterator it = pTmpTrain[iSample].begin() ;
+        it != pTmpTrain[iSample].end(); it++){
+      if ( (*it).Target == 1 ) { // if positive
+        numPos ++;
+        Positives.push_back(*it);
+      }else if ( (*it).Target == -1 )
+        Negatives.push_back(*it);
+    }
+
+
+    for( iSubsample = 0 ; iSubsample < num ; iSubsample ++ ) {
+      index = num * iSample + iSubsample;
+//      pTmpTrain[ index ].reserve((size_t) (numPos * 10));
+      for( i = 0 ; i < numPos ; i ++ ) {
+        pTrainVector[ index ].push_back(Positives.at( i ) );
+        pTrainVector[ index ].push_back(Negatives.at( rand() % Negatives.size() )) ; //with replacement!!
+      }
+
+cerr << "blah" << endl;
+cerr << pTrainVector[ index ].size() << endl;
+      pTestVector[ index ] = pTmpTest[ iSample ] ;
+  }
+}
+
+                }
+                else if( sArgs.cross_validation_arg > 1 && sArgs.num_cv_runs_arg >= 1 ){
+//                  size_t ii, index;
+                  for (ii = 0; ii < sArgs.num_cv_runs_arg; ii++) {                    
                     std::random_shuffle(vecLabels.begin(), vecLabels.end());
 
                   for (i = 0; i < sArgs.cross_validation_arg; i++) {                  

File tools/LibSVMer/LibSVMer.ggo

 										int default="5" no
 option  "num_cv_runs"                   r       "Number of cross-validation runs"
                                                                                 int default="1" no
+option  "negative_subsamples"            g       "Number of subsample runs"
+                                                                                int default="0" no
 option "svm_type"                       v       "Sets type of SVM (default 0)
 0\tC-SVC
 1\tnu-SVC

File tools/LibSVMer/cmdline.c

 const char *gengetopt_args_info_description = "";
 
 const char *gengetopt_args_info_help[] = {
-  "  -h, --help                  Print help and exit",
-  "  -V, --version               Print version and exit",
+  "  -h, --help                    Print help and exit",
+  "  -V, --version                 Print version and exit",
   "\nMain:",
-  "  -l, --labels=filename       Labels file",
-  "  -o, --output=filename       Output file ",
-  "  -i, --input=filename        Input PCL file ",
-  "  -m, --model=filename        Model file",
-  "  -a, --all                   Always classify all genes in PCLs  (default=off)",
+  "  -l, --labels=filename         Labels file",
+  "  -o, --output=filename         Output file ",
+  "  -i, --input=filename          Input PCL file ",
+  "  -m, --model=filename          Model file",
+  "  -a, --all                     Always classify all genes in PCLs  \n                                  (default=off)",
   "\nOptions:",
-  "  -s, --skip=INT              Number of columns to skip in input pcls  \n                                (default=`2')",
-  "  -n, --normalize             Normalize PCLS to 0 mean 1 variance  \n                                (default=off)",
-  "  -c, --cross_validation=INT  Number of cross-validation sets ( arg of 1 will \n                                turn off cross-validation )  (default=`5')",
-  "  -r, --num_cv_runs=INT       Number of cross-validation runs  (default=`1')",
-  "  -v, --svm_type=INT          Sets type of SVM (default 0)\n\n                                0\tC-SVC\n\n                                1\tnu-SVC\n\n                                2\tone-class SVM\n                                  (default=`0')",
-  "  -b, --balance               weight classes such that C_P * n_P = C_N * n_N  \n                                (default=off)",
-  "  -t, --tradeoff=FLOAT        SVM tradeoff constant C of C-SVC  (default=`1')",
-  "  -u, --nu=FLOAT              nu parameter of nu-SVC, one-class SVM  \n                                (default=`0.5')",
-  "  -p, --params=filename       Parameter file",
-  "  -M, --mmap                  Memory map binary input  (default=off)",
+  "  -s, --skip=INT                Number of columns to skip in input pcls  \n                                  (default=`2')",
+  "  -n, --normalize               Normalize PCLS to 0 mean 1 variance  \n                                  (default=off)",
+  "  -c, --cross_validation=INT    Number of cross-validation sets ( arg of 1 will \n                                  turn off cross-validation )  (default=`5')",
+  "  -r, --num_cv_runs=INT         Number of cross-validation runs  (default=`1')",
+  "  -g, --negative_subsamples=INT Number of subsample runs  (default=`0')",
+  "  -v, --svm_type=INT            Sets type of SVM (default 0)\n\n                                  0\tC-SVC\n\n                                  1\tnu-SVC\n\n                                  2\tone-class SVM\n                                    (default=`0')",
+  "  -b, --balance                 weight classes such that C_P * n_P = C_N * n_N  \n                                  (default=off)",
+  "  -t, --tradeoff=FLOAT          SVM tradeoff constant C of C-SVC  (default=`1')",
+  "  -u, --nu=FLOAT                nu parameter of nu-SVC, one-class SVM  \n                                  (default=`0.5')",
+  "  -p, --params=filename         Parameter file",
+  "  -M, --mmap                    Memory map binary input  (default=off)",
     0
 };
 
   args_info->normalize_given = 0 ;
   args_info->cross_validation_given = 0 ;
   args_info->num_cv_runs_given = 0 ;
+  args_info->negative_subsamples_given = 0 ;
   args_info->svm_type_given = 0 ;
   args_info->balance_given = 0 ;
   args_info->tradeoff_given = 0 ;
   args_info->cross_validation_orig = NULL;
   args_info->num_cv_runs_arg = 1;
   args_info->num_cv_runs_orig = NULL;
+  args_info->negative_subsamples_arg = 0;
+  args_info->negative_subsamples_orig = NULL;
   args_info->svm_type_arg = 0;
   args_info->svm_type_orig = NULL;
   args_info->balance_flag = 0;
   args_info->normalize_help = gengetopt_args_info_help[10] ;
   args_info->cross_validation_help = gengetopt_args_info_help[11] ;
   args_info->num_cv_runs_help = gengetopt_args_info_help[12] ;
-  args_info->svm_type_help = gengetopt_args_info_help[13] ;
-  args_info->balance_help = gengetopt_args_info_help[14] ;
-  args_info->tradeoff_help = gengetopt_args_info_help[15] ;
-  args_info->nu_help = gengetopt_args_info_help[16] ;
-  args_info->params_help = gengetopt_args_info_help[17] ;
-  args_info->mmap_help = gengetopt_args_info_help[18] ;
+  args_info->negative_subsamples_help = gengetopt_args_info_help[13] ;
+  args_info->svm_type_help = gengetopt_args_info_help[14] ;
+  args_info->balance_help = gengetopt_args_info_help[15] ;
+  args_info->tradeoff_help = gengetopt_args_info_help[16] ;
+  args_info->nu_help = gengetopt_args_info_help[17] ;
+  args_info->params_help = gengetopt_args_info_help[18] ;
+  args_info->mmap_help = gengetopt_args_info_help[19] ;
   
 }
 
   free_string_field (&(args_info->skip_orig));
   free_string_field (&(args_info->cross_validation_orig));
   free_string_field (&(args_info->num_cv_runs_orig));
+  free_string_field (&(args_info->negative_subsamples_orig));
   free_string_field (&(args_info->svm_type_orig));
   free_string_field (&(args_info->tradeoff_orig));
   free_string_field (&(args_info->nu_orig));
     write_into_file(outfile, "cross_validation", args_info->cross_validation_orig, 0);
   if (args_info->num_cv_runs_given)
     write_into_file(outfile, "num_cv_runs", args_info->num_cv_runs_orig, 0);
+  if (args_info->negative_subsamples_given)
+    write_into_file(outfile, "negative_subsamples", args_info->negative_subsamples_orig, 0);
   if (args_info->svm_type_given)
     write_into_file(outfile, "svm_type", args_info->svm_type_orig, 0);
   if (args_info->balance_given)
         { "normalize",	0, NULL, 'n' },
         { "cross_validation",	1, NULL, 'c' },
         { "num_cv_runs",	1, NULL, 'r' },
+        { "negative_subsamples",	1, NULL, 'g' },
         { "svm_type",	1, NULL, 'v' },
         { "balance",	0, NULL, 'b' },
         { "tradeoff",	1, NULL, 't' },
         { NULL,	0, NULL, 0 }
       };
 
-      c = getopt_long (argc, argv, "hVl:o:i:m:as:nc:r:v:bt:u:p:M", long_options, &option_index);
+      c = getopt_long (argc, argv, "hVl:o:i:m:as:nc:r:g:v:bt:u:p:M", long_options, &option_index);
 
       if (c == -1) break;	/* Exit from `while (1)' loop.  */
 
             goto failure;
         
           break;
+        case 'g':	/* Number of subsample runs.  */
+        
+        
+          if (update_arg( (void *)&(args_info->negative_subsamples_arg), 
+               &(args_info->negative_subsamples_orig), &(args_info->negative_subsamples_given),
+              &(local_args_info.negative_subsamples_given), optarg, 0, "0", ARG_INT,
+              check_ambiguity, override, 0, 0,
+              "negative_subsamples", 'g',
+              additional_error))
+            goto failure;
+        
+          break;
         case 'v':	/* Sets type of SVM (default 0)
         0\tC-SVC
         1\tnu-SVC

File tools/LibSVMer/cmdline.h

   int num_cv_runs_arg;	/**< @brief Number of cross-validation runs (default='1').  */
   char * num_cv_runs_orig;	/**< @brief Number of cross-validation runs original value given at command line.  */
   const char *num_cv_runs_help; /**< @brief Number of cross-validation runs help description.  */
+  int negative_subsamples_arg;	/**< @brief Number of subsample runs (default='0').  */
+  char * negative_subsamples_orig;	/**< @brief Number of subsample runs original value given at command line.  */
+  const char *negative_subsamples_help; /**< @brief Number of subsample runs help description.  */
   int svm_type_arg;	/**< @brief Sets type of SVM (default 0)
   0\tC-SVC
   1\tnu-SVC
   unsigned int normalize_given ;	/**< @brief Whether normalize was given.  */
   unsigned int cross_validation_given ;	/**< @brief Whether cross_validation was given.  */
   unsigned int num_cv_runs_given ;	/**< @brief Whether num_cv_runs was given.  */
+  unsigned int negative_subsamples_given ;	/**< @brief Whether negative_subsamples was given.  */
   unsigned int svm_type_given ;	/**< @brief Whether svm_type was given.  */
   unsigned int balance_given ;	/**< @brief Whether balance was given.  */
   unsigned int tradeoff_given ;	/**< @brief Whether tradeoff was given.  */