Commits

Qian Zhu committed 465c7b0

Allow PCLServer to use a second CDatabase collection

  • Participants
  • Parent commits 65ac4e5

Comments (0)

Files changed (5)

src/seekcentral.cpp

 			CSeekIntIntMap *mapG = m_vc[d]->GetGeneMap();
 			CSeekIntIntMap *mapQ = m_vc[d]->GetQueryMap();
 
-			//if(mapG->GetNumSet()<10000){
-			//	continue;
-			//}
+			if(mapG->GetNumSet()<10000){
+				continue;
+			}
 
 			if(mapQ==NULL ||mapQ->GetNumSet()==0){
 				if(DEBUG) fprintf(stderr, "This dataset is skipped\n");

tools/PCLServer/PCLServer.cpp

 
 pthread_mutex_t mutexGet;
 
-string strPrepInputDirectory;
+vector<CSeekDBSetting*> cc;
+
+/*string strPrepInputDirectory;
 string strSinfoInputDirectory;
 string strDatasetInputDirectory;
-string strPlatformInputDirectory;
+string strPlatformInputDirectory;*/
 map<string, int> mapstrintGene;
 
 vector<string> vecstrGeneID;
 vector<string> vecstrGenes;
 vector<string> vecstrDatasets;
 vector<string> vecstrDP;
+
+map<string, int> mapstrintDatasetDB;
 map<string, ushort> mapstriPlatform;
 map<string, string> mapstrstrDatasetPlatform;
 map<string, ushort> mapstrintDataset;
 	//for each dataset
 
 	vector<float> quant;
-	CSeekTools::ReadQuantFile("/home/qzhu/Seek/quant2", quant);
+
+	//QUANT file must be consistent
+	CSeekTools::ReadQuantFile(cc[0]->GetValue("quant"), quant);
+	//CSeekTools::ReadQuantFile("/home/qzhu/Seek/quant2", quant);
 
 	#pragma omp parallel for \
 	private(i) \
 		if(outputCoexpression || outputQueryCoexpression){
 			vd = new CSeekDataset();
 			string strFileStem = datasetName[i].substr(0, datasetName[i].find(".bin"));
-			string strAvgPath = strPrepInputDirectory + "/" + strFileStem + ".gavg";
-			string strPresencePath = strPrepInputDirectory + "/" + strFileStem + ".gpres";
-			string strSinfoPath = strSinfoInputDirectory + "/" + strFileStem + ".sinfo";
+
+			int dbID = mapstrintDatasetDB[strFileStem];
+	
+			string strAvgPath = cc[dbID]->GetValue("prep") + "/" + strFileStem + ".gavg"; //avg and prep path share same directory
+			string strPresencePath = cc[dbID]->GetValue("prep") + "/" + strFileStem + ".gpres";
+			string strSinfoPath = cc[dbID]->GetValue("sinfo") + "/" + strFileStem + ".sinfo";
+
 			vd->ReadGeneAverage(strAvgPath);
 			vd->ReadGenePresence(strPresencePath);
 			vd->ReadDatasetAverageStdev(strSinfoPath);
 	}
 
 	signal(SIGPIPE, SIG_IGN);
-	size_t i;
+	size_t i, j;
 	for(i=0; i<NUM_THREADS; i++){
 		THREAD_OCCUPIED[i] = 0;
 	}
 
 	PORT = sArgs.port_arg;
+
+	CSeekDBSetting *dbSetting = new CSeekDBSetting(
+		"NA", //default gvar arg, argument not needed for PCLServer
+		sArgs.sinfo_arg, sArgs.platform_arg, sArgs.prep_arg,
+		".",  //default DB arg, argument not needed for PCLServer
+		sArgs.gene_arg, sArgs.quant_arg, sArgs.dset_arg,
+		21702 //default num_db arg, argument not needed for PCLServer
+	);
+
+	//vector<CSeekDBSetting*> cc;
+	cc.push_back(dbSetting);
+
+	string add_db = sArgs.additional_db_arg;
+	if(add_db!="NA"){
+		ifstream ifsm;
+		ifsm.open(add_db.c_str());
+		const int lineSize =1024;
+		if(!ifsm.is_open()){
+			fprintf(stderr, "Error opening file %s\n", add_db.c_str());
+			return false;
+		}
+		char acBuffer[lineSize];
+		ushort c_iBuffer = lineSize;
+		map<string,string> parameters;
+		i=0;
+		while(!ifsm.eof()){
+			ifsm.getline(acBuffer, c_iBuffer-1);
+			if(acBuffer[0]==0) break;
+			acBuffer[c_iBuffer-1]=0;
+			vector<string> tok;
+			CMeta::Tokenize(acBuffer, tok); //separator tab
+			parameters[tok[0]] = tok[1];
+		}
+		ifsm.close();
+
+		string sinfo_dir = "NA";
+		string gvar_dir = "NA";
+		string platform_dir = "NA";
+		string prep_dir = "NA";
+		string db_dir = "NA";
+		string dset_map_file = "NA";
+		string gene_map_file = "NA";
+		string quant_file = "NA";
+		int num_db = -1;
+
+		if(parameters.find("SINFO_DIR")->second=="NA"){
+			fprintf(stderr, "Please specify an sinfo directory for the extra db\n");
+			return false;
+		}
+		sinfo_dir = parameters.find("SINFO_DIR")->second;
+		if(parameters.find("GVAR_DIR")!=parameters.end())
+			gvar_dir = parameters.find("GVAR_DIR")->second;
+		if(parameters.find("PREP_DIR")==parameters.end() ||
+			parameters.find("PLATFORM_DIR")==parameters.end() ||
+			parameters.find("DB_DIR")==parameters.end() ||
+			parameters.find("DSET_MAP_FILE")==parameters.end() ||
+			parameters.find("GENE_MAP_FILE")==parameters.end() ||
+			parameters.find("QUANT_FILE")==parameters.end() ||
+			parameters.find("NUMBER_OF_DB")==parameters.end()){
+			fprintf(stderr, "Some arguments are missing. Please make sure the following are provided:\n");
+			fprintf(stderr, "PREP_DIR, DB_DIR, DSET_MAP_FILE, GENE_MAP_FILE, QUANT_FILE, NUMBER_OF_DB\n");
+			return false;
+		}
+
+		platform_dir = parameters.find("PLATFORM_DIR")->second;
+		db_dir = parameters.find("DB_DIR")->second;
+		prep_dir = parameters.find("PREP_DIR")->second;
+		dset_map_file = parameters.find("DSET_MAP_FILE")->second;
+		gene_map_file = parameters.find("GENE_MAP_FILE")->second;
+		quant_file = parameters.find("QUANT_FILE")->second;
+		num_db = atoi(parameters.find("NUMBER_OF_DB")->second.c_str());
+
+		CSeekDBSetting *dbSetting2 = new CSeekDBSetting(gvar_dir, sinfo_dir,
+			platform_dir, prep_dir, db_dir, gene_map_file, quant_file, dset_map_file,
+			num_db);
+		cc.push_back(dbSetting2);
+	}
 	
-	strPrepInputDirectory = sArgs.prep_arg;
-	strSinfoInputDirectory = sArgs.sinfo_arg;
-	strPlatformInputDirectory = sArgs.platform_arg;
-	strDatasetInputDirectory = sArgs.dset_arg;
-
 	if(!CSeekTools::ReadListTwoColumns(sArgs.gene_arg, vecstrGeneID, vecstrGenes))
 		return false;
 	for(i=0; i<vecstrGenes.size(); i++)
 		mapstrintGene[vecstrGenes[i]] = (int) i;
 
-	//Read datasets and Read platforms
-	if(!CSeekTools::ReadListTwoColumns(sArgs.dset_arg, vecstrDatasets, vecstrDP))
-		return false;
+	for(i=0; i<cc.size(); i++){
+		vector<string> vD, vDP;
+		if(!CSeekTools::ReadListTwoColumns(cc[i]->GetValue("dset"), vD, vDP))
+			return false;
+		for(j=0; j<vD.size(); j++){
+			vecstrDatasets.push_back(vD[j]);
+			vecstrDP.push_back(vDP[j]);
+			mapstrintDatasetDB[vD[j]] = (int) i;			
+		}
+		vector<string> vP;
+		map<string,ushort> mP;
+		vector<CSeekPlatform> vpx;
+		CSeekTools::ReadPlatforms(cc[i]->GetValue("platform"), vpx, vP, mP);
+		for(map<string,ushort>::iterator it=mP.begin();
+			it!=mP.end(); it++){
+			mapstriPlatform[it->first] = it->second;
+		}
+		int cur=vp.size();
+		vp.resize(cur+vpx.size());
+		for(j=0; j<vpx.size(); j++)
+			vp[cur+j].Copy(vpx[j]);
+	}	
 
 	for(i=0; i<vecstrDatasets.size(); i++){
 		mapstrstrDatasetPlatform[vecstrDatasets[i]] = vecstrDP[i];
 		mapstrintDataset[vecstrDatasets[i]] = i;
 	}
 
-	CSeekTools::ReadPlatforms(sArgs.platform_arg, vp, vecstrPlatforms,
-		mapstriPlatform);
 	//==================================================
 
 

tools/PCLServer/PCLServer.ggo

 option	"port"				p	"Port to listen to"
 								string typestr="9000"
 option	"input"				i	"Input PCL directory"
-								string	typestr="directory"
+								string	typestr="directory"	yes
 option	"sinfo"				s	"Sinfo directory"
-								string	typestr="directory"
+								string	typestr="directory"	yes
 option	"prep"				q	"Prep directory"
-								string	typestr="directory"
+								string	typestr="directory"	yes
 option	"gene"				g	"Gene map file"
-								string	typestr="file"
+								string	typestr="file"	yes
 option	"platform"			P	"Platform directory"
-								string	typestr="directory"
+								string	typestr="directory"	yes
 option	"dset"				x	"Dataset listing"
-								string	typestr="file"
+								string	typestr="file"	yes
+option	"quant"				Q	"quant file (assuming all datasets use the same quantization)"
+								string	typestr="filename"	yes
+						
+section "MISC"								
+option	"additional_db"		B	"Utilize a second CDatabase collection. Path to the second CDatabase's setting file."
+								string default="NA"	

tools/PCLServer/cmdline.c

 const char *gengetopt_args_info_description = "";
 
 const char *gengetopt_args_info_help[] = {
-  "  -h, --help                Print help and exit",
-  "  -V, --version             Print version and exit",
+  "  -h, --help                  Print help and exit",
+  "  -V, --version               Print version and exit",
   "\nMain:",
-  "  -p, --port=9000           Port to listen to",
-  "  -i, --input=directory     Input PCL directory",
-  "  -s, --sinfo=directory     Sinfo directory",
-  "  -q, --prep=directory      Prep directory",
-  "  -g, --gene=file           Gene map file",
-  "  -P, --platform=directory  Platform directory",
-  "  -x, --dset=file           Dataset listing",
+  "  -p, --port=9000             Port to listen to",
+  "  -i, --input=directory       Input PCL directory",
+  "  -s, --sinfo=directory       Sinfo directory",
+  "  -q, --prep=directory        Prep directory",
+  "  -g, --gene=file             Gene map file",
+  "  -P, --platform=directory    Platform directory",
+  "  -x, --dset=file             Dataset listing",
+  "  -Q, --quant=filename        quant file (assuming all datasets use the same \n                                quantization)",
+  "\nMISC:",
+  "  -B, --additional_db=STRING  Utilize a second CDatabase collection. Path to \n                                the second CDatabase's setting file.  \n                                (default=`NA')",
     0
 };
 
 cmdline_parser_internal (int argc, char **argv, struct gengetopt_args_info *args_info,
                         struct cmdline_parser_params *params, const char *additional_error);
 
+static int
+cmdline_parser_required2 (struct gengetopt_args_info *args_info, const char *prog_name, const char *additional_error);
 
 static char *
 gengetopt_strdup (const char *s);
   args_info->gene_given = 0 ;
   args_info->platform_given = 0 ;
   args_info->dset_given = 0 ;
+  args_info->quant_given = 0 ;
+  args_info->additional_db_given = 0 ;
 }
 
 static
   args_info->platform_orig = NULL;
   args_info->dset_arg = NULL;
   args_info->dset_orig = NULL;
+  args_info->quant_arg = NULL;
+  args_info->quant_orig = NULL;
+  args_info->additional_db_arg = gengetopt_strdup ("NA");
+  args_info->additional_db_orig = NULL;
   
 }
 
   args_info->gene_help = gengetopt_args_info_help[7] ;
   args_info->platform_help = gengetopt_args_info_help[8] ;
   args_info->dset_help = gengetopt_args_info_help[9] ;
+  args_info->quant_help = gengetopt_args_info_help[10] ;
+  args_info->additional_db_help = gengetopt_args_info_help[12] ;
   
 }
 
   free_string_field (&(args_info->platform_orig));
   free_string_field (&(args_info->dset_arg));
   free_string_field (&(args_info->dset_orig));
+  free_string_field (&(args_info->quant_arg));
+  free_string_field (&(args_info->quant_orig));
+  free_string_field (&(args_info->additional_db_arg));
+  free_string_field (&(args_info->additional_db_orig));
   
   
   for (i = 0; i < args_info->inputs_num; ++i)
     write_into_file(outfile, "platform", args_info->platform_orig, 0);
   if (args_info->dset_given)
     write_into_file(outfile, "dset", args_info->dset_orig, 0);
+  if (args_info->quant_given)
+    write_into_file(outfile, "quant", args_info->quant_orig, 0);
+  if (args_info->additional_db_given)
+    write_into_file(outfile, "additional_db", args_info->additional_db_orig, 0);
   
 
   i = EXIT_SUCCESS;
 int
 cmdline_parser_required (struct gengetopt_args_info *args_info, const char *prog_name)
 {
-  FIX_UNUSED (args_info);
-  FIX_UNUSED (prog_name);
-  return EXIT_SUCCESS;
+  int result = EXIT_SUCCESS;
+
+  if (cmdline_parser_required2(args_info, prog_name, 0) > 0)
+    result = EXIT_FAILURE;
+
+  return result;
+}
+
+int
+cmdline_parser_required2 (struct gengetopt_args_info *args_info, const char *prog_name, const char *additional_error)
+{
+  int error = 0;
+  FIX_UNUSED (additional_error);
+
+  /* checks for required options */
+  if (! args_info->input_given)
+    {
+      fprintf (stderr, "%s: '--input' ('-i') option required%s\n", prog_name, (additional_error ? additional_error : ""));
+      error = 1;
+    }
+  
+  if (! args_info->sinfo_given)
+    {
+      fprintf (stderr, "%s: '--sinfo' ('-s') option required%s\n", prog_name, (additional_error ? additional_error : ""));
+      error = 1;
+    }
+  
+  if (! args_info->prep_given)
+    {
+      fprintf (stderr, "%s: '--prep' ('-q') option required%s\n", prog_name, (additional_error ? additional_error : ""));
+      error = 1;
+    }
+  
+  if (! args_info->gene_given)
+    {
+      fprintf (stderr, "%s: '--gene' ('-g') option required%s\n", prog_name, (additional_error ? additional_error : ""));
+      error = 1;
+    }
+  
+  if (! args_info->platform_given)
+    {
+      fprintf (stderr, "%s: '--platform' ('-P') option required%s\n", prog_name, (additional_error ? additional_error : ""));
+      error = 1;
+    }
+  
+  if (! args_info->dset_given)
+    {
+      fprintf (stderr, "%s: '--dset' ('-x') option required%s\n", prog_name, (additional_error ? additional_error : ""));
+      error = 1;
+    }
+  
+  if (! args_info->quant_given)
+    {
+      fprintf (stderr, "%s: '--quant' ('-Q') option required%s\n", prog_name, (additional_error ? additional_error : ""));
+      error = 1;
+    }
+  
+  
+  /* checks for dependences among options */
+
+  return error;
 }
 
 
         { "gene",	1, NULL, 'g' },
         { "platform",	1, NULL, 'P' },
         { "dset",	1, NULL, 'x' },
+        { "quant",	1, NULL, 'Q' },
+        { "additional_db",	1, NULL, 'B' },
         { 0,  0, 0, 0 }
       };
 
-      c = getopt_long (argc, argv, "hVp:i:s:q:g:P:x:", long_options, &option_index);
+      c = getopt_long (argc, argv, "hVp:i:s:q:g:P:x:Q:B:", long_options, &option_index);
 
       if (c == -1) break;	/* Exit from `while (1)' loop.  */
 
             goto failure;
         
           break;
+        case 'Q':	/* quant file (assuming all datasets use the same quantization).  */
+        
+        
+          if (update_arg( (void *)&(args_info->quant_arg), 
+               &(args_info->quant_orig), &(args_info->quant_given),
+              &(local_args_info.quant_given), optarg, 0, 0, ARG_STRING,
+              check_ambiguity, override, 0, 0,
+              "quant", 'Q',
+              additional_error))
+            goto failure;
+        
+          break;
+        case 'B':	/* Utilize a second CDatabase collection. Path to the second CDatabase's setting file..  */
+        
+        
+          if (update_arg( (void *)&(args_info->additional_db_arg), 
+               &(args_info->additional_db_orig), &(args_info->additional_db_given),
+              &(local_args_info.additional_db_given), optarg, 0, "NA", ARG_STRING,
+              check_ambiguity, override, 0, 0,
+              "additional_db", 'B',
+              additional_error))
+            goto failure;
+        
+          break;
 
         case 0:	/* Long option with no short option */
         case '?':	/* Invalid option.  */
 
 
 
+  if (check_required)
+    {
+      error += cmdline_parser_required2 (args_info, argv[0], additional_error);
+    }
 
   cmdline_parser_release (&local_args_info);
 

tools/PCLServer/cmdline.h

   char * dset_arg;	/**< @brief Dataset listing.  */
   char * dset_orig;	/**< @brief Dataset listing original value given at command line.  */
   const char *dset_help; /**< @brief Dataset listing help description.  */
+  char * quant_arg;	/**< @brief quant file (assuming all datasets use the same quantization).  */
+  char * quant_orig;	/**< @brief quant file (assuming all datasets use the same quantization) original value given at command line.  */
+  const char *quant_help; /**< @brief quant file (assuming all datasets use the same quantization) help description.  */
+  char * additional_db_arg;	/**< @brief Utilize a second CDatabase collection. Path to the second CDatabase's setting file. (default='NA').  */
+  char * additional_db_orig;	/**< @brief Utilize a second CDatabase collection. Path to the second CDatabase's setting file. original value given at command line.  */
+  const char *additional_db_help; /**< @brief Utilize a second CDatabase collection. Path to the second CDatabase's setting file. help description.  */
   
   unsigned int help_given ;	/**< @brief Whether help was given.  */
   unsigned int version_given ;	/**< @brief Whether version was given.  */
   unsigned int gene_given ;	/**< @brief Whether gene was given.  */
   unsigned int platform_given ;	/**< @brief Whether platform was given.  */
   unsigned int dset_given ;	/**< @brief Whether dset was given.  */
+  unsigned int quant_given ;	/**< @brief Whether quant was given.  */
+  unsigned int additional_db_given ;	/**< @brief Whether additional_db was given.  */
 
   char **inputs ; /**< @brief unamed options (options without names) */
   unsigned inputs_num ; /**< @brief unamed options number */