Commits

Jian Zhou committed a2865c8 Merge

merge

  • Participants
  • Parent commits e09676d, c8eb988

Comments (0)

Files changed (17)

src/seekcentral.cpp

 	m_mapstrintDataset.clear();
 	m_mapstrintGene.clear();
 	m_searchdsetMap.clear();
-	m_DB = NULL;
+	m_vecDB.clear();
+	m_vecDBDataset.clear();
 	m_rData = NULL;
 	m_maxNumDB = 50;
 
 	m_counts.clear();
 	m_weight.clear();
 	m_final.clear();
+	m_vecDBDataset.clear();
 
 	m_vecstrAllQuery.clear();
 	m_Query.clear();
 	m_mapstriPlatform.clear();
 	m_vecstrPlatform.clear();
 
-	if(m_DB!=NULL){
+	if(m_vecDB.size()!=0){
+		if(!m_bSharedDB){
+			for(i=0; i<m_vecDB.size(); i++)
+				delete m_vecDB[i];
+		}
+		for(i=0; i<m_vecDB.size(); i++)
+			m_vecDB[i] = NULL;
+		m_vecDB.clear();
+	}
+
+	/*if(m_DB!=NULL){
 		if(!m_bSharedDB){
 			delete m_DB;
 		}
 		m_DB = NULL;
-	}
+	}*/
 	m_iDatasets = 0;
 	m_iGenes = 0;
 	m_numThreads = 0;
 	const enum CSeekDataset::DistanceMeasure eDistMeasure,
 	const bool bSubtractGeneAvg, const bool bNormPlatform){
 
-	//fprintf(stderr, "B0 %lu\n", CMeta::GetMemoryUsage());
 	m_output_dir = output_dir; //LATER, TO BE DELETED
 	m_maxNumDB = src->m_maxNumDB;
 	m_bSharedDB = true;
 	m_iDatasets = m_vecstrDatasets.size();
 	m_iGenes = m_vecstrGenes.size();
 
-	//fprintf(stderr, "%d %d\n", m_iDatasets, m_iGenes);
-	//fprintf(stderr, "B1 %lu\n", CMeta::GetMemoryUsage());
-
 	//read search datasets
 	vector<string> sd;
 	CMeta::Tokenize(search_dset.c_str(), sd, "|", false);
 	m_vecstrSearchDatasets.resize(sd.size());
 	for(i=0; i<sd.size(); i++){
 		CMeta::Tokenize(sd[i].c_str(), m_vecstrSearchDatasets[i], " ", false);
-		//fprintf(stderr, "%s\n", sd[i].c_str());
 	}
 	//read queries
 	vector<string> sq;
 	m_vecstrAllQuery.resize(sq.size());
 	for(i=0; i<sq.size(); i++){
 		CMeta::Tokenize(sq[i].c_str(), m_vecstrAllQuery[i], " ", false);
-		//fprintf(stderr, "%s\n", sq[i].c_str());
 	}
-	//fprintf(stderr, "%s\n", output_dir.c_str());
+
 	m_searchdsetMap.resize(m_vecstrAllQuery.size());
 	for(i=0; i<m_vecstrAllQuery.size(); i++){
 		m_searchdsetMap[i] = new CSeekIntIntMap(m_vecstrDatasets.size());
 				m_mapstrintDataset[m_vecstrSearchDatasets[i][j]]);
 	}
 
-	//fprintf(stderr, "B2 %lu\n", CMeta::GetMemoryUsage());
+	m_vecDB.resize(src->m_vecDB.size());
+	for(i=0; i<m_vecDB.size(); i++)
+		m_vecDB[i] = src->m_vecDB[i];
+	//m_DB = src->m_DB; //shared DB
 
-	m_DB = src->m_DB; //shared DB
+	m_vecDBDataset.resize(src->m_vecDB.size());
+	for(i=0; i<m_vecDB.size(); i++)
+		copy(src->m_vecDBDataset[i].begin(), src->m_vecDBDataset[i].end(),
+		m_vecDBDataset[i].begin());
 
-	CSeekTools::LoadDatabase(*m_DB, m_vc, src->m_vc, m_vp, src->m_vp,
-		m_vecstrDatasets, m_mapstrstrDatasetPlatform, m_mapstriPlatform);
-
-	//fprintf(stderr, "B3 %lu\n", CMeta::GetMemoryUsage());
+	CSeekTools::LoadDatabase(m_vecDB, m_iGenes, m_iDatasets,
+		m_vc, src->m_vc, m_vp, src->m_vp, m_vecstrDatasets,
+		m_mapstrstrDatasetPlatform, m_mapstriPlatform);
 
 	if(!CalculateRestart())
 		return false;
 				count[j]++;
 				present++;
 			}
+
 			//datasets that contains all query genes (very stringent)
 			//if(present==m_vecstrAllQuery[l].size()){
 
 }
 
 //load everything except query, search datasets, output directory
-bool CSeekCentral::Initialize(const char *gene, const char *quant,
-	const char *dset, const char *platform, const char *db,
-	const char *prep, const char *gvar, const char *sinfo,
-	const ushort num_db,
+bool CSeekCentral::Initialize(const vector<CSeekDBSetting*> &vecDBSetting,
 	const ushort buffer, const bool to_output_text,
 	const bool bOutputWeightComponent, const bool bSimulateWeight,
 	const enum CSeekDataset::DistanceMeasure dist_measure,
 	m_bLogit = bLogit;
 	m_eDistMeasure = dist_measure;
 
-	string strGvarDirectory = gvar;
-	string strSinfoDirectory = sinfo;
-	if(dist_measure==CSeekDataset::CORRELATION && sinfo=="NA"){
-		fprintf(stderr, "Error: not specifying sinfo!\n");
-		return false;
-	}
+	bool bCorrelation = false;
 
-	if(dist_measure==CSeekDataset::CORRELATION && 
-		(m_bSubtractGeneAvg || m_bNormPlatform || m_bLogit)){
-		fprintf(stderr, 
-			"Warning: setting subtract_avg, norm platform to false\n");
-		m_bSubtractGeneAvg = false;
-		m_bNormPlatform = false;
-		m_bLogit = false;
+	if(dist_measure==CSeekDataset::CORRELATION){
+		bCorrelation = true;
+		if(m_bSubtractGeneAvg || m_bNormPlatform || m_bLogit){
+			fprintf(stderr,
+				"Warning: setting subtract_avg, norm platform to false\n");
+			m_bSubtractGeneAvg = false;
+			m_bNormPlatform = false;
+			m_bLogit = false;
+		}
 	}
 
 	//read genes
 	vector<string> vecstrGeneID;
-	if(!CSeekTools::ReadListTwoColumns(gene, vecstrGeneID, m_vecstrGenes))
+	if(!CSeekTools::ReadListTwoColumns(vecDBSetting[0]->GetValue("gene"),
+		vecstrGeneID, m_vecstrGenes))
 		return false;
-
 	for(i=0; i<m_vecstrGenes.size(); i++)
 		m_mapstrintGene[m_vecstrGenes[i]] = i;
 
-	CSeekTools::ReadQuantFile(quant, m_quant);
-	m_DB = new CDatabase(useNibble);
+	//read quant file
+	CSeekTools::ReadQuantFile(vecDBSetting[0]->GetValue("quant"), m_quant);
 
-	//read datasets
-	if(!CSeekTools::ReadListTwoColumns(dset, m_vecstrDatasets, m_vecstrDP))
-		return false;
+	m_vecstrDatasets.clear();
+	m_vecstrDP.clear();
+	m_mapstriPlatform.clear();
+	m_mapstrstrDatasetPlatform.clear();
+	m_mapstrintDataset.clear();
+	m_vp.clear();
+
+	m_vecDB.resize(vecDBSetting.size());
+	m_vecDBDataset.resize(vecDBSetting.size());
+	for(i=0; i<vecDBSetting.size(); i++)
+		m_vecDB[i] = NULL;
+
+	for(i=0; i<vecDBSetting.size(); i++){
+		if(dist_measure==CSeekDataset::CORRELATION &&
+		vecDBSetting[i]->GetValue("sinfo")=="NA"){
+			fprintf(stderr, "Error: not specifying sinfo!\n");
+			return false;
+		}
+
+		m_vecDB[i] = new CDatabase(useNibble);
+		//read datasets
+		vector<string> vD, vDP;
+		if(!CSeekTools::ReadListTwoColumns(vecDBSetting[i]->GetValue("dset"), vD, vDP))
+			return false;
+
+		for(j=0; j<vD.size(); j++){
+			m_vecstrDatasets.push_back(vD[j]);
+			m_vecDBDataset[i].push_back(vD[j]);
+			m_vecstrDP.push_back(vDP[j]);
+		}
+
+		vector<string> vecstrPlatforms;
+		map<string,ushort> mapstriPlatform;
+		vector<CSeekPlatform> vp;
+		CSeekTools::ReadPlatforms(vecDBSetting[i]->GetValue("platform"), vp,
+			vecstrPlatforms, mapstriPlatform);
+		for(map<string,ushort>::iterator it=mapstriPlatform.begin();
+			it!=mapstriPlatform.end(); it++){
+			m_mapstriPlatform[it->first] = it->second;
+		}
+
+		int cur = m_vp.size();
+		m_vp.resize(cur+vp.size());
+		for(j=0; j<vp.size(); j++)
+			m_vp[cur+j].Copy(vp[j]);
+	}
 
 	for(i=0; i<m_vecstrDatasets.size(); i++){
 		m_mapstrstrDatasetPlatform[m_vecstrDatasets[i]] = m_vecstrDP[i];
 		m_mapstrintDataset[m_vecstrDatasets[i]] = i;
 	}
 
-	vector<string> vecstrPlatforms;
-	CSeekTools::ReadPlatforms(platform, m_vp, vecstrPlatforms,
-		m_mapstriPlatform);
-
 	m_iDatasets = m_vecstrDatasets.size();
 	m_iGenes = m_vecstrGenes.size();
 
-	m_DB->Open(db, m_vecstrGenes, m_iDatasets, num_db);
-	CSeekTools::LoadDatabase(*m_DB, prep, gvar, sinfo, m_vecstrDatasets,
-		m_mapstrstrDatasetPlatform, m_mapstriPlatform, m_vp, m_vc);
+	for(i=0; i<vecDBSetting.size(); i++){
+		m_vecDB[i]->Open(vecDBSetting[i]->GetValue("db"),
+			m_vecstrGenes, m_vecDBDataset[i].size(), vecDBSetting[i]->GetNumDB());
+	}
+
+	CSeekTools::LoadDatabase(m_vecDB, m_iGenes, m_iDatasets,
+		vecDBSetting, m_vecstrDatasets, m_mapstrstrDatasetPlatform,
+		m_mapstriPlatform, m_vp, m_vc, m_vecDBDataset, m_mapstrintDataset,
+		false, bCorrelation);
 
 	return true;
 }
 
-
-bool CSeekCentral::Initialize(const char *gene, const char *quant,
-	const char *dset, const char *search_dset,
-	const char *query, const char *platform, const char *db,
-	const char *prep, const char *gvar, const char *sinfo,
-	const ushort num_db, const char *output_dir,
-	const ushort buffer, const bool to_output_text,
+bool CSeekCentral::Initialize(
+	const vector<CSeekDBSetting*> &vecDBSetting,
+	const char *search_dset, const char *query,
+	const char *output_dir, const ushort buffer, const bool to_output_text,
 	const bool bOutputWeightComponent, const bool bSimulateWeight,
 	const enum CSeekDataset::DistanceMeasure dist_measure,
 	const bool bSubtractAvg, const bool bNormPlatform,
 	const bool bSquareZ, const bool bRandom, const int iNumRandom,
 	gsl_rng *rand, const bool useNibble){
 
-	if(!CSeekCentral::Initialize(gene, quant, dset, platform, 
-		db, prep, gvar, sinfo, num_db, buffer, to_output_text,
+	if(!CSeekCentral::Initialize(vecDBSetting, buffer, to_output_text,
 		bOutputWeightComponent, bSimulateWeight, dist_measure,
 		bSubtractAvg, bNormPlatform, bLogit, fCutOff, fPercentRequired,
 		bSquareZ, bRandom, iNumRandom, rand, useNibble)){
 
 	if(!CalculateRestart()) return false;
 
-
 	return true;
 }
 
 	vector<ushort> queryGenes;
 	ushort j;
 	for(j=0; j<vecstrQuery.size(); j++){
-		size_t m = m_DB->GetGene(vecstrQuery[j]);
-		if(m==-1) continue;
-		queryGenes.push_back(m);
+		if(m_mapstrintGene.find(vecstrQuery[j])==
+			m_mapstrintGene.end()) continue;
+		//size_t m = m_DB->GetGene(vecstrQuery[j]);
+		//if(m==-1) continue;
+		//queryGenes.push_back(m);
+		queryGenes.push_back(m_mapstrintGene[vecstrQuery[j]]);
 	}
 	queryGenes.resize(queryGenes.size());
 	query.InitializeQuery(queryGenes, m_iGenes);
 	const vector<char> &cQuery = query.GetQueryPresence();
 	for(ii=0, jj=0; jj<500; ii++){
 		if(cQuery[final[ii].i]==1) continue;
+		//fprintf(stderr, "%s %.5f\n",
+		//	m_DB->GetGene((size_t)final[ii].i).c_str(), final[ii].f);
 		fprintf(stderr, "%s %.5f\n",
-			m_DB->GetGene((size_t)final[ii].i).c_str(), final[ii].f);
+			m_vecstrGenes[(size_t)final[ii].i].c_str(), final[ii].f);
 		jj++;
 	}
 	return true;
 
 	for(i=0; i<m_vecstrAllQuery.size(); i++){
 
-
 		//simulated weight case ======================
 		/*if(simulateWeight && redoWithEqual>=1) //1 or 2 
 			current_sm = EQUAL;
 
 		if(m_mapLoadTime.find(i)!=m_mapLoadTime.end()){
 			if(!m_bRandom || l==0){ //l==0: first random repetition
-				CSeekTools::ReadDatabaselets(*m_DB, m_mapLoadTime[i], m_vc, 
-				m_iClient, m_bEnableNetwork);
+				CSeekTools::ReadDatabaselets(m_vecDB, m_iGenes, m_iDatasets,
+					m_mapLoadTime[i], m_vc, m_mapstrintGene, m_vecDBDataset,
+					m_mapstrintDataset, m_iClient, m_bEnableNetwork);
 			}
 		}
 
 }
 
 ushort CSeekCentral::GetGene(const string &strGene) const{
-	return (ushort) m_DB->GetGene(strGene);
+	if(m_mapstrintGene.find(strGene)==m_mapstrintGene.end())
+		return CSeekTools::GetNaN();
+	return m_mapstrintGene.find(strGene)->second;
 }
 string CSeekCentral::GetGene(const ushort &geneID) const{
-	return m_DB->GetGene((size_t) geneID);
+	return m_vecstrGenes[(size_t) geneID];
 }
 
 const vector<vector<float> >& CSeekCentral::GetAllWeight() const{

src/seekcentral.h

  * The Seek search algorithms perform the coexpression search of the user's
  * query genes in a large compendium of microarray datasets. 
  * The output of the search algorithms is a ranking of genes based on their
- * gene score, where the gene score represents the overall weighted coexpression
+ * gene score, which is determined by the overall weighted coexpression
  * to the query genes. 
  *
  * One of the first steps in a search is to weight
  * the datasets in such a way to prioritize informative datasets.
- * Then, with the dataset weight generated, the final gene-score is given by:
+ * Then, with the weights generated, the final gene-score is given by:
  * \f[FS(g, Q)=\alpha\sum_{d \in D}{w_d \cdot s_d(g, Q)}\f]
  * where \f$w_d\f$ is the weight of the dataset, \f$s_d(g, Q)\f$ is the score
  * of \f$g\f$ to the query in the dataset, \f$\alpha\f$ is the normalization 
      * the contribution of each dataset, the simulated weight is computed from the distance of a dataset's coexpression ranking to the final gene ranking.
      * \remark This function is designed to be used by SeekMiner.
      */
-	bool Initialize(const char *gene, const char *quant,
-		const char *dset, const char *search_dset,
-		const char *query, const char *platform, const char* db,
-		const char *prep, const char *gvar, const char *sinfo,
-		const ushort num_db, const char* output_dir,
+	bool Initialize(
+		const vector<CSeekDBSetting*> &vecDBSetting,
+		const char *search_dset, const char *query,
+		const char* output_dir,
 		const ushort buffer = 20, const bool to_output_text = false,
 		const bool bOutputWeightComponent = false, const bool bSimulateWeight = false,
 		const enum CSeekDataset::DistanceMeasure dist_measure = CSeekDataset::Z_SCORE,
      * the contribution of each dataset, the simulated weight is computed from the distance of a dataset's coexpression ranking to the final gene ranking.
      * \remark This function is designed to be used by SeekMiner.
      */
-	bool Initialize(const char *gene, const char *quant,
-		const char *dset, const char *platform, const char* db,
-		const char *prep, const char *gvar, const char *sinfo,
-		const ushort num_db,
+	bool Initialize(
+		const vector<CSeekDBSetting*> &vecDBSetting,
 		const ushort buffer = 20, const bool to_output_text = false,
 		const bool bOutputWeightComponent = false, const bool bSimulateWeight = false,
 		const enum CSeekDataset::DistanceMeasure dist_measure = CSeekDataset::Z_SCORE,
 	/* random gene scores over all repetitions */
 	//vector<vector<float> > m_vecRandScore; 
 
-	/* Gene-gene correlation matrix for all datasets*/
+	/* Gene-gene correlation matrix for all datasets
+	 Organized per thread */
 	ushort ***m_rData;
 
 	/* Correlation discretization */
 	/* Correlation transformation options */
 	bool m_bSubtractGeneAvg;
 	bool m_bNormPlatform;
-	//bool m_bSubtractPlatformAvg;
-	//bool m_bDividePlatformStdev;
 	enum CSeekDataset::DistanceMeasure m_eDistMeasure;
-	//bool m_bCorrelation;
 	bool m_bLogit;
 	bool m_bSquareZ;
 
 	map<string, ushort> m_mapstriPlatform;
 	vector<string> m_vecstrPlatform;
 
-	CDatabase *m_DB;
+	//CDatabase reference
+	vector<CDatabase*> m_vecDB;
+	vector<vector<string> > m_vecDBDataset; //A list of dsets in each CDatabase
 
 	size_t m_iDatasets;
 	size_t m_iGenes;
 	bool m_bSharedDB; //if m_DB is shared between multiple CSeekCentral instances
 };
 
+
+
+
+
 }
 #endif

src/seekdataset.cpp

 		for(i=0; i<src->geneAverage.size(); i++){
 			geneAverage[i] = src->geneAverage[i];
 		}
-		//copy(src->geneAverage.begin(), src->geneAverage.end(), 
-		//	geneAverage.begin());
 	}
 	if(src->genePresence.size()>0){
 		//fprintf(stderr, "Great b!\n");
 		for(i=0; i<src->genePresence.size(); i++){
 			genePresence[i] = src->genePresence[i];
 		}
-		//copy(src->genePresence.begin(), src->genePresence.end(),
-		//	genePresence.begin());
 	}
 	if(src->geneVariance.size()>0){
 		geneVariance.resize(src->geneVariance.size());
 	return *platform;
 }
 
-
-
-
 }

src/seekdataset.h

 #include "seekplatform.h"
 
 namespace Sleipnir {
+
+class CSeekDBSetting{
+public:
+	CSeekDBSetting(const string &gvar,
+		const string &sinfo, const string &plat,
+		const string &prep, const string &db,
+		const string &gene, const string &quant,
+		const string &dset, const ushort &numDB){
+		m_gvarDirectory = gvar;
+		m_sinfoDirectory = sinfo;
+		m_platformDirectory = plat;
+		m_prepDirectory = prep;
+		m_dbDirectory = db;
+		m_geneMapFile = gene;
+		m_quantFile = quant;
+		m_dsetFile = dset;
+		m_numDB = numDB;
+	}
+	CSeekDBSetting(const char *gvar,
+		const char* sinfo, const char* plat,
+		const char* prep, const char* db,
+		const char* gene, const char* quant,
+		const char* dset, const ushort &numDB){
+		m_gvarDirectory = gvar;
+		m_sinfoDirectory = sinfo;
+		m_platformDirectory = plat;
+		m_prepDirectory = prep;
+		m_dbDirectory = db;
+		m_geneMapFile = gene;
+		m_quantFile = quant;
+		m_dsetFile = dset;
+		m_numDB = numDB;
+	}
+
+	~CSeekDBSetting(){
+	}
+
+	string GetValue(const string &str){
+		if(str=="gene")
+			return m_geneMapFile;
+		else if(str=="dset")
+			return m_dsetFile;
+		else if(str=="quant")
+			return m_quantFile;
+		else if(str=="gvar")
+			return m_gvarDirectory;
+		else if(str=="sinfo")
+			return m_sinfoDirectory;
+		else if(str=="db")
+			return m_dbDirectory;
+		else if(str=="prep")
+			return m_prepDirectory;
+		else if(str=="platform")
+			return m_platformDirectory;
+		else
+			return "NULL";
+	}
+
+	ushort GetNumDB(){
+		return m_numDB;
+	}
+
+private:
+	string m_gvarDirectory;
+	string m_sinfoDirectory;
+	string m_platformDirectory;
+	string m_prepDirectory;
+	string m_dbDirectory;
+	string m_geneMapFile;
+	string m_quantFile;
+	string m_dsetFile;
+	ushort m_numDB;
+};
+
+
+
 /*!
  * \brief Representation of a microarray dataset that is used by Seek
  *
 
 	float sum_weight;
 	bool m_bIsNibble;
-
 };
 
 

src/seekreader.cpp

 	return false;
 }
 
-bool CSeekTools::ReadDatabaselets(const CDatabase &DB, 
+ushort CSeekTools::GetNaN(){
+	return 65535;
+}
+
+bool CSeekTools::ReadDatabaselets(const vector<CDatabase*> &DB,
+	const size_t &iGenes, const size_t &iDatasets,
 	const vector< vector<string> > &vecstrAllQuery,
-	vector<CSeekDataset*> &vc, 
+	vector<CSeekDataset*> &vc, const map<string,ushort> &mapstriGenes,
+	const vector<vector<string> > &dbDatasets,
+	const map<string,ushort> &mapstriDatasets,
 	//network mode (data sent to client)
 	const int &iClient, const bool &bNetwork){
 
 	//requires LoadDatabase to be called beforehand
-	size_t iGenes = DB.GetGenes();
-	size_t iDatasets = DB.GetDatasets();
 	size_t i, j, k;
 	vector<char> cAllQuery;
 
 
 	for(i=0; i<vecstrAllQuery.size(); i++){
 		for(j=0; j<vecstrAllQuery[i].size(); j++){
-			if((k = DB.GetGene(vecstrAllQuery[i][j]))==-1) continue;
+			if(mapstriGenes.find(vecstrAllQuery[i][j])==mapstriGenes.end()) continue;
+			ushort k = mapstriGenes.find(vecstrAllQuery[i][j])->second;
 			cAllQuery[k] = 1;
 		}
 	}
 	vector<ushort> allQ;
 	for(i=0; i<cAllQuery.size(); i++) if(cAllQuery[i]==1) allQ.push_back(i);
 	allQ.resize(allQ.size());
-	/*for(i=0; i<allQ.size(); i++){
-		fprintf(stderr, "allQ: %d %d\n", i, allQ[i]);
-	}*/
 
 	//for now
 	for(i=0; i<iDatasets; i++){
 	
 	//fprintf(stderr, "Here\n");	
 	#pragma omp parallel for \
-	shared(allQ) private(i) firstprivate(iDatasets) schedule(dynamic)
+	shared(allQ) private(i) schedule(dynamic)
 	for(i=0; i<iDatasets; i++){
 		vc[i]->InitializeQueryBlock(allQ);
 	}
 	}
 
 	size_t m;
+	size_t d;
 
 	for(i=0; i<allQ.size(); i++){
 		m = allQ[i];
-		vector<unsigned char> Qi;
-		if(!DB.GetGene(m, Qi)){
-			cerr << "Gene does not exist" << endl;
-			continue;
+		for(d=0; d<DB.size(); d++){
+			vector<unsigned char> Qi;
+			if(!DB[d]->GetGene(m, Qi)){
+				cerr << "Gene does not exist" << endl;
+				continue;
+			}
+			ushort db;
+			CSeekIntIntMap *qu = NULL;
+			unsigned char **r = NULL;
+			vector<ushort> vecDatasetID;
+			for(j=0; j<dbDatasets[d].size(); j++){
+				ushort qq = mapstriDatasets.find(dbDatasets[d][j])->second;
+				vecDatasetID.push_back(qq);
+			}
+			#pragma omp parallel for \
+			shared(Qi) private(j, k) \
+			firstprivate(m, qu, r, db) schedule(dynamic)
+			for(j=0; j<vecDatasetID.size(); j++){
+				if((qu=vc[vecDatasetID[j]]->GetDBMap())==NULL) continue;
+				if(CSeekTools::IsNaN(db = (qu->GetForward(m)))) continue;
+				for(r = vc[vecDatasetID[j]]->GetMatrix(), k=0; k<iGenes; k++)
+					r[db][k] = Qi[k*vecDatasetID.size()+j];
+			}
+			Qi.clear();
 		}
-
-		ushort db;
-		CSeekIntIntMap *qu = NULL;
-		unsigned char **r = NULL;
-
-		#pragma omp parallel for \
-		shared(Qi) private(j, k) \
-		firstprivate(iDatasets, iGenes, m, qu, r, db) schedule(dynamic)
-		for(j=0; j<iDatasets; j++){
-			if((qu=vc[j]->GetDBMap())==NULL) continue;
-			if(CSeekTools::IsNaN(db = (qu->GetForward(m)))) continue;
-			for(r = vc[j]->GetMatrix(), k=0; k<iGenes; k++)
-				r[db][k] = Qi[k*iDatasets+j];
-
-			/*vector<unsigned char>::iterator iterQ = Qi.begin() + j;
-			unsigned char *rp = &r[db][0];
-			unsigned char *rp_end = &r[db][0] + iGenes;
-			for(; rp!=rp_end; rp++, iterQ+=iDatasets){
-				*rp = *iterQ;
-			}*/
-		}
-
-		Qi.clear();
 	}
 
 	fprintf(stderr, "Finished reading query genes' correlations\n");
 	return true;
 }
 
-bool CSeekTools::LoadDatabase(const CDatabase &DB,
-	const string &strPrepInputDirectory, 
-	const string &strGvarInputDirectory,
-	const string &strSinfoInputDirectory,
-	const vector<string> &vecstrDatasets,
-	const map<string, string> &mapstrstrDatasetPlatform,
-	const map<string, ushort> &mapstriPlatform, vector<CSeekPlatform> &vp,
-	vector<CSeekDataset*> &vc){
-	return CSeekTools::LoadDatabase(DB, strPrepInputDirectory.c_str(),
-		strGvarInputDirectory.c_str(), strSinfoInputDirectory.c_str(),
-		vecstrDatasets, mapstrstrDatasetPlatform, mapstriPlatform, vp, vc);
-}
-
-bool CSeekTools::LoadDatabase(const CDatabase &DB, 
+bool CSeekTools::LoadDatabase(const vector<CDatabase*> &DB,
+	const size_t &iGenes, const size_t &iDatasets,
 	vector<CSeekDataset*> &vc, const vector<CSeekDataset*> &vc_src, 
 	vector<CSeekPlatform> &vp, const vector<CSeekPlatform> &vp_src, 
 	const vector<string> &vecstrDatasets,
 	const map<string, string> &mapstrstrDatasetPlatform, 
 	const map<string, ushort> &mapstriPlatform){
 
-	size_t iDatasets = DB.GetDatasets();
-	size_t iGenes = DB.GetGenes();
 	size_t i, j, k;
 
 	vc.clear();
 
 	fprintf(stderr, "Initializing gene map\n"); ret = system("date +%s%N 1>&2");
 	#pragma omp parallel for \
-	private(i) firstprivate(iDatasets) schedule(dynamic)
+	private(i) schedule(dynamic)
 	for(i=0; i<iDatasets; i++){
 		vc[i] = new CSeekDataset();
 		vc[i]->Copy(vc_src[i]);
 	return true;
 }
 
-bool CSeekTools::LoadDatabase(const CDatabase &DB,
-	const char *prep_dir, const char *gvar_dir, const char *sinfo_dir,
+bool CSeekTools::LoadDatabase(const vector<CDatabase*> &DB,
+	const size_t &iGenes, const size_t &iDatasets,
+	const vector<CSeekDBSetting*> &DBSetting,
 	const vector<string> &vecstrDatasets,
 	const map<string, string> &mapstrstrDatasetPlatform,
 	const map<string, ushort> &mapstriPlatform, vector<CSeekPlatform> &vp,
-	vector<CSeekDataset*> &vc){
-		
-	size_t iDatasets = DB.GetDatasets();
-	size_t iGenes = DB.GetGenes();
+	vector<CSeekDataset*> &vc, const vector<vector<string> > &dbDataset,
+	const map<string,ushort> &mapstriDataset,
+	const bool bVariance, const bool bCorrelation){
+
 	size_t i, j, k;
 	vc.clear();
 	vc.resize(iDatasets);
-	string strPrepInputDirectory = prep_dir; //must be non NA
 
-	bool bVariance = false;
-	bool bCorrelation = false;
+	if(bCorrelation){
+		for(i=0; i<DB.size(); i++){
+			if(DBSetting[i]->GetValue("sinfo")=="NA"){
+				fprintf(stderr, "sinfo parameter must be given.\n");
+				return false;
+			}
+		}
+	}
 
-	string strSinfoInputDirectory = sinfo_dir;
-	string strGvarInputDirectory = gvar_dir;
-
-	if(strSinfoInputDirectory!="NA"){
-		bCorrelation = true;
-	}
-	if(strGvarInputDirectory!="NA"){
-		bVariance = true;
+	if(bVariance){
+		for(i=0; i<DB.size(); i++){
+			if(DBSetting[i]->GetValue("gvar")=="NA"){
+				fprintf(stderr, "gene variance parameter must be given.\n");
+				return false;
+			}
+		}
 	}
 
 	int ret; //system call return
 
 	fprintf(stderr, "Start reading average and presence files\n");
 	ret = system("date +%s%N 1>&2");
-	for(i=0; i<iDatasets; i++){
-		vc[i] = new CSeekDataset();
-		string strFileStem = vecstrDatasets[i];
-		string strAvgPath = strPrepInputDirectory + "/" +
-			strFileStem + ".gavg";
-		string strPresencePath = strPrepInputDirectory + "/" +
-			strFileStem + ".gpres";
-		vc[i]->ReadGeneAverage(strAvgPath);
-		vc[i]->ReadGenePresence(strPresencePath);
-		if(bVariance){
-			string strVariancePath = strGvarInputDirectory + "/" +
-				strFileStem + ".gexpvar";
-			vc[i]->ReadGeneVariance(strVariancePath);
+	for(i=0; i<DB.size(); i++){
+		const vector<string> &dset = dbDataset[i];
+		string strPrepInputDirectory = DBSetting[i]->GetValue("prep");
+		string strGvarInputDirectory = DBSetting[i]->GetValue("gvar");
+		string strSinfoInputDirectory = DBSetting[i]->GetValue("sinfo");
+
+		for(j=0; j<dset.size(); j++){
+			ushort d = mapstriDataset.find(dset[j])->second;
+			vc[d] = new CSeekDataset();
+			string strFileStem = dset[j];
+			string strAvgPath = strPrepInputDirectory + "/" +
+				strFileStem + ".gavg";
+			string strPresencePath = strPrepInputDirectory + "/" +
+				strFileStem + ".gpres";
+			vc[d]->ReadGeneAverage(strAvgPath);
+			vc[d]->ReadGenePresence(strPresencePath);
+			if(bVariance){
+				string strVariancePath = strGvarInputDirectory + "/" +
+					strFileStem + ".gexpvar";
+				vc[d]->ReadGeneVariance(strVariancePath);
+			}
+			if(bCorrelation){
+				string strSinfoPath = strSinfoInputDirectory + "/" +
+					strFileStem + ".sinfo";
+				vc[d]->ReadDatasetAverageStdev(strSinfoPath);
+			}
+			string strPlatform =
+				mapstrstrDatasetPlatform.find(strFileStem)->second;
+			ushort platform_id = mapstriPlatform.find(strPlatform)->second;
+			vc[d]->SetPlatform(vp[platform_id]);
 		}
-		if(bCorrelation){
-			string strSinfoPath = strSinfoInputDirectory + "/" + 
-				strFileStem + ".sinfo";
-			vc[i]->ReadDatasetAverageStdev(strSinfoPath);
-		}
-		string strPlatform =
-			mapstrstrDatasetPlatform.find(strFileStem)->second;
-		ushort platform_id = mapstriPlatform.find(strPlatform)->second;
-		vc[i]->SetPlatform(vp[platform_id]);
 	}
+
 	fprintf(stderr, "Done reading average and presence files\n");
 	ret = system("date +%s%N 1>&2");
 
 	fprintf(stderr, "Initializing gene map\n"); ret = system("date +%s%N 1>&2");
 	#pragma omp parallel for \
-	private(i) firstprivate(iDatasets) schedule(dynamic)
+	private(i) schedule(dynamic)
 	for(i=0; i<iDatasets; i++) vc[i]->InitializeGeneMap();
 
 	fprintf(stderr, "Done initializing gene map\n"); ret = system("date +%s%N 1>&2");
 	static bool IsNaN(const ushort &);
 
 	/*!
+	 * \brief Return the NaN value as a ushort
+	 */
+	static ushort GetNaN();
+
+	/*!
 	 * \brief Converts an integer to a string
 	 * \param number The given integer number
 	 * \return The string
 	 * \remarks
 	 * Assumes that the CSeekTools::LoadDatabase() has been called.
 	 */
-	static bool ReadDatabaselets(const CDatabase &, 
-		const vector< vector<string> > &, vector<CSeekDataset*> &, 
+	static bool ReadDatabaselets(const vector<CDatabase*>&,
+		const size_t&, const size_t&,
+		const vector<vector<string> >&,
+		vector<CSeekDataset*>&,
+		const map<string,ushort> &,
+		const vector<vector<string> > &, const map<string,ushort> &,
 		//network mode options
 		const int&, const bool&);
 
 	 * \param vc The vector of CSeekDataset, the output
 	 *
 	 */
-	static bool LoadDatabase(const CDatabase &, const string &,
-		const string &, const string &,
-		const vector<string> &, const map<string, string> &,
-		const map<string, ushort> &, vector<CSeekPlatform> &,
-		vector<CSeekDataset*> &);
-
-	/*!
-	 * \brief Read the search setting files and load the CDatabase
-	 *
-	 * Same as the previous CSeekTools::LoadDatabase() definition, except that this function
-	 * accepts string arguments as \c const \c char \c *.
-	 */
-	static bool LoadDatabase(const CDatabase &, const char *,
-		const char *, const char *,
-		const vector<string> &, const map<string, string> &,
-		const map<string, ushort> &, vector<CSeekPlatform> &,
-		vector<CSeekDataset*> &);
+	static bool LoadDatabase(const vector<CDatabase*>&,
+		const size_t&, const size_t&,
+		const vector<CSeekDBSetting*>&,
+		const vector<string>&,
+		const map<string,string>&,
+		const map<string,ushort>&, vector<CSeekPlatform>&,
+		vector<CSeekDataset*>&, const vector<vector<string> >&,
+		const map<string,ushort>&,
+		const bool=false, const bool=false);
 
 	/*!
 	 * \brief Load a CDatabase by copying from an existing instance
 	 * \param mapstrstrDatasetPlatform The dataset-platform mapping
 	 * \param mapstriPlatform Platform name-platform ID mapping
 	 */
-	static bool LoadDatabase(const CDatabase &, vector<CSeekDataset*>&,
+	static bool LoadDatabase(
+		const vector<CDatabase*>&, const size_t&, const size_t&,
+		vector<CSeekDataset*>&,
 		const vector<CSeekDataset*>&, vector<CSeekPlatform>&, 
 		const vector<CSeekPlatform>&, const vector<string>&, 
 		const map<string,string>&, const map<string,ushort>&);

src/seekwriter.cpp

 			num++;
 		}
 		vecResult[i] = sum / (float) num;
-		fprintf(stderr, "%.2f\n", vecResult[i]);
+		//fprintf(stderr, "%.2f\n", vecResult[i]);
 		free(v);
 	}
 	return true;

tools/SeekMiner/SeekMiner.cpp

 	pthread_win32_process_attach_np( );
 #endif // WIN32
 	gengetopt_args_info	sArgs;
+	const int lineSize = 1024;
 
 	if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) {
 		cmdline_parser_print_help( );
 	getchar();*/
 
 	CSeekCentral *csfinal = new CSeekCentral();
-	if(!csfinal->Initialize(sArgs.input_arg, sArgs.quant_arg, sArgs.dset_arg,
+	CSeekDBSetting *dbSetting = new CSeekDBSetting(sArgs.dir_gvar_arg,
+		sArgs.dir_sinfo_arg, sArgs.dir_platform_arg, sArgs.dir_prep_in_arg,
+		sArgs.dir_in_arg, sArgs.input_arg, sArgs.quant_arg, sArgs.dset_arg,
+		sArgs.num_db_arg);
+	vector<CSeekDBSetting*> cc;
+	cc.push_back(dbSetting);
+
+	string add_db = sArgs.additional_db_arg;
+	if(add_db!="NA"){
+		ifstream ifsm;
+		ifsm.open(add_db.c_str());
+		if(!ifsm.is_open()){
+			fprintf(stderr, "Error opening file %s\n", add_db.c_str());
+			return false;
+		}
+		char acBuffer[lineSize];
+		ushort c_iBuffer = lineSize;
+		map<string,string> parameters;
+		i=0;
+		while(!ifsm.eof()){
+			ifsm.getline(acBuffer, c_iBuffer-1);
+			if(acBuffer[0]==0) break;
+			acBuffer[c_iBuffer-1]=0;
+			vector<string> tok;
+			CMeta::Tokenize(acBuffer, tok); //separator tab
+			parameters[tok[0]] = tok[1];
+		}
+		ifsm.close();
+
+		string sinfo_dir = "NA";
+		string gvar_dir = "NA";
+		string platform_dir = "NA";
+		string prep_dir = "NA";
+		string db_dir = "NA";
+		string dset_map_file = "NA";
+		string gene_map_file = "NA";
+		string quant_file = "NA";
+		int num_db = -1;
+
+		if(eDistMeasure==CSeekDataset::CORRELATION){
+			if(parameters.find("SINFO_DIR")==parameters.end() ||
+				parameters.find("SINFO_DIR")->second=="NA"){
+				fprintf(stderr, "Please specify an sinfo directory for the extra db\n");
+				return false;
+			}
+			sinfo_dir = parameters.find("SINFO_DIR")->second;
+		}
+		if(parameters.find("GVAR_DIR")!=parameters.end())
+			gvar_dir = parameters.find("GVAR_DIR")->second;
+		if(parameters.find("PREP_DIR")==parameters.end() ||
+			parameters.find("PLATFORM_DIR")==parameters.end() ||
+			parameters.find("DB_DIR")==parameters.end() ||
+			parameters.find("DSET_MAP_FILE")==parameters.end() ||
+			parameters.find("GENE_MAP_FILE")==parameters.end() ||
+			parameters.find("QUANT_FILE")==parameters.end() ||
+			parameters.find("NUMBER_OF_DB")==parameters.end()){
+			fprintf(stderr, "Some arguments are missing. Please make sure the following are provided:\n");
+			fprintf(stderr, "PREP_DIR, DB_DIR, DSET_MAP_FILE, GENE_MAP_FILE, QUANT_FILE, NUMBER_OF_DB\n");
+		}
+
+		platform_dir = parameters.find("PLATFORM_DIR")->second;
+		db_dir = parameters.find("DB_DIR")->second;
+		prep_dir = parameters.find("PREP_DIR")->second;
+		dset_map_file = parameters.find("DSET_MAP_FILE")->second;
+		gene_map_file = parameters.find("GENE_MAP_FILE")->second;
+		quant_file = parameters.find("QUANT_FILE")->second;
+		num_db = atoi(parameters.find("NUMBER_OF_DB")->second.c_str());
+
+		CSeekDBSetting *dbSetting2 = new CSeekDBSetting(gvar_dir, sinfo_dir,
+			platform_dir, prep_dir, db_dir, gene_map_file, quant_file, dset_map_file,
+			num_db);
+		cc.push_back(dbSetting2);
+	}
+
+	if(!csfinal->Initialize(cc,
 		sArgs.search_dset_arg, 
 		//"/tmp/ex_query2.txt", 
 		sArgs.query_arg,
-		sArgs.dir_platform_arg,
-		sArgs.dir_in_arg, sArgs.dir_prep_in_arg, 
-		sArgs.dir_gvar_arg,
-		sArgs.dir_sinfo_arg,
-		sArgs.num_db_arg,
 		sArgs.output_dir_arg,
 		sArgs.buffer_arg, !!sArgs.output_text_flag,
 		bOutputWeightComponent, bSimulateWeight,
 	//csfinal->OrderStatistics();
 	csfinal->Destruct();
 	delete csfinal;
+	delete dbSetting;
+
+	if(add_db!="NA"){
+		delete cc[1];
+	}
+
+	cc.clear();
 
 #ifdef WIN32
 	pthread_win32_process_detach_np( );

tools/SeekMiner/SeekMiner.ggo

 								flag	off
 option	"simulate_w"		E	"If equal weighting or order-statistics weighting is selected, output simulated dataset weights"
 								flag	off
-								
+option	"additional_db"		B	"Utilize a second CDatabase collection. Path to the second CDatabase's setting file."
+								string default="NA"	

tools/SeekMiner/cmdline.c

 /*
   File autogenerated by gengetopt version 2.22.5
   generated with the following command:
-  /usr/local/bin/gengetopt -iSeekMiner.ggo --default-optional -u -N -e 
+  /usr/bin/gengetopt -iSeekMiner.ggo --default-optional -u -N -e 
 
   The developers of gengetopt consider the fixed text that goes in all
   gengetopt output files to be in the public domain:
   "  -o, --output_dir=directory    Output directory",
   "  -Y, --output_w_comp           Output dataset weight components (generates \n                                  .dweight_comp file)  (default=off)",
   "  -E, --simulate_w              If equal weighting or order-statistics \n                                  weighting is selected, output simulated \n                                  dataset weights  (default=off)",
+  "  -B, --additional_db=STRING    Utilize a second CDatabase collection. Path to \n                                  the second CDatabase's setting file.  \n                                  (default=`NA')",
     0
 };
 
   args_info->output_dir_given = 0 ;
   args_info->output_w_comp_given = 0 ;
   args_info->simulate_w_given = 0 ;
+  args_info->additional_db_given = 0 ;
 }
 
 static
   args_info->output_dir_orig = NULL;
   args_info->output_w_comp_flag = 0;
   args_info->simulate_w_flag = 0;
+  args_info->additional_db_arg = gengetopt_strdup ("NA");
+  args_info->additional_db_orig = NULL;
   
 }
 
   args_info->output_dir_help = gengetopt_args_info_help[41] ;
   args_info->output_w_comp_help = gengetopt_args_info_help[42] ;
   args_info->simulate_w_help = gengetopt_args_info_help[43] ;
+  args_info->additional_db_help = gengetopt_args_info_help[44] ;
   
 }
 
   free_string_field (&(args_info->buffer_orig));
   free_string_field (&(args_info->output_dir_arg));
   free_string_field (&(args_info->output_dir_orig));
+  free_string_field (&(args_info->additional_db_arg));
+  free_string_field (&(args_info->additional_db_orig));
   
   
   for (i = 0; i < args_info->inputs_num; ++i)
     write_into_file(outfile, "output_w_comp", 0, 0 );
   if (args_info->simulate_w_given)
     write_into_file(outfile, "simulate_w", 0, 0 );
+  if (args_info->additional_db_given)
+    write_into_file(outfile, "additional_db", args_info->additional_db_orig, 0);
   
 
   i = EXIT_SUCCESS;
         { "output_dir",	1, NULL, 'o' },
         { "output_w_comp",	0, NULL, 'Y' },
         { "simulate_w",	0, NULL, 'E' },
+        { "additional_db",	1, NULL, 'B' },
         { 0,  0, 0, 0 }
       };
 
-      c = getopt_long (argc, argv, "hx:D:i:q:d:p:P:u:U:Q:n:V:w:f:W:R:F:lSt:z:mMc:eC:I:X:G:Nb:Oo:YE", long_options, &option_index);
+      c = getopt_long (argc, argv, "hx:D:i:q:d:p:P:u:U:Q:n:V:w:f:W:R:F:lSt:z:mMc:eC:I:X:G:Nb:Oo:YEB:", long_options, &option_index);
 
       if (c == -1) break;	/* Exit from `while (1)' loop.  */
 
             goto failure;
         
           break;
+        case 'B':	/* Utilize a second CDatabase collection. Path to the second CDatabase's setting file..  */
+        
+        
+          if (update_arg( (void *)&(args_info->additional_db_arg), 
+               &(args_info->additional_db_orig), &(args_info->additional_db_given),
+              &(local_args_info.additional_db_given), optarg, 0, "NA", ARG_STRING,
+              check_ambiguity, override, 0, 0,
+              "additional_db", 'B',
+              additional_error))
+            goto failure;
+        
+          break;
 
         case 0:	/* Long option with no short option */
           /* Print version and exit.  */

tools/SeekMiner/cmdline.h

   const char *output_w_comp_help; /**< @brief Output dataset weight components (generates .dweight_comp file) help description.  */
   int simulate_w_flag;	/**< @brief If equal weighting or order-statistics weighting is selected, output simulated dataset weights (default=off).  */
   const char *simulate_w_help; /**< @brief If equal weighting or order-statistics weighting is selected, output simulated dataset weights help description.  */
+  char * additional_db_arg;	/**< @brief Utilize a second CDatabase collection. Path to the second CDatabase's setting file. (default='NA').  */
+  char * additional_db_orig;	/**< @brief Utilize a second CDatabase collection. Path to the second CDatabase's setting file. original value given at command line.  */
+  const char *additional_db_help; /**< @brief Utilize a second CDatabase collection. Path to the second CDatabase's setting file. help description.  */
   
   unsigned int help_given ;	/**< @brief Whether help was given.  */
   unsigned int version_given ;	/**< @brief Whether version was given.  */
   unsigned int output_dir_given ;	/**< @brief Whether output_dir was given.  */
   unsigned int output_w_comp_given ;	/**< @brief Whether output_w_comp was given.  */
   unsigned int simulate_w_given ;	/**< @brief Whether simulate_w was given.  */
+  unsigned int additional_db_given ;	/**< @brief Whether additional_db was given.  */
 
   char **inputs ; /**< @brief unamed options (options without names) */
   unsigned inputs_num ; /**< @brief unamed options number */

tools/SeekMiner/stdafx.cpp

 /*!
  * \page SeekMiner SeekMiner
  *
- * SeekMiner returns a gene-ranking based on the coexpressions to the user-specified
- * query genes. It finds relevant datasets by using one of the many dataset weighting
- * algorithms, including the query-coexpression weighting, the order statistics 
- * weighting, etc. Afterward, it performs a weighted integration of coexpressions
- * using the computed dataset weights.
- * The search algorithms employed by Seek are designed to be quick and efficient, and
- * they support the real-time weight calculations for thousands of microarray
- * datasets.
+ * SeekMiner is the main program for integrating coexpressions among thousands of
+ * microarray datasets. Users supply the program with a set of genes as input (or query) and the program
+ * returns other similar genes with a coexpression to the input genes.
+ *
+ * The main challenge in performing the user's query is finding the right datasets.
+ * As not all microarrays are relevant to exploring the query's coexpression,
+ * SeekMiner particularly favors those datasets where the query genes are highly
+ * correlated among each other. As we would expect, the query gene coregulation would suggest that
+ * the biological process involving these genes is highly active. So datasets that pass this criteria would
+ * be very informative to the search process.
+ *
+ * In addition to the default coregulation based weighting, SeekMiner supports other methods
+ * of scoring datasets, such as rank-based methods (order statistics) and equal-weighting.
+ *
+ * Users can easily compare between methods, adjust parameters in the search algorithms, specify
+ * the datasets to be integrated, and test a number of different queries with varying length,
+ * in order to achieve their desired results.
  *
  * \section sec_usage Usage
  * 
  *
  * SeekMiner supports the following weighting methods (\c -V):
  * \li Query cross-validated weighting (\c CV, default), where we iteratively use a subset of the
- * query to construct a search instance to retrieve the remaining query genes. The sum of the score of the
- * cross-validations forms the dataset weight.
+ * query to construct a search instance to retrieve the remaining query genes. This is a form of measuring
+ * the coregulation of query genes using a cross-validation setup.
  * \li Equal weighting (\c EQUAL), where all datasets are weighted equally.
  * \li Order statistics integration (\c ORDER_STAT), which is outlined in Adler et al (2009).
  * This method computes a P-value statistics by comparing the rank of correlation across datasets to the

tools/SeekServer/SeekServer.cpp

 	pthread_win32_process_attach_np( );
 #endif // WIN32
 	gengetopt_args_info	sArgs;
+	int lineSize = 1024;
 
 	if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) {
 		cmdline_parser_print_help( );
 	bool bLogit = false;
 
 	csfinal = new CSeekCentral();
-	if(!csfinal->Initialize(sArgs.input_arg, sArgs.quant_arg, sArgs.dset_arg,
+	CSeekDBSetting *dbSetting = new CSeekDBSetting(sArgs.dir_gvar_arg,
+		sArgs.dir_sinfo_arg, sArgs.dir_platform_arg, sArgs.dir_prep_in_arg,
+		sArgs.dir_in_arg, sArgs.input_arg, sArgs.quant_arg, sArgs.dset_arg,
+		sArgs.num_db_arg);
+	vector<CSeekDBSetting*> cc;
+	cc.push_back(dbSetting);
+
+	string add_db = sArgs.additional_db_arg;
+	if(add_db!="NA"){
+		ifstream ifsm;
+		ifsm.open(add_db.c_str());
+		if(!ifsm.is_open()){
+			fprintf(stderr, "Error opening file %s\n", add_db.c_str());
+			return false;
+		}
+		char acBuffer[lineSize];
+		ushort c_iBuffer = lineSize;
+		map<string,string> parameters;
+		i=0;
+		while(!ifsm.eof()){
+			ifsm.getline(acBuffer, c_iBuffer-1);
+			if(acBuffer[0]==0) break;
+			acBuffer[c_iBuffer-1]=0;
+			vector<string> tok;
+			CMeta::Tokenize(acBuffer, tok); //separator tab
+			parameters[tok[0]] = tok[1];
+		}
+		ifsm.close();
+
+		string sinfo_dir = "NA";
+		string gvar_dir = "NA";
+		string platform_dir = "NA";
+		string prep_dir = "NA";
+		string db_dir = "NA";
+		string dset_map_file = "NA";
+		string gene_map_file = "NA";
+		string quant_file = "NA";
+		int num_db = -1;
+
+		if(parameters.find("SINFO_DIR")->second=="NA"){
+			fprintf(stderr, "Please specify an sinfo directory for the extra db\n");
+			return false;
+		}
+		sinfo_dir = parameters.find("SINFO_DIR")->second;
+		if(parameters.find("GVAR_DIR")!=parameters.end())
+			gvar_dir = parameters.find("GVAR_DIR")->second;
+		if(parameters.find("PREP_DIR")==parameters.end() ||
+			parameters.find("PLATFORM_DIR")==parameters.end() ||
+			parameters.find("DB_DIR")==parameters.end() ||
+			parameters.find("DSET_MAP_FILE")==parameters.end() ||
+			parameters.find("GENE_MAP_FILE")==parameters.end() ||
+			parameters.find("QUANT_FILE")==parameters.end() ||
+			parameters.find("NUMBER_OF_DB")==parameters.end()){
+			fprintf(stderr, "Some arguments are missing. Please make sure the following are provided:\n");
+			fprintf(stderr, "PREP_DIR, DB_DIR, DSET_MAP_FILE, GENE_MAP_FILE, QUANT_FILE, NUMBER_OF_DB\n");
+			return false;
+		}
+
+		platform_dir = parameters.find("PLATFORM_DIR")->second;
+		db_dir = parameters.find("DB_DIR")->second;
+		prep_dir = parameters.find("PREP_DIR")->second;
+		dset_map_file = parameters.find("DSET_MAP_FILE")->second;
+		gene_map_file = parameters.find("GENE_MAP_FILE")->second;
+		quant_file = parameters.find("QUANT_FILE")->second;
+		num_db = atoi(parameters.find("NUMBER_OF_DB")->second.c_str());
+
+		CSeekDBSetting *dbSetting2 = new CSeekDBSetting(gvar_dir, sinfo_dir,
+			platform_dir, prep_dir, db_dir, gene_map_file, quant_file, dset_map_file,
+			num_db);
+		cc.push_back(dbSetting2);
+	}
+
+	if(!csfinal->Initialize(cc,
 		//"/tmp/ex_query2.txt", 
-		sArgs.dir_platform_arg,
-		sArgs.dir_in_arg, sArgs.dir_prep_in_arg, 
-		sArgs.dir_gvar_arg,
-		sArgs.dir_sinfo_arg,
-		sArgs.num_db_arg,
 		sArgs.buffer_arg, !!sArgs.output_text_flag,
 		bOutputWeightComponent, bSimulateWeight,
 		CSeekDataset::Z_SCORE, //to be overwritten by individual search instance's setting

tools/SeekServer/SeekServer.ggo

 								int default="20"
 option	"output_text"		O	"Output results (gene list and dataset weights) as text"
 								flag	off
-								
+option	"additional_db"		B	"Utilize a second CDatabase collection. Path to the second CDatabase's setting file."
+								string default="NA"	

tools/SeekServer/cmdline.c

 /*
   File autogenerated by gengetopt version 2.22.5
   generated with the following command:
-  /usr/local/bin/gengetopt -iSeekServer.ggo --default-optional -u -N -e 
+  /usr/bin/gengetopt -iSeekServer.ggo --default-optional -u -N -e 
 
   The developers of gengetopt consider the fixed text that goes in all
   gengetopt output files to be in the public domain:
   "  -N, --is_nibble               If true, the input DB is nibble type  \n                                  (default=off)",
   "  -b, --buffer=INT              Number of Databaselets to store in memory  \n                                  (default=`20')",
   "  -O, --output_text             Output results (gene list and dataset weights) \n                                  as text  (default=off)",
+  "  -B, --additional_db=STRING    Utilize a second CDatabase collection. Path to \n                                  the second CDatabase's setting file.  \n                                  (default=`NA')",
     0
 };
 
   args_info->is_nibble_given = 0 ;
   args_info->buffer_given = 0 ;
   args_info->output_text_given = 0 ;
+  args_info->additional_db_given = 0 ;
 }
 
 static
   args_info->buffer_arg = 20;
   args_info->buffer_orig = NULL;
   args_info->output_text_flag = 0;
+  args_info->additional_db_arg = gengetopt_strdup ("NA");
+  args_info->additional_db_orig = NULL;
   
 }
 
   args_info->is_nibble_help = gengetopt_args_info_help[18] ;
   args_info->buffer_help = gengetopt_args_info_help[19] ;
   args_info->output_text_help = gengetopt_args_info_help[20] ;
+  args_info->additional_db_help = gengetopt_args_info_help[21] ;
   
 }
 
   free_string_field (&(args_info->score_cutoff_orig));
   free_string_field (&(args_info->per_q_required_orig));
   free_string_field (&(args_info->buffer_orig));
+  free_string_field (&(args_info->additional_db_arg));
+  free_string_field (&(args_info->additional_db_orig));
   
   
   for (i = 0; i < args_info->inputs_num; ++i)
     write_into_file(outfile, "buffer", args_info->buffer_orig, 0);
   if (args_info->output_text_given)
     write_into_file(outfile, "output_text", 0, 0 );
+  if (args_info->additional_db_given)
+    write_into_file(outfile, "additional_db", args_info->additional_db_orig, 0);
   
 
   i = EXIT_SUCCESS;
   FIX_UNUSED (additional_error);
 
   /* checks for required options */
+  if (! args_info->port_given)
+    {
+      fprintf (stderr, "%s: '--port' ('-t') option required%s\n", prog_name, (additional_error ? additional_error : ""));
+      error = 1;
+    }
+  
   if (! args_info->dset_given)
     {
       fprintf (stderr, "%s: '--dset' ('-x') option required%s\n", prog_name, (additional_error ? additional_error : ""));
       error = 1;
     }
   
+  if (! args_info->dir_sinfo_given)
+    {
+      fprintf (stderr, "%s: '--dir_sinfo' ('-u') option required%s\n", prog_name, (additional_error ? additional_error : ""));
+      error = 1;
+    }
+  
   if (! args_info->quant_given)
     {
       fprintf (stderr, "%s: '--quant' ('-Q') option required%s\n", prog_name, (additional_error ? additional_error : ""));
         { "is_nibble",	0, NULL, 'N' },
         { "buffer",	1, NULL, 'b' },
         { "output_text",	0, NULL, 'O' },
+        { "additional_db",	1, NULL, 'B' },
         { 0,  0, 0, 0 }
       };
 
-      c = getopt_long (argc, argv, "hVt:x:i:d:p:P:u:U:Q:n:c:C:eNb:O", long_options, &option_index);
+      c = getopt_long (argc, argv, "hVt:x:i:d:p:P:u:U:Q:n:c:C:eNb:OB:", long_options, &option_index);
 
       if (c == -1) break;	/* Exit from `while (1)' loop.  */
 
             goto failure;
         
           break;
+        case 'B':	/* Utilize a second CDatabase collection. Path to the second CDatabase's setting file..  */
+        
+        
+          if (update_arg( (void *)&(args_info->additional_db_arg), 
+               &(args_info->additional_db_orig), &(args_info->additional_db_given),
+              &(local_args_info.additional_db_given), optarg, 0, "NA", ARG_STRING,
+              check_ambiguity, override, 0, 0,
+              "additional_db", 'B',
+              additional_error))
+            goto failure;
+        
+          break;
 
         case 0:	/* Long option with no short option */
         case '?':	/* Invalid option.  */

tools/SeekServer/cmdline.h

   const char *buffer_help; /**< @brief Number of Databaselets to store in memory help description.  */
   int output_text_flag;	/**< @brief Output results (gene list and dataset weights) as text (default=off).  */
   const char *output_text_help; /**< @brief Output results (gene list and dataset weights) as text help description.  */
+  char * additional_db_arg;	/**< @brief Utilize a second CDatabase collection. Path to the second CDatabase's setting file. (default='NA').  */
+  char * additional_db_orig;	/**< @brief Utilize a second CDatabase collection. Path to the second CDatabase's setting file. original value given at command line.  */
+  const char *additional_db_help; /**< @brief Utilize a second CDatabase collection. Path to the second CDatabase's setting file. help description.  */
   
   unsigned int help_given ;	/**< @brief Whether help was given.  */
   unsigned int version_given ;	/**< @brief Whether version was given.  */
   unsigned int is_nibble_given ;	/**< @brief Whether is_nibble was given.  */
   unsigned int buffer_given ;	/**< @brief Whether buffer was given.  */
   unsigned int output_text_given ;	/**< @brief Whether output_text was given.  */
+  unsigned int additional_db_given ;	/**< @brief Whether additional_db was given.  */
 
   char **inputs ; /**< @brief unamed options (options without names) */
   unsigned inputs_num ; /**< @brief unamed options number */

tools/SeekServer/stdafx.cpp

  * \page SeekServer SeekServer
  * 
  * SeekServer runs the coexpression mining algorithm using a multithreaded TCP/IP interface.
- * When it is running, SeekServer services requests over the network from multiple connected clients
- * for genes that co-express with the client's query genes.
- * SeekServer sends two lists to the client upon finishing the query search: 1) 
- * a list of genes that are found by the algorithm to be coexpressed with the query genes, and 2) a list of datasets
- * where this coexpression with the query is found to be occurring.
+ * When it is running, SeekServer services requests from multiple clients over the network. The requests that can be
+ * handled by SeekServer are, for example:
+ * \li retrieve a list of genes that are found to be coexpressed with the query genes
+ * \li retrieve a list of datasets where this coexpression with the query is found to be occurring.
  *
  * \section sec_usage Usage
  *