Qian Zhu avatar Qian Zhu committed 88f9d12

Temporary changes to SeekIterative

Comments (0)

Files changed (12)

src/seekreader.cpp

 	return true;
 }
 
+bool CSeekTools::ReadMultipleNotQueries(const char *file,
+	vector<vector<vector<string> > > &qList, const int lineSize){
+	qList.clear();
+	FILE *infile;
+	if((infile=fopen(file, "r"))==NULL){
+		fprintf(stderr, "Error opening file %s\n", file);
+		return false;
+	}
+
+	char *acBuffer;
+	int MAX_CHAR_PER_LINE = lineSize;
+	int lineLen = MAX_CHAR_PER_LINE;
+	acBuffer = (char*)malloc(lineLen);
+	while(fgets(acBuffer, lineLen, infile)!=NULL){
+		while(strlen(acBuffer)==lineLen-1){
+			int len = strlen(acBuffer);
+			fseek(infile, -len, SEEK_CUR);
+			lineLen+=MAX_CHAR_PER_LINE;
+			acBuffer = (char*)realloc(acBuffer, lineLen);
+			char *ret = fgets(acBuffer, lineLen, infile);
+		}
+	}
+	rewind(infile);
+
+	while(fgets(acBuffer, lineLen, infile)!=NULL){
+		char *p = strtok(acBuffer, "\n");
+		vector<vector<string> > aQ;
+		vector<string> tok;
+		CMeta::Tokenize(p, tok, "|");
+		aQ.resize(tok.size());
+		int i;
+		for(i=0; i<tok.size(); i++){
+			vector<string> tmp;
+			CMeta::Tokenize(tok[i].c_str(), tmp, " ");
+			aQ[i] = tmp;
+		}
+		qList.push_back(aQ);
+	}
+	qList.resize(qList.size());
+	free(acBuffer);
+	fclose(infile);
+	return true;
+}
+
 bool CSeekTools::ReadMultiGeneOneLine(const string &strFile,
 	vector<string> &list, const int lineSize){
 	return CSeekTools::ReadMultiGeneOneLine(strFile.c_str(), list, lineSize);
 	static bool ReadMultipleQueries(const string &strFile,
 		vector< vector<string> > &qList, const int lineSize = 1024);
 
+
+	static bool ReadMultipleNotQueries(const char *file,
+		vector<vector<vector<string> > > &qList, const int lineSize = 1024);
+
 	/*!
 	 * \brief Read a list of queries
 	 *
 				ret = fread((char*)(&val),1,sizeof(val),f);
 				tType first = id;
 				tType second = id2;
+				//mat looks like a full matrix
 				mat.Add(first, second, rbp_score[val]);
 				mat.Add(second, first, rbp_score[val]);
 			}
 
 	//compatibility
 	template<class tType>
+	static bool RemoveDominant(CSparseFlatMatrix<float> &mat, 
+	CSeekIntIntMap &m, const vector<string> &vecstrGenes){
+	
+		size_t i, j;
+		vector<vector<float> > tmp_mat, tmp2;
+		tmp_mat.resize(vecstrGenes.size());
+		tmp2.resize(vecstrGenes.size());
+		for(i=0; i<vecstrGenes.size(); i++){
+			tmp_mat[i].resize(vecstrGenes.size());
+			tmp2[i].resize(vecstrGenes.size());
+			for(j=0; j<vecstrGenes.size(); j++){
+				tmp_mat[i][j] = 0;
+				tmp2[i][j] = 0;
+			}
+		}
+
+		size_t ii, jj;
+		const vector<utype> &allRGenes = m.GetAllReverse();
+		for(ii=0; ii<m.GetNumSet(); ii++){
+			i = (size_t) allRGenes[ii];
+			vector<CPair<float> >::iterator row_it;
+			for(row_it=mat.RowBegin(i); row_it!=mat.RowEnd(i); row_it++){
+				j = (size_t) row_it->i;
+				tmp_mat[i][j] = row_it->v;
+			}
+		}
+
+		int TOP = 100;
+		fprintf(stderr, "Started removing dominant...\n");
+		for(ii=0; ii<m.GetNumSet(); ii++){
+			i = (size_t) allRGenes[ii];
+			vector<CPair<float> > vp;
+			vp.resize(m.GetNumSet());
+			for(jj=0; jj<m.GetNumSet(); jj++){
+				j = (size_t) allRGenes[jj];
+				vp[jj].i = (utype) j;
+				vp[jj].v = tmp_mat[i][j];
+			}
+			nth_element(vp.begin(), vp.begin()+TOP, vp.end(), CDescendingValue<float>());
+			sort(vp.begin(), vp.begin()+TOP, CDescendingValue<float>());
+
+			//top 100
+			size_t k, l;
+			size_t max_ind = 0;
+			float max_val = -1;
+			for(k=0; k<TOP; k++){
+				size_t this_g = vp[k].i;
+				float v = 0;
+				for(l=0; l<TOP; l++){
+					size_t other_g = vp[l].i;
+					if(this_g==other_g) continue;
+					v+=tmp_mat[this_g][other_g];
+				}
+				if(v>max_val){
+					max_val = v;
+					max_ind = k;
+				}
+			}
+			for(jj=0; jj<m.GetNumSet(); jj++){
+				j = (size_t) allRGenes[jj];
+				tmp2[i][j] = tmp_mat[i][j] - tmp_mat[j][max_ind];
+				if(tmp2[i][j]<0)	
+					tmp2[i][j] = 0;
+			}
+		}
+
+		fprintf(stderr, "Started re-normalizing matrix...\n");
+
+		for(ii=0; ii<m.GetNumSet(); ii++){
+			i = (size_t) allRGenes[ii];
+			for(jj=ii+1; jj<m.GetNumSet(); jj++){
+				j = (size_t) allRGenes[jj];
+				float m = max(tmp2[i][j], tmp2[j][i]);
+				tmp2[i][j] = m;
+				tmp2[j][i] = m;
+			}
+		}
+
+		vector<float> vecSum;
+		CSeekTools::InitVector(vecSum, vecstrGenes.size(), (float) 0);
+		for(ii=0; ii<m.GetNumSet(); ii++){
+			i = (size_t) allRGenes[ii];
+			for(jj=0; jj<m.GetNumSet(); jj++){
+				j = (size_t) allRGenes[jj];
+				vecSum[i] += tmp2[i][j];
+			}
+		}
+
+		vector<float> vecSqrtSum;
+		CSeekTools::InitVector(vecSqrtSum, vecstrGenes.size(), (float) 0);
+		for(ii=0; ii<m.GetNumSet(); ii++){
+			i = (size_t) allRGenes[ii];
+			if(vecSum[i]==0) continue;
+			vecSqrtSum[i] = sqrtf(vecSum[i]);
+		}
+
+		for(ii=0; ii<m.GetNumSet(); ii++){
+			i = (size_t) allRGenes[ii];
+			vector<CPair<float> >::iterator row_it;
+			for(row_it=mat.RowBegin(i); row_it!=mat.RowEnd(i); row_it++){
+				j = (size_t) row_it->i;
+				if(vecSqrtSum[i]==0 || vecSqrtSum[j]==0) continue;
+				row_it->v = tmp2[i][j] / vecSqrtSum[i] / vecSqrtSum[j];
+				//fprintf(stderr, "%d %d %.3e\n", i, j, row_it->v);
+			}
+		}
+		return true;
+	}
+
+	//compatibility
+	template<class tType>
 	static bool WriteSparseMatrix(CDataPair &Dat, 
 	vector<map<tType,unsigned short> > &umat, 
 	const vector<string> &vecstrGenes, const char *fileName){

tools/SeekIterative/SeekIterative.cpp

 	CDat &mat, const size_t numGenes,
 	CSeekIntIntMap &d1,
 	vector<utype> &indexConvReverse,
-	vector<float> &q_weight){ 
+	vector<float> &q_weight, 
+	vector<vector<float> > &nq_weight
+){ 
 	//q_weight is query presence - 1.0 if gene at the index i is a query gene
 
 	vector<float> gene_count;
 		gene_score[gi] /= gene_count[gi];
 	}
 
+	utype ind;
+	for(ind=0; ind<nq_weight.size(); ind++){
+		vector<float> ngene_count, ngene_score;
+		CSeekTools::InitVector(ngene_score, numGenes, (float)CMeta::GetNaN());
+		CSeekTools::InitVector(ngene_count, numGenes, (float)0);
+
+		for(kk=0; kk<d1.GetNumSet(); kk++)
+			ngene_score[(utype)indexConvReverse[(utype)allGenes[kk]]] = 0;
+
+		for(qqi=0; qqi<d1.GetNumSet(); qqi++){
+			utype qi = allGenes[qqi];
+			utype qq = indexConvReverse[qi];
+			if(nq_weight[ind][qq]==0) //not a NOT query gene
+				continue;
+			//now a NOT query gene
+			float *vc = mat.GetFullRow(qi);
+			for(kk=0; kk<d1.GetNumSet(); kk++){
+				utype k = allGenes[kk];
+				utype gi = indexConvReverse[k];
+				float fl = vc[k];
+				ngene_score[gi] += fl;
+				ngene_count[gi] += 1.0;
+			}
+			delete[] vc;
+		}
+		for(kk=0; kk<d1.GetNumSet(); kk++){
+			utype gi = indexConvReverse[(utype)allGenes[kk]];
+			ngene_score[gi] /= ngene_count[gi];
+			gene_score[gi] -= ngene_score[gi]; //subtract the correlation to NOT genes
+		}
+	}
+
 	return true;
 }
 
 bool get_score(vector<float> &gene_score, 
-	//vector<vector<float> > &mat, 
 	CSparseFlatMatrix<float> &mat,
 	CSeekIntIntMap *geneMap, vector<float> &q_weight){
+
 	vector<float> gene_count;
 	int numGenes = geneMap->GetSize();
 	CSeekTools::InitVector(gene_score, numGenes, (float)CMeta::GetNaN());
 		utype qq = allGenes[qi];
 		if(q_weight[qq]==0) 
 			continue;
+			
 		const vector<CPair<float> > &vc = mat.GetRow(qq);
 		for(kk=0; kk<vc.size(); kk++){
 			float fl = vc[kk].v;
 	for(k=0; k<geneMap->GetNumSet(); k++){
 		utype gi = allGenes[k];
 		gene_score[gi] /= gene_count[gi];
+		if(gene_count[gi]==0){
+			gene_score[gi] = 0;
+		}
 	}
 	return true;
 }
 	return true;
 }*/
 
+bool weight_fast(vector<char> &is_query, vector<float> &d1,
+CSeekIntIntMap *geneMap, float &w){
+	utype i;
+	w = 0;
+	const vector<utype> &allGenes = geneMap->GetAllReverse();
+	for(i=0; i<geneMap->GetNumSet(); i++){
+		utype gi = allGenes[i];
+		if(is_query[gi]==1){
+			//if(CMeta::IsNaN(d1[gi])) continue;
+			w += d1[gi];
+		}
+	}
+	return true;	
+}
+
 bool weight(vector<char> &is_query, vector<float> &d1,
 	CSeekIntIntMap *geneMap, float rbp_p, float &w){
 	vector<AResultFloat> ar;
 	ar.resize(geneMap->GetSize());
 	utype i;
-	for(i=0; i<ar.size(); i++)
-		ar[i].i = i;
 	
 	const vector<utype> &allGenes = geneMap->GetAllReverse();
 	for(i=0; i<geneMap->GetNumSet(); i++){
 		utype gi = allGenes[i];
-		ar[gi].f = d1[gi];
+		ar[i].i = gi;
+		ar[i].f = d1[gi];
 	}
 	
 	int MAX = 5000;
 	return true;
 }
 
-bool cv_weight(vector<utype> &query, 
-	//vector<vector<float> > &mat,
-	CSparseFlatMatrix<float> &mat,
+bool cv_weight(vector<utype> &query, CSparseFlatMatrix<float> &mat,
 	CSeekIntIntMap *geneMap, float rbp_p, float &tot_w){
+
 	//leave one in cross-validation
 	utype i, j;
 	int numGenes = geneMap->GetSize();
 		CSeekTools::InitVector(q_weight, numGenes, (float) 0);
 		q_weight[query[i]] = 1.0;
 		float w = 0;
-		
 		for(j=0; j<query.size(); j++){
 			if(i==j) continue;
 			is_query[query[j]] = 1;
 		vector<float> gene_score;
 		get_score(gene_score, mat, geneMap, q_weight);
 		gene_score[query[i]] = 0;
-		weight(is_query, gene_score, geneMap, rbp_p, w);
+		//weight(is_query, gene_score, geneMap, rbp_p, w);
+		weight_fast(is_query, gene_score, geneMap, w);
 		tot_w += w;
 	}
-	tot_w /= query.size();
-	
+	tot_w /= query.size();	
 	return true;	
 }
 
 	string output_dir = sArgs.dir_out_arg;
 	int numGenes = vecstrGenes.size();
 	vector< vector<string> > vecstrAllQuery;
-		
 	fprintf(stderr, "Reading queries\n");
 	if(!CSeekTools::ReadMultipleQueries(sArgs.query_arg, vecstrAllQuery))
 		return -1;
 			qu[i].push_back(mapstriGenes[vecstrAllQuery[i][j]]);
 	}
 
+	vector<vector<vector<string> > > vecstrAllNotQuery;
+	vecstrAllNotQuery.resize(vecstrAllQuery.size());
+	string not_query = sArgs.not_query_arg;
+	if(not_query!="NA"){
+		fprintf(stderr, "Reading NOT queries\n");
+		if(!CSeekTools::ReadMultipleNotQueries(sArgs.not_query_arg, 
+		vecstrAllNotQuery))
+			return -1;
+	}
+
 	if(sArgs.visualize_flag==1){
 		string dab_base = sArgs.dab_basename_arg;
 		string file1 = dab_dir + "/" + dab_base + ".dab";
 			}
 			fprintf(stderr, "Query %d\n", j);
 
-			/*float init = 0.00001;
-			float step = 0.00001;
-			float upper = 0.001;
-			*/
-			float init = 0.05;
-			float step = 0.05;
-			float upper = 3;
-
-			float cutoff = init;
-			while(cutoff<upper){
-				int count = 0;
+			if(sArgs.print_distr_flag==1){
+				float min = 9999;
+				float max = -1;
 				for(k=0; k<vec_s.size(); k++){
 					for(l=k+1; l<vec_s.size(); l++){
-						if(!CMeta::IsNaN(V.Get(k,l)) && V.Get(k,l)>=cutoff){
-							count++;
+						float v = CD.Get(vec_g[k], vec_g[l]);
+						if(CMeta::IsNaN(v)) continue;
+						if(v>max) max = v;
+						if(v<min) min = v;
+					}
+				}
+				float init = min;
+				float step = (max - min)/100.0;
+				float upper = max;
+				float cutoff = init;
+				while(cutoff<upper){
+					int count = 0;
+					for(k=0; k<vec_s.size(); k++){
+						for(l=k+1; l<vec_s.size(); l++){
+							if(!CMeta::IsNaN(V.Get(k,l)) && V.Get(k,l)>=cutoff){
+								count++;
+							}
 						}
 					}
+					fprintf(stderr, "%.5f\t%d\n", cutoff, count);
+					cutoff+=step;
 				}
-				fprintf(stderr, "%.5f\t%d\n", cutoff, count);
-				cutoff+=step;
 			}
 
 			sprintf(acBuffer, "%s/%d.dot", output_dir.c_str(), j);			
 		}
 
 		vector<vector<float> > q_weight;
+		vector<vector<vector<float> > > nq_weight;
+
 		q_weight.resize(vecstrAllQuery.size());
 		for(i=0; i<vecstrAllQuery.size(); i++){
 			CSeekTools::InitVector(q_weight[i], numGenes, (float) 0);
 				q_weight[i][mapstriGenes[vecstrAllQuery[i][j]]] = 1;
 		}
 
+		nq_weight.resize(vecstrAllNotQuery.size());
+		if(not_query!="NA"){
+			for(i=0; i<vecstrAllNotQuery.size(); i++){
+				nq_weight[i].resize(vecstrAllNotQuery[i].size());
+				for(j=0; j<vecstrAllNotQuery[i].size(); j++){
+					CSeekTools::InitVector(nq_weight[i][j], numGenes, (float) 0);
+					for(k=0; k<vecstrAllNotQuery[i][j].size(); k++)
+						nq_weight[i][j][mapstriGenes[vecstrAllNotQuery[i][j][k]]] = 1;
+				}
+			}
+		}
+
+		fprintf(stderr, "Start reading dab\n");
 		CDat CD;
 		CD.Open(file1.c_str(), false, 2, false, false, false);
+		fprintf(stderr, "Finished reading dab\n");
 
 		CSeekIntIntMap d1(CD.GetGenes());
 		vector<utype> indexConvReverse;
 		for(j=0; j<vecstrAllQuery.size(); j++){
 			vector<float> tmp_score;
 			search_one_dab(tmp_score, CD, vecstrGenes.size(), d1, indexConvReverse,
-			q_weight[j]);
+			q_weight[j], nq_weight[j]);
 			for(kk=0; kk<d1.GetNumSet(); kk++){
 				utype k = allGenes[kk];
 				utype gi = indexConvReverse[k];
 				final_score[j][gi] = tmp_score[gi];
 			}
 		}
+		fprintf(stderr, "Writing output\n");
 
 		for(j=0; j<vecstrAllQuery.size(); j++){
 			for(k=0; k<final_score[j].size(); k++){
 			CSeekTools::WriteArray(acBuffer, final_score[j]);
 		}
 
+		fprintf(stderr, "Finished with search\n");
 		//Visualize
 		for(j=0; j<vecstrAllQuery.size(); j++){
+			//fprintf(stderr, "Query %d\n", j);
 			vector<AResultFloat> ar;
 			ar.resize(final_score[j].size());
 			for(k=0; k<final_score[j].size(); k++){
 			sort(ar.begin(), ar.end());
 			vector<utype> vec_g;
 			vector<string> vec_s;
-			int FIRST = 200;
+			vector<float> node_w;
+			int FIRST = sArgs.top_genes_arg;
 			for(k=0; k<FIRST; k++){
 				if(ar[k].f==-320) break;
 				vec_g.push_back(CD.GetGeneIndex(vecstrGenes[ar[k].i]));
 				vec_s.push_back(vecstrGenes[ar[k].i]);
+				node_w.push_back(0);
 			}
-			if(vec_g.size()!=FIRST) continue;
+			//if(vec_g.size()!=FIRST) continue;
+			for(k=0; k<vecstrAllQuery[j].size(); k++){
+				size_t ind = CD.GetGeneIndex(vecstrAllQuery[j][k]);
+				if(ind==(size_t)-1) continue;
+				vec_g.push_back(ind);
+				vec_s.push_back(vecstrAllQuery[j][k]);
+				node_w.push_back(1.0);
+			}
+
 			CDat V;
 			V.Open(vec_s);
 			for(k=0; k<vec_s.size(); k++){
 				for(l=k+1; l<vec_s.size(); l++){
-					V.Set(k, l, CD.Get(vec_g[k], vec_g[l]));
+					float v = CD.Get(vec_g[k], vec_g[l]);
+					V.Set(k, l, v);
 				}
 			}
 
-			fprintf(stderr, "Query %d\n", j);
-			float cutoff = 0.00001;
-			while(cutoff<0.001){
-				int count = 0;
+			if(sArgs.print_distr_flag==1){
+				float min = 9999;
+				float max = -1;
 				for(k=0; k<vec_s.size(); k++){
 					for(l=k+1; l<vec_s.size(); l++){
-						if(!CMeta::IsNaN(V.Get(k,l)) && V.Get(k,l)>=cutoff){
-							count++;
+						float v = CD.Get(vec_g[k], vec_g[l]);
+						if(CMeta::IsNaN(v)) continue;
+						if(v>max) max = v;
+						if(v<min) min = v;
+					}
+				}
+
+				float init = min;
+				float step = (max - min)/100.0;
+				float upper = max;
+
+				float cutoff = init;
+				while(cutoff<upper){
+					int count = 0;
+					for(k=0; k<vec_s.size(); k++){
+						for(l=k+1; l<vec_s.size(); l++){
+							if(!CMeta::IsNaN(V.Get(k,l)) && V.Get(k,l)>=cutoff){
+								count++;
+							}
 						}
 					}
+					fprintf(stderr, "%.5f\t%d\n", cutoff, count);
+					cutoff+=step;
 				}
-				fprintf(stderr, "%.5f\t%d\n", cutoff, count);
-				cutoff+=0.00001;
 			}
 
-			sprintf(acBuffer, "%s/%d.dot", output_dir.c_str(), j);			
-			ofstream ot(acBuffer);
-			V.SaveDOT(ot, cutoff_par, &g, false, true, NULL, NULL);
-			//V.SaveDOT(ot, 0.0001, NULL, true, false, NULL, NULL);
+			if(sArgs.generate_dot_flag==1){
+				sprintf(acBuffer, "%s/%d.dot", output_dir.c_str(), j);			
+				ofstream ot(acBuffer);
+				V.SaveDOT(ot, cutoff_par, &g, false, true, &node_w, NULL);
+				//V.SaveDOT(ot, 0.0001, NULL, true, false, NULL, NULL);
+			}
 		}
 		
-
 	}
 
 	if(sArgs.test_flag==1){
 			return -1;
 		}
 
+		vector<float> score_cutoff;
+		bool bDatasetCutoff = false;
+		string dset_cutoff_file = sArgs.dset_cutoff_file_arg;
+		if(dset_cutoff_file!="NA"){
+			CSeekTools::ReadArray(dset_cutoff_file.c_str(), score_cutoff);
+			bDatasetCutoff = true;
+			fprintf(stderr, "Filtered mode is on!\n");
+		}
+
 		float rbp_p = sArgs.rbp_p_arg;
 		int max_rank = sArgs.max_rank_arg;
-
 		int num_iter = sArgs.num_iter_arg;
 		vector<string> dab_list;
 		CSeekTools::ReadListOneColumn(sArgs.dab_list_arg, dab_list);
 				q_weight[i][mapstriGenes[vecstrAllQuery[i][j]]] = 1;
 		}
 		
-		
 		fprintf(stderr, "Reading sparse DAB\n");
 		vector<vector<float> > final_score, count;
+		vector<vector<float> > dweight;
 		vector<vector<int> > freq;
 		final_score.resize(vecstrAllQuery.size());
 		count.resize(vecstrAllQuery.size());
 		freq.resize(vecstrAllQuery.size());
+		dweight.resize(vecstrAllQuery.size());
 		for(j=0; j<vecstrAllQuery.size(); j++){
 			CSeekTools::InitVector(final_score[j], vecstrGenes.size(), (float)CMeta::GetNaN());
 			CSeekTools::InitVector(count[j], vecstrGenes.size(), (float) 0);
 			CSeekTools::InitVector(freq[j], vecstrGenes.size(), (int) 0);
+			CSeekTools::InitVector(dweight[j], dab_list.size(), (float) 0);
 		}
 	
-		//CSparseFlatHalfMatrix<float> res(0);
-		/*CHalfMatrix<float> res;
-		res.Initialize(vecstrGenes.size());
-		for(i=0; i<vecstrGenes.size(); i++)
-			for(j=i+1; j<vecstrGenes.size(); j++)
-				res.Set(i, j, 0);
-		*/
 		for(i=0; i<dab_list.size(); i++){
 			fprintf(stderr, "Reading %d: %s\n", i, dab_list[i].c_str());
-			//vector<vector<float> > sm;
-			//vector<map<utype,float> > sm;
 			CSeekIntIntMap d1(vecstrGenes.size());
 			string dabfile = dab_dir + "/" + dab_list[i];
-			//CDat Dat;
-			//Dat.Open(dabfile.c_str(), false, 2, false, false, false);
-			//CSeekWriter::NormalizeDAB(Dat, vecstrGenes, false, false, false, true);
-			//transfer(Dat, sm, &d1, vecstrGenes);
-			
 			CSparseFlatMatrix<float> sm (0);
-			//CSeekWriter::ReadSparseMatrix(dabfile.c_str(), sm, d1, 3000, 0.997, vecstrGenes);
 
 			if(sArgs.default_type_arg==0) //utype
 				CSeekWriter::ReadSeekSparseMatrix<utype>(dabfile.c_str(), sm, d1, max_rank, rbp_p, vecstrGenes);
 			else
 				CSeekWriter::ReadSeekSparseMatrix<unsigned short>(dabfile.c_str(), sm, d1, max_rank, rbp_p, vecstrGenes);
-			/*if(i==0){
-				fprintf(stderr, "Summing...\n");
-				res.Copy(sm);
-				fprintf(stderr, "Finished Summing...\n");
-			}else{*/
-			//	fprintf(stderr, "Summing...\n");
-			//	CSeekWriter::SumSparseMatrix(sm, res, d1, 1.0);
-			//	fprintf(stderr, "Finished Summing...\n");
-			/*}
-			*/
-			//fprintf(stderr, "Finished!\n");	
 			
 			const vector<utype> &allGenes = d1.GetAllReverse();			
 			for(j=0; j<vecstrAllQuery.size(); j++){
 				float dw = 1.0;
-				//cv_weight(qu[j], sm, &d1, 0.99, dw);
+				cv_weight(qu[j], sm, &d1, 0.99, dw);
+				if(bDatasetCutoff){
+					if(score_cutoff[i]>dw){
+						dw = 0;
+					}
+					//fprintf(stderr, "%.3e %.3e\n", score_cutoff[i], dw);
+				}
 				//fprintf(stderr, "%.3e\n", dw);
+				dweight[j][i] = dw;
+				//if(dw==0) 
+				//	continue;
 				vector<float> tmp_score;
 				get_score(tmp_score, sm, &d1, q_weight[j]);
 				for(k=0; k<d1.GetNumSet(); k++){
 			CSeekTools::WriteArrayText(acBuffer, vecstrAllQuery[j]);
 			sprintf(acBuffer, "%s/%d.gscore", output_dir.c_str(), j);
 			CSeekTools::WriteArray(acBuffer, final_score[j]);
+			sprintf(acBuffer, "%s/%d.dweight", output_dir.c_str(), j);
+			CSeekTools::WriteArray(acBuffer, dweight[j]);
 		}
 	
 		//MODE 2

tools/SeekIterative/SeekIterative.ggo

 section "Combined-DAB / Visualization mode"
 option	"dab_basename"		b	"Combined-dab basename, also shared with Test Mode"
 								string typestr="filename"
+option	"top_genes"			t	"Top genes to visualize (for combined-dab)"
+								int default="100"
+option	"generate_dot"		E	"Generate a dot file (for combined-dab)"
+								flag off
+option	"print_distr"		P	"Print distribution of edge values"
+								flag off
 
 section "Visualization mode"
 option	"cutoff"			c	"Cutoff value"
 								float default="-1"
 option	"max_rank"			M	"Maximum rank number in the sparse DAB matrix (must be specified)"
 								int default="-1"
+option	"dset_cutoff_file"	H	"Dataset score cutoff file"
+								string typestr="filename" default="NA"
 
 section "Input"
 option	"input"				i	"Gene mapping file"
 								string typestr="filename"	yes
 option	"dab_dir"			F	"DAB directory"
 								string typestr="directory"	yes
+option	"not_query"			Q	"NOT Query file (optional, for combined-DAB)"
+								string typestr="filename" default="NA"
 
 section "Output"
 option	"dir_out"			D	"Output directory"

tools/SeekIterative/cmdline.c

 const char *gengetopt_args_info_description = "";
 
 const char *gengetopt_args_info_help[] = {
-  "      --help                   Print help and exit",
-  "      --version                Print version and exit",
+  "      --help                    Print help and exit",
+  "      --version                 Print version and exit",
   "\nMode:",
-  "  -d, --dab                    Sparse Dab mode  (default=off)",
-  "  -e, --combined               Combined-dab mode  (default=off)",
-  "  -f, --test                   Test mode  (default=off)",
-  "  -g, --testcount              Test count mode  (default=off)",
-  "  -h, --testcombined           Test count mode  (default=off)",
-  "  -v, --visualize              Visualization mode  (default=off)",
+  "  -d, --dab                     Sparse Dab mode  (default=off)",
+  "  -e, --combined                Combined-dab mode  (default=off)",
+  "  -f, --test                    Test mode  (default=off)",
+  "  -g, --testcount               Test count mode  (default=off)",
+  "  -h, --testcombined            Test count mode  (default=off)",
+  "  -v, --visualize               Visualization mode  (default=off)",
   "\nCombined-DAB / Visualization mode:",
-  "  -b, --dab_basename=filename  Combined-dab basename, also shared with Test \n                                 Mode",
+  "  -b, --dab_basename=filename   Combined-dab basename, also shared with Test \n                                  Mode",
+  "  -t, --top_genes=INT           Top genes to visualize (for combined-dab)  \n                                  (default=`100')",
+  "  -E, --generate_dot            Generate a dot file (for combined-dab)  \n                                  (default=off)",
+  "  -P, --print_distr             Print distribution of edge values  \n                                  (default=off)",
   "\nVisualization mode:",
-  "  -c, --cutoff=FLOAT           Cutoff value  (default=`0.0001')",
-  "  -G, --genome=filename        Genome mapping file",
+  "  -c, --cutoff=FLOAT            Cutoff value  (default=`0.0001')",
+  "  -G, --genome=filename         Genome mapping file",
   "\nSparse DAB mode:",
-  "  -V, --dab_list=filename      DAB list",
-  "  -I, --num_iter=INT           Number of iterations  (default=`0')",
-  "  -T, --default_type=INT       Default gene index type (choose unsigned short \n                                 for genes, or unsigned int (32-bit) for \n                                 transcripts) (required for DAB mode) (0 - \n                                 unsigned int, 1 - unsigned short)  \n                                 (default=`-1')",
-  "  -R, --rbp_p=FLOAT            RBP p parameter (must be specified) (p<1.0) \n                                 (recommended > 0.95)  (default=`-1')",
-  "  -M, --max_rank=INT           Maximum rank number in the sparse DAB matrix \n                                 (must be specified)  (default=`-1')",
+  "  -V, --dab_list=filename       DAB list",
+  "  -I, --num_iter=INT            Number of iterations  (default=`0')",
+  "  -T, --default_type=INT        Default gene index type (choose unsigned short \n                                  for genes, or unsigned int (32-bit) for \n                                  transcripts) (required for DAB mode) (0 - \n                                  unsigned int, 1 - unsigned short)  \n                                  (default=`-1')",
+  "  -R, --rbp_p=FLOAT             RBP p parameter (must be specified) (p<1.0) \n                                  (recommended > 0.95)  (default=`-1')",
+  "  -M, --max_rank=INT            Maximum rank number in the sparse DAB matrix \n                                  (must be specified)  (default=`-1')",
+  "  -H, --dset_cutoff_file=filename\n                                Dataset score cutoff file  (default=`NA')",
   "\nInput:",
-  "  -i, --input=filename         Gene mapping file",
-  "  -q, --query=filename         Query file",
-  "  -F, --dab_dir=directory      DAB directory",
+  "  -i, --input=filename          Gene mapping file",
+  "  -q, --query=filename          Query file",
+  "  -F, --dab_dir=directory       DAB directory",
+  "  -Q, --not_query=filename      NOT Query file (optional, for combined-DAB)  \n                                  (default=`NA')",
   "\nOutput:",
-  "  -D, --dir_out=directory      Output directory",
+  "  -D, --dir_out=directory       Output directory",
     0
 };
 
   args_info->testcombined_given = 0 ;
   args_info->visualize_given = 0 ;
   args_info->dab_basename_given = 0 ;
+  args_info->top_genes_given = 0 ;
+  args_info->generate_dot_given = 0 ;
+  args_info->print_distr_given = 0 ;
   args_info->cutoff_given = 0 ;
   args_info->genome_given = 0 ;
   args_info->dab_list_given = 0 ;
   args_info->default_type_given = 0 ;
   args_info->rbp_p_given = 0 ;
   args_info->max_rank_given = 0 ;
+  args_info->dset_cutoff_file_given = 0 ;
   args_info->input_given = 0 ;
   args_info->query_given = 0 ;
   args_info->dab_dir_given = 0 ;
+  args_info->not_query_given = 0 ;
   args_info->dir_out_given = 0 ;
 }
 
   args_info->visualize_flag = 0;
   args_info->dab_basename_arg = NULL;
   args_info->dab_basename_orig = NULL;
+  args_info->top_genes_arg = 100;
+  args_info->top_genes_orig = NULL;
+  args_info->generate_dot_flag = 0;
+  args_info->print_distr_flag = 0;
   args_info->cutoff_arg = 0.0001;
   args_info->cutoff_orig = NULL;
   args_info->genome_arg = NULL;
   args_info->rbp_p_orig = NULL;
   args_info->max_rank_arg = -1;
   args_info->max_rank_orig = NULL;
+  args_info->dset_cutoff_file_arg = gengetopt_strdup ("NA");
+  args_info->dset_cutoff_file_orig = NULL;
   args_info->input_arg = NULL;
   args_info->input_orig = NULL;
   args_info->query_arg = NULL;
   args_info->query_orig = NULL;
   args_info->dab_dir_arg = NULL;
   args_info->dab_dir_orig = NULL;
+  args_info->not_query_arg = gengetopt_strdup ("NA");
+  args_info->not_query_orig = NULL;
   args_info->dir_out_arg = NULL;
   args_info->dir_out_orig = NULL;
   
   args_info->testcombined_help = gengetopt_args_info_help[7] ;
   args_info->visualize_help = gengetopt_args_info_help[8] ;
   args_info->dab_basename_help = gengetopt_args_info_help[10] ;
-  args_info->cutoff_help = gengetopt_args_info_help[12] ;
-  args_info->genome_help = gengetopt_args_info_help[13] ;
-  args_info->dab_list_help = gengetopt_args_info_help[15] ;
-  args_info->num_iter_help = gengetopt_args_info_help[16] ;
-  args_info->default_type_help = gengetopt_args_info_help[17] ;
-  args_info->rbp_p_help = gengetopt_args_info_help[18] ;
-  args_info->max_rank_help = gengetopt_args_info_help[19] ;
-  args_info->input_help = gengetopt_args_info_help[21] ;
-  args_info->query_help = gengetopt_args_info_help[22] ;
-  args_info->dab_dir_help = gengetopt_args_info_help[23] ;
-  args_info->dir_out_help = gengetopt_args_info_help[25] ;
+  args_info->top_genes_help = gengetopt_args_info_help[11] ;
+  args_info->generate_dot_help = gengetopt_args_info_help[12] ;
+  args_info->print_distr_help = gengetopt_args_info_help[13] ;
+  args_info->cutoff_help = gengetopt_args_info_help[15] ;
+  args_info->genome_help = gengetopt_args_info_help[16] ;
+  args_info->dab_list_help = gengetopt_args_info_help[18] ;
+  args_info->num_iter_help = gengetopt_args_info_help[19] ;
+  args_info->default_type_help = gengetopt_args_info_help[20] ;
+  args_info->rbp_p_help = gengetopt_args_info_help[21] ;
+  args_info->max_rank_help = gengetopt_args_info_help[22] ;
+  args_info->dset_cutoff_file_help = gengetopt_args_info_help[23] ;
+  args_info->input_help = gengetopt_args_info_help[25] ;
+  args_info->query_help = gengetopt_args_info_help[26] ;
+  args_info->dab_dir_help = gengetopt_args_info_help[27] ;
+  args_info->not_query_help = gengetopt_args_info_help[28] ;
+  args_info->dir_out_help = gengetopt_args_info_help[30] ;
   
 }
 
   unsigned int i;
   free_string_field (&(args_info->dab_basename_arg));
   free_string_field (&(args_info->dab_basename_orig));
+  free_string_field (&(args_info->top_genes_orig));
   free_string_field (&(args_info->cutoff_orig));
   free_string_field (&(args_info->genome_arg));
   free_string_field (&(args_info->genome_orig));
   free_string_field (&(args_info->default_type_orig));
   free_string_field (&(args_info->rbp_p_orig));
   free_string_field (&(args_info->max_rank_orig));
+  free_string_field (&(args_info->dset_cutoff_file_arg));
+  free_string_field (&(args_info->dset_cutoff_file_orig));
   free_string_field (&(args_info->input_arg));
   free_string_field (&(args_info->input_orig));
   free_string_field (&(args_info->query_arg));
   free_string_field (&(args_info->query_orig));
   free_string_field (&(args_info->dab_dir_arg));
   free_string_field (&(args_info->dab_dir_orig));
+  free_string_field (&(args_info->not_query_arg));
+  free_string_field (&(args_info->not_query_orig));
   free_string_field (&(args_info->dir_out_arg));
   free_string_field (&(args_info->dir_out_orig));
   
     write_into_file(outfile, "visualize", 0, 0 );
   if (args_info->dab_basename_given)
     write_into_file(outfile, "dab_basename", args_info->dab_basename_orig, 0);
+  if (args_info->top_genes_given)
+    write_into_file(outfile, "top_genes", args_info->top_genes_orig, 0);
+  if (args_info->generate_dot_given)
+    write_into_file(outfile, "generate_dot", 0, 0 );
+  if (args_info->print_distr_given)
+    write_into_file(outfile, "print_distr", 0, 0 );
   if (args_info->cutoff_given)
     write_into_file(outfile, "cutoff", args_info->cutoff_orig, 0);
   if (args_info->genome_given)
     write_into_file(outfile, "rbp_p", args_info->rbp_p_orig, 0);
   if (args_info->max_rank_given)
     write_into_file(outfile, "max_rank", args_info->max_rank_orig, 0);
+  if (args_info->dset_cutoff_file_given)
+    write_into_file(outfile, "dset_cutoff_file", args_info->dset_cutoff_file_orig, 0);
   if (args_info->input_given)
     write_into_file(outfile, "input", args_info->input_orig, 0);
   if (args_info->query_given)
     write_into_file(outfile, "query", args_info->query_orig, 0);
   if (args_info->dab_dir_given)
     write_into_file(outfile, "dab_dir", args_info->dab_dir_orig, 0);
+  if (args_info->not_query_given)
+    write_into_file(outfile, "not_query", args_info->not_query_orig, 0);
   if (args_info->dir_out_given)
     write_into_file(outfile, "dir_out", args_info->dir_out_orig, 0);
   
         { "testcombined",	0, NULL, 'h' },
         { "visualize",	0, NULL, 'v' },
         { "dab_basename",	1, NULL, 'b' },
+        { "top_genes",	1, NULL, 't' },
+        { "generate_dot",	0, NULL, 'E' },
+        { "print_distr",	0, NULL, 'P' },
         { "cutoff",	1, NULL, 'c' },
         { "genome",	1, NULL, 'G' },
         { "dab_list",	1, NULL, 'V' },
         { "default_type",	1, NULL, 'T' },
         { "rbp_p",	1, NULL, 'R' },
         { "max_rank",	1, NULL, 'M' },
+        { "dset_cutoff_file",	1, NULL, 'H' },
         { "input",	1, NULL, 'i' },
         { "query",	1, NULL, 'q' },
         { "dab_dir",	1, NULL, 'F' },
+        { "not_query",	1, NULL, 'Q' },
         { "dir_out",	1, NULL, 'D' },
         { NULL,	0, NULL, 0 }
       };
 
-      c = getopt_long (argc, argv, "defghvb:c:G:V:I:T:R:M:i:q:F:D:", long_options, &option_index);
+      c = getopt_long (argc, argv, "defghvb:t:EPc:G:V:I:T:R:M:H:i:q:F:Q:D:", long_options, &option_index);
 
       if (c == -1) break;	/* Exit from `while (1)' loop.  */
 
             goto failure;
         
           break;
+        case 't':	/* Top genes to visualize (for combined-dab).  */
+        
+        
+          if (update_arg( (void *)&(args_info->top_genes_arg), 
+               &(args_info->top_genes_orig), &(args_info->top_genes_given),
+              &(local_args_info.top_genes_given), optarg, 0, "100", ARG_INT,
+              check_ambiguity, override, 0, 0,
+              "top_genes", 't',
+              additional_error))
+            goto failure;
+        
+          break;
+        case 'E':	/* Generate a dot file (for combined-dab).  */
+        
+        
+          if (update_arg((void *)&(args_info->generate_dot_flag), 0, &(args_info->generate_dot_given),
+              &(local_args_info.generate_dot_given), optarg, 0, 0, ARG_FLAG,
+              check_ambiguity, override, 1, 0, "generate_dot", 'E',
+              additional_error))
+            goto failure;
+        
+          break;
+        case 'P':	/* Print distribution of edge values.  */
+        
+        
+          if (update_arg((void *)&(args_info->print_distr_flag), 0, &(args_info->print_distr_given),
+              &(local_args_info.print_distr_given), optarg, 0, 0, ARG_FLAG,
+              check_ambiguity, override, 1, 0, "print_distr", 'P',
+              additional_error))
+            goto failure;
+        
+          break;
         case 'c':	/* Cutoff value.  */
         
         
             goto failure;
         
           break;
+        case 'H':	/* Dataset score cutoff file.  */
+        
+        
+          if (update_arg( (void *)&(args_info->dset_cutoff_file_arg), 
+               &(args_info->dset_cutoff_file_orig), &(args_info->dset_cutoff_file_given),
+              &(local_args_info.dset_cutoff_file_given), optarg, 0, "NA", ARG_STRING,
+              check_ambiguity, override, 0, 0,
+              "dset_cutoff_file", 'H',
+              additional_error))
+            goto failure;
+        
+          break;
         case 'i':	/* Gene mapping file.  */
         
         
             goto failure;
         
           break;
+        case 'Q':	/* NOT Query file (optional, for combined-DAB).  */
+        
+        
+          if (update_arg( (void *)&(args_info->not_query_arg), 
+               &(args_info->not_query_orig), &(args_info->not_query_given),
+              &(local_args_info.not_query_given), optarg, 0, "NA", ARG_STRING,
+              check_ambiguity, override, 0, 0,
+              "not_query", 'Q',
+              additional_error))
+            goto failure;
+        
+          break;
         case 'D':	/* Output directory.  */
         
         

tools/SeekIterative/cmdline.h

   char * dab_basename_arg;	/**< @brief Combined-dab basename, also shared with Test Mode.  */
   char * dab_basename_orig;	/**< @brief Combined-dab basename, also shared with Test Mode original value given at command line.  */
   const char *dab_basename_help; /**< @brief Combined-dab basename, also shared with Test Mode help description.  */
+  int top_genes_arg;	/**< @brief Top genes to visualize (for combined-dab) (default='100').  */
+  char * top_genes_orig;	/**< @brief Top genes to visualize (for combined-dab) original value given at command line.  */
+  const char *top_genes_help; /**< @brief Top genes to visualize (for combined-dab) help description.  */
+  int generate_dot_flag;	/**< @brief Generate a dot file (for combined-dab) (default=off).  */
+  const char *generate_dot_help; /**< @brief Generate a dot file (for combined-dab) help description.  */
+  int print_distr_flag;	/**< @brief Print distribution of edge values (default=off).  */
+  const char *print_distr_help; /**< @brief Print distribution of edge values help description.  */
   float cutoff_arg;	/**< @brief Cutoff value (default='0.0001').  */
   char * cutoff_orig;	/**< @brief Cutoff value original value given at command line.  */
   const char *cutoff_help; /**< @brief Cutoff value help description.  */
   int max_rank_arg;	/**< @brief Maximum rank number in the sparse DAB matrix (must be specified) (default='-1').  */
   char * max_rank_orig;	/**< @brief Maximum rank number in the sparse DAB matrix (must be specified) original value given at command line.  */
   const char *max_rank_help; /**< @brief Maximum rank number in the sparse DAB matrix (must be specified) help description.  */
+  char * dset_cutoff_file_arg;	/**< @brief Dataset score cutoff file (default='NA').  */
+  char * dset_cutoff_file_orig;	/**< @brief Dataset score cutoff file original value given at command line.  */
+  const char *dset_cutoff_file_help; /**< @brief Dataset score cutoff file help description.  */
   char * input_arg;	/**< @brief Gene mapping file.  */
   char * input_orig;	/**< @brief Gene mapping file original value given at command line.  */
   const char *input_help; /**< @brief Gene mapping file help description.  */
   char * dab_dir_arg;	/**< @brief DAB directory.  */
   char * dab_dir_orig;	/**< @brief DAB directory original value given at command line.  */
   const char *dab_dir_help; /**< @brief DAB directory help description.  */
+  char * not_query_arg;	/**< @brief NOT Query file (optional, for combined-DAB) (default='NA').  */
+  char * not_query_orig;	/**< @brief NOT Query file (optional, for combined-DAB) original value given at command line.  */
+  const char *not_query_help; /**< @brief NOT Query file (optional, for combined-DAB) help description.  */
   char * dir_out_arg;	/**< @brief Output directory.  */
   char * dir_out_orig;	/**< @brief Output directory original value given at command line.  */
   const char *dir_out_help; /**< @brief Output directory help description.  */
   unsigned int testcombined_given ;	/**< @brief Whether testcombined was given.  */
   unsigned int visualize_given ;	/**< @brief Whether visualize was given.  */
   unsigned int dab_basename_given ;	/**< @brief Whether dab_basename was given.  */
+  unsigned int top_genes_given ;	/**< @brief Whether top_genes was given.  */
+  unsigned int generate_dot_given ;	/**< @brief Whether generate_dot was given.  */
+  unsigned int print_distr_given ;	/**< @brief Whether print_distr was given.  */
   unsigned int cutoff_given ;	/**< @brief Whether cutoff was given.  */
   unsigned int genome_given ;	/**< @brief Whether genome was given.  */
   unsigned int dab_list_given ;	/**< @brief Whether dab_list was given.  */
   unsigned int default_type_given ;	/**< @brief Whether default_type was given.  */
   unsigned int rbp_p_given ;	/**< @brief Whether rbp_p was given.  */
   unsigned int max_rank_given ;	/**< @brief Whether max_rank was given.  */
+  unsigned int dset_cutoff_file_given ;	/**< @brief Whether dset_cutoff_file was given.  */
   unsigned int input_given ;	/**< @brief Whether input was given.  */
   unsigned int query_given ;	/**< @brief Whether query was given.  */
   unsigned int dab_dir_given ;	/**< @brief Whether dab_dir was given.  */
+  unsigned int not_query_given ;	/**< @brief Whether not_query was given.  */
   unsigned int dir_out_given ;	/**< @brief Whether dir_out was given.  */
 
   char **inputs ; /**< @brief unamed options (options without names) */

tools/SeekPrep/SeekPrep.cpp

 			if(norm_mode==RANK_NORM){
 				CSeekWriter::ReadSeekSparseMatrix<tType>(dabfile.c_str(), sm, d1, 
 					MAX_RANK, rbp_p, vecstrGenes);
+				//NEW
+				CSeekWriter::RemoveDominant<tType>(sm, d1, vecstrGenes);
+				CSeekWriter::RemoveDominant<tType>(sm, d1, vecstrGenes);
 			}else if(norm_mode==Z_NORM){
 				CSeekWriter::ReadSeekSparseMatrix<tType>(dabfile.c_str(), sm, d1,
 					vecstrGenes, (int) (0.10*vecstrGenes.size()), exp);
 			CSeekWriter::SumSparseMatrix(sm, res, d1, w[i]);
 			fprintf(stderr, "Finished Summing...\n");
 		}
+
+		/*for(i=0; i<vecstrGenes.size(); i++){
+			for(j=i+1; j<vecstrGenes.size(); j++){
+				float v = res.Get(i,j);
+				if(CMeta::IsNaN(v)){
+					fprintf(stderr, "Error, %d %d is nan\n", i,j);
+				}
+			}
+		}*/
+
 		ofstream ofsm;
 		string ofile = outdir + "/" + outdab + ".half";
 		ofsm.open(ofile.c_str(), ios_base::binary);
 				fprintf(stderr, "Invalid default type!\n");
 				return -1;
 			}
+
+			//TEST========================================
+			/*CSeekIntIntMap d1(vecstrGenes.size());
+			CSparseFlatMatrix<float> sm(0);
+			float rbp_p = sArgs.rbp_p_arg;
+			CSeekWriter::ReadSeekSparseMatrix<unsigned short>(outFile, sm, d1, 
+			max_rank, rbp_p, vecstrGenes);*/
+			//============================================
+
 			/*fprintf(stderr, "Begin\n");
 			vector<unsigned short> l;
 			vector<vector<float> > mat;
 			fprintf(stderr, "Invalid default type!\n");
 			return -1;
 		}
-
 	}
 
 	if(sArgs.combined_dab_flag==1){

tools/SeekReader/SeekReader.cpp

 	for(i=0; i<vecstrGenes.size(); i++)
 		mapstrintGene[vecstrGenes[i]] = i;
 
+	if(sArgs.weight_flag==1){
+		vector<string> vecstrDataset;
+		if(!CSeekTools::ReadListOneColumn(sArgs.dweight_map_arg, vecstrDataset))
+			return false;
+
+		vector<vector<float> > vec_score;
+		vector<vector<float> > orig_score;
+		utype i, j;
+		int num_query = sArgs.dweight_num_arg; //random query
+		orig_score.resize(num_query);
+
+		vec_score.resize(vecstrDataset.size());
+		for(j=0; j<vec_score.size(); j++)
+			vec_score[j].resize(num_query);
+
+		char x[256];
+		for(i=0; i<num_query; i++){ //i is query id
+			vector<float> v;
+			sprintf(x, "%s/%d.dweight", sArgs.dweight_dir_arg, i);
+			CSeekTools::ReadArray(x, v);
+			orig_score[i] = v;
+			for(j=0; j<vec_score.size(); j++) //j is dataset id
+				vec_score[j][i] = v[j];
+		}
+
+		vector<float> score_cutoff;
+		score_cutoff.resize(vec_score.size());
+		for(j=0; j<vec_score.size(); j++){
+			sort(vec_score[j].begin(), vec_score[j].end());
+			score_cutoff[j] = vec_score[j][899];
+			//fprintf(stderr, "Dataset %d: %.4e\n", j, vec_score[j][899]);
+		}
+
+		sprintf(x, "/tmp/dataset.cutoff");
+		CSeekTools::WriteArray(x, score_cutoff);
+
+		vector<int> numGoodDataset;
+		CSeekTools::InitVector(numGoodDataset, num_query, (int) 0);
+		for(i=0; i<num_query; i++) //i is query id
+			for(j=0; j<vecstrDataset.size(); j++)
+				if(orig_score[i][j]>vec_score[j][899])
+					numGoodDataset[i]++;
+
+		sort(numGoodDataset.begin(), numGoodDataset.end());
+		fprintf(stderr, "10 percentile %d\n", numGoodDataset[99]);
+		fprintf(stderr, "90 percentile %d\n", numGoodDataset[899]);
+
+		int test_num_query = sArgs.dweight_test_num_arg;
+		for(i=0; i<test_num_query; i++){ //i is query id
+			vector<float> v;
+			sprintf(x, "%s/%d.dweight", sArgs.dweight_test_dir_arg, i);
+			CSeekTools::ReadArray(x, v);
+			int numGood = 0;
+			for(j=0; j<v.size(); j++)
+				if(v[j]>vec_score[j][899])
+					numGood++;
+			fprintf(stderr, "Query %d: ", i);
+			if(numGood > numGoodDataset[899])
+				fprintf(stderr, "Unique (upper)\n");
+			else if(numGood < numGoodDataset[99])
+				fprintf(stderr, "Unique (lower)\n");
+			else
+				fprintf(stderr, "Not unique\n");
+		}
+		fprintf(stderr, "Done!\n");
+	}
+
 	if(sArgs.dataset_flag==1){
+		string db = sArgs.db_arg;
+		string dset_list = sArgs.dset_list_arg;
+		string dir_in = sArgs.dir_in_arg;
+		string dir_prep = sArgs.dir_prep_in_arg;
+		if(db=="NA" || dset_list=="NA" || dir_in=="NA" ||
+		dir_prep=="NA"){
+			fprintf(stderr, "Requires: -x, -X, -d -p\n");
+			return false;
+		}
+
 		vector<string> vecstrDP, vecstrUserDP;
 		//dataset-platform mapping (required)
 		if(!CSeekTools::ReadListTwoColumns(sArgs.db_arg, vecstrDatasets, vecstrDP))
 			vc[i]->SetPlatform(vp[platform_id]);
 		}
 
-			
 		//fprintf(stderr, "Finished reading prep\n");
 
 		for(i=0; i<iDatasets; i++) vc[i]->InitializeGeneMap();
 
-
 		for(i=0; i<vecstrGenes.size(); i++){
 			utype ii = mapstrintGene[vecstrGenes[i]];
 			fprintf(stderr, "Gene %s ", vecstrGenes[i].c_str());
 		fprintf(stderr, "Done\n");
 		return false;
 
-
-
 		vector<vector<string> > vecstrQueries;
 		string multiQuery = sArgs.multi_query_arg;
 
 
 	}
 	else if(sArgs.databaselet_flag==1){
+
+		string db = sArgs.db_arg;
+		string dset_list = sArgs.dset_list_arg;
+		string dir_in = sArgs.dir_in_arg;
+		string dir_prep = sArgs.dir_prep_in_arg;
+		string single_query = sArgs.single_query_arg;
+
+		if(db=="NA" || dset_list=="NA" || dir_in=="NA" ||
+		dir_prep=="NA" || single_query=="NA"){
+			fprintf(stderr, "Requires: -x, -X, -d -p -q\n");
+			return false;
+		}
+
 		bool useNibble = false;
 		if(sArgs.is_nibble_flag==1) useNibble = true;
 		CDatabase DB(useNibble);
 			}
 		}
 
-
-	}else{
-		cerr << "Must give a db list." << endl;
-		return 1;
-
 	}
 
 #ifdef WIN32

tools/SeekReader/SeekReader.ggo

 								flag	off
 option	"dataset"			A	"Check which datasets contain query of interest, based on .gpres file"
 								flag	off
+option	"weight"			W	"Test dataset weights"
+								flag	off
 
+section "Weight"
+option	"dweight_dir"		E	"Dataset weight directory"
+								string typestr="directory" default="NA"
+option	"dweight_num"		n	"Number of .dweight files"
+								int	default="1000"
+option	"dweight_map"		M	"Dataset mapping file"
+								string typestr="filename" default="NA"
+option	"dweight_test_dir"	F	"Test dataset weight directory"
+								string typestr="directory" default="NA"
+option	"dweight_test_num"	G	"Test number of .dweight files"
+								int	default="1000"
 
 section "Main"
 option	"order_stat_single_gene_query"		O	"Order statistics mode (single-gene query)"
 								flag	off
 option	"db"				x	"Input dataset-platform definition"
-								string	typestr="filename"	yes
+								string	typestr="filename"
 option	"dset_list"			X	"Input a set of datasets"
-								string	typestr="filename"	yes
+								string	typestr="filename"
 option	"input"				i	"Input gene mapping"
-								string	typestr="filename"	yes
+								string	typestr="filename"
 option	"single_query"		q	"Query gene list"
 								string	typestr="filename"
 option	"dir_in"			d	"Database directory"
-								string	typestr="directory"	yes
+								string	typestr="directory"
 option	"dir_prep_in"		p	"Prep directory (containing .gavg, .gpres files)"
-								string	typestr="directory"	yes				
+								string	typestr="directory"				
 option	"dir_gvar_in"		r	"Prep directory (containing .gexpvar files)"
 								string	typestr="directory"	default="NA"	
 option	"dir_sinfo_in"		s	"Sinfo directory (containing .sinfo files)"

tools/SeekReader/cmdline.c

   "\nDiagnosis:",
   "  -D, --databaselet             Display values from databaselet(s)  \n                                  (default=off)",
   "  -A, --dataset                 Check which datasets contain query of interest, \n                                  based on .gpres file  (default=off)",
+  "  -W, --weight                  Test dataset weights  (default=off)",
+  "\nWeight:",
+  "  -E, --dweight_dir=directory   Dataset weight directory  (default=`NA')",
+  "  -n, --dweight_num=INT         Number of .dweight files  (default=`1000')",
+  "  -M, --dweight_map=filename    Dataset mapping file  (default=`NA')",
+  "  -F, --dweight_test_dir=directory\n                                Test dataset weight directory  (default=`NA')",
+  "  -G, --dweight_test_num=INT    Test number of .dweight files  (default=`1000')",
   "\nMain:",
   "  -O, --order_stat_single_gene_query\n                                Order statistics mode (single-gene query)  \n                                  (default=off)",
   "  -x, --db=filename             Input dataset-platform definition",
 typedef enum {ARG_NO
   , ARG_FLAG
   , ARG_STRING
+  , ARG_INT
   , ARG_FLOAT
 } cmdline_parser_arg_type;
 
 cmdline_parser_internal (int argc, char * const *argv, struct gengetopt_args_info *args_info,
                         struct cmdline_parser_params *params, const char *additional_error);
 
-static int
-cmdline_parser_required2 (struct gengetopt_args_info *args_info, const char *prog_name, const char *additional_error);
 
 static char *
 gengetopt_strdup (const char *s);
   args_info->version_given = 0 ;
   args_info->databaselet_given = 0 ;
   args_info->dataset_given = 0 ;
+  args_info->weight_given = 0 ;
+  args_info->dweight_dir_given = 0 ;
+  args_info->dweight_num_given = 0 ;
+  args_info->dweight_map_given = 0 ;
+  args_info->dweight_test_dir_given = 0 ;
+  args_info->dweight_test_num_given = 0 ;
   args_info->order_stat_single_gene_query_given = 0 ;
   args_info->db_given = 0 ;
   args_info->dset_list_given = 0 ;
 {
   args_info->databaselet_flag = 0;
   args_info->dataset_flag = 0;
+  args_info->weight_flag = 0;
+  args_info->dweight_dir_arg = gengetopt_strdup ("NA");
+  args_info->dweight_dir_orig = NULL;
+  args_info->dweight_num_arg = 1000;
+  args_info->dweight_num_orig = NULL;
+  args_info->dweight_map_arg = gengetopt_strdup ("NA");
+  args_info->dweight_map_orig = NULL;
+  args_info->dweight_test_dir_arg = gengetopt_strdup ("NA");
+  args_info->dweight_test_dir_orig = NULL;
+  args_info->dweight_test_num_arg = 1000;
+  args_info->dweight_test_num_orig = NULL;
   args_info->order_stat_single_gene_query_flag = 0;
   args_info->db_arg = NULL;
   args_info->db_orig = NULL;
   args_info->version_help = gengetopt_args_info_help[1] ;
   args_info->databaselet_help = gengetopt_args_info_help[3] ;
   args_info->dataset_help = gengetopt_args_info_help[4] ;
-  args_info->order_stat_single_gene_query_help = gengetopt_args_info_help[6] ;
-  args_info->db_help = gengetopt_args_info_help[7] ;
-  args_info->dset_list_help = gengetopt_args_info_help[8] ;
-  args_info->input_help = gengetopt_args_info_help[9] ;
-  args_info->single_query_help = gengetopt_args_info_help[10] ;
-  args_info->dir_in_help = gengetopt_args_info_help[11] ;
-  args_info->dir_prep_in_help = gengetopt_args_info_help[12] ;
-  args_info->dir_gvar_in_help = gengetopt_args_info_help[13] ;
-  args_info->dir_sinfo_in_help = gengetopt_args_info_help[14] ;
-  args_info->is_nibble_help = gengetopt_args_info_help[15] ;
-  args_info->platform_dir_help = gengetopt_args_info_help[16] ;
-  args_info->gvar_cutoff_help = gengetopt_args_info_help[17] ;
-  args_info->multi_query_help = gengetopt_args_info_help[18] ;
-  args_info->output_file_help = gengetopt_args_info_help[19] ;
+  args_info->weight_help = gengetopt_args_info_help[5] ;
+  args_info->dweight_dir_help = gengetopt_args_info_help[7] ;
+  args_info->dweight_num_help = gengetopt_args_info_help[8] ;
+  args_info->dweight_map_help = gengetopt_args_info_help[9] ;
+  args_info->dweight_test_dir_help = gengetopt_args_info_help[10] ;
+  args_info->dweight_test_num_help = gengetopt_args_info_help[11] ;
+  args_info->order_stat_single_gene_query_help = gengetopt_args_info_help[13] ;
+  args_info->db_help = gengetopt_args_info_help[14] ;
+  args_info->dset_list_help = gengetopt_args_info_help[15] ;
+  args_info->input_help = gengetopt_args_info_help[16] ;
+  args_info->single_query_help = gengetopt_args_info_help[17] ;
+  args_info->dir_in_help = gengetopt_args_info_help[18] ;
+  args_info->dir_prep_in_help = gengetopt_args_info_help[19] ;
+  args_info->dir_gvar_in_help = gengetopt_args_info_help[20] ;
+  args_info->dir_sinfo_in_help = gengetopt_args_info_help[21] ;
+  args_info->is_nibble_help = gengetopt_args_info_help[22] ;
+  args_info->platform_dir_help = gengetopt_args_info_help[23] ;
+  args_info->gvar_cutoff_help = gengetopt_args_info_help[24] ;
+  args_info->multi_query_help = gengetopt_args_info_help[25] ;
+  args_info->output_file_help = gengetopt_args_info_help[26] ;
   
 }
 
 cmdline_parser_release (struct gengetopt_args_info *args_info)
 {
   unsigned int i;
+  free_string_field (&(args_info->dweight_dir_arg));
+  free_string_field (&(args_info->dweight_dir_orig));
+  free_string_field (&(args_info->dweight_num_orig));
+  free_string_field (&(args_info->dweight_map_arg));
+  free_string_field (&(args_info->dweight_map_orig));
+  free_string_field (&(args_info->dweight_test_dir_arg));
+  free_string_field (&(args_info->dweight_test_dir_orig));
+  free_string_field (&(args_info->dweight_test_num_orig));
   free_string_field (&(args_info->db_arg));
   free_string_field (&(args_info->db_orig));
   free_string_field (&(args_info->dset_list_arg));
     write_into_file(outfile, "databaselet", 0, 0 );
   if (args_info->dataset_given)
     write_into_file(outfile, "dataset", 0, 0 );
+  if (args_info->weight_given)
+    write_into_file(outfile, "weight", 0, 0 );
+  if (args_info->dweight_dir_given)
+    write_into_file(outfile, "dweight_dir", args_info->dweight_dir_orig, 0);
+  if (args_info->dweight_num_given)
+    write_into_file(outfile, "dweight_num", args_info->dweight_num_orig, 0);
+  if (args_info->dweight_map_given)
+    write_into_file(outfile, "dweight_map", args_info->dweight_map_orig, 0);
+  if (args_info->dweight_test_dir_given)
+    write_into_file(outfile, "dweight_test_dir", args_info->dweight_test_dir_orig, 0);
+  if (args_info->dweight_test_num_given)
+    write_into_file(outfile, "dweight_test_num", args_info->dweight_test_num_orig, 0);
   if (args_info->order_stat_single_gene_query_given)
     write_into_file(outfile, "order_stat_single_gene_query", 0, 0 );
   if (args_info->db_given)
 int
 cmdline_parser_required (struct gengetopt_args_info *args_info, const char *prog_name)
 {
-  int result = EXIT_SUCCESS;
-
-  if (cmdline_parser_required2(args_info, prog_name, NULL) > 0)
-    result = EXIT_FAILURE;
-
-  return result;
-}
-
-int
-cmdline_parser_required2 (struct gengetopt_args_info *args_info, const char *prog_name, const char *additional_error)
-{
-  int error = 0;
-
-  /* checks for required options */
-  if (! args_info->db_given)
-    {
-      fprintf (stderr, "%s: '--db' ('-x') option required%s\n", prog_name, (additional_error ? additional_error : ""));
-      error = 1;
-    }
-  
-  if (! args_info->dset_list_given)
-    {
-      fprintf (stderr, "%s: '--dset_list' ('-X') option required%s\n", prog_name, (additional_error ? additional_error : ""));
-      error = 1;
-    }
-  
-  if (! args_info->input_given)
-    {
-      fprintf (stderr, "%s: '--input' ('-i') option required%s\n", prog_name, (additional_error ? additional_error : ""));
-      error = 1;
-    }
-  
-  if (! args_info->dir_in_given)
-    {
-      fprintf (stderr, "%s: '--dir_in' ('-d') option required%s\n", prog_name, (additional_error ? additional_error : ""));
-      error = 1;
-    }
-  
-  if (! args_info->dir_prep_in_given)
-    {
-      fprintf (stderr, "%s: '--dir_prep_in' ('-p') option required%s\n", prog_name, (additional_error ? additional_error : ""));
-      error = 1;
-    }
-  
-  
-  /* checks for dependences among options */
-
-  return error;
+  return EXIT_SUCCESS;
 }
 
 
   case ARG_FLAG:
     *((int *)field) = !*((int *)field);
     break;
+  case ARG_INT:
+    if (val) *((int *)field) = strtol (val, &stop_char, 0);
+    break;
   case ARG_FLOAT:
     if (val) *((float *)field) = (float)strtod (val, &stop_char);
     break;
 
   /* check numeric conversion */
   switch(arg_type) {
+  case ARG_INT:
   case ARG_FLOAT:
     if (val && !(stop_char && *stop_char == '\0')) {
       fprintf(stderr, "%s: invalid numeric value: %s\n", package_name, val);
         { "version",	0, NULL, 'V' },
         { "databaselet",	0, NULL, 'D' },
         { "dataset",	0, NULL, 'A' },
+        { "weight",	0, NULL, 'W' },
+        { "dweight_dir",	1, NULL, 'E' },
+        { "dweight_num",	1, NULL, 'n' },
+        { "dweight_map",	1, NULL, 'M' },
+        { "dweight_test_dir",	1, NULL, 'F' },
+        { "dweight_test_num",	1, NULL, 'G' },
         { "order_stat_single_gene_query",	0, NULL, 'O' },
         { "db",	1, NULL, 'x' },
         { "dset_list",	1, NULL, 'X' },
         { NULL,	0, NULL, 0 }
       };
 
-      c = getopt_long (argc, argv, "hVDAOx:X:i:q:d:p:r:s:NP:v:Q:o:", long_options, &option_index);
+      c = getopt_long (argc, argv, "hVDAWE:n:M:F:G:Ox:X:i:q:d:p:r:s:NP:v:Q:o:", long_options, &option_index);
 
       if (c == -1) break;	/* Exit from `while (1)' loop.  */
 
             goto failure;
         
           break;
+        case 'W':	/* Test dataset weights.  */
+        
+        
+          if (update_arg((void *)&(args_info->weight_flag), 0, &(args_info->weight_given),
+              &(local_args_info.weight_given), optarg, 0, 0, ARG_FLAG,
+              check_ambiguity, override, 1, 0, "weight", 'W',
+              additional_error))
+            goto failure;
+        
+          break;
+        case 'E':	/* Dataset weight directory.  */
+        
+        
+          if (update_arg( (void *)&(args_info->dweight_dir_arg), 
+               &(args_info->dweight_dir_orig), &(args_info->dweight_dir_given),
+              &(local_args_info.dweight_dir_given), optarg, 0, "NA", ARG_STRING,
+              check_ambiguity, override, 0, 0,
+              "dweight_dir", 'E',
+              additional_error))
+            goto failure;
+        
+          break;
+        case 'n':	/* Number of .dweight files.  */
+        
+        
+          if (update_arg( (void *)&(args_info->dweight_num_arg), 
+               &(args_info->dweight_num_orig), &(args_info->dweight_num_given),
+              &(local_args_info.dweight_num_given), optarg, 0, "1000", ARG_INT,
+              check_ambiguity, override, 0, 0,
+              "dweight_num", 'n',
+              additional_error))
+            goto failure;
+        
+          break;
+        case 'M':	/* Dataset mapping file.  */
+        
+        
+          if (update_arg( (void *)&(args_info->dweight_map_arg), 
+               &(args_info->dweight_map_orig), &(args_info->dweight_map_given),
+              &(local_args_info.dweight_map_given), optarg, 0, "NA", ARG_STRING,
+              check_ambiguity, override, 0, 0,
+              "dweight_map", 'M',
+              additional_error))
+            goto failure;
+        
+          break;
+        case 'F':	/* Test dataset weight directory.  */
+        
+        
+          if (update_arg( (void *)&(args_info->dweight_test_dir_arg), 
+               &(args_info->dweight_test_dir_orig), &(args_info->dweight_test_dir_given),
+              &(local_args_info.dweight_test_dir_given), optarg, 0, "NA", ARG_STRING,
+              check_ambiguity, override, 0, 0,
+              "dweight_test_dir", 'F',
+              additional_error))
+            goto failure;
+        
+          break;
+        case 'G':	/* Test number of .dweight files.  */
+        
+        
+          if (update_arg( (void *)&(args_info->dweight_test_num_arg), 
+               &(args_info->dweight_test_num_orig), &(args_info->dweight_test_num_given),
+              &(local_args_info.dweight_test_num_given), optarg, 0, "1000", ARG_INT,
+              check_ambiguity, override, 0, 0,
+              "dweight_test_num", 'G',
+              additional_error))
+            goto failure;
+        
+          break;
         case 'O':	/* Order statistics mode (single-gene query).  */
         
         
 
 
 
-  if (check_required)
-    {
-      error += cmdline_parser_required2 (args_info, argv[0], additional_error);
-    }
 
   cmdline_parser_release (&local_args_info);
 

tools/SeekReader/cmdline.h

   const char *databaselet_help; /**< @brief Display values from databaselet(s) help description.  */
   int dataset_flag;	/**< @brief Check which datasets contain query of interest, based on .gpres file (default=off).  */
   const char *dataset_help; /**< @brief Check which datasets contain query of interest, based on .gpres file help description.  */
+  int weight_flag;	/**< @brief Test dataset weights (default=off).  */
+  const char *weight_help; /**< @brief Test dataset weights help description.  */
+  char * dweight_dir_arg;	/**< @brief Dataset weight directory (default='NA').  */
+  char * dweight_dir_orig;	/**< @brief Dataset weight directory original value given at command line.  */
+  const char *dweight_dir_help; /**< @brief Dataset weight directory help description.  */
+  int dweight_num_arg;	/**< @brief Number of .dweight files (default='1000').  */
+  char * dweight_num_orig;	/**< @brief Number of .dweight files original value given at command line.  */
+  const char *dweight_num_help; /**< @brief Number of .dweight files help description.  */
+  char * dweight_map_arg;	/**< @brief Dataset mapping file (default='NA').  */
+  char * dweight_map_orig;	/**< @brief Dataset mapping file original value given at command line.  */
+  const char *dweight_map_help; /**< @brief Dataset mapping file help description.  */
+  char * dweight_test_dir_arg;	/**< @brief Test dataset weight directory (default='NA').  */
+  char * dweight_test_dir_orig;	/**< @brief Test dataset weight directory original value given at command line.  */
+  const char *dweight_test_dir_help; /**< @brief Test dataset weight directory help description.  */
+  int dweight_test_num_arg;	/**< @brief Test number of .dweight files (default='1000').  */
+  char * dweight_test_num_orig;	/**< @brief Test number of .dweight files original value given at command line.  */
+  const char *dweight_test_num_help; /**< @brief Test number of .dweight files help description.  */
   int order_stat_single_gene_query_flag;	/**< @brief Order statistics mode (single-gene query) (default=off).  */
   const char *order_stat_single_gene_query_help; /**< @brief Order statistics mode (single-gene query) help description.  */
   char * db_arg;	/**< @brief Input dataset-platform definition.  */
   unsigned int version_given ;	/**< @brief Whether version was given.  */
   unsigned int databaselet_given ;	/**< @brief Whether databaselet was given.  */
   unsigned int dataset_given ;	/**< @brief Whether dataset was given.  */
+  unsigned int weight_given ;	/**< @brief Whether weight was given.  */
+  unsigned int dweight_dir_given ;	/**< @brief Whether dweight_dir was given.  */
+  unsigned int dweight_num_given ;	/**< @brief Whether dweight_num was given.  */
+  unsigned int dweight_map_given ;	/**< @brief Whether dweight_map was given.  */
+  unsigned int dweight_test_dir_given ;	/**< @brief Whether dweight_test_dir was given.  */
+  unsigned int dweight_test_num_given ;	/**< @brief Whether dweight_test_num was given.  */
   unsigned int order_stat_single_gene_query_given ;	/**< @brief Whether order_stat_single_gene_query was given.  */
   unsigned int db_given ;	/**< @brief Whether db was given.  */
   unsigned int dset_list_given ;	/**< @brief Whether dset_list was given.  */
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.