Commits

Qian Zhu  committed da2d8ee

converting float addition to integer addition in LinearCombine

  • Participants
  • Parent commits 7006cae
  • Branches search_project

Comments (0)

Files changed (7)

File src/seekdataset.cpp

 	return true;
 }
 
-CFullMatrix<float>* CSeekDataset::GetFloatMatrix(){
+CFullMatrix<short>* CSeekDataset::GetDataMatrix(){
 	return rData;
 }
 
-bool CSeekDataset::InitializeFloatMatrix(bool bSubtractAvg,
+bool CSeekDataset::InitializeDataMatrix(bool bSubtractAvg,
 	bool bSubtractPlatformAvg){
 	/* assume platform is already set */
 
 		w+=0.1;
 	}
 	quant.resize(quant.size());
-	rData = new CFullMatrix<float>();
+
+	//rData = new CFullMatrix<float>();
+	rData = new CFullMatrix<short>();
 
 	/* transpose */
 	/* numGenes * numQueries */
 
 	size_t i,j;
 	if(bSubtractAvg){
-		float *platform_avg = new float[rData->GetColumns()];
-		float *platform_stdev = new float[rData->GetColumns()];
 
 		if(bSubtractPlatformAvg){
+			float *platform_avg = new float[rData->GetColumns()];
+			float *platform_stdev = new float[rData->GetColumns()];
+
 			for(j=0; j<rData->GetColumns(); j++){
 				size_t jj = queryMap->GetReverse(j);
 				platform_avg[j] = platform->GetPlatformAvg(jj);
 				platform_stdev[j] = platform->GetPlatformStdev(jj);
 			}
-		}
-
-		/* numGenes */
-		for(i=0; i<rData->GetRows(); i++){
-			float a = GetGeneAverage(i);
-			if(isnan(a) || isinf(a)){
-				for(j=0; j<rData->GetColumns(); j++){
-					rData->Set(i, j, -50.0);
-				}
-				continue;
-			}
-
-			/* numQueries */
-			for(j=0; j<rData->GetColumns(); j++){
-				unsigned char x = r->Get(j, i);
-
-				if(x==255){
-					rData->Set(i, j, -50.0);
-					//printf("Bad %.5f %d\n", x, r->Get(j, i));
-					//getchar();
-					/*}else if(x>=quant.size()){
-					printf("Bad oversize %d\n", x);
-					getchar();*/
+			for(i=0; i<rData->GetRows(); i++){
+				/* numGenes */
+				float a = GetGeneAverage(i);
+				if(CMeta::IsNaN(a)){
+					for(j=0; j<rData->GetColumns(); j++){
+						rData->Set(i, j, -32768);
+					}
 					continue;
 				}
-
-				float v = quant[x] - a;
-				if(bSubtractPlatformAvg){
+				/* numQueries */
+				for(j=0; j<rData->GetColumns(); j++){
+					unsigned char x = r->Get(j, i);
+					if(x==255){
+						rData->Set(i, j, -32768);
+						continue;
+					}
 					/*if(CMeta::IsNaN(platform_avg[j]) ||
 						CMeta::IsNaN(platform_stdev[j])){
 						printf("platform average or stdev is NaN\n");
 						getchar();
 						continue;
 					}*/
-					rData->Set(i, j, (v - platform_avg[j] / platform_stdev[j]));
+					float vv = (quant[x] - a - platform_avg[j]) / platform_stdev[j];
+					rData->Set(i, j, (short)(vv*100.0));
+				}
+			}
+			delete[] platform_avg;
+			delete[] platform_stdev;
+
+		}else{
+			for(i=0; i<rData->GetRows(); i++){
+				float a = GetGeneAverage(i);
+				if(CMeta::IsNaN(a)){
+					for(j=0; j<rData->GetColumns(); j++){
+						rData->Set(i, j, -32768);
+					}
 					continue;
 				}
-				rData->Set(i, j, v);
+				/* numQueries */
+				for(j=0; j<rData->GetColumns(); j++){
+					unsigned char x = r->Get(j, i);
+					if(x==255){
+						rData->Set(i, j, -32768);
+						continue;
+					}
+					float v = quant[x] - a;
+					rData->Set(i, j, (short)(v*100.0));
+				}
 			}
 
 		}
 
-		delete[] platform_avg;
-		delete[] platform_stdev;
 
 		return true;
 	}
 	for(i=0; i<rData->GetRows(); i++){
 		/* numQueries */
 		for(j=0; j<rData->GetColumns(); j++){
-			rData->Set(i, j, quant[r->Get(j, i)]);
+			rData->Set(i, j, (short) (quant[r->Get(j, i)] * 100.0));
 		}
 	}
 
 	return true;
 }
 
-bool CSeekDataset::FreeFloatMatrix(){
+bool CSeekDataset::FreeDataMatrix(){
 	delete rData;
 	return true;
 }

File src/seekdataset.h

 	bool SetQuery(size_t &, size_t &, unsigned char &);
 	bool SetQueryNoMapping(size_t &, size_t &, unsigned char &);
 	bool SetQuery(size_t &, vector<unsigned char> &);
-	CFullMatrix<float> *GetFloatMatrix();
-	bool InitializeFloatMatrix(bool=true, bool=true);
-	bool FreeFloatMatrix();
+
+	CFullMatrix<short> *GetDataMatrix();
+	bool InitializeDataMatrix(bool=true, bool=true);
+	bool FreeDataMatrix();
+
 	CFullMatrix<unsigned char> *GetMatrix();
 	CSeekIntIntMap* GetGeneMap();
 	CSeekIntIntMap* GetQueryMap();
 
 	vector<float> weight;
 	float sum_weight;
-	CFullMatrix<float> *rData;
+	//CFullMatrix<float> *rData;
+	CFullMatrix<short> *rData;
+
 	bool m_bIsNibble;
 
 };

File src/seekevaluate.cpp

 
 namespace Sleipnir {
 
-bool CSeekPerformanceMeasure::SortRankVector(vector<float> &rank,
+bool CSeekPerformanceMeasure::SortRankVector(vector<short> &rank,
 	CSeekIntIntMap &mapG, vector<AResult> &a){
 	a.clear();
 	int numGenesD = mapG.GetNumSet();
-	float old_target = 0;
-	float new_target = 0;
-	float prev_target = 0;
+	float old_target = -32769;
+	float new_target = -32769;
+	float prev_target = -32769;
 	int prev_numNonZero = 0;
 	int numNonZero = 0;
 	int ii, i, jj;
 			new_target += rank[i];
 			numNonZero++;
 		}
+
+		//printf("Non Zero %d %d\n", numNonZero, new_target);
+
 		/* 1000 is adjustable, this is the top number of items to sort */
 		if(numNonZero==0 || numNonZero<1000){
 			old_target = prev_target;
 			numNonZero = prev_numNonZero;
 			break;
 		}
+
 		new_target /= (float) numNonZero;
-		if(new_target == old_target){
+
+		if(new_target <= old_target){
+			numNonZero = prev_numNonZero;
 			break;
 		}
 		prev_target = old_target;
 
 /* designed specifically for a CSeekDataset */
 /* mask: the query genes which are not included in RBP calcualtion */
-bool CSeekPerformanceMeasure::RankBiasedPrecision(float rate, vector<float> &rank, float &rbp,
+bool CSeekPerformanceMeasure::RankBiasedPrecision(float rate, vector<short> &rank, float &rbp,
 	vector<char> &mask, vector<char> &gold, CSeekIntIntMap &mapG){
 
 	int i, ii, j, jj;
 	jj = 0;
 	int numNonZero = sing.size();
 	for(i=0; i<numNonZero; i++){
-		if(sing[i].f<=0) break;
+		if(sing[i].f<=-32768) break;
 		if(mask[sing[i].i]==1) continue;
 		if(gold[sing[i].i]==1){
 			x+=pow(rate, jj);

File src/seekevaluate.h

 
 struct AResult{
 	int i;
+	//float f;
+	short f;
+	bool operator<(const AResult& val) const{
+		if(f <= val.f){
+			return false;
+		}else{
+			return true;
+		}
+	}
+};
+
+struct AResultFloat{
+	int i;
 	float f;
-	bool operator<(const AResult& val) const{
+	bool operator<(const AResultFloat& val) const{
 		if(f <= val.f){
 			return false;
 		}else{
 
 class CSeekPerformanceMeasure{
 public:
-	static bool SortRankVector(vector<float> &rank,
+	static bool SortRankVector(vector<short> &rank,
 		CSeekIntIntMap &mapG, vector<AResult> &a);
 	/* designed specifically for a CSeekDataset */
 	/* mask: the query genes which are not included in RBP calcualtion */
-	static bool RankBiasedPrecision(float rate, vector<float> &rank, float &rbp,
+	static bool RankBiasedPrecision(float rate, vector<short> &rank, float &rbp,
 		vector<char> &mask, vector<char> &gold, CSeekIntIntMap &mapG);
 };
 

File src/seekweight.cpp

 
 namespace Sleipnir {
 
-bool CSeekWeighter::LinearCombine(vector<float> &rank, vector<int> &cv_query,
+bool CSeekWeighter::LinearCombine(vector<short> &rank, vector<int> &cv_query,
 	CSeekDataset &sDataset){
 	if(cv_query.size()==0){
 		cerr << "cv_query empty" << endl;
 		return true;
 	}
 	size_t iNumGenes = sDataset.GetNumGenes();
-
-	//vector<float> new_rank;
-	CSeekTools::InitVector(rank, iNumGenes, (float)0);
-	//CSeekTools::InitVector(new_rank, iNumGenes, (float)0);
+	CSeekTools::InitVector(rank, iNumGenes, (short)-32768);
 	size_t i, j, k;
 
 	int q_size = cv_query.size();
-	/*for(i=0; i<q_size; i++){
-		rank[cv_query[i]] = 1.0 / q_size;
-	}*/
-
-	/*if(q_size==0){
-		printf("Bad!\n");
-		getchar();
-	}*/
-
 	CSeekIntIntMap *mapG = sDataset.GetGeneMap();
 	CSeekIntIntMap *mapQ = sDataset.GetQueryMap();
-
-	CFullMatrix<float> *f = sDataset.GetFloatMatrix();
+	CFullMatrix<short> *f = sDataset.GetDataMatrix();
 
 	size_t iGenesPresent = mapG->GetNumSet();
+	/* as long as rank[g] does not overflow, due to too many queries, we are fine
+	 * should control query size to be <100. */
 	for(i=0; i<iGenesPresent; i++){
 		size_t g = mapG->GetReverse(i);
+		rank[g] = 0;
 		for(j=0; j<q_size; j++){
 			int qq = cv_query[j];
 			if(g==qq) continue;
 			size_t q = mapQ->GetForward(qq);
-			/*if(f->Get(g,q)<-50.0 || f->Get(g,q)>50.0){
-				printf("Bad %.5f\n", f->Get(g,q));
-				getchar();
-			}*/
 			rank[g] += f->Get(g, q);
 		}
-		rank[g] /= (float) q_size;
+		rank[g] /= (short) q_size;
 	}
-
-	//for(i=0; i<iGenesPresent; i++){
-	//	size_t g = mapG->GetReverse(i);
-	//	rank[g] = new_rank[g];
-		//printf("Gene %d %.5f\n", g, rank[g]);
-	//}
-
-	//getchar();
-
 	return true;
 }
 
 			num_v++;
 		}
 
-		/*printf("Cross Val %d %d %d\n", qi, num_q, num_v);
-		printf("Cross Val %d\n", qi);
-		for(i=0; i<vi.size(); i++){
-			printf("%d ", vi[i]);
-		}
-		printf("\n");
-		*/
 		if(num_q==0 || num_v==0){
 			sDataset.SetCVWeight(qi, -1);
 		}else{
 			/* actual weighting */
-			vector<float> rank;
+			vector<short> rank;
 			float w = 0;
 			bool ret = LinearCombine(rank, cv_query, sDataset);
 			ret = CSeekPerformanceMeasure::RankBiasedPrecision(0.95,

File src/seekweight.h

 class CSeekWeighter{
 public:
 	/*cv_query must be present in sDataset */
-	static bool LinearCombine(vector<float> &rank, vector<int> &cv_query,
+	static bool LinearCombine(vector<short> &rank, vector<int> &cv_query,
 			CSeekDataset &sDataset);
 	static bool CVWeighting(CSeekQuery &sQuery, CSeekDataset &sDataset);
 };

File tools/SeekMiner/SeekMiner.cpp

 			shared(vc) \
 			private(d, j) \
 			firstprivate(iDatasets) \
-			schedule(static)
+			schedule(dynamic)
 
 			for(d=0; d<iDatasets; d++){
 				int tid = omp_get_thread_num();
 				}
 
 				//printf("Initializing\n");
-				vc[d]->InitializeFloatMatrix();
+				vc[d]->InitializeDataMatrix();
 				//printf("Weighting dataset\n");
 				CSeekWeighter::CVWeighting(query, *vc[d]);
 				float w = vc[d]->GetDatasetSumWeight();
 				if(w==-1){
 					//printf("Bad weight\n");
-					vc[d]->FreeFloatMatrix();
+					vc[d]->FreeDataMatrix();
 					continue;
 					//getchar();
 				}
-				vector<float> rank_normal;
+				vector<short> rank_normal;
 				//printf("Doing linear combination\n");
 				CSeekWeighter::LinearCombine(rank_normal, this_q, *vc[d]);
 				/*for(j=0; j<1000; j++){
 					size_t g = mapG->GetReverse(j);
-					printf("Gene %d %.5f\n", g, rank_normal[g]);
+					printf("Gene %d %d\n", g, rank_normal[g]);
 				}*/
-				vc[d]->FreeFloatMatrix();
+				vc[d]->FreeDataMatrix();
 
 				//printf("Adding contribution of dataset to master ranking: %.5f\n", w);
 				for(j=0; j<mapG->GetNumSet(); j++){
 					size_t g = mapG->GetReverse(j);
-					master_rank_threads[tid][g] += rank_normal[g] * w;
 					counts_threads[tid][g]++;
-					sum_weight_threads[tid][g] += w;
+				}
+
+				if(w>0.000005){
+					for(j=0; j<mapG->GetNumSet(); j++){
+						size_t g = mapG->GetReverse(j);
+						master_rank_threads[tid][g] += (float) rank_normal[g] / 100.0 * w;
+						sum_weight_threads[tid][g] += w;
+					}
 				}
 			}
 
 			}
 
 			printf("Sorting genes\n");
-			vector<AResult> a;
+			vector<AResultFloat> a;
 			a.clear();
 			a.resize(iGenes);
 			for(j=0; j<iGenes; j++){