Commits

Qian Zhu  committed b5560cd

Use anti-correlation only at the final gene scoring step, dataset weighting still based on positive correlation between query genes

  • Participants
  • Parent commits c207a2c

Comments (0)

Files changed (3)

File src/seekcentral.cpp

 	m_bSimulateWeight = false;
 	m_bOutputText = false;
 	m_bSquareZ = false;
-	//m_bSharedDB = false;
+
 	m_bNegativeCor = false;
 	m_DEFAULT_NA = -320;
 
 	m_vecstrPlatform.clear();
 
 	if(m_vecDB.size()!=0){
-		//if(!m_bSharedDB){
 		for(i=0; i<m_vecDB.size(); i++){
 			delete m_vecDB[i];
 			m_vecDB[i] = NULL;
 		}
-		//}
-		//for(i=0; i<m_vecDB.size(); i++)
-		//	m_vecDB[i] = NULL;
 		m_vecDB.clear();
 	}
 
-	/*if(m_DB!=NULL){
-		if(!m_bSharedDB){
-			delete m_DB;
-		}
-		m_DB = NULL;
-	}*/
 	m_iDatasets = 0;
 	m_iGenes = 0;
 	m_numThreads = 0;
 	m_mapLoadTime.clear();
 	m_output_dir = "";
 	DEBUG = false;
-	//m_bSharedDB = false;
 
 	for(i=0; i<m_vecDBSetting.size(); i++)
 		if(m_vecDBSetting[i]!= NULL)
 	m_bLogit = src->m_bLogit;
 	m_eDistMeasure = eDistMeasure;
 
+	//if negative correlation, then need to use a different null value
 	m_bNegativeCor = bNegativeCor;
 	if(m_bNegativeCor){
 		m_DEFAULT_NA = 320;
 		final[j].f = m_master_rank[j];
 	}
 	if(DEBUG) fprintf(stderr, "Begin Sorting genes\n");
-	sort(final.begin(), final.end());
+	if(m_bNegativeCor){
+		sort(final.begin(), final.end(), AscendingFloat());
+	}else{
+		sort(final.begin(), final.end());
+	}
 	return true;
 }
 
 				if(current_sm==CV)
 					CSeekWeighter::CVWeighting(query, *m_vc[d], *RATE,
 						m_fPercentQueryAfterScoreCutOff, m_bSquareZ,
-						m_bNegativeCor, &m_rank_threads[tid]);
+						false, &m_rank_threads[tid]); //weighting always based on positive co-expression
 				else
 					CSeekWeighter::CVWeighting(query, *m_vc[d], *RATE,
 						m_fPercentQueryAfterScoreCutOff, m_bSquareZ,
-						m_bNegativeCor, &m_rank_threads[tid], &customGoldStd);
+						false, &m_rank_threads[tid], &customGoldStd); //weighting based on positive correlation
 
 				if( (w = m_vc[d]->GetDatasetSumWeight())==-1){
 					if(DEBUG) fprintf(stderr, "Bad weight\n");
 			}
 			else if(current_sm==AVERAGE_Z){
 				CSeekWeighter::AverageWeighting(query, *m_vc[d],
-					m_fPercentQueryAfterScoreCutOff, m_bSquareZ, w, m_bNegativeCor);
+					m_fPercentQueryAfterScoreCutOff, m_bSquareZ, w, false); //weighting based on positive correlation
 				if(w==-1) continue;
 			}
 			else if(current_sm==EQUAL && redoWithEqual==0){
 			const utype MIN_REQUIRED = max((utype) 1, (utype) (
 				m_fPercentQueryAfterScoreCutOff * this_q.size()));
 			CSeekWeighter::LinearCombine(m_rank_normal_threads[tid], this_q,
-				*m_vc[d], MIN_REQUIRED, m_bSquareZ);
+				*m_vc[d], m_bNegativeCor, MIN_REQUIRED, m_bSquareZ);
 
 			if(DEBUG) fprintf(stderr,
 				"Adding contribution of dataset %d to master ranking: %.5f\n", d, w);
 	
 		//random-ranking case =========================
 		if(m_bRandom){
+			if(m_bNegativeCor){
+				fprintf(stderr, "Error! Random-ranking case does not support Negative Correlations!\n");
+				continue;
+			}
+
 			sort(m_master_rank.begin(), m_master_rank.end(), greater<float>());
 			sort(weight.begin(), weight.end(), greater<float>());
 			copy(m_master_rank.begin(), m_master_rank.end(), vecRandScore[l].begin());

File src/seekweight.cpp

 //correlate with A in order to count A's query score
 bool CSeekWeighter::LinearCombine(vector<utype> &rank,
 	const vector<utype> &cv_query, CSeekDataset &sDataset,
+	const bool bNegativeCor,
 	const utype &MIN_REQUIRED, const bool &bSquareZ){
 
 	CSeekIntIntMap *mapG = sDataset.GetGeneMap();
 	utype q_size = cv_query.size();
 	utype **f = sDataset.GetDataMatrix();
 
-	//rank.resize(iNumGenes);
-	CSeekTools::InitVector(rank, iNumGenes, (utype) 0);
+	utype DEFAULT_NA = 0;
+	if(bNegativeCor){
+		DEFAULT_NA = 640;
+	}
+
+	CSeekTools::InitVector(rank, iNumGenes, (utype) DEFAULT_NA);
 
 	/* as long as rank[g] does not overflow, due to too many queries, we are fine
 	 * should control query size to be <100. */
 				if(totNonZero >= MIN_REQUIRED)
 					(*iter_g) = tmpScore / totNonZero;
 				else
-					(*iter_g) = 0;
+					(*iter_g) = DEFAULT_NA;
 			}
 		}
 		else{
 				if(totNonZero >= MIN_REQUIRED)
 					(*iter_g) = tmpScore / totNonZero;
 				else
-					(*iter_g) = 0;
+					(*iter_g) = DEFAULT_NA;
 			}
 		}
 	}
 	const utype &iGenes, utype **rank_d, const vector<utype> &counts,
 	vector<float> &master_rank, const utype &numThreads, const bool bNegativeCor){
 
-	float DEFAULT_NA = -320;
-	if(bNegativeCor){
-		DEFAULT_NA = 320;
-	}
+	//bNegativeCor: 
+	//do integration normally (ie based on positive correlations)
+	//then reverse the final ranking to get negative correlated gene ranking
+	float DEFAULT_NA = -320; //CAUTION!!
 
 	//vector<float> precompute;
 	//CSeekTools::ReadArray("/tmp/order_stats.binomial.bin", precompute);
 			this_d[kk].f = rank_d[j][k];
 			kk++;
 		}
-		if(bNegativeCor){
-			sort(this_d.begin(), this_d.end(), Ascending());
-		}else{
-			sort(this_d.begin(), this_d.end());
-		}
+		sort(this_d.begin(), this_d.end());
 		for(k=0; k<numNonZero; k++){
 			rank_f[j][this_d[k].i] =
 				(float) (k+1) / (float) numNonZero;
 	rks.clear();
 	gss.clear();
 
+
+	//REVERSE FINAL RANKING IF SORTING BY NEGATIVE CORRELATIONS
+	if(bNegativeCor){
+		DEFAULT_NA = 320;
+		float max = -320;
+		for(k=0; k<iGenes; k++){
+			if(master_rank[k]==max){
+				master_rank[k] = DEFAULT_NA;
+			}
+		}
+	}
+
 	//gsl_permutation_free(perm);
 	//gsl_permutation_free(rk);
 	//gsl_vector_float_free(gs);
 		const utype MIN_QUERY_REQUIRED =
 			max((utype) 1, (utype) (percent_required * query.size()));
 		bool ret = LinearCombine(rank, query, sDataset,
-			MIN_QUERY_REQUIRED, bSquareZ);
+			bNegativeCor, MIN_QUERY_REQUIRED, bSquareZ);
 		ret = CSeekPerformanceMeasure::RankBiasedPrecision(rate,
 			rank, w, is_query, is_gold, *mapG, &ar, bNegativeCor, TOP);
 		if(!ret) sDataset.SetCVWeight(0, -1);
 			float w = 0;
 			const utype MIN_QUERY_REQUIRED =
 				max((utype) 1, (utype) (percent_required * cv_query.size()));
-			bool ret = LinearCombine(rank, cv_query, sDataset,
+			bool ret = LinearCombine(rank, cv_query, sDataset, bNegativeCor,
 				MIN_QUERY_REQUIRED, bSquareZ);
 			ret = CSeekPerformanceMeasure::RankBiasedPrecision(rate,
 				rank, w, is_query_cross, is_gold, *mapG, &ar, bNegativeCor, TOP);

File src/seekweight.h

 	/*cv_query must be present in sDataset */
 	static bool LinearCombine(vector<utype> &rank,
 		const vector<utype> &cv_query, CSeekDataset &sDataset,
-		const utype &, const bool &);
+		const bool, const utype &, const bool &);
 
 	/*!
 	 * \brief