Commits

Qian Zhu committed 9cae675

Added error message for Dat.Open if missing quant file
Made rank-normalization in SeekPrep memory efficient

Comments (0)

Files changed (7)

 	if( !OpenGenes( istm, true, false ) )
 		return false;
 
+	fprintf(stderr, "Reading genes\n");
 	m_Data.Initialize( GetGenes( ) );
 	adScores = new float[ GetGenes( ) - 1 ];
 	for( i = 0; ( i + 1 ) < GetGenes( ); ++i ) {
 		return CDatImpl::GetRowSeek(m_ifsm, i);
 	}
 
+	size_t GetGeneIndex(const string &strGene) const {
+		return CDatImpl::GetGeneIndex(strGene);
+	}
+
 	void AveStd( double& a, double& b, size_t& c){
 		return CDatImpl::AveStd(a, b, c);
 	}
 	char		szBuf[ c_iBuf ];
 	ifstream	ifsm;
 
-	if( !CPairImpl::Open( szDatafile, ifsm ) )
+	if( !CPairImpl::Open( szDatafile, ifsm ) ){
+		fprintf(stderr, "Quant file does not exist or error opening it.\n");
 		return false;
+	}
 	ifsm.getline( szBuf, c_iBuf - 1 );
 	ifsm.close( );
 	return CPairImpl::Open( szBuf, m_vecdQuant ); }
 //typedef unsigned short ushort;
 typedef unsigned int utype;
 //#define MAX_UTYPE 65535
-const utype MAX_UTYPE = 4294967295;
+const utype MAX_UTYPE = -1;
 #endif
 
 

src/seekwriter.cpp

 namespace Sleipnir {
 
 bool CSeekWriter::ReadSparseMatrix(const char *fileName,
-	vector<vector<float> > &mat, CSeekIntIntMap &m, 
+	vector<map<utype,float> > &mat, CSeekIntIntMap &m, 
 	const int maxRank, const float rbp_p,
 	const vector<string> &vecstrGenes){
 
 
 	mat.resize(vecstrGenes.size());
 	for(i=0; i<vecstrGenes.size(); i++)
-		CSeekTools::InitVector(mat[i], vecstrGenes.size(), (float) 0);
+		mat[i] = map<utype,float>();
 
 	ret = fread((char*) (&numPresent), 1, sizeof(numPresent), f);
 	for(j=0; j<numPresent; j++){
 	for(i=0; i<maxRank; i++)
 		rbp_score[i] = (1.0 - rbp_p) * pow(rbp_p, i);
 
-	fprintf(stderr, "Begin assigning rbp scores\n");
-
 	for(i=0; i<numGenes; i++){
 		utype id, id2;
 		unsigned short numEntries;
 		for(j=0; j<numEntries; j++){
 			ret = fread((char*)(&id2),1,sizeof(id2),f);
 			ret = fread((char*)(&val),1,sizeof(val),f);
-			mat[id][id2] = rbp_score[val];
+			utype first = id;
+			utype second = id2;
+			if(first>=second){
+				first = id2;
+				second = id;
+			}
+			mat[first][second] = rbp_score[val];
 		}
 	}
 	fclose(f);
 
-	fprintf(stderr, "Filling zero rbp score\n");
-	/*for(i=0; i<vecstrGenes.size(); i++){
-		if(CSeekTools::IsNaN(m.GetForward(i))){
-			for(j=0; j<vecstrGenes.size(); j++){
-				mat[i][j] = CMeta::GetNaN();
-			}
-			continue;
-		}
-		for(j=0; j<vecstrGenes.size(); j++){
-			if(CSeekTools::IsNaN(m.GetForward(j))){
-				mat[i][j] = CMeta::GetNaN();
-			}
-		}
-	}*/
-
-	/*
-	for(ii=0; ii<m.GetNumSet(); ii++){
-		i = allRGenes[ii];
-		for(jj=ii+1; jj<m.GetNumSet(); jj++){
-			j = allRGenes[jj];
-			if(CMeta::IsNaN(mat[i][j])){
-				mat[i][j] = 0;
-				mat[j][i] = 0;
-			}
-		}
-	}*/
-
 	utype ii, jj;
 	const vector<utype> &allRGenes = m.GetAllReverse();
 	fprintf(stderr, "Begin calculating row sum\n");
 	CSeekTools::InitVector(vecSum, vecstrGenes.size(), (float) 0);
 	for(ii=0; ii<m.GetNumSet(); ii++){
 		i = allRGenes[ii];
-		for(jj=ii+1; jj<m.GetNumSet(); jj++){
-			j = allRGenes[jj];
-			//if(CMeta::IsNaN(mat[i][j])) continue;
-			//if(mat[i][j] < 0 || mat[i][j]>1){
-			//	fprintf(stderr, "Should not happen, error!\n");
-			//}
-			if(mat[i][j]==0) continue;
-			vecSum[i] += mat[i][j];
-			vecSum[j] += mat[i][j];
+		map<utype,float>::iterator it;
+		for(it=mat[i].begin(); it!=mat[i].end(); it++){
+			j = it->first;
+			float second = it->second;
+			vecSum[i] += second;
+			vecSum[j] += second;
 		}
 	}
 
 	fprintf(stderr, "Begin normalization using row sum\n");
 	for(ii=0; ii<m.GetNumSet(); ii++){
 		i = allRGenes[ii];
-		for(jj=ii+1; jj<m.GetNumSet(); jj++){
-			j = allRGenes[jj];
-			if(mat[i][j]==0 || vecSqrtSum[i]==0 || vecSqrtSum[j]==0) continue;
-			mat[i][j] = mat[i][j] / vecSqrtSum[i] / vecSqrtSum[j];
-			mat[j][i] = mat[i][j];
+		map<utype,float>::iterator it;
+		for(it=mat[i].begin(); it!=mat[i].end(); it++){
+			j = it->first;
+			if(vecSqrtSum[i]==0 || vecSqrtSum[j]==0) continue;
+			it->second = it->second / vecSqrtSum[i] / vecSqrtSum[j];
 		}
 	}
 	return true;
 }
 
-bool CSeekWriter::ProductNorm(const vector<vector<float> > &mat1,
-	const vector<vector<float> > &mat2, const CSeekIntIntMap &m1, 
-	const CSeekIntIntMap &m2, vector<vector<float> > &re){
+//Calculate the similarity of two distance matrices
+//by simply taking product of two matrix for corresponding entries
+bool CSeekWriter::ProductNorm(const vector<map<utype,float> > &mat1,
+	const vector<map<utype,float> > &mat2, const CSeekIntIntMap &m1, 
+	const CSeekIntIntMap &m2, vector<map<utype,float> > &re){
 
 	utype ii, jj;
 	utype i, j;
 
 	re.resize(mat1.size());
 	for(i=0; i<mat1.size(); i++)
-		CSeekTools::InitVector(re[i], mat1.size(), (float)0);
+		re[i] = map<utype,float>();
 
 	const vector<utype> &allRGenes1 = m1.GetAllReverse();
 	CSeekIntIntMap mi(mat1.size());
 	CSeekTools::InitVector(vecSum, mat1.size(), (float) 0);
 	for(ii=0; ii<mi.GetNumSet(); ii++){
 		i = allR[ii];
-		for(jj=ii+1; jj<mi.GetNumSet(); jj++){
-			j = allR[jj];
-			if(mat1[i][j]==0 || mat2[i][j]==0) continue;
-			re[i][j] = sqrtf(mat1[i][j] * mat2[i][j]);
-			re[j][i] = re[i][j];
+		map<utype,float>::const_iterator it;
+		for(it=mat1[i].begin(); it!=mat1[i].end(); it++){
+			j = it->first;
+			float f1 = it->second;
+			map<utype,float>::const_iterator it2;
+			if((it2 = mat2[i].find(j))==mat2[i].end()) continue;
+			float f2 = it2->second;
+			re[i][j] = sqrtf(f1*f2);
 			vecSum[i] += re[i][j];
 			vecSum[j] += re[i][j];
 		}
 		vecSqrtSum[i] = sqrtf(vecSum[i]);
 	}
 
-	utype numNonZero = 0;
 	fprintf(stderr, "Begin normalization using row sum\n");
-	vector<float> vf;
 	for(ii=0; ii<mi.GetNumSet(); ii++){
 		i = allR[ii];
-		for(jj=ii+1; jj<mi.GetNumSet(); jj++){
-			j = allR[jj];
-			if(mat1[i][j]==0 || mat2[i][j]==0) continue;
+		map<utype,float>::iterator it;
+		for(it=re[i].begin(); it!=re[i].end(); it++){
+			j = it->first;
 			if(vecSqrtSum[i]==0 || vecSqrtSum[j]==0) continue;
-			re[i][j] = re[i][j] / vecSqrtSum[i] / vecSqrtSum[j];
-			re[j][i] = re[i][j];
-			//vf.push_back(rx);
-			numNonZero++;
-			//fprintf(stderr, "%.3e\n", re[i][j]);
+			it->second = it->second / vecSqrtSum[i] / vecSqrtSum[j];
 		}
 	}
-	//sort(vf.begin(), vf.end(), greater<float>());
-	//int xi;
-	//for(xi=0; xi<vf.size(); xi++){
-		//if(isinf(vf[xi]) || isnan(vf[xi])){
-	//		fprintf(stderr, "%.3e\n", vf[xi]);
-		//}
-	//}
-	//fprintf(stderr, "Non Zero: %d\n", numNonZero);
 	return true;
 }
 
-bool CSeekWriter::WriteSparseMatrix(vector<vector<unsigned short> > &umat,
+bool CSeekWriter::WriteSparseMatrix(CDataPair &Dat,
+	vector< map<utype,unsigned short> > &umat, 
 	int maxRank, const vector<string> &vecstrGenes, const char *fileName){
 
 	FILE *f = fopen(fileName, "wb");
 	utype numGenes = 0;
 	utype i, j;
 
+	vector<utype> veciGenes;
+	veciGenes.clear();
+	veciGenes.resize(vecstrGenes.size());
+	for( i = 0; i < vecstrGenes.size( ); ++i )
+		veciGenes[ i ] = Dat.GetGeneIndex( vecstrGenes[i] );
+
 	CSeekIntIntMap mm(vecstrGenes.size());
-	for(i=0; i<vecstrGenes.size(); i++){
-		for(j=0; j<vecstrGenes.size(); j++)
-			if(!CSeekTools::IsNaN(umat[i][j])) break;
-		if(j!=vecstrGenes.size()){
+	for(i=0; i<vecstrGenes.size(); i++)
+		if(!CSeekTools::IsNaN(veciGenes[i]))
 			mm.Add(i);
-		}
-	}
 
 	utype numPresent = mm.GetNumSet();
 	//1 utype
 		fwrite((char*) (&allR[i]), 1, sizeof(allR[i]), f);
 
 	for(i=0; i<vecstrGenes.size(); i++){
-		for(j=i+1; j<vecstrGenes.size(); j++)
-			if(!CSeekTools::IsNaN(umat[i][j]) && umat[i][j]!=maxRank) 
-				break;
-		if(j==vecstrGenes.size()) 
-			continue;
+		if(umat[i].size()==0) continue;
 		numGenes++;
 	}
 
 	fwrite((char*) (&numGenes), 1, sizeof(numGenes), f);
 
 	for(i=0; i<vecstrGenes.size(); i++){
-		unsigned short numEntries = 0; //should be 1000
-		for(j=i+1; j<vecstrGenes.size(); j++){
-			if(CSeekTools::IsNaN(umat[i][j]) || umat[i][j]==maxRank)
-				continue;
-			numEntries++;
-		}
+		unsigned short numEntries = umat[i].size(); //should be 1000
 		if(numEntries==0) 
 			continue;
 		//1 utype
 		fwrite((char*) (&i), 1, sizeof(i), f);
 		//1 unsigned short
 		fwrite((char*) (&numEntries), 1, sizeof(numEntries), f);
-		for(j=i+1; j<vecstrGenes.size(); j++){
-			if(CSeekTools::IsNaN(umat[i][j]) || umat[i][j]==maxRank)
-				continue;
+		map<utype,unsigned short>::iterator it;
+		for(it=umat[i].begin(); it!=umat[i].end(); it++){
+			utype first = it->first;
+			unsigned short second = it->second;
 			//1 utype
-			fwrite((char*) (&j), 1, sizeof(j), f);
+			fwrite((char*) (&first), 1, sizeof(first), f);
 			//1 unsigned short
-			fwrite((char*) (&umat[i][j]), 1, sizeof(umat[i][j]), f);
+			fwrite((char*) (&second), 1, sizeof(second), f);
 		}
 	}
-
 	fclose(f);
 	return true;
 }
 
-bool CSeekWriter::GetSparseRankMatrix(CDat &Dat,
-	vector<vector<unsigned short> > &umat, const unsigned short nullValue,
+bool CSeekWriter::GetSparseRankMatrix(CDataPair &Dat,
+	vector< map<utype,unsigned short> > &umat, 
 	int maxRank, //1000
 	const vector<string> &vecstrGenes){
 
 	veciGenes.clear();
 	veciGenes.resize(vecstrGenes.size());
 	for( i = 0; i < vecstrGenes.size( ); ++i )
-		veciGenes[ i ] = Dat.GetGene( vecstrGenes[i] );
+		veciGenes[ i ] = Dat.GetGeneIndex( vecstrGenes[i] );
 	umat.resize(vecstrGenes.size());
+	for(i=0; i<vecstrGenes.size(); i++){
+		umat[i] = map<utype, unsigned short>();
+	}
 
+	fprintf(stderr, "Start reading DAB...\n");
 	for(i=0; i<vecstrGenes.size(); i++){
-		CSeekTools::InitVector(umat[i], vecstrGenes.size(), nullValue);
 		utype s = veciGenes[i];
 		if(CSeekTools::IsNaN(s)) continue;
+		if(i%1000==0)
+			fprintf(stderr, "Start reading gene %d...\n", i);
 
+		//float *v = Dat.GetRowSeek(s);
 		float *v = Dat.GetFullRow(s);
 		vector<AResultFloat> vv;
 		vv.resize(vecstrGenes.size());
+
 		for(j=0; j<vecstrGenes.size(); j++){
 			utype t = veciGenes[j];
 			vv[j].i = j;
-			if(CSeekTools::IsNaN(t) || CMeta::IsNaN(v[t])){
-				vv[j].f = -9999;
-				continue;
-			}
-			vv[j].f = v[t];
+			vv[j].f = -9999;
+			if(CSeekTools::IsNaN(t)) continue;
+			float d = v[t];
+			if(CMeta::IsNaN(d)) continue;
+			vv[j].f = d;
 		}
+
 		nth_element(vv.begin(), vv.begin()+maxRank, vv.end());
 		sort(vv.begin(), vv.begin()+maxRank);
+
 		for(j=0; j<vecstrGenes.size(); j++){
 			if(j<maxRank){
-				umat[i][vv[j].i] = j;
-			}else if(vv[j].f!=-9999){
-				umat[i][vv[j].i] = maxRank;
+				utype first = i;
+				utype second = vv[j].i;
+				if(i >= vv[j].i){
+					first = vv[j].i;
+					second = i;
+				}
+				map<utype,unsigned short>::iterator it;
+				if((it=umat[first].find(second))==umat[first].end())
+					umat[first][second] = (unsigned short) j;
+				else
+					umat[first][second] = std::min(it->second, (unsigned short) j);
 			}
 		}
 		free(v);
 	}
-	for(i=0; i<vecstrGenes.size(); i++){
-		utype s = veciGenes[i];
-		if(CSeekTools::IsNaN(s)) continue;
-		for(j=i+1; j<vecstrGenes.size(); j++){
-			utype t = veciGenes[j];
-			if(CSeekTools::IsNaN(t)) continue;
-			umat[i][j] = std::min(umat[i][j], umat[j][i]);
-			umat[j][i] = umat[i][j];
-		}
-	}
+	fprintf(stderr, "Finished reading DAB\n");
 	return true;
 }
 
-bool CSeekWriter::RankNormalizeDAB(CDat &Dat,
-	const vector<string> &vecstrGenes, int max_rank, float rbp_p){
-
-	utype i, j;
-	vector<utype> veciGenes;
-	veciGenes.clear();
-	veciGenes.resize(vecstrGenes.size());
-	for( i = 0; i < vecstrGenes.size( ); ++i )
-		veciGenes[ i ] = Dat.GetGene( vecstrGenes[i] );
-
-	vector<float> vecSum;
-	vector<int> vecNum;
-	CSeekTools::InitVector(vecSum, vecstrGenes.size(), CMeta::GetNaN());
-	CSeekTools::InitVector(vecNum, vecstrGenes.size(), (int)-9999);
-
-	vector<vector<float> > mat;
-	mat.resize(vecstrGenes.size());
-	int max = max_rank;
-	//float rbp_p = 0.99;
-
-	bool expTransform = true;
-	for(i=0; i<vecstrGenes.size(); i++){
-		CSeekTools::InitVector(mat[i], vecstrGenes.size(), CMeta::GetNaN());
-
-		utype s = veciGenes[i];
-		if(CSeekTools::IsNaN(s)) continue;
-		float *v = Dat.GetFullRow(s);
-		vector<AResultFloat> vv;
-		vv.resize(vecstrGenes.size());
-		int numV = 0;		
-
-		for(j=0; j<vecstrGenes.size(); j++){
-			utype t = veciGenes[j];
-			vv[j].i = j;
-			if(CSeekTools::IsNaN(t) || CMeta::IsNaN(v[t])){
-				vv[j].f = -9999;
-				continue;
-			}
-			vv[j].f = v[t];
-			numV++;
-		}
-
-		if(expTransform){
-			nth_element(vv.begin(), vv.begin()+max, vv.end());
-			sort(vv.begin(), vv.begin()+max);
-			for(j=0; j<vecstrGenes.size(); j++){
-				if(j<max){
-					float rank = (1.0 - rbp_p) * pow(rbp_p, j);
-					mat[i][vv[j].i] = rank;
-				}else if(vv[j].f!=-9999){
-					mat[i][vv[j].i] = 0;
-				}
-			}
-		}else{
-			sort(vv.begin(), vv.end());
-			for(j=0; j<vecstrGenes.size(); j++){
-				if(vv[j].f!=-9999){
-					mat[i][vv[j].i] = numV - j;
-				}
-			}
-		}
-
-		free(v);
-	}
-
-	for(i=0; i<vecstrGenes.size(); i++){
-		utype s = veciGenes[i];
-		if(CSeekTools::IsNaN(s)) continue;
-		for(j=i+1; j<vecstrGenes.size(); j++){
-			utype t = veciGenes[j];
-			if(CSeekTools::IsNaN(t)) continue;
-			if(CMeta::IsNaN(mat[i][j]) || CMeta::IsNaN(mat[j][i])){
-				fprintf(stderr, "%.3e %.3e\n", mat[i][j], mat[j][i]);
-			}
-			mat[i][j] = std::max(mat[i][j], mat[j][i]);
-			mat[j][i] = mat[i][j];
-		}
-	}
-
-	for(i=0; i<vecstrGenes.size(); i++){
-		utype s = veciGenes[i];
-		if(CSeekTools::IsNaN(s)) continue;
-		vecSum[i] = 0;
-		vecNum[i] = 0;
-		for(j=0; j<vecstrGenes.size(); j++){
-			utype t = veciGenes[j];
-			if(CSeekTools::IsNaN(t)) continue;
-			if(CMeta::IsNaN(mat[i][j])) continue;
-			vecSum[i] += mat[i][j];
-			vecNum[i]++;
-		}
-		//fprintf(stderr, "%.3e\n", vecSum[i]);
-	}
-
-	for(i=0; i<vecstrGenes.size(); i++){
-		utype s = veciGenes[i];
-		if(CSeekTools::IsNaN(s)) continue;
-		for(j=0; j<vecstrGenes.size(); j++){
-			utype t = veciGenes[j];
-			if(CSeekTools::IsNaN(t)) continue;
-			//fprintf(stderr, "%.3e %.3e\n", vecSum[i], vecSum[j]);
-			float r = mat[i][j] / sqrtf(vecSum[i]) / sqrtf(vecSum[j]);
-			Dat.Set(s, t, r);
-		}
-	}
-	
-	return true;
-}
-
-
-
-bool CSeekWriter::NormalizeDAB(CDat &Dat,
+bool CSeekWriter::NormalizeDAB(CDataPair &Dat,
 	const vector<string> &vecstrGenes,
 	bool cutoff, bool expTransform, bool divideNorm, bool subtractNorm){
 
 	for(i=0; i<vecstrGenes.size(); i++){
 		utype s = veciGenes[i];
 		if(CSeekTools::IsNaN(s)) continue;
-		float *v = Dat.GetFullRow(s);
 		float sum = 0;
 		int num = 0;
 		vector<float> all;
 		for(j=0; j<vecstrGenes.size(); j++){
 			utype t = veciGenes[j];
+			float d = Dat.Get(s,t);
 			if(CSeekTools::IsNaN(t)) continue;
-			if(CMeta::IsNaN(v[t])) continue;
+			if(CMeta::IsNaN(d)) continue;
 			if(cutoff){
-				if(v[t]>0){
+				if(d>0){
 					if(expTransform)
-						all.push_back(expf(-1.0*v[t]*v[t]/2.0));
+						all.push_back(expf(-1.0*d*d/2.0));
 					else
-						all.push_back(v[t]);
+						all.push_back(d);
 				}
 			}
 			else{
 				//fprintf(stderr, "Warning: Negative Z-Scores");
 				if(expTransform)
-					all.push_back(expf(-1.0*v[t]*v[t]/2.0));
+					all.push_back(expf(-1.0*d*d/2.0));
 				else
-					all.push_back(v[t]);
+					all.push_back(d);
 			}	
 		}
 
 		}
 		vecSum[i] = sum;
 		vecNum[i] = num;
-		free(v);
 	}
 
 	for(i=0; i<vecstrGenes.size(); i++){
 		utype s = veciGenes[i];
 		if(CSeekTools::IsNaN(s)) continue;
-		float *v = Dat.GetFullRow(s);
-
 		for(j=0; j<vecstrGenes.size(); j++){
 			utype t = veciGenes[j];
+			float d = Dat.Get(s, t);
 			if(CSeekTools::IsNaN(t)) continue;
-			if(CMeta::IsNaN(v[t])) continue;
+			if(CMeta::IsNaN(d)) continue;
 			if(cutoff){
-				if(v[t]>0){
+				if(d>0){
 					if(expTransform){
 						if(divideNorm){
-							float r = expf(-1.0*v[t]*v[t]/2.0) / sqrtf(vecSum[i]) / sqrtf(vecSum[j]);
+							float r = expf(-1.0*d*d/2.0) / sqrtf(vecSum[i]) / sqrtf(vecSum[j]);
 							Dat.Set(s, t, r);
 						}else if(subtractNorm){
-							float r = expf(-1.0*v[t]*v[t]/2.0) - vecSum[i] / vecNum[i] - vecSum[j] / vecNum[j];
+							float r = expf(-1.0*d*d/2.0) - vecSum[i] / vecNum[i] - vecSum[j] / vecNum[j];
 							Dat.Set(s, t, r);
 						}
 					}else{
 						if(divideNorm){
-							float r = v[t] / sqrtf(vecSum[i]) / sqrtf(vecSum[j]);
+							float r = d / sqrtf(vecSum[i]) / sqrtf(vecSum[j]);
 							Dat.Set(s, t, r);
 						}else if(subtractNorm){
-							float r = v[t] - vecSum[i] / vecNum[i] - vecSum[j] / vecNum[j];
+							float r = d - vecSum[i] / vecNum[i] - vecSum[j] / vecNum[j];
 							Dat.Set(s, t, r);
 						}
 					}
 			else{
 				if(expTransform){
 					if(divideNorm){
-						float r = expf(-1.0*v[t]*v[t]/2.0) / sqrtf(vecSum[i]) / sqrtf(vecSum[j]);
+						float r = expf(-1.0*d*d/2.0) / sqrtf(vecSum[i]) / sqrtf(vecSum[j]);
 						Dat.Set(s, t, r);
 					}else if(subtractNorm){
-						float r = expf(-1.0*v[t]*v[t]/2.0) - vecSum[i] / vecNum[i] - vecSum[j] / vecNum[j];
+						float r = expf(-1.0*d*d/2.0) - vecSum[i] / vecNum[i] - vecSum[j] / vecNum[j];
 						Dat.Set(s, t, r);
 					}
 				}else{
 							fprintf(stderr, "Warning, Dangerous, divide sqrt(z), where z could be negative\n");
 							r = 0;
 						}else{
-							r = v[t] / sqrtf(vecSum[i]) / sqrtf(vecSum[j]);
+							r = d / sqrtf(vecSum[i]) / sqrtf(vecSum[j]);
 						}
 						Dat.Set(s, t, r);
 					}else if(subtractNorm){
-						float r = v[t] - vecSum[i] / vecNum[i] - vecSum[j] / vecNum[j];
+						float r = d - vecSum[i] / vecNum[i] - vecSum[j] / vecNum[j];
 						Dat.Set(s, t, r);
 					}
 
 				}
 			}
 		}
-		free(v);
 	}
 
 	return true;
 class CSeekWriter{
 public:
 	static bool ReadSparseMatrix(const char *fileName, 
-		vector<vector<float> > &mat, 
+		vector<map<utype,float> > &mat, 
 		CSeekIntIntMap &m, const int maxRank, const float rbp_p,
 		const vector<string> &vecstrGenes);
 
-	static bool ProductNorm(const vector<vector<float> > &mat1,
-		const vector<vector<float> > &mat2, const CSeekIntIntMap &m1, 
-		const CSeekIntIntMap &m2, vector<vector<float> > &re);
+	static bool ProductNorm(const vector<map<utype,float> > &mat1,
+		const vector<map<utype,float> > &mat2, const CSeekIntIntMap &m1, 
+		const CSeekIntIntMap &m2, vector<map<utype,float> > &re);
 
-	static bool WriteSparseMatrix(vector<vector<unsigned short> > &umat,
+	static bool WriteSparseMatrix(CDataPair &Dat,
+		vector<map<utype,unsigned short> > &umat,
 		int maxRank, const vector<string> &vecstrGenes, const char *fileName);
 
-	static bool GetSparseRankMatrix(CDat &Dat,
-		vector<vector<unsigned short> > &umat, const unsigned short nullValue,
+	static bool GetSparseRankMatrix(CDataPair &Dat,
+		vector<map<utype,unsigned short> > &umat,
 		int maxRank, const vector<string> &vecstrGenes);
 
-	static bool RankNormalizeDAB(CDat &Dat,
-		const vector<string> &vecstrGenes, int max_rank, float rbp_p);
-
-	static bool NormalizeDAB(CDat &Dat,
+	static bool NormalizeDAB(CDataPair &Dat,
 		const vector<string> &vecstrGenes,
 		bool cutoff, bool expTransform, bool divideNorm, bool subtractNorm);
 

tools/SeekPrep/SeekPrep.cpp

 	} else if(sArgs.dab_flag==1){
 		
 		if(sArgs.norm_flag==1){
-			CDat Dat;
+			CDataPair Dat;
 			char outFile[1024];
-			if(!Dat.Open(sArgs.dabinput_arg, false, 2, false, false, false)){
+			fprintf(stderr, "Opening file...\n");
+			//if(!Dat.Open(sArgs.dabinput_arg, false, 2, false, false, true)){
+			if(!Dat.Open(sArgs.dabinput_arg, false, false, 2, false, false)){
 				cerr << "error opening file" << endl;
 				return 1;
 			}
+			fprintf(stderr, "Finished opening file\n");
 			string fileName = CMeta::Basename(sArgs.dabinput_arg);
 			string fileStem = CMeta::Deextension(fileName);
 			sprintf(outFile, "%s/%s.2.dab", sArgs.dir_out_arg,
 				fileStem.c_str());
-			int max_rank = 1000;
+			int max_rank = 3000;
 			float rbp_p = 0.99;
 			//cutoff, expTransform, divideNorm, subtractNorm
 			//CSeekWriter::NormalizeDAB(Dat, vecstrGenes, true, false, true, false);
 			//CSeekWriter::RankNormalizeDAB(Dat, vecstrGenes, max_rank, rbp_p);
 			//Dat.Save(outFile);
-			vector<vector<unsigned short> > umat;
-			CSeekWriter::GetSparseRankMatrix(Dat, umat, 65535, max_rank, 
-				vecstrGenes);
-			CSeekWriter::WriteSparseMatrix(umat, max_rank, vecstrGenes, 
-				outFile);
+			vector<map<utype,unsigned short> > umat;
+			CSeekWriter::GetSparseRankMatrix(Dat, umat, max_rank, vecstrGenes);
+			CSeekWriter::WriteSparseMatrix(Dat, umat, max_rank, vecstrGenes, outFile);
 			/*fprintf(stderr, "Begin\n");
 			vector<unsigned short> l;
 			vector<vector<float> > mat;