Qian Zhu avatar Qian Zhu committed 4fd46bd

working SEEK copy with cross-validated weighting

Comments (0)

Files changed (16)

 #      AC_DEFINE([SMILEXML_LIB], [1])
 #      SMILEXML_LIB="-lsmilexml"
 #    fi
-         SMILE_CFLAGS="-I $SMILE_INCLUDE_DIR -fopenmp"
+         SMILE_CFLAGS="-I $SMILE_INCLUDE_DIR -fopenmp -pg"
 #         SMILE_LIBS="-L $SMILE_LIB_DIR $SMILEXML_LIB -lsmile"
-         SMILE_LIBS="-L $SMILE_LIB_DIR -lsmile -fopenmp"
+         SMILE_LIBS="-L $SMILE_LIB_DIR -lsmile -fopenmp -pg"
         ],                                                        dnl and found in specified path
 	[],                                                       dnl not found
         [smile_state=no],                                         dnl and not found installed
 		 tools/Counter/Makefile \
 		 tools/Data2DB/Makefile \
 		 tools/SeekReader/Makefile \
+		 tools/SeekMiner/Makefile \
 		 tools/SeekPrep/Makefile \
          tools/DBCombiner/Makefile \
 		 tools/DSLConverter/Makefile \
 				DBCombiner => ['SMILE'],
 				SeekReader => ['SMILE'],
 				SeekPrep => ['SMILE'],
+				SeekMiner => ['SMILE'],
 			    Dab2Dad  => ['SMILE'],
 			    Dab2DB  => ['SMILE'],
 			    Data2Svm => ['SVM_PERF'],
 	svmperf.cpp					\
 	vwb.cpp					\
 	seekmap.cpp				\
+	seekdataset.cpp			\
+	seekweight.cpp			\
+	seekevaluate.cpp		\
+	seekquery.cpp			\
+	seekreader.cpp			\
+	seekwriter.cpp			\
 include_HEADERS			=	\
+	seekreader.h			\
+	seekwriter.h			\
 	seekquery.h				\
 	seekevaluate.h			\
 	seekweight.h			\
 	seekreader.h			\
 	seekwriter.h			\
+	seekdataset.h			\
 	seekmap.h				\
 	annotation.h			\
 	annotationi.h			\

src/seekdataset.cpp

+/*****************************************************************************
+* This file is provided under the Creative Commons Attribution 3.0 license.
+*
+* You are free to share, copy, distribute, transmit, or adapt this work
+* PROVIDED THAT you attribute the work to the authors listed below.
+* For more information, please see the following web page:
+* http://creativecommons.org/licenses/by/3.0/
+*
+* This file is a component of the Sleipnir library for functional genomics,
+* authored by:
+* Curtis Huttenhower (chuttenh@princeton.edu)
+* Mark Schroeder
+* Maria D. Chikina
+* Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
+*
+* If you use this library, the included executable tools, or any related
+* code in your work, please cite the following publication:
+* Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
+* Olga G. Troyanskaya.
+* "The Sleipnir library for computational functional genomics"
+*****************************************************************************/
+#include "seekmap.h"
+#include "seekreader.h"
+#include "seekdataset.h"
+#include "stdafx.h"
+#include "datapair.h"
+
+
+namespace Sleipnir {
+
+CSeekDataset::CSeekDataset(){
+	r = NULL;
+	geneAverage.clear();
+	geneVariance.clear();
+	genePresence.clear();
+	m_fDsetAverage = CMeta::GetNaN();
+	m_fDsetStdev = CMeta::GetNaN();
+	weight.clear();
+	sum_weight = -1;
+}
+
+CSeekDataset::~CSeekDataset(){
+	if(r!=NULL){
+		delete r;
+	}
+	geneAverage.clear();
+	geneVariance.clear();
+	genePresence.clear();
+}
+
+bool CSeekDataset::ReadGeneAverage(const string &strFileName){
+	return CSeekTools::ReadArray(strFileName.c_str(), geneAverage);
+}
+
+bool CSeekDataset::ReadGeneVariance(const string &strFileName){
+	return CSeekTools::ReadArray(strFileName.c_str(), geneVariance);
+}
+
+bool CSeekDataset::ReadGenePresence(const string &strFileName){
+	bool ret = CSeekTools::ReadArray(strFileName.c_str(), genePresence);
+	if(!ret) return ret;
+	geneMap = new CSeekIntIntMap(genePresence);
+	return true;
+}
+
+/* requires presence vector */
+bool CSeekDataset::InitializeQuery(vector<char> &query){
+	size_t iSize = query.size();
+	size_t i, j;
+	queryMap = new CSeekIntIntMap(iSize);
+	for(i=0; i<geneMap->GetNumSet(); i++){
+		size_t j = geneMap->GetReverse(i);
+		if(query[j]==0) continue;
+		queryMap->Add(j);
+	}
+	iQuerySize = queryMap->GetNumSet();
+	iNumGenes = iSize;
+
+	if(iQuerySize==0){
+		cerr << "Dataset will be skipped" << endl;
+		return true;
+	}
+	r = new CFullMatrix<unsigned char>();
+	r->Initialize(iQuerySize, iNumGenes);
+	for(i=0; i<iQuerySize; i++){
+		for(j=0; j<iNumGenes; j++){
+			r->Set(i, j, 255);
+		}
+	}
+	return true;
+}
+
+bool CSeekDataset::DeleteQuery(){
+	if(queryMap!=NULL){
+		delete queryMap;
+	}
+	iQuerySize = 0;
+	iNumGenes = 0;
+	if(r!=NULL){
+		delete r;
+	}
+	return true;
+}
+
+bool CSeekDataset::SetQuery(size_t &i, size_t &j, unsigned char &c){
+	size_t query = queryMap->GetForward(i);
+	if(query==-1){
+		return false;
+	}
+	r->Set(query, j, c);
+	return true;
+}
+
+bool CSeekDataset::SetQueryNoMapping(size_t &i, size_t &j, unsigned char &c){
+	r->Set(i, j, c);
+	return true;
+}
+
+bool CSeekDataset::SetQuery(size_t &i, vector<unsigned char> &c){		
+	size_t query = queryMap->GetForward(i);
+	if(query==-1){
+		return false;
+	}
+	size_t j = 0;
+	for(j=0; j<c.size(); j++){
+		r->Set(query, j, c[j]);
+	}
+	return true;
+}
+
+CFullMatrix<float>* CSeekDataset::GetFloatMatrix(){
+	return rData;
+}
+
+bool CSeekDataset::InitializeFloatMatrix(bool bSubtractAvg){
+	//hard coded quant file
+	vector<float> quant;
+	float w = -5.0;
+	while(w<5.01){
+		quant.push_back(w);
+		w+=0.1;
+	}
+	quant.resize(quant.size());
+	rData = new CFullMatrix<float>();
+
+	/* transpose */
+	/* numGenes * numQueries */
+	rData->Initialize(r->GetColumns(), r->GetRows());
+
+	size_t i,j;
+	if(bSubtractAvg){
+		/* numGenes */
+		for(i=0; i<rData->GetRows(); i++){
+			float a = GetGeneAverage(i);
+			if(isnan(a) || isinf(a)){
+				for(j=0; j<rData->GetColumns(); j++){
+					rData->Set(i, j, -50.0);
+				}
+				continue;
+			}
+			/* numQueries */
+			for(j=0; j<rData->GetColumns(); j++){
+				unsigned char x = r->Get(j, i);
+				if(x==255){
+					rData->Set(i, j, -50.0);
+					//printf("Bad %.5f %d\n", x, r->Get(j, i));
+					//getchar();
+					/*}else if(x>=quant.size()){
+					printf("Bad oversize %d\n", x);
+					getchar();*/
+				}else{
+					rData->Set(i, j, quant[x] - a);
+				}
+			}
+		}
+	}else{
+		/* numGenes */
+		for(i=0; i<rData->GetRows(); i++){
+			/* numQueries */
+			for(j=0; j<rData->GetColumns(); j++){
+				rData->Set(i, j, quant[r->Get(j, i)]);
+			}
+		}
+	}
+	return true;
+}
+
+bool CSeekDataset::FreeFloatMatrix(){
+	delete rData;
+	return true;
+}
+
+CFullMatrix<unsigned char>* CSeekDataset::GetMatrix(){
+	return r;
+}
+
+CSeekIntIntMap* CSeekDataset::GetGeneMap(){
+	return geneMap;
+}
+
+CSeekIntIntMap* CSeekDataset::GetQueryMap(){
+	return queryMap;
+}
+
+float CSeekDataset::GetGeneVariance(size_t i){
+	return geneVariance[i];
+}
+
+float CSeekDataset::GetGeneAverage(size_t i){
+	return geneAverage[i];
+}
+
+size_t CSeekDataset::GetNumGenes(){
+	return iNumGenes;
+}
+
+bool CSeekDataset::InitializeCVWeight(size_t i){
+	weight.clear();
+	weight.resize(i);
+	return true;
+}
+
+bool CSeekDataset::SetCVWeight(size_t i, float f){
+	weight[i] = f;
+	return true;
+}
+
+float CSeekDataset::GetDatasetSumWeight(){	
+	size_t i;
+	size_t num = 0;
+	if(sum_weight==-1){
+		sum_weight = 0;
+		for(i=0; i<weight.size(); i++){
+			if(weight[i]==-1) continue;
+			sum_weight+=weight[i];
+			num++;
+		}
+		if(num>0){
+			sum_weight/=(float)num;
+		}else{
+			sum_weight = -1;
+		}
+	}
+	return sum_weight;
+}
+
+
+}

src/seekdataset.h

+/*****************************************************************************
+* This file is provided under the Creative Commons Attribution 3.0 license.
+*
+* You are free to share, copy, distribute, transmit, or adapt this work
+* PROVIDED THAT you attribute the work to the authors listed below.
+* For more information, please see the following web page:
+* http://creativecommons.org/licenses/by/3.0/
+*
+* This file is a component of the Sleipnir library for functional genomics,
+* authored by:
+* Curtis Huttenhower (chuttenh@princeton.edu)
+* Mark Schroeder
+* Maria D. Chikina
+* Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
+*
+* If you use this library, the included executable tools, or any related
+* code in your work, please cite the following publication:
+* Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
+* Olga G. Troyanskaya.
+* "The Sleipnir library for computational functional genomics"
+*****************************************************************************/
+#ifndef SEEKDATASET_H
+#define SEEKDATASET_H
+
+#include "seekmap.h"
+#include "stdafx.h"
+#include "datapair.h"
+
+
+namespace Sleipnir {
+
+class CSeekDataset{
+public:
+	CSeekDataset();
+	~CSeekDataset();
+	bool ReadGeneAverage(const string &);
+	bool ReadGeneVariance(const string &);
+	bool ReadGenePresence(const string &);
+	bool InitializeQuery(vector<char> &);
+	bool DeleteQuery();
+	bool SetQuery(size_t &, size_t &, unsigned char &);
+	bool SetQueryNoMapping(size_t &, size_t &, unsigned char &);
+	bool SetQuery(size_t &, vector<unsigned char> &);
+	CFullMatrix<float> *GetFloatMatrix();
+	bool InitializeFloatMatrix(bool=true);
+	bool FreeFloatMatrix();
+	CFullMatrix<unsigned char> *GetMatrix();
+	CSeekIntIntMap* GetGeneMap();
+	CSeekIntIntMap* GetQueryMap();
+	float GetGeneVariance(size_t);
+	float GetGeneAverage(size_t);
+	size_t GetNumGenes();
+	bool InitializeCVWeight(size_t);
+	bool SetCVWeight(size_t, float);
+	float GetDatasetSumWeight();
+
+private:
+	string strName;
+	string strPlatform;
+	CFullMatrix<unsigned char> *r;
+	vector<float> geneAverage;
+	vector<float> geneVariance;
+
+	vector<char> genePresence;
+	CSeekIntIntMap *geneMap;
+	CSeekIntIntMap *queryMap;
+
+	/* previously known as sinfo file */
+	float m_fDsetAverage;
+	float m_fDsetStdev;
+
+	size_t iQuerySize;
+	size_t iNumGenes;
+
+	vector<float> weight;
+	float sum_weight;
+	CFullMatrix<float> *rData;
+	bool m_bIsNibble;
+
+};
+
+
+
+}
+#endif

src/seekevaluate.cpp

+/*****************************************************************************
+* This file is provided under the Creative Commons Attribution 3.0 license.
+*
+* You are free to share, copy, distribute, transmit, or adapt this work
+* PROVIDED THAT you attribute the work to the authors listed below.
+* For more information, please see the following web page:
+* http://creativecommons.org/licenses/by/3.0/
+*
+* This file is a component of the Sleipnir library for functional genomics,
+* authored by:
+* Curtis Huttenhower (chuttenh@princeton.edu)
+* Mark Schroeder
+* Maria D. Chikina
+* Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
+*
+* If you use this library, the included executable tools, or any related
+* code in your work, please cite the following publication:
+* Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
+* Olga G. Troyanskaya.
+* "The Sleipnir library for computational functional genomics"
+*****************************************************************************/
+#include "seekevaluate.h"
+#include "stdafx.h"
+#include "seekmap.h"
+
+namespace Sleipnir {
+
+bool CSeekPerformanceMeasure::SortRankVector(vector<float> &rank,
+	CSeekIntIntMap &mapG, vector<AResult> &a){
+	a.clear();
+	int numGenesD = mapG.GetNumSet();
+	float old_target = 0;
+	float new_target = 0;
+	float prev_target = 0;
+	int prev_numNonZero = 0;
+	int numNonZero = 0;
+	int ii, i, jj;
+
+	while(1){
+		numNonZero = 0;
+		for(ii=0; ii<numGenesD; ii++){
+			i = mapG.GetReverse(ii);
+			if(rank[i]<=old_target) continue;
+			new_target += rank[i];
+			numNonZero++;
+		}
+		/* 1000 is adjustable, this is the top number of items to sort */
+		if(numNonZero==0 || numNonZero<1000){
+			old_target = prev_target;
+			numNonZero = prev_numNonZero;
+			break;
+		}
+		new_target /= (float) numNonZero;
+		if(new_target == old_target){
+			break;
+		}
+		prev_target = old_target;
+		old_target = new_target;
+		prev_numNonZero = numNonZero;
+	}
+
+	if(numNonZero==0){
+		cerr << "This dataset is all zero!" << endl;
+		return false;
+	}
+
+	a.resize(numNonZero);
+	jj = 0;
+	for(ii=0; ii<numGenesD; ii++){
+		i = mapG.GetReverse(ii);
+		if(rank[i]<=old_target) continue;
+		a[jj].i = i;
+		a[jj].f = rank[i];
+		jj++;
+	}
+	sort(a.begin(), a.end());
+	return true;
+}
+
+/* designed specifically for a CSeekDataset */
+/* mask: the query genes which are not included in RBP calcualtion */
+bool CSeekPerformanceMeasure::RankBiasedPrecision(float rate, vector<float> &rank, float &rbp,
+	vector<char> &mask, vector<char> &gold, CSeekIntIntMap &mapG){
+
+	int i, ii, j, jj;
+	vector<AResult> sing;
+	bool ret = CSeekPerformanceMeasure::SortRankVector(rank, mapG, sing);
+	if(!ret){
+		rbp = -1;
+		return false;
+	}
+
+	float x = 0;
+	int numNonZero = sing.size();
+	for(i=0; i<numNonZero; i++){
+		if(sing[i].f<=0) break;
+		if(mask[sing[i].i]==1) continue;
+		if(gold[sing[i].i]==1){
+			x+=pow(rate, jj);
+		}
+		jj++;
+	}
+	x*=(1.0-rate);
+	rbp = x;
+	return true;
+}
+
+
+}

src/seekevaluate.h

 	int i;
 	float f;
 	bool operator<(const AResult& val) const{
-		if(f < val.f){
+		if(f <= val.f){
 			return false;
 		}else{
 			return true;
 
 class CSeekPerformanceMeasure{
 public:
-	static bool SortRankVector(const vector<float> &rank, CSeekIntIntMap &mapG, vector<AResult> &a){
-		a.clear();
-		int numGenesD = mapG.GetNumSet();
-		float old_target = 0;
-		float new_target = 0;
-		float prev_target = 0;
-		int prev_numNonZero = 0;
-		int numNonZero = 0;
-		int ii, i, jj;
-
-		while(1){
-			numNonZero = 0;
-			for(ii=0; ii<numGenesD; ii++){
-				i = mapG.GetReverse(ii);
-				if(rank[i]<=old_target) continue;
-				new_target += rank[i];
-				numNonZero++;
-			}
-			/* 1000 is adjustable, this is the top number of items to sort */
-			if(numNonZero==0 || numNonZero<1000){
-				old_target = prev_target;
-				numNonZero = prev_numNonZero;
-				break;
-			}
-			new_target /= (float) numNonZero;
-			if(new_target == old_target){
-				break;
-			}
-			prev_target = old_target;
-			old_target = new_target;
-			prev_numNonZero = numNonZero;
-		}
-
-		if(numNonZero==0){
-			return a;
-		}
-
-		a.resize(numNonZero);
-		jj = 0;
-		for(ii=0; ii<numGenesD; ii++){
-			i = mapG.GetReverse(ii);
-			if(rank[i]<=old_target) continue;
-			a[jj].i = i;
-			a[jj].f = rank[i];
-			jj++;
-		}
-		sort(a.begin(), a.end());
-		return true;
-	}
-
+	static bool SortRankVector(vector<float> &rank,
+		CSeekIntIntMap &mapG, vector<AResult> &a);
 	/* designed specifically for a CSeekDataset */
 	/* mask: the query genes which are not included in RBP calcualtion */
-	static bool RankBiasedPrecision(const float rate, const vector<float> &rank, float &rbp,
-			const vector<char> &mask, const vector<char> &gold, CSeekIntIntMap &mapG){
-
-		int i, ii, j, jj;
-		vector<AResult> sing;
-		CSeekPerformanceMeasure::SortRankVector(rank, mapG, sing);
-
-		float x = 0;
-		int numNonZero = sing.size();
-		for(i=0; i<numNonZero; i++){
-			if(sing[i].f<=0) break;
-			if(mask[sing[i].i]==1) continue;
-			if(gold[sing[i].i]==1){
-				x+=pow(rate, jj);
-			}
-			jj++;
-		}
-		x*=(1.0-rate);
-		rbp = x;
-		return true;
-	}
+	static bool RankBiasedPrecision(float rate, vector<float> &rank, float &rbp,
+		vector<char> &mask, vector<char> &gold, CSeekIntIntMap &mapG);
 };
 
 

src/seekquery.cpp

+/*****************************************************************************
+* This file is provided under the Creative Commons Attribution 3.0 license.
+*
+* You are free to share, copy, distribute, transmit, or adapt this work
+* PROVIDED THAT you attribute the work to the authors listed below.
+* For more information, please see the following web page:
+* http://creativecommons.org/licenses/by/3.0/
+*
+* This file is a component of the Sleipnir library for functional genomics,
+* authored by:
+* Curtis Huttenhower (chuttenh@princeton.edu)
+* Mark Schroeder
+* Maria D. Chikina
+* Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
+*
+* If you use this library, the included executable tools, or any related
+* code in your work, please cite the following publication:
+* Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
+* Olga G. Troyanskaya.
+* "The Sleipnir library for computational functional genomics"
+*****************************************************************************/
+#include "stdafx.h"
+#include "seekquery.h"
+
+
+namespace Sleipnir {
+
+CSeekQuery::CSeekQuery(){
+	crossValGenes = NULL;
+	queryGenes.clear();
+	iNumFold = 0;
+	iFoldSize = 0;
+}
+CSeekQuery::~CSeekQuery(){
+	if(crossValGenes!=NULL){
+		delete[] crossValGenes;
+	}
+	queryGenes.clear();
+	iNumFold = 0;
+	iFoldSize = 0;
+}
+
+bool CSeekQuery::InitializeQuery(vector<char> query){
+	size_t i;
+	for(i=0; i<query.size(); i++){
+		if(query[i]==1){
+			queryGenes.push_back(i);
+		}
+	}
+	queryGenes.resize(queryGenes.size());
+	return true;
+}
+
+size_t CSeekQuery::GetNumFold(){
+	return iNumFold;
+}
+
+vector<int>& CSeekQuery::GetQuery(){
+	return queryGenes;
+}
+
+vector<int>& CSeekQuery::GetCVQuery(size_t i){
+	return crossValGenes[i];
+}
+
+bool CSeekQuery::CreateCVPartitions(gsl_rng *rnd, enum PartitionMode p, size_t iFold){
+	//must have run initializequery beforehand
+	if(p!=LEAVE_ONE_IN && p!=LEAVE_ONE_OUT && p!=CUSTOM_PARTITION){
+		cerr << "Error, unknown partition mode" << endl;
+		return false;
+	}
+	qSize = queryGenes.size();
+	size_t fold_size = 0;
+	if(iFold==-1){
+		if(p==LEAVE_ONE_IN){
+			iFold = qSize;
+			fold_size = 1;
+		}else if(p==LEAVE_ONE_OUT){
+			iFold = qSize;
+			fold_size = qSize-1;
+		}else{
+			cerr << "Error, must specify number of folds if CustomPartition mode" << endl;
+			return false;
+		}
+	}else{
+		if(p==LEAVE_ONE_IN || p==LEAVE_ONE_OUT){
+			cerr << "Error, specified number of folds, so this must NOT be LEAVE_ONE_OUT or LEAVE_ONE_IN" << endl;
+			return false;
+		}
+		fold_size = qSize / iFold;
+		if(qSize % iFold > 0){
+			fold_size++;
+		}
+	}
+	iNumFold = iFold;
+	iFoldSize = fold_size;
+	crossValGenes = new vector<int>[iNumFold];
+
+	size_t i, j, k;
+	int *q_b = (int*)malloc(qSize*sizeof(int));
+	for(i=0; i<qSize; i++){
+		q_b[i] = queryGenes[i];
+		//printf("%d ", q_b[i]);
+	}
+	//printf("\n");
+	//getchar();
+	gsl_ran_shuffle(rnd, q_b, qSize, sizeof(int));
+
+	if(p==LEAVE_ONE_IN || p==CUSTOM_PARTITION){
+		k = 0;
+		for(i=0; i<iFold; i++){
+			for(j=0; j<iFoldSize; j++){
+				if(k==qSize) continue;
+				crossValGenes[i].push_back(q_b[k]);
+				k++;
+			}
+			crossValGenes[i].resize(crossValGenes[i].size());
+		}
+	}else if(p==LEAVE_ONE_OUT){
+		int current_index = -1;
+		for(i=0; i<iFold; i++){
+			for(j=0; j<iFoldSize; j++){
+				current_index = (i+j) % qSize;
+				crossValGenes[i].push_back(q_b[current_index]);
+			}
+			crossValGenes[i].resize(crossValGenes[i].size());
+		}
+	}
+
+	free(q_b);
+	return true;
+}
+}
 
 class CSeekQuery{
 public:
-	CSeekQuery(){
-		crossValGenes = NULL;
-		queryGenes.clear();
-		iNumFold = 0;
-		iFoldSize = 0;
-	}
-	~CSeekQuery(){
-		if(crossValGenes!=NULL){
-			delete[] crossValGenes;
-		}
-		queryGenes.clear();
-		iNumFold = 0;
-		iFoldSize = 0;
-	}
+	CSeekQuery();
+	~CSeekQuery();
 
-	bool InitializeQuery(vector<char> query){
-		size_t i;
-		for(i=0; i<query.size(); i++){
-			if(query[i]==1){
-				queryGenes.push_back(query[i]);
-			}
-		}
-		queryGenes.resize(queryGenes.size());
-		return true;
-	}
-
-	size_t GetNumFold(){
-		return iNumFold;
-	}
-
-	vector<int>& GetQuery(){
-		return queryGenes;
-	}
-
-	vector<int>& GetCVQuery(size_t i){
-		return crossValGenes[i];
-	}
-
-	bool CreateCVPartitions(gsl_rng *rnd, enum PartitionMode p, size_t iFold=-1){
-		//must have run initializequery beforehand
-		if(p!=LEAVE_ONE_IN && p!=LEAVE_ONE_OUT && p!=CUSTOM_PARTITION){
-			cerr << "Error, unknown partition mode" << endl;
-			return false;
-		}
-		qSize = queryGenes.size();
-		size_t fold_size = 0;
-		if(iFold==-1){
-			if(p==LEAVE_ONE_IN){
-				iFold = qSize;
-				fold_size = 1;
-			}else if(p==LEAVE_ONE_OUT){
-				iFold = qSize;
-				fold_size = qSize-1;
-			}else{
-				cerr << "Error, must specify number of folds if CustomPartition mode" << endl;
-				return false;
-			}
-		}else{
-			if(p==LEAVE_ONE_IN || p==LEAVE_ONE_OUT){
-				cerr << "Error, specified number of folds, so this must NOT be LEAVE_ONE_OUT or LEAVE_ONE_IN" << endl;
-				return false;
-			}
-			fold_size = qSize / iFold;
-			if(qSize % iFold > 0){
-				fold_size++;
-			}
-
-		}
-		iNumFold = iFold;
-		iFoldSize = fold_size;
-		crossValGenes = new vector<int>[iNumFold];
-
-		size_t i, j, k;
-		int *q_b = (int*)malloc(qSize*sizeof(int));
-		for(i=0; i<qSize; i++){
-			q_b[i] = queryGenes[i];
-		}
-
-		gsl_ran_shuffle(rnd, q_b, qSize, sizeof(int));
-
-		if(p==LEAVE_ONE_IN || p==CUSTOM_PARTITION){
-			k = 0;
-			for(i=0; i<iFold; i++){
-				for(j=0; j<iFoldSize; j++){
-					if(k==qSize) continue;
-					crossValGenes[i].push_back(q_b[k]);
-					k++;
-				}
-				crossValGenes[i].resize(crossValGenes[i].size());
-			}
-		}else if(p==LEAVE_ONE_OUT){
-			int current_index = -1;
-			for(i=0; i<iFold; i++){
-				for(j=0; j<iFoldSize; j++){
-					current_index = (i+j) % qSize;
-					crossValGenes[i].push_back(q_b[current_index]);
-				}
-				crossValGenes[i].resize(crossValGenes[i].size());
-			}
-		}
-
-		free(q_b);
-		return true;
-	}
+	bool InitializeQuery(vector<char>);
+	size_t GetNumFold();
+	vector<int>& GetQuery();
+	vector<int>& GetCVQuery(size_t);
+	bool CreateCVPartitions(gsl_rng*, enum PartitionMode, size_t=-1);
 
 private:
 	vector<int> queryGenes;

src/seekreader.cpp

+/*****************************************************************************
+* This file is provided under the Creative Commons Attribution 3.0 license.
+*
+* You are free to share, copy, distribute, transmit, or adapt this work
+* PROVIDED THAT you attribute the work to the authors listed below.
+* For more information, please see the following web page:
+* http://creativecommons.org/licenses/by/3.0/
+*
+* This file is a component of the Sleipnir library for functional genomics,
+* authored by:
+* Curtis Huttenhower (chuttenh@princeton.edu)
+* Mark Schroeder
+* Maria D. Chikina
+* Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
+*
+* If you use this library, the included executable tools, or any related
+* code in your work, please cite the following publication:
+* Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
+* Olga G. Troyanskaya.
+* "The Sleipnir library for computational functional genomics"
+*****************************************************************************/
+#include "seekmap.h"
+#include "stdafx.h"
+#include "datapair.h"
+#include "seekdataset.h"
+#include "seekreader.h"
+#include "database.h"
+
+namespace Sleipnir {
+
+bool CSeekTools::CreatePresenceVector(vector<int> &srcData, vector<char> &destData, size_t iSize){
+	size_t i;
+	destData.clear();
+	destData.resize(iSize);
+	for(i=0; i<iSize; i++){
+		destData[i] = 0;
+	}
+	for(i=0; i<srcData.size(); i++){
+		destData[srcData[i]] = 1;
+	}
+	return true;
+}
+
+bool CSeekTools::LoadDatabase(CDatabase &DB, string &strInputDirectory, string &strPrepInputDirectory, 
+	vector<char> &cQuery, vector<string> &vecstrQuery, vector<string> &vecstrDatasets, 
+	vector<CSeekDataset*> &vc){
+		
+	DB.Open(strInputDirectory);
+	size_t iDatasets = DB.GetDatasets();
+	size_t iGenes = DB.GetGenes();
+	size_t i, j,k;
+	vc.clear();
+	vc.resize(iDatasets);
+	for(i=0; i<iDatasets; i++){
+		vc[i] = new CSeekDataset();
+		string strFileStem = vecstrDatasets[i];
+		//string strFileStem = CMeta::Deextension(CMeta::Basename(vecstrDatasets[i].c_str()));
+		string strAvgPath = strPrepInputDirectory + "/" + strFileStem + ".gavg";
+		string strPresencePath = strPrepInputDirectory + "/" + strFileStem + ".gpres";
+		vc[i]->ReadGeneAverage(strAvgPath);
+		vc[i]->ReadGenePresence(strPresencePath);
+	}
+
+	CSeekTools::InitVector(cQuery, iGenes, (char) 0);
+
+	for(i=0; i<vecstrQuery.size(); i++){
+		k = DB.GetGene(vecstrQuery[i]);
+		if(k==-1) continue;
+		cQuery[k] = 1;
+	}
+	for(i=0; i<iDatasets; i++){
+		vc[i]->InitializeQuery(cQuery);
+	}
+
+	vector<unsigned char> *Q =
+		new vector<unsigned char>[vecstrQuery.size()];
+
+	for(i=0; i<vecstrQuery.size(); i++){
+		if(!DB.GetGene(vecstrQuery[i], Q[i])){
+			cerr << "Gene does not exist" << endl;
+		}
+	}
+
+	for(i=0; i<vecstrQuery.size(); i++){
+		if(DB.GetGene(vecstrQuery[i])==-1){
+			continue;
+		}
+		size_t m = DB.GetGene(vecstrQuery[i]);
+		size_t l = 0;
+		for(j=0; j<iDatasets; j++){
+			CSeekIntIntMap *qu = vc[j]->GetQueryMap();
+			size_t query = qu->GetForward(m);
+			if(query==-1) continue;
+			for(k=0; k<iGenes; k++){
+				unsigned char c = Q[i][k*iDatasets + j];
+				vc[j]->SetQueryNoMapping(query, k, c);
+			}
+		}
+	}
+
+	delete[] Q;
+
+	return true;
+}
+
+}
 #include "seekmap.h"
 #include "stdafx.h"
 #include "datapair.h"
-
+#include "seekdataset.h"
+#include "database.h"
 
 namespace Sleipnir {
 
 class CSeekTools{
 public:
+	/* binary */
 	template<class tType>
 	static bool ReadArray(const char *fileName, vector<tType> &vData){
 		FILE *f = fopen(fileName, "rb");
 		return true;
 	}
 
+	/* binary */
 	template<class tType>
 	static bool WriteArray(const char *fileName, vector<tType> &vData){
 		FILE *f = fopen(fileName, "wb");
 		return true;
 	}
 
-	static bool CreatePresenceVector(vector<int> &srcData, vector<char> &destData, size_t iSize){
-		size_t i;
-		destData.clear();
-		destData.resize(iSize);
-		for(i=0; i<iSize; i++){
-			destData[i] = 0;
-		}
-		for(i=0; i<srcData.size(); i++){
-			destData[srcData[i]] = 1;
-		}
-		return true;
-	}
+	static bool CreatePresenceVector(vector<int> &srcData, vector<char> &destData, size_t iSize);
+	static bool LoadDatabase(CDatabase &DB, string &strInputDirectory, 
+	string &strPrepInputDirectory, vector<char> &cQuery, 
+	vector<string> &vecstrQuery, vector<string> &vecstrDatasets, 
+	vector<CSeekDataset*> &vc);
 
 };
 
-class CSeekDataset{
-public:
-	CSeekDataset(){
-		r = NULL;
-		geneAverage.clear();
-		geneVariance.clear();
-		genePresence.clear();
-		m_fDsetAverage = CMeta::GetNaN();
-		m_fDsetStdev = CMeta::GetNaN();
-	}
-	~CSeekDataset(){
-		if(r!=NULL){
-			delete r;
-		}
-		geneAverage.clear();
-		geneVariance.clear();
-		genePresence.clear();
-	}
-	bool ReadGeneAverage(const string &strFileName){
-		return CSeekTools::ReadArray(strFileName.c_str(), geneAverage);
-	}
-	bool ReadGeneVariance(const string &strFileName){
-		return CSeekTools::ReadArray(strFileName.c_str(), geneVariance);
-	}
-	bool ReadGenePresence(const string &strFileName){
-		bool ret = CSeekTools::ReadArray(strFileName.c_str(), genePresence);
-		if(!ret) return ret;
-		geneMap = new CSeekIntIntMap(genePresence);
-		return true;
-	}
-
-	/* requires presence vector */
-	bool InitializeQuery(vector<char> &query){
-		size_t iSize = query.size();
-		size_t i, j;
-		queryMap = new CSeekIntIntMap(iSize);
-		for(i=0; i<geneMap->GetNumSet(); i++){
-			size_t j = geneMap->GetReverse(i);
-			if(query[j]==0) continue;
-			queryMap->Add(j);
-		}
-		iQuerySize = queryMap->GetNumSet();
-		iNumGenes = iSize;
-
-		if(iQuerySize==0){
-			cerr << "Dataset will be skipped" << endl;
-			return true;
-		}
-		r = new CFullMatrix<unsigned char>();
-		r->Initialize(iQuerySize, iNumGenes);
-		for(i=0; i<iQuerySize; i++){
-			for(j=0; j<iNumGenes; j++){
-				r->Set(i, j, 255);
-			}
-		}
-
-		return true;
-	}
-
-	bool DeleteQuery(){
-		if(queryMap!=NULL){
-			delete queryMap;
-		}
-		iQuerySize = 0;
-		iNumGenes = 0;
-		if(r!=NULL){
-			delete r;
-		}
-		return true;
-	}
-
-	bool SetQuery(size_t &i, size_t &j, unsigned char &c){
-		size_t query = queryMap->GetForward(i);
-		if(query==-1){
-			return false;
-		}
-		r->Set(query, j, c);
-		return true;
-	}
-
-	bool SetQueryNoMapping(size_t &i, size_t &j, unsigned char &c){
-		r->Set(i, j, c);
-		return true;
-	}
-
-	bool SetQuery(size_t &i, vector<unsigned char> &c){
-		size_t query = queryMap->GetForward(i);
-		if(query==-1){
-			return false;
-		}
-		size_t j = 0;
-		for(j=0; j<c.size(); j++){
-			r->Set(query, j, c[j]);
-		}
-		return true;
-	}
-
-	CFullMatrix<float> *GetFloatMatrix(){
-		return rData;
-	}
-
-	bool InitializeFloatMatrix(bool bSubtractAvg = true){
-		//hard coded quant file
-		vector<float> quant;
-		float w = -5.0;
-		while(w<5.01){
-			quant.push_back(w);
-			w+=1.0;
-		}
-		quant.resize(quant.size());
-		rData = new CFullMatrix<float>();
-		rData->Initialize(r->GetRows(), r->GetColumns());
-		size_t i,j;
-		if(bSubtractAvg){
-			for(i=0; i<rData->GetRows(); i++){
-				for(j=0; j<rData->GetColumns(); j++){
-					float a = GetGeneAverage(j);
-					rData->Set(i, j, quant[r->Get(i, j)] - a);
-				}
-			}
-		}else{
-			for(i=0; i<rData->GetRows(); i++){
-				for(j=0; j<rData->GetColumns(); j++){
-					rData->Set(i, j, quant[r->Get(i, j)]);
-				}
-			}
-		}
-		return true;
-	}
-
-	bool FreeFloatMatrix(){
-		delete rData;
-		return true;
-	}
-
-	CFullMatrix<unsigned char> *GetMatrix(){
-		return r;
-	}
-
-	CSeekIntIntMap* GetGeneMap(){
-		return geneMap;
-	}
-
-	CSeekIntIntMap* GetQueryMap(){
-		return queryMap;
-	}
-
-	float GetGeneVariance(size_t i){
-		return geneVariance[i];
-	}
-
-	float GetGeneAverage(size_t i){
-		return geneAverage[i];
-	}
-
-	size_t GetNumGenes(){
-		return iNumGenes;
-	}
-
-	bool InitializeCVWeight(size_t i){
-		weight.clear();
-		weight.resize(i);
-		return true;
-	}
-
-	bool SetCVWeight(size_t i, float f){
-		weight[i] = f;
-		return true;
-	}
-
-
-private:
-	string strName;
-	string strPlatform;
-	CFullMatrix<unsigned char> *r;
-	vector<float> geneAverage;
-	vector<float> geneVariance;
-
-	vector<char> genePresence;
-	CSeekIntIntMap *geneMap;
-	CSeekIntIntMap *queryMap;
-
-	/* previously known as sinfo file */
-	float m_fDsetAverage;
-	float m_fDsetStdev;
-
-	size_t iQuerySize;
-	size_t iNumGenes;
-
-	vector<float> weight;
-	CFullMatrix<float> *rData;
-
-	bool m_bIsNibble;
-};
-
-
 
 }
 #endif

src/seekweight.cpp

+/*****************************************************************************
+* This file is provided under the Creative Commons Attribution 3.0 license.
+*
+* You are free to share, copy, distribute, transmit, or adapt this work
+* PROVIDED THAT you attribute the work to the authors listed below.
+* For more information, please see the following web page:
+* http://creativecommons.org/licenses/by/3.0/
+*
+* This file is a component of the Sleipnir library for functional genomics,
+* authored by:
+* Curtis Huttenhower (chuttenh@princeton.edu)
+* Mark Schroeder
+* Maria D. Chikina
+* Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
+*
+* If you use this library, the included executable tools, or any related
+* code in your work, please cite the following publication:
+* Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
+* Olga G. Troyanskaya.
+* "The Sleipnir library for computational functional genomics"
+*****************************************************************************/
+
+#include "stdafx.h"
+#include "seekweight.h"
+#include "seekreader.h"
+#include "seekquery.h"
+#include "seekevaluate.h"
+
+namespace Sleipnir {
+
+bool CSeekWeighter::LinearCombine(vector<float> &rank, vector<int> &cv_query,
+	CSeekDataset &sDataset){
+	if(cv_query.size()==0){
+		cerr << "cv_query empty" << endl;
+		return true;
+	}
+	size_t iNumGenes = sDataset.GetNumGenes();
+
+	vector<float> new_rank;
+	CSeekTools::InitVector(rank, iNumGenes, (float)0);
+	CSeekTools::InitVector(new_rank, iNumGenes, (float)0);
+	size_t i, j, k;
+
+	int q_size = cv_query.size();
+	for(i=0; i<q_size; i++){
+		rank[cv_query[i]] = 1.0 / q_size;
+	}
+
+	/*if(q_size==0){
+		printf("Bad!\n");
+		getchar();
+	}*/
+
+	CSeekIntIntMap *mapG = sDataset.GetGeneMap();
+	CSeekIntIntMap *mapQ = sDataset.GetQueryMap();
+
+	CFullMatrix<float> *f = sDataset.GetFloatMatrix();
+
+	size_t iGenesPresent = mapG->GetNumSet();
+	for(i=0; i<iGenesPresent; i++){
+		size_t g = mapG->GetReverse(i);
+		for(j=0; j<q_size; j++){
+			int qq = cv_query[j];
+			if(g==qq) continue;
+			size_t q = mapQ->GetForward(qq);
+			/*if(f->Get(g,q)<-50.0 || f->Get(g,q)>50.0){
+				printf("Bad %.5f\n", f->Get(g,q));
+				getchar();
+			}*/
+			new_rank[g] += rank[qq] * f->Get(g, q);
+		}
+	}
+
+	for(i=0; i<iGenesPresent; i++){
+		size_t g = mapG->GetReverse(i);
+		rank[g] = new_rank[g];
+		//printf("Gene %d %.5f\n", g, rank[g]);
+	}
+
+	//getchar();
+
+	return true;
+}
+
+
+bool CSeekWeighter::CVWeighting(CSeekQuery &sQuery, CSeekDataset &sDataset){
+	size_t iFold = sQuery.GetNumFold();
+	sDataset.InitializeCVWeight(iFold);
+
+	int i, j, qi, qj;
+
+	vector<char> is_query_cross, is_gold;
+	CSeekTools::InitVector(is_query_cross, sDataset.GetNumGenes(), (char) 0);
+	CSeekTools::InitVector(is_gold, sDataset.GetNumGenes(), (char) 0);
+
+	CSeekIntIntMap *mapG = sDataset.GetGeneMap();
+	for(qi=0; qi<iFold; qi++){
+		vector<int> vi = sQuery.GetCVQuery(qi);
+		vector<int> cv_query;
+		CSeekIntIntMap *mapQ = sDataset.GetQueryMap();
+		int num_q = 0;
+		int num_v = 0;
+
+		/* Set query and gold standard */
+		for(i=0; i<vi.size(); i++){
+			if(mapQ->GetForward(vi[i])==-1) continue;
+			is_query_cross[vi[i]] = 1;
+			cv_query.push_back(vi[i]);
+			num_q++;
+		}
+
+		vector<int> allQ = sQuery.GetQuery();
+		for(i=0; i<allQ.size(); i++){
+			if(mapQ->GetForward(allQ[i])==-1) continue;
+			if(is_query_cross[allQ[i]]==1) continue;
+			is_gold[allQ[i]] = 1;
+			num_v++;
+		}
+
+		/*printf("Cross Val %d %d %d\n", qi, num_q, num_v);
+		printf("Cross Val %d\n", qi);
+		for(i=0; i<vi.size(); i++){
+			printf("%d ", vi[i]);
+		}
+		printf("\n");
+		*/
+		if(num_q==0 || num_v==0){
+			sDataset.SetCVWeight(qi, -1);
+		}else{
+			/* actual weighting */
+			vector<float> rank;
+			float w = 0;
+			bool ret = LinearCombine(rank, cv_query, sDataset);
+			ret = CSeekPerformanceMeasure::RankBiasedPrecision(0.95,
+				rank, w, is_query_cross, is_gold, *mapG);
+			if(!ret){
+				sDataset.SetCVWeight(qi, -1);
+			}else{
+				sDataset.SetCVWeight(qi, w);
+			}
+			//printf("Weight: %.5f\n", w);
+		}
+		/* Reset query and gold standard */
+		for(i=0; i<vi.size(); i++){
+			if(mapQ->GetForward(vi[i])==-1) continue;
+			is_query_cross[vi[i]] = 0;
+		}
+		for(i=0; i<allQ.size(); i++){
+			if(mapQ->GetForward(allQ[i])==-1) continue;
+			is_gold[allQ[i]] = 0;
+		}
+
+	}
+	return true;
+}
+
+}
 #include "stdafx.h"
 #include "seekreader.h"
 #include "seekquery.h"
+#include "seekevaluate.h"
 
 namespace Sleipnir {
 
 
 class CSeekWeighter{
 public:
-	CSeekWeighter(){
-
-	}
-	~CSeekWeighter(){
-
-	}
-	static bool CVWeighting(CSeekQuery &sQuery, CSeekDataset &sDataset){
-		sDataset.InitializeFloatMatrix();
-		size_t iFold = sQuery.GetNumFold();
-		sDataset.InitializeCVWeight(iFold);
-
-		int i, j, qi, qj;
-
-		vector<char> is_query_cross, is_gold;
-		CSeekTools::InitVector(is_query_cross, sDataset.GetNumGenes(), (char) 0);
-		CSeekTools::InitVector(is_gold, sDataset.GetNumGenes(), (char) 0);
-
-		for(qi=0; qi<iFold; qi++){
-			vector<int> vi = sQuery.GetCVQuery(qi);
-			CSeekIntIntMap *mapQ = sDataset.GetQueryMap();
-			int num_q = 0;
-			int num_v = 0;
-			for(i=0; i<vi.size(); i++){
-				if(mapQ->GetForward(vi[i])==-1) continue;
-				is_query_cross[vi[i]] = 1;
-				num_q++;
-			}
-			vector<int> allQ = sQuery.GetQuery();
-			for(i=0; i<allQ.size(); i++){
-				if(mapQ->GetForward(allQ[i])==-1) continue;
-				if(is_query_cross[allQ[i]]==1) continue;
-				is_gold[allQ[i]] = 1;
-				num_v++;
-			}
-			if(num_q==0 || num_v==0){
-				sDataset.SetCVWeight(qi, 0);
-				continue;
-			}
-
-		}
-		sDataset.FreeFloatMatrix();
-		return true;
-	}
-
-
-
+	/*cv_query must be present in sDataset */
+	static bool LinearCombine(vector<float> &rank, vector<int> &cv_query,
+			CSeekDataset &sDataset);
+	static bool CVWeighting(CSeekQuery &sQuery, CSeekDataset &sDataset);
 };
 
 

src/seekwriter.cpp

+/*****************************************************************************
+* This file is provided under the Creative Commons Attribution 3.0 license.
+*
+* You are free to share, copy, distribute, transmit, or adapt this work
+* PROVIDED THAT you attribute the work to the authors listed below.
+* For more information, please see the following web page:
+* http://creativecommons.org/licenses/by/3.0/
+*
+* This file is a component of the Sleipnir library for functional genomics,
+* authored by:
+* Curtis Huttenhower (chuttenh@princeton.edu)
+* Mark Schroeder
+* Maria D. Chikina
+* Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
+*
+* If you use this library, the included executable tools, or any related
+* code in your work, please cite the following publication:
+* Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
+* Olga G. Troyanskaya.
+* "The Sleipnir library for computational functional genomics"
+*****************************************************************************/
+#include "seekmap.h"
+#include "stdafx.h"
+#include "datapair.h"
+#include "seekwriter.h"
+#include "seekreader.h"
+
+namespace Sleipnir {
+
+bool CSeekWriter::GetGeneAverage(CDataPair &Dat, vector<string> &vecstrGenes,
+		vector<float> &vecResult){
+	/* assume datapair is already opened */
+	size_t i, j;
+	vector<size_t> veciGenes;
+	veciGenes.clear();
+	veciGenes.resize(vecstrGenes.size());
+	for( i = 0; i < vecstrGenes.size( ); ++i )
+		veciGenes[ i ] = Dat.GetGene( vecstrGenes[i] );
+
+	CSeekTools::InitVector(vecResult, vecstrGenes.size(), CMeta::GetNaN());
+	for(i=0; i<vecstrGenes.size(); i++){
+		size_t s = veciGenes[i];
+		if(s==-1) continue;
+		float *v = Dat.GetFullRow(s);
+		float sum = 0;
+		int num = 0;
+		for(j=0; j<vecstrGenes.size(); j++){
+			size_t t = veciGenes[j];
+			if(t==-1) continue;
+			if(CMeta::IsNaN(v[t])) continue;
+			sum+=v[t];
+			num++;
+		}
+		vecResult[i] = sum / (float) num;
+		free(v);
+	}
+	return true;
+}
+
+bool CSeekWriter::GetGenePresence(CDataPair &Dat, vector<string> &vecstrGenes,
+		vector<char> &vecResult){
+	/* assume datapair is already opened */
+	size_t i, j;
+	vector<size_t> veciGenes;
+	veciGenes.clear();
+	veciGenes.resize(vecstrGenes.size());
+	for( i = 0; i < vecstrGenes.size( ); ++i )
+		veciGenes[ i ] = Dat.GetGene( vecstrGenes[i] );
+
+	CSeekTools::InitVector(vecResult, vecstrGenes.size(), (char) 0);
+
+	for(i=0; i<vecstrGenes.size(); i++){
+		if(veciGenes[i]==-1) continue;
+		vecResult[i]=1;
+	}
+	return true;
+}
+
+}
 class CSeekWriter{
 public:
 	static bool GetGeneAverage(CDataPair &Dat, vector<string> &vecstrGenes,
-			vector<float> &vecResult){
-		/* assume datapair is already opened */
-		size_t i, j;
-		vector<size_t> veciGenes;
-		veciGenes.clear();
-		veciGenes.resize(vecstrGenes.size());
-		for( i = 0; i < vecstrGenes.size( ); ++i )
-			veciGenes[ i ] = Dat.GetGene( vecstrGenes[i] );
-
-		CSeekTools::InitVector(vecResult, vecstrGenes.size(), CMeta::GetNaN());
-		for(i=0; i<vecstrGenes.size(); i++){
-			size_t s = veciGenes[i];
-			if(s==-1) continue;
-			float *v = Dat.GetFullRow(s);
-			float sum = 0;
-			int num = 0;
-			for(j=0; j<vecstrGenes.size(); j++){
-				size_t t = veciGenes[j];
-				if(t==-1) continue;
-				if(CMeta::IsNaN(v[t])) continue;
-				sum+=v[t];
-				num++;
-			}
-			vecResult[i] = sum / (float) num;
-			free(v);
-		}
-		return true;
-	}
+			vector<float> &vecResult);
 	static bool GetGenePresence(CDataPair &Dat, vector<string> &vecstrGenes,
-			vector<char> &vecResult){
-		/* assume datapair is already opened */
-		size_t i, j;
-		vector<size_t> veciGenes;
-		veciGenes.clear();
-		veciGenes.resize(vecstrGenes.size());
-		for( i = 0; i < vecstrGenes.size( ); ++i )
-			veciGenes[ i ] = Dat.GetGene( vecstrGenes[i] );
-
-		CSeekTools::InitVector(vecResult, vecstrGenes.size(), (char) 0);
-
-		for(i=0; i<vecstrGenes.size(); i++){
-			if(veciGenes[i]==-1) continue;
-			vecResult[i]=1;
-		}
-		return true;
-	}
+			vector<char> &vecResult);
 
 };
 

tools/Makefile.am

       DBCombiner \
       SeekPrep \
       SeekReader \
+      SeekMiner \
 	  DSLConverter \
 	  Dab2Dad \
 	  Edges2Posteriors \
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.