Commits

Qian Zhu committed 95549fe

added tools for calculating/reading/saving z-score avg and gene-presence vector
added testing tool SeekReader (for reading db files)
combined changes from 895b9754a89f

  • Participants
  • Parent commits b434c8c
  • Branches search_project

Comments (0)

Files changed (25)

 		 tools/Overlapper/Makefile \
 		 tools/PCLPlotter/Makefile \
 		 tools/PCL2Bin/Makefile \
-		 tools/Dat2PCL/Makefile \
 		 tools/Randomizer/Makefile \
 		 tools/Seqs2Ngrams/Makefile \
 		 tools/SMRF/Makefile \
 		 tools/Contexter/Makefile \
 		 tools/Counter/Makefile \
 		 tools/Data2DB/Makefile \
+		 tools/SeekReader/Makefile \
+		 tools/SeekPrep/Makefile \
          tools/DBCombiner/Makefile \
 		 tools/DSLConverter/Makefile \
 		 tools/Dab2Dad/Makefile \
 			    DSLConverter  => ['SMILE'],
 			    Data2DB  => ['SMILE'],
 				DBCombiner => ['SMILE'],
+				SeekReader => ['SMILE'],
+				SeekPrep => ['SMILE'],
 			    Dab2Dad  => ['SMILE'],
 			    Dab2DB  => ['SMILE'],
 			    Data2Svm => ['SVM_PERF'],
 	stdafx.cpp				\
 	svm.cpp					\
 	svmperf.cpp					\
-	vwb.cpp
+	vwb.cpp					\
+	seekmap.cpp				\
 include_HEADERS			=	\
+	seekreader.h			\
+	seekwriter.h			\
+	seekmap.h				\
 	annotation.h			\
 	annotationi.h			\
 	bayesnetfni.h			\
 	if( fBuffer ) {
 		//iBaseGenes: gene id of first gene in each databaselet
 		//iDataset: dataset id
+		//printf("Number: %d %d %d %d\n", GetSizeGene(), GetSizePair(), iBaseGenes, iBaseDatasets);
 		abImage = new unsigned char[ iSize = ( GetSizeGene( ) * m_vecstrGenes.size( ) ) ];
 		m_fstm.seekg( m_iHeader, ios_base::beg );
 		m_fstm.read( (char*)abImage, iSize );
 
 	return true; }
 
-/* 	A faster and simpler writing method for the matrix.
-	takes UcharFullMatrix
-	and requires buffering to be enabled, and works only with byte output
-*/
-bool CDatabaselet::OpenFast( const vector<CUcharFullMatrix>& vecData, size_t iBaseGenes, size_t iBaseDatasets, bool fBuffer) {
-	if(fBuffer){
-		cerr << "Requires buferring to be enabled." << endl;
+bool CDatabase::GetGene(string strGene, vector<unsigned char> &vecbData){
+	size_t iGene = this->GetGene(strGene);
+	if(iGene==-1){
+		cerr << "Gene not found" << endl;
 		return false;
 	}
-	if(m_useNibble){
-		cerr << "Requires byte." << endl;
+	m_vecpDBs[ iGene % m_vecpDBs.size( ) ]->Get( iGene / m_vecpDBs.size( ), vecbData);
+	return true;
+}
+
+
+bool CDatabaselet::Get(size_t iOne, vector<unsigned char>& vecbData){
+	size_t iSize;
+	size_t i, j;
+	size_t offset1 = GetOffset(iOne);
+	if(!this->m_fstm.is_open()){
+		cerr << "file not opened" << endl;
 		return false;
 	}
 
-	unsigned char*	abImage;
-	size_t			iSize, iDatum, iGeneOne, iGeneTwo;
-	unsigned char	bOne, bTwo;
+	this->m_fstm.seekg(offset1, ios_base::beg);
+	unsigned char *abImage = (unsigned char*)malloc( iSize = GetSizeGene());
+	this->m_fstm.read((char*)abImage, iSize);
 
-	abImage = new unsigned char[ iSize = ( GetSizeGene( ) * m_vecstrGenes.size( ) ) ];
-	m_fstm.seekg( m_iHeader );
-	m_fstm.read( (char*)abImage, iSize );
+	if(m_useNibble==false){
+		vecbData.clear();
+		vecbData.resize(iSize);
+		for(i=0; i<iSize; i++){
+			vecbData[i] = abImage[i];
+		}
+	}else{
+		vecbData.clear();
+		vecbData.resize(m_iDatasets*m_iGenes);
 
-	for( iDatum = 0; iDatum  < vecData.size( ); iDatum ++ ){
-		for( iGeneOne = 0; iGeneOne < GetGenes( ); ++iGeneOne ){
-			size_t index = vecData[iDatum].GetGeneIndex(GetGene(iGeneOne));
-			size_t iOffset = (GetSizeGene() * iGeneOne) + iBaseDatasets + iDatum;
+		for(j=0; j<m_iGenes; j++){
+			for(i=0; i<GetSizePair(); i++){
+				size_t offset = GetOffset(iOne, j) - m_iHeader;
+				unsigned char b = abImage[offset + i];
+				unsigned char bValue = -1;
+				if( ( bValue = ( b & 0xF ) ) == 0xF ){
+					bValue = -1;
+				}
+				vecbData[ j * m_iDatasets + 2 * i ] = bValue;
 
-			for( iGeneTwo = 0; iGeneTwo < vecData[iDatum].GetColumns(); ++iGeneTwo ){
-				if( bOne = vecData[ iDatum].Get(index, iGeneTwo ) ){
-					abImage[ iOffset + GetSizePair() * iGeneTwo ] = bOne - 1;
+				if( ( bValue = ( ( b >> 4 ) & 0xF ) ) == 0xF ){
+					bValue = -1;
 				}
+
+				if((2 * i + 1)==m_iDatasets){
+					break;
+				}
+				vecbData[ j * m_iDatasets + (2 * i) + 1 ] = bValue;
 			}
 		}
+
 	}
-
-	m_fstm.seekp( m_iHeader );
-	m_fstm.write( (char*)abImage, iSize );
-	delete[] abImage;
-
+	free(abImage);
 	return true;
 }
 
+
+
 bool CDatabaselet::Get( size_t iOne, size_t iTwo,
 		vector<unsigned char>& vecbData, unsigned char *charImage){
 	size_t	i;
 	Works for both nibble and byte
 */
 bool CDatabaselet::Combine(std::vector<CDatabaselet*>& vecDatabaselet,
-		std::string strOutDirectory, bool bSplit){
+		std::string strOutDirectory, vector<string> &vecstrGenes, bool bSplit){
 
 	/* for checking on consistency of databaselets */
 	bool bIsConsistent = true;
 		}
 	}
 
+	map<string, size_t> mapstrintGenes;
+	for(i=0; i<vecstrGenes.size(); i++){
+		mapstrintGenes[vecstrGenes[i]] = i;
+	}
+
+	char acNumber[16];
+
 	/* splitting to one gene per file after combine */
 	if(bSplit){
 
 
 			/* open a new Databaselet containing only one gene */
 			string thisGene = first->GetGene(i);
-			string path = strOutDirectory + "/" + thisGene + ".db";
+			size_t iGeneID = mapstrintGenes[thisGene];
+			sprintf(acNumber, "%08u", iGeneID);
+
+			string path = strOutDirectory + "/" + acNumber + ".db";
 			vector<string> vecstrThisGene;
 			vecstrThisGene.push_back(thisGene);
 
 	vecbData.resize( i + GetSizePair( ) );
 	pthread_mutex_lock( m_pmutx );
 
-	m_fstm.seekg( GetOffset( iOne, iTwo ) );
+	m_fstm.seekg( GetOffset( iOne, iTwo ), ios_base::beg);
 	m_fstm.read( (char*)&vecbData[ i ], GetSizePair( ) );
 	pthread_mutex_unlock( m_pmutx );
 
 	vecbData.resize( i + GetSizeGene( ) );
 	pthread_mutex_lock( m_pmutx );
 
-	m_fstm.seekg( GetOffset( iGene ) );
+	m_fstm.seekg( GetOffset( iGene ), ios_base::beg);
 	m_fstm.read( (char*)&vecbData[ i ], GetSizeGene( ) );
 	pthread_mutex_unlock( m_pmutx );
 
 	vecbData.resize( iOffset + ( veciGenes.size( ) * GetSizePair( ) ) );
 	pthread_mutex_lock( m_pmutx );
 	for( i = 0; i < veciGenes.size( ); ++i,iOffset += GetSizePair( ) ) {
-		m_fstm.seekg( GetOffset( iGene, veciGenes[ i ] ) );
+		m_fstm.seekg( GetOffset( iGene, veciGenes[ i ] ), ios_base::beg);
 		m_fstm.read( (char*)&vecbData[ iOffset ], GetSizePair( ) );
 	}
 	pthread_mutex_unlock( m_pmutx );
 	if( vecstrNodes.size( ) )
 		vecstrNodes.resize( vecstrNodes.size( ) - 1 );
 	m_vecpDBs.resize( iFiles );
+	int iNumFilesOpen = 1000;
 	for( i = 0; i < m_vecpDBs.size( ); ++i ) {
-		m_vecpDBs[ i ] = new CDatabaselet( m_useNibble);
+		m_vecpDBs[ i ] = new CDatabaselet( m_useNibble );
+	}
+
+	size_t k;
+	for( i = 0; i < m_vecpDBs.size( ); ++i ) {
+		if(i%iNumFilesOpen==0 && i>0){
+			for(k=0; k<iNumFilesOpen; k++){
+				m_vecpDBs[i-k-1]->CloseFile();
+			}
+		}
 		vecstrSubset.clear( );
 		for( j = i; j < vecstrGenes.size( ); j += m_vecpDBs.size( ) )
 			vecstrSubset.push_back( vecstrGenes[ j ] );
 			g_CatSleipnir( ).error( "CDatabase::Open( %s, %d ) could not open file %s",
 				strOutputDirectory.c_str( ), iFiles, strFile.c_str( ) );
 			return false; } }
+
+	for( i = 0; i < m_vecpDBs.size( ); ++i ){
+		m_vecpDBs[i]->CloseFile();
+	}
+
 	for( i = 0; i < vecstrGenes.size( ); ++i )
 		m_mapstriGenes[ m_vecpDBs[ i % m_vecpDBs.size( ) ]->GetGene( i / m_vecpDBs.size( ) ) ] = i;
 
 		for( j = i; j < vecstrGenes.size( ); j += m_vecpDBs.size( ) )
 			vecstrSubset.push_back( vecstrGenes[ j ] ); //contains index for 1000, 2000, 3000th genes
 		sprintf( acNumber, "%08u", i );
-		if(iFiles>=vecstrGenes.size()){
-			//if one gene per file, let databaselet filename be gene-name
-			strFile = strOutputDirectory + '/' + vecstrSubset[0] + c_acExtension;
-		}else{
-			strFile = strOutputDirectory + '/' + acNumber + c_acExtension;
-		}
+		strFile = strOutputDirectory + '/' + acNumber + c_acExtension;
 
 		if( !( i % 100 ) )
 			g_CatSleipnir( ).notice( "CDatabase::Open( %s, %d ) initializing file %d/%d",
 	float			d;
 
 	/* define number of threads to concurrently process datasets */
-	//omp_set_num_threads(4);
+	omp_set_num_threads(4);
 
 	veciGenes.resize( vecstrGenes.size( ) );
 	iOutBlock = ( m_iBlockOut == -1 ) ? m_vecpDBs.size( ) : m_iBlockOut;
 
 		for( iInBase = 0; iInBase < vecstrFiles.size( ); iInBase += iInBlock ) {
 			vector<CCompactFullMatrix>	vecData;
-
 			vecData.resize( ( ( iInBase + iInBlock ) > vecstrFiles.size( ) ) ?
 				( vecstrFiles.size( ) - iInBase ) : iInBlock );
 			for( iInOffset = 0; iInOffset < vecData.size( ); ++iInOffset ) {
 						size_t s = veciGenes[j];
 						if(s == -1) continue;
 						if(s == t) continue;
+						if(CMeta::IsNaN(d_array[s])) continue;
 						vecData[iInOffset].Set(i,j,Dat.Quantize(d_array[s])+1);
 					}
 					free(d_array);
 class CDatabase : CDatabaseImpl {
 public:
 
-	struct ArrayULInt{
-		size_t iX;
-		unsigned long int v;
-	};
-
-	int ULIntComp(const void * a, const void* b){
-		if ( *(unsigned long int*) a > *(unsigned long int*) b){
-			return(1);
-		}
-		if ( *(unsigned long int*) a < *(unsigned long int*) b){
-			return(-1);
-		}
-		return(0);
-	}
-
 	/*!
 	 * \brief
 	 * Construct a new database over the given genes from the given datasets and Bayes net.
 	CDatabase(bool isNibble) : CDatabaseImpl(isNibble){
 	}
 
+	bool GetGene(string, vector<unsigned char>&);
+
 	/*!
 	 * \brief
 	 * Open an existing database from subfiles in the given directory.
 	bool Get( size_t, size_t, std::vector<unsigned char>& ) const;
 	bool Get( size_t, std::vector<unsigned char>&, bool ) const;
 	bool Get( size_t, const std::vector<size_t>&, std::vector<unsigned char>&, bool ) const;
+	bool Get(size_t, vector<unsigned char>&);
 
 	static bool Combine(std::vector<CDatabaselet*>& vecDatabaselet,
-			std::string strOutDirectory, bool bSplit = true);
+			std::string strOutDirectory, vector<string> &vecstrGenes, bool bSplit = true);
 
 	size_t GetGenes( ) const {
 
 		if(m_useNibble){
 			if( !fBoth ) {
 				unsigned char	b;
-				m_fstm.seekg( iOffset );
+				m_fstm.seekg( iOffset, ios_base::beg );
 				b = m_fstm.get( );
 				bValue = ( iDataset % 2 ) ? ( ( b & 0xF ) | ( bValue << 4 ) ) :
 						( ( b & 0xF0 ) | ( bValue & 0xF ) ); 
 				}
 		}
 
-		m_fstm.seekp( iOffset );
+		m_fstm.seekp( iOffset, ios_base::beg);
 		m_fstm.put( bValue );
 	}
 
 
 	void Clear( ) {
 		size_t	i;
-
 		m_mapstriGenes.clear( );
 		for( i = 0; i < m_vecpDBs.size( ); ++i )
 			delete m_vecpDBs[ i ];
 	bool OpenMemmap( const unsigned char* );
 	void FilterGenesGraph( const CGenes&, std::vector<bool>&, size_t, float, bool, bool, const std::vector<float>* );
 
+	struct size_t_comp {
+	    bool operator ()(size_t const& a, size_t const& b) const {
+	    	return (a<b);
+	    }
+	} size_t_comp;
+
+
 	float* GetFullRow(size_t iY){
 		float *d_array = m_Data.GetFullRow(iY);
 		d_array[iY] = CMeta::GetNaN();
 	iSize = i;
 }
 
-CSeekPresence::CSeekPresence(CSeekPresence& cP){
-	p = (char*)malloc(cP.iSize);
-	memcpy(p, cP.p, cP.iSize);
-	iSize = cP.iSize;
+CSeekPresence::CSeekPresence(CSeekPresence* cP){
+	p = (char*)malloc(cP->iSize);
+	memcpy(p, cP->p, cP->iSize);
+	iSize = cP->iSize;
 }
 
 CSeekPresence::~CSeekPresence(){
 /*
  * IntIntMap Data Structure
  */
-CSeekIntIntMap::CSeekIntIntMap(size_t iSize){
+void CSeekIntIntMap::Initialize(size_t iSize){
 	m_iF = (int*)malloc(iSize * sizeof(int));
 	m_iR = (int*)malloc(iSize * sizeof(int));
 	m_iSize = iSize;
 	Clear();
 }
+CSeekIntIntMap::CSeekIntIntMap(size_t iSize){
+	Initialize(iSize);
+}
 
-CSeekIntIntMap::CSeekIntIntMap(CSeekPresence &cP, bool bReverse){
-	m_iSize = cP.GetSize();
-	m_iF = (int*)malloc(m_iSize * sizeof(int));
-	m_iR = (int*)malloc(m_iSize * sizeof(int));
-	Clear();
+CSeekIntIntMap::CSeekIntIntMap(CSeekPresence *cP, bool bReverse){
+	Initialize(cP->GetSize());
 	Reset(cP, bReverse);
 }
 
+CSeekIntIntMap::CSeekIntIntMap(vector<char> &cP, bool bReverse){
+	Initialize(cP.size());
+	Reset(cP, bReverse);
+}
+
+
+CSeekIntIntMap::CSeekIntIntMap(char *cP, size_t iSize, bool bReverse){
+	Initialize(iSize);
+	Reset(cP, bReverse);
+}
+
+
 CSeekIntIntMap::~CSeekIntIntMap(){
 	free(m_iF);
 	free(m_iR);
 	m_iNumSet = 0;
 }
 
-void CSeekIntIntMap::Reset(CSeekPresence &cP, bool bReverse){
+int CSeekIntIntMap::GetNumSet(){
+	return m_iNumSet;
+}
+
+void CSeekIntIntMap::Reset(CSeekPresence *cP, bool bReverse){
 	int i;
 	if(bReverse==false){
 		int j = 0;
 		for(i=0; i<m_iSize; i++){
-			if(cP.Check(i)==true){
+			if(cP->Check(i)==true){
 				Add(i);
 			}
 		}
 	}else{
 		int j = 0;
 		for(i=0; i<m_iSize; i++){
-			if(cP.Check(i)==false){
+			if(cP->Check(i)==false){
 				Add(i);
 			}
 		}
 	}
 }
 
+void CSeekIntIntMap::Reset(char *cP, bool bReverse){
+	int i;
+	if(bReverse==false){
+		int j = 0;
+		for(i=0; i<m_iSize; i++){
+			if(cP[i]==1){
+				Add(i);
+			}
+		}
+	}else{
+		int j = 0;
+		for(i=0; i<m_iSize; i++){
+			if(cP[i]==0){
+				Add(i);
+			}
+		}
+	}
+}
+
+void CSeekIntIntMap::Reset(vector<char> &cP, bool bReverse){
+	int i;
+	if(bReverse==false){
+		int j = 0;
+		for(i=0; i<m_iSize; i++){
+			if(cP[i]==1){
+				Add(i);
+			}
+		}
+	}else{
+		int j = 0;
+		for(i=0; i<m_iSize; i++){
+			if(cP[i]==0){
+				Add(i);
+			}
+		}
+	}
+}
+
+
+
 /*
  * StrIntMap Data Structure
  */
 	return m_mapintstr.size();
 }
 
-vector<string>& CSeekStrIntMap::GetAllString(){
+vector<string> CSeekStrIntMap::GetAllString(){
 	vector<string> vecStr;
 	vecStr.clear();
 	vecStr.resize(GetSize());
 	size_t i = 0;
 	for(iter = m_mapstrint.begin(); iter!=m_mapstrint.end(); iter++){
 		vecStr[i] = iter->first;
+		i++;
 	}
 	return vecStr;
 }
 
-vector<int>& CSeekStrIntMap::GetAllInteger(){
+vector<int> CSeekStrIntMap::GetAllInteger(){
 	vector<int> vecInt;
 	vecInt.clear();
 	vecInt.resize(GetSize());
 	size_t i = 0;
 	for(iter = m_mapintstr.begin(); iter!=m_mapintstr.end(); iter++){
 		vecInt[i] = iter->first;
+		i++;
 	}
 	return vecInt;
 }
+}
 
 public:
 	CSeekPresence(size_t);
 	CSeekPresence(char*, size_t);
-	CSeekPresence(CSeekPresence&);
+	CSeekPresence(CSeekPresence*);
 	~CSeekPresence();
 	void Clear();
 	bool Check(size_t);
 class CSeekIntIntMap{
 public:
 	CSeekIntIntMap(size_t);
-	CSeekIntIntMap(CSeekPresence&, bool=false);
+	CSeekIntIntMap(CSeekPresence*, bool=false);
+	CSeekIntIntMap(vector<char>&, bool=false);
+	CSeekIntIntMap(char*, size_t, bool=false);
+	void Initialize(size_t);
+
 	~CSeekIntIntMap();
 	int GetForward(int);
 	int GetReverse(int);
 	void Add(int);
 	void Clear();
-	void Reset(CSeekPresence&, bool=false);
+	void Reset(CSeekPresence*, bool=false);
+	void Reset(vector<char>&, bool=false);
+	void Reset(char*, bool=false);
+	int GetNumSet();
 
 private:
 	int *m_iF;
 	int Get(string);
 	size_t GetSize();
 	string Get(int);
-	vector<string>& GetAllString();
-	vector<int>& GetAllInteger();
+	vector<string> GetAllString();
+	vector<int> GetAllInteger();
 private:
 	map<string, int> m_mapstrint;
 	map<int, string> m_mapintstr;
+/*****************************************************************************
+* This file is provided under the Creative Commons Attribution 3.0 license.
+*
+* You are free to share, copy, distribute, transmit, or adapt this work
+* PROVIDED THAT you attribute the work to the authors listed below.
+* For more information, please see the following web page:
+* http://creativecommons.org/licenses/by/3.0/
+*
+* This file is a component of the Sleipnir library for functional genomics,
+* authored by:
+* Curtis Huttenhower (chuttenh@princeton.edu)
+* Mark Schroeder
+* Maria D. Chikina
+* Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
+*
+* If you use this library, the included executable tools, or any related
+* code in your work, please cite the following publication:
+* Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
+* Olga G. Troyanskaya.
+* "The Sleipnir library for computational functional genomics"
+*****************************************************************************/
+#ifndef SEEKREADER_H
+#define SEEKREADER_H
+
+#include "seekmap.h"
+#include "stdafx.h"
+#include "datapair.h"
+
+namespace Sleipnir {
+
+class CSeekTools{
+public:
+	template<class tType>
+	static bool ReadArray(const char *fileName, vector<tType> &vData){
+		FILE *f = fopen(fileName, "rb");
+		if(f==NULL){
+			cerr << "File not found" << endl;
+			return false;
+		}
+		size_t iSize;
+		int ret;
+		ret = fread((char*) (&iSize), 1, sizeof(iSize), f);
+		vData.clear();
+		vData.resize(iSize);
+		tType *m_Data = (tType*)malloc(iSize*sizeof(tType));
+		ret = fread((char*)m_Data, 1, iSize*sizeof(tType), f);
+		size_t i;
+		for(i=0; i<iSize; i++){
+			vData[i] = m_Data[i];
+		}
+		free(m_Data);
+		fclose(f);
+		return true;
+	}
+
+	template<class tType>
+	static bool WriteArray(const char *fileName, vector<tType> &vData){
+		FILE *f = fopen(fileName, "wb");
+		if(f==NULL){
+			cerr << "File not found" << endl;
+			return false;
+		}
+		size_t i;
+		tType *m_Data = (tType*)malloc(vData.size()*sizeof(tType));
+		for(i=0; i<vData.size(); i++){
+			m_Data[i] = vData[i];
+		}
+		size_t iSize = vData.size();
+		fwrite((char*) (&iSize), 1, sizeof(iSize), f);
+		fwrite((char*) (m_Data), 1, iSize*sizeof(tType), f);
+		free(m_Data);
+		fclose(f);
+		return true;
+	}
+
+	template<class tType>
+	static bool InitVector(vector<tType> &vData, size_t iSize, tType tValue){
+		size_t i;
+		vData.clear();
+		vData.resize(iSize);
+		for(i=0; i<iSize; i++){
+			vData[i] = tValue;
+		}
+		return true;
+	}
+
+};
+
+
+class CSeekDataset{
+public:
+	CSeekDataset(){
+		r = NULL;
+		geneAverage.clear();
+		geneVariance.clear();
+		genePresence.clear();
+		m_fDsetAverage = CMeta::GetNaN();
+		m_fDsetStdev = CMeta::GetNaN();
+	}
+	~CSeekDataset(){
+		if(r!=NULL){
+			delete r;
+		}
+		geneAverage.clear();
+		geneVariance.clear();
+		genePresence.clear();
+	}
+	bool ReadGeneAverage(const string &strFileName){
+		return CSeekTools::ReadArray(strFileName.c_str(), geneAverage);
+	}
+	bool ReadGeneVariance(const string &strFileName){
+		return CSeekTools::ReadArray(strFileName.c_str(), geneVariance);
+	}
+	bool ReadGenePresence(const string &strFileName){
+		bool ret = CSeekTools::ReadArray(strFileName.c_str(), genePresence);
+		if(!ret) return ret;
+		geneMap = new CSeekIntIntMap(genePresence);
+		return true;
+	}
+
+	/* requires presence vector */
+	bool InitializeQuery(vector<char> &query){
+		size_t iSize = query.size();
+		size_t i, j;
+		queryMap = new CSeekIntIntMap(iSize);
+		for(i=0; i<geneMap->GetNumSet(); i++){
+			size_t j = geneMap->GetReverse(i);
+			if(query[j]==0) continue;
+			queryMap->Add(j);
+		}
+		iQuerySize = queryMap->GetNumSet();
+		iNumGenes = iSize;
+
+		if(iQuerySize==0){
+			cerr << "Dataset will be skipped" << endl;
+			return true;
+		}
+		r = new CFullMatrix<unsigned char>();
+		r->Initialize(iQuerySize, iNumGenes);
+		for(i=0; i<iQuerySize; i++){
+			for(j=0; j<iNumGenes; j++){
+				r->Set(i, j, 255);
+			}
+		}
+
+		return true;
+	}
+
+	bool DeleteQuery(){
+		if(queryMap!=NULL){
+			delete queryMap;
+		}
+		iQuerySize = 0;
+		iNumGenes = 0;
+		if(r!=NULL){
+			delete r;
+		}
+		return true;
+	}
+
+	bool SetQuery(size_t &i, size_t &j, unsigned char &c){
+		size_t query = queryMap->GetForward(i);
+		if(query==-1){
+			return false;
+		}
+		r->Set(query, j, c);
+		return true;
+	}
+
+	bool SetQueryNoMapping(size_t &i, size_t &j, unsigned char &c){
+		r->Set(i, j, c);
+		return true;
+	}
+
+	bool SetQuery(size_t &i, vector<unsigned char> &c){
+		size_t query = queryMap->GetForward(i);
+		if(query==-1){
+			return false;
+		}
+		size_t j = 0;
+		for(j=0; j<c.size(); j++){
+			r->Set(query, j, c[j]);
+		}
+		return true;
+	}
+
+	CFullMatrix<unsigned char> *GetMatrix(){
+		return r;
+	}
+
+	CSeekIntIntMap* GetGeneMap(){
+		return geneMap;
+	}
+
+	CSeekIntIntMap* GetQueryMap(){
+		return queryMap;
+	}
+
+	float GetGeneVariance(size_t i){
+		return geneVariance[i];
+	}
+
+	float GetGeneAverage(size_t i){
+		return geneAverage[i];
+	}
+
+
+private:
+	string strName;
+	string strPlatform;
+	CFullMatrix<unsigned char> *r;
+	vector<float> geneAverage;
+	vector<float> geneVariance;
+
+	vector<char> genePresence;
+	CSeekIntIntMap *geneMap;
+	CSeekIntIntMap *queryMap;
+
+	/* previously known as sinfo file */
+	float m_fDsetAverage;
+	float m_fDsetStdev;
+
+	size_t iQuerySize;
+	size_t iNumGenes;
+
+	bool m_bIsNibble;
+};
+
+
+
+}
+#endif
+/*****************************************************************************
+* This file is provided under the Creative Commons Attribution 3.0 license.
+*
+* You are free to share, copy, distribute, transmit, or adapt this work
+* PROVIDED THAT you attribute the work to the authors listed below.
+* For more information, please see the following web page:
+* http://creativecommons.org/licenses/by/3.0/
+*
+* This file is a component of the Sleipnir library for functional genomics,
+* authored by:
+* Curtis Huttenhower (chuttenh@princeton.edu)
+* Mark Schroeder
+* Maria D. Chikina
+* Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
+*
+* If you use this library, the included executable tools, or any related
+* code in your work, please cite the following publication:
+* Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
+* Olga G. Troyanskaya.
+* "The Sleipnir library for computational functional genomics"
+*****************************************************************************/
+#ifndef SEEKWRITER_H
+#define SEEKWRITER_H
+
+#include "seekmap.h"
+#include "stdafx.h"
+#include "datapair.h"
+
+namespace Sleipnir {
+
+class CSeekWriter{
+public:
+	static bool GetGeneAverage(CDataPair &Dat, vector<string> &vecstrGenes,
+			vector<float> &vecResult){
+		/* assume datapair is already opened */
+		size_t i, j;
+		vector<size_t> veciGenes;
+		veciGenes.clear();
+		veciGenes.resize(vecstrGenes.size());
+		for( i = 0; i < vecstrGenes.size( ); ++i )
+			veciGenes[ i ] = Dat.GetGene( vecstrGenes[i] );
+
+		CSeekTools::InitVector(vecResult, vecstrGenes.size(), CMeta::GetNaN());
+		for(i=0; i<vecstrGenes.size(); i++){
+			size_t s = veciGenes[i];
+			if(s==-1) continue;
+			float *v = Dat.GetFullRow(s);
+			float sum = 0;
+			int num = 0;
+			for(j=0; j<vecstrGenes.size(); j++){
+				size_t t = veciGenes[j];
+				if(t==-1) continue;
+				if(CMeta::IsNaN(v[t])) continue;
+				sum+=v[t];
+				num++;
+			}
+			vecResult[i] = sum / (float) num;
+			free(v);
+		}
+		return true;
+	}
+	static bool GetGenePresence(CDataPair &Dat, vector<string> &vecstrGenes,
+			vector<char> &vecResult){
+		/* assume datapair is already opened */
+		size_t i, j;
+		vector<size_t> veciGenes;
+		veciGenes.clear();
+		veciGenes.resize(vecstrGenes.size());
+		for( i = 0; i < vecstrGenes.size( ); ++i )
+			veciGenes[ i ] = Dat.GetGene( vecstrGenes[i] );
+
+		CSeekTools::InitVector(vecResult, vecstrGenes.size(), (char) 0);
+
+		for(i=0; i<vecstrGenes.size(); i++){
+			if(veciGenes[i]==-1) continue;
+			vecResult[i]=1;
+		}
+		return true;
+	}
+
+};
+
+}
+#endif

tools/DBCombiner/DBCombiner.cpp

 	    }
 		//printf("Finished reading DBS"); getchar();
 
-	    CDatabaselet::Combine(DBS, sArgs.dir_out_arg, fSplit);
+	    CDatabaselet::Combine(DBS, sArgs.dir_out_arg, vecstrGenes, fSplit);
 	    for(i=0; i<vecstrDBs.size(); i++){
 	    	free(DBS[i]);
 	    }

tools/Makefile.am

 	  Counter \
 	  Data2DB \
       DBCombiner \
+      SeekPrep \
+      SeekReader \
 	  DSLConverter \
 	  Dab2Dad \
 	  Edges2Posteriors \

tools/SeekPrep/SeekPrep.cpp

+/*****************************************************************************
+* This file is provided under the Creative Commons Attribution 3.0 license.
+*
+* You are free to share, copy, distribute, transmit, or adapt this work
+* PROVIDED THAT you attribute the work to the authors listed below.
+* For more information, please see the following web page:
+* http://creativecommons.org/licenses/by/3.0/
+*
+* This file is a component of the Sleipnir library for functional genomics,
+* authored by:
+* Curtis Huttenhower (chuttenh@princeton.edu)
+* Mark Schroeder
+* Maria D. Chikina
+* Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
+*
+* If you use this library, the included executable tools, or any related
+* code in your work, please cite the following publication:
+* Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
+* Olga G. Troyanskaya.
+* "The Sleipnir library for computational functional genomics"
+*****************************************************************************/
+#include "stdafx.h"
+#include "cmdline.h"
+
+
+int main( int iArgs, char** aszArgs ) {
+	static const size_t	c_iBuffer	= 1024;
+#ifdef WIN32
+	pthread_win32_process_attach_np( );
+#endif // WIN32
+	gengetopt_args_info	sArgs;
+	ifstream			ifsm;
+	istream*			pistm;
+	vector<string>		vecstrLine, vecstrGenes, vecstrDBs, vecstrQuery;
+	char				acBuffer[ c_iBuffer ];
+	size_t				i;
+
+	if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) {
+		cmdline_parser_print_help( );
+		return 1; }
+
+	if( sArgs.input_arg ) {
+		ifsm.open( sArgs.input_arg );
+		pistm = &ifsm; }
+	else
+		pistm = &cin;
+	while( !pistm->eof( ) ) {
+		pistm->getline( acBuffer, c_iBuffer - 1 );
+		acBuffer[ c_iBuffer - 1 ] = 0;
+		vecstrLine.clear( );
+		CMeta::Tokenize( acBuffer, vecstrLine );
+		if( vecstrLine.size( ) < 2 ) {
+			cerr << "Ignoring line: " << acBuffer << endl;
+			continue; }
+		if( !( i = atoi( vecstrLine[ 0 ].c_str( ) ) ) ) {
+			cerr << "Illegal gene ID: " << vecstrLine[ 0 ] << " for " << vecstrLine[ 1 ] << endl;
+			return 1; }
+		i--;
+		if( vecstrGenes.size( ) <= i )
+			vecstrGenes.resize( i + 1 );
+		vecstrGenes[ i ] = vecstrLine[ 1 ]; }
+	if( sArgs.input_arg )
+		ifsm.close( );
+
+	if(sArgs.dab_arg){
+		CDataPair Dat;
+		char outFile[125];
+		if(!Dat.Open(sArgs.dab_arg, false, false)){
+			cerr << "error opening file" << endl;
+			return 1;
+		}
+
+		if(sArgs.gavg_flag==1){
+			vector<float> vecGeneAvg;
+			string fileName = CMeta::Basename(sArgs.dab_arg);
+			string fileStem = CMeta::Deextension(fileName);
+			sprintf(outFile, "%s/%s.gavg", sArgs.dir_out_arg, fileStem.c_str());
+			CSeekWriter::GetGeneAverage(Dat, vecstrGenes, vecGeneAvg);
+			CSeekTools::WriteArray(outFile, vecGeneAvg);
+		}
+
+		if(sArgs.gpres_flag==1){
+			vector<char> vecGenePresence;
+			string fileName = CMeta::Basename(sArgs.dab_arg);
+			string fileStem = CMeta::Deextension(fileName);
+			sprintf(outFile, "%s/%s.gpres", sArgs.dir_out_arg, fileStem.c_str());
+			CSeekWriter::GetGenePresence(Dat, vecstrGenes, vecGenePresence);
+			CSeekTools::WriteArray(outFile, vecGenePresence);
+		}
+
+	}else{
+		cerr << "Must give a dab." << endl;
+		return 1;
+
+	}
+
+#ifdef WIN32
+	pthread_win32_process_detach_np( );
+#endif // WIN32
+	return 0; }

tools/SeekPrep/SeekPrep.ggo

+package	"SeekPrep"
+version	"1.0"
+purpose	"Preprocess datasets for Seek"
+
+section "Main"
+option	"dab"				x	"Input dataset (.dab)"
+								string typestr="filename"	yes
+option	"sinfo"				s	"Generates sinfo file (with dataset wide mean and stdev)"
+								flag	off
+option	"gavg"				a	"Generates gene-average file"
+								flag	off
+option	"gpres"				p	"Generates gene-presence file"
+								flag	off
+option	"gvar"				v	"Generates gene-variance file"
+								flag	off
+option	"dir_out"			D	"Output directory"
+								string typestr="directory"	yes
+option	"input"				i	"Gene mapping file"
+								string typestr="filename"	yes

tools/SeekPrep/cmdline.c

+/*
+  File autogenerated by gengetopt version 2.22.5
+  generated with the following command:
+  gengetopt -iSeekPrep.ggo --default-optional -u -N -e 
+
+  The developers of gengetopt consider the fixed text that goes in all
+  gengetopt output files to be in the public domain:
+  we make no copyright claims on it.
+*/
+
+/* If we use autoconf.  */
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifndef FIX_UNUSED
+#define FIX_UNUSED(X) (void) (X) /* avoid warnings for unused params */
+#endif
+
+#include <getopt.h>
+
+#include "cmdline.h"
+
+const char *gengetopt_args_info_purpose = "Preprocess datasets for Seek";
+
+const char *gengetopt_args_info_usage = "Usage: SeekPrep [OPTIONS]... [FILES]...";
+
+const char *gengetopt_args_info_description = "";
+
+const char *gengetopt_args_info_help[] = {
+  "  -h, --help               Print help and exit",
+  "  -V, --version            Print version and exit",
+  "\nMain:",
+  "  -x, --dab=filename       Input dataset (.dab)",
+  "  -s, --sinfo              Generates sinfo file (with dataset wide mean and \n                             stdev)  (default=off)",
+  "  -a, --gavg               Generates gene-average file  (default=off)",
+  "  -p, --gpres              Generates gene-presence file  (default=off)",
+  "  -v, --gvar               Generates gene-variance file  (default=off)",
+  "  -D, --dir_out=directory  Output directory",
+  "  -i, --input=filename     Gene mapping file",
+    0
+};
+
+typedef enum {ARG_NO
+  , ARG_FLAG
+  , ARG_STRING
+} cmdline_parser_arg_type;
+
+static
+void clear_given (struct gengetopt_args_info *args_info);
+static
+void clear_args (struct gengetopt_args_info *args_info);
+
+static int
+cmdline_parser_internal (int argc, char **argv, struct gengetopt_args_info *args_info,
+                        struct cmdline_parser_params *params, const char *additional_error);
+
+static int
+cmdline_parser_required2 (struct gengetopt_args_info *args_info, const char *prog_name, const char *additional_error);
+
+static char *
+gengetopt_strdup (const char *s);
+
+static
+void clear_given (struct gengetopt_args_info *args_info)
+{
+  args_info->help_given = 0 ;
+  args_info->version_given = 0 ;
+  args_info->dab_given = 0 ;
+  args_info->sinfo_given = 0 ;
+  args_info->gavg_given = 0 ;
+  args_info->gpres_given = 0 ;
+  args_info->gvar_given = 0 ;
+  args_info->dir_out_given = 0 ;
+  args_info->input_given = 0 ;
+}
+
+static
+void clear_args (struct gengetopt_args_info *args_info)
+{
+  FIX_UNUSED (args_info);
+  args_info->dab_arg = NULL;
+  args_info->dab_orig = NULL;
+  args_info->sinfo_flag = 0;
+  args_info->gavg_flag = 0;
+  args_info->gpres_flag = 0;
+  args_info->gvar_flag = 0;
+  args_info->dir_out_arg = NULL;
+  args_info->dir_out_orig = NULL;
+  args_info->input_arg = NULL;
+  args_info->input_orig = NULL;
+  
+}
+
+static
+void init_args_info(struct gengetopt_args_info *args_info)
+{
+
+
+  args_info->help_help = gengetopt_args_info_help[0] ;
+  args_info->version_help = gengetopt_args_info_help[1] ;
+  args_info->dab_help = gengetopt_args_info_help[3] ;
+  args_info->sinfo_help = gengetopt_args_info_help[4] ;
+  args_info->gavg_help = gengetopt_args_info_help[5] ;
+  args_info->gpres_help = gengetopt_args_info_help[6] ;
+  args_info->gvar_help = gengetopt_args_info_help[7] ;
+  args_info->dir_out_help = gengetopt_args_info_help[8] ;
+  args_info->input_help = gengetopt_args_info_help[9] ;
+  
+}
+
+void
+cmdline_parser_print_version (void)
+{
+  printf ("%s %s\n",
+     (strlen(CMDLINE_PARSER_PACKAGE_NAME) ? CMDLINE_PARSER_PACKAGE_NAME : CMDLINE_PARSER_PACKAGE),
+     CMDLINE_PARSER_VERSION);
+}
+
+static void print_help_common(void) {
+  cmdline_parser_print_version ();
+
+  if (strlen(gengetopt_args_info_purpose) > 0)
+    printf("\n%s\n", gengetopt_args_info_purpose);
+
+  if (strlen(gengetopt_args_info_usage) > 0)
+    printf("\n%s\n", gengetopt_args_info_usage);
+
+  printf("\n");
+
+  if (strlen(gengetopt_args_info_description) > 0)
+    printf("%s\n\n", gengetopt_args_info_description);
+}
+
+void
+cmdline_parser_print_help (void)
+{
+  int i = 0;
+  print_help_common();
+  while (gengetopt_args_info_help[i])
+    printf("%s\n", gengetopt_args_info_help[i++]);
+}
+
+void
+cmdline_parser_init (struct gengetopt_args_info *args_info)
+{
+  clear_given (args_info);
+  clear_args (args_info);
+  init_args_info (args_info);
+
+  args_info->inputs = 0;
+  args_info->inputs_num = 0;
+}
+
+void
+cmdline_parser_params_init(struct cmdline_parser_params *params)
+{
+  if (params)
+    { 
+      params->override = 0;
+      params->initialize = 1;
+      params->check_required = 1;
+      params->check_ambiguity = 0;
+      params->print_errors = 1;
+    }
+}
+
+struct cmdline_parser_params *
+cmdline_parser_params_create(void)
+{
+  struct cmdline_parser_params *params = 
+    (struct cmdline_parser_params *)malloc(sizeof(struct cmdline_parser_params));
+  cmdline_parser_params_init(params);  
+  return params;
+}
+
+static void
+free_string_field (char **s)
+{
+  if (*s)
+    {
+      free (*s);
+      *s = 0;
+    }
+}
+
+
+static void
+cmdline_parser_release (struct gengetopt_args_info *args_info)
+{
+  unsigned int i;
+  free_string_field (&(args_info->dab_arg));
+  free_string_field (&(args_info->dab_orig));
+  free_string_field (&(args_info->dir_out_arg));
+  free_string_field (&(args_info->dir_out_orig));
+  free_string_field (&(args_info->input_arg));
+  free_string_field (&(args_info->input_orig));
+  
+  
+  for (i = 0; i < args_info->inputs_num; ++i)
+    free (args_info->inputs [i]);
+
+  if (args_info->inputs_num)
+    free (args_info->inputs);
+
+  clear_given (args_info);
+}
+
+
+static void
+write_into_file(FILE *outfile, const char *opt, const char *arg, const char *values[])
+{
+  FIX_UNUSED (values);
+  if (arg) {
+    fprintf(outfile, "%s=\"%s\"\n", opt, arg);
+  } else {
+    fprintf(outfile, "%s\n", opt);
+  }
+}
+
+
+int
+cmdline_parser_dump(FILE *outfile, struct gengetopt_args_info *args_info)
+{
+  int i = 0;
+
+  if (!outfile)
+    {
+      fprintf (stderr, "%s: cannot dump options to stream\n", CMDLINE_PARSER_PACKAGE);
+      return EXIT_FAILURE;
+    }
+
+  if (args_info->help_given)
+    write_into_file(outfile, "help", 0, 0 );
+  if (args_info->version_given)
+    write_into_file(outfile, "version", 0, 0 );
+  if (args_info->dab_given)
+    write_into_file(outfile, "dab", args_info->dab_orig, 0);
+  if (args_info->sinfo_given)
+    write_into_file(outfile, "sinfo", 0, 0 );
+  if (args_info->gavg_given)
+    write_into_file(outfile, "gavg", 0, 0 );
+  if (args_info->gpres_given)
+    write_into_file(outfile, "gpres", 0, 0 );
+  if (args_info->gvar_given)
+    write_into_file(outfile, "gvar", 0, 0 );
+  if (args_info->dir_out_given)
+    write_into_file(outfile, "dir_out", args_info->dir_out_orig, 0);
+  if (args_info->input_given)
+    write_into_file(outfile, "input", args_info->input_orig, 0);
+  
+
+  i = EXIT_SUCCESS;
+  return i;
+}
+
+int
+cmdline_parser_file_save(const char *filename, struct gengetopt_args_info *args_info)
+{
+  FILE *outfile;
+  int i = 0;
+
+  outfile = fopen(filename, "w");
+
+  if (!outfile)
+    {
+      fprintf (stderr, "%s: cannot open file for writing: %s\n", CMDLINE_PARSER_PACKAGE, filename);
+      return EXIT_FAILURE;
+    }
+
+  i = cmdline_parser_dump(outfile, args_info);
+  fclose (outfile);
+
+  return i;
+}
+
+void
+cmdline_parser_free (struct gengetopt_args_info *args_info)
+{
+  cmdline_parser_release (args_info);
+}
+
+/** @brief replacement of strdup, which is not standard */
+char *
+gengetopt_strdup (const char *s)
+{
+  char *result = 0;
+  if (!s)
+    return result;
+
+  result = (char*)malloc(strlen(s) + 1);
+  if (result == (char*)0)
+    return (char*)0;
+  strcpy(result, s);
+  return result;
+}
+
+int
+cmdline_parser (int argc, char **argv, struct gengetopt_args_info *args_info)
+{
+  return cmdline_parser2 (argc, argv, args_info, 0, 1, 1);
+}
+
+int
+cmdline_parser_ext (int argc, char **argv, struct gengetopt_args_info *args_info,
+                   struct cmdline_parser_params *params)
+{
+  int result;
+  result = cmdline_parser_internal (argc, argv, args_info, params, 0);
+
+  return result;
+}
+
+int
+cmdline_parser2 (int argc, char **argv, struct gengetopt_args_info *args_info, int override, int initialize, int check_required)
+{
+  int result;
+  struct cmdline_parser_params params;
+  
+  params.override = override;
+  params.initialize = initialize;
+  params.check_required = check_required;
+  params.check_ambiguity = 0;
+  params.print_errors = 1;
+
+  result = cmdline_parser_internal (argc, argv, args_info, &params, 0);
+
+  return result;
+}
+
+int
+cmdline_parser_required (struct gengetopt_args_info *args_info, const char *prog_name)
+{
+  int result = EXIT_SUCCESS;
+
+  if (cmdline_parser_required2(args_info, prog_name, 0) > 0)
+    result = EXIT_FAILURE;
+
+  return result;
+}
+
+int
+cmdline_parser_required2 (struct gengetopt_args_info *args_info, const char *prog_name, const char *additional_error)
+{
+  int error = 0;
+  FIX_UNUSED (additional_error);
+
+  /* checks for required options */
+  if (! args_info->dab_given)
+    {
+      fprintf (stderr, "%s: '--dab' ('-x') option required%s\n", prog_name, (additional_error ? additional_error : ""));
+      error = 1;
+    }
+  
+  if (! args_info->dir_out_given)
+    {
+      fprintf (stderr, "%s: '--dir_out' ('-D') option required%s\n", prog_name, (additional_error ? additional_error : ""));
+      error = 1;
+    }
+  
+  if (! args_info->input_given)
+    {
+      fprintf (stderr, "%s: '--input' ('-i') option required%s\n", prog_name, (additional_error ? additional_error : ""));
+      error = 1;
+    }
+  
+  
+  /* checks for dependences among options */
+
+  return error;
+}
+
+
+static char *package_name = 0;
+
+/**
+ * @brief updates an option
+ * @param field the generic pointer to the field to update
+ * @param orig_field the pointer to the orig field
+ * @param field_given the pointer to the number of occurrence of this option
+ * @param prev_given the pointer to the number of occurrence already seen
+ * @param value the argument for this option (if null no arg was specified)
+ * @param possible_values the possible values for this option (if specified)
+ * @param default_value the default value (in case the option only accepts fixed values)
+ * @param arg_type the type of this option
+ * @param check_ambiguity @see cmdline_parser_params.check_ambiguity
+ * @param override @see cmdline_parser_params.override
+ * @param no_free whether to free a possible previous value
+ * @param multiple_option whether this is a multiple option
+ * @param long_opt the corresponding long option
+ * @param short_opt the corresponding short option (or '-' if none)
+ * @param additional_error possible further error specification
+ */
+static
+int update_arg(void *field, char **orig_field,
+               unsigned int *field_given, unsigned int *prev_given, 
+               char *value, const char *possible_values[],
+               const char *default_value,
+               cmdline_parser_arg_type arg_type,
+               int check_ambiguity, int override,
+               int no_free, int multiple_option,
+               const char *long_opt, char short_opt,
+               const char *additional_error)
+{
+  char *stop_char = 0;
+  const char *val = value;
+  int found;
+  char **string_field;
+  FIX_UNUSED (field);
+
+  stop_char = 0;
+  found = 0;
+
+  if (!multiple_option && prev_given && (*prev_given || (check_ambiguity && *field_given)))
+    {
+      if (short_opt != '-')
+        fprintf (stderr, "%s: `--%s' (`-%c') option given more than once%s\n", 
+               package_name, long_opt, short_opt,
+               (additional_error ? additional_error : ""));
+      else
+        fprintf (stderr, "%s: `--%s' option given more than once%s\n", 
+               package_name, long_opt,
+               (additional_error ? additional_error : ""));
+      return 1; /* failure */
+    }
+
+  FIX_UNUSED (default_value);
+    
+  if (field_given && *field_given && ! override)
+    return 0;
+  if (prev_given)
+    (*prev_given)++;
+  if (field_given)
+    (*field_given)++;
+  if (possible_values)
+    val = possible_values[found];
+
+  switch(arg_type) {
+  case ARG_FLAG:
+    *((int *)field) = !*((int *)field);
+    break;
+  case ARG_STRING:
+    if (val) {
+      string_field = (char **)field;
+      if (!no_free && *string_field)
+        free (*string_field); /* free previous string */
+      *string_field = gengetopt_strdup (val);
+    }
+    break;
+  default:
+    break;
+  };
+
+
+  /* store the original value */
+  switch(arg_type) {
+  case ARG_NO:
+  case ARG_FLAG:
+    break;
+  default:
+    if (value && orig_field) {
+      if (no_free) {
+        *orig_field = value;
+      } else {
+        if (*orig_field)
+          free (*orig_field); /* free previous string */
+        *orig_field = gengetopt_strdup (value);
+      }
+    }
+  };
+
+  return 0; /* OK */
+}
+
+
+int
+cmdline_parser_internal (
+  int argc, char **argv, struct gengetopt_args_info *args_info,
+                        struct cmdline_parser_params *params, const char *additional_error)
+{
+  int c;	/* Character of the parsed option.  */
+
+  int error = 0;
+  struct gengetopt_args_info local_args_info;
+  
+  int override;
+  int initialize;
+  int check_required;
+  int check_ambiguity;
+  
+  package_name = argv[0];
+  
+  override = params->override;
+  initialize = params->initialize;
+  check_required = params->check_required;
+  check_ambiguity = params->check_ambiguity;
+
+  if (initialize)
+    cmdline_parser_init (args_info);
+
+  cmdline_parser_init (&local_args_info);
+
+  optarg = 0;
+  optind = 0;
+  opterr = params->print_errors;
+  optopt = '?';
+
+  while (1)
+    {
+      int option_index = 0;
+
+      static struct option long_options[] = {
+        { "help",	0, NULL, 'h' },
+        { "version",	0, NULL, 'V' },
+        { "dab",	1, NULL, 'x' },
+        { "sinfo",	0, NULL, 's' },
+        { "gavg",	0, NULL, 'a' },
+        { "gpres",	0, NULL, 'p' },
+        { "gvar",	0, NULL, 'v' },
+        { "dir_out",	1, NULL, 'D' },
+        { "input",	1, NULL, 'i' },
+        { 0,  0, 0, 0 }
+      };
+
+      c = getopt_long (argc, argv, "hVx:sapvD:i:", long_options, &option_index);
+
+      if (c == -1) break;	/* Exit from `while (1)' loop.  */
+
+      switch (c)
+        {
+        case 'h':	/* Print help and exit.  */
+          cmdline_parser_print_help ();
+          cmdline_parser_free (&local_args_info);
+          exit (EXIT_SUCCESS);
+
+        case 'V':	/* Print version and exit.  */
+        
+        
+          if (update_arg( 0 , 
+               0 , &(args_info->version_given),
+              &(local_args_info.version_given), optarg, 0, 0, ARG_NO,
+              check_ambiguity, override, 0, 0,
+              "version", 'V',
+              additional_error))
+            goto failure;
+          cmdline_parser_free (&local_args_info);
+          return 0;
+        
+          break;
+        case 'x':	/* Input dataset (.dab).  */
+        
+        
+          if (update_arg( (void *)&(args_info->dab_arg), 
+               &(args_info->dab_orig), &(args_info->dab_given),
+              &(local_args_info.dab_given), optarg, 0, 0, ARG_STRING,
+              check_ambiguity, override, 0, 0,
+              "dab", 'x',
+              additional_error))
+            goto failure;
+        
+          break;
+        case 's':	/* Generates sinfo file (with dataset wide mean and stdev).  */
+        
+        
+          if (update_arg((void *)&(args_info->sinfo_flag), 0, &(args_info->sinfo_given),
+              &(local_args_info.sinfo_given), optarg, 0, 0, ARG_FLAG,
+              check_ambiguity, override, 1, 0, "sinfo", 's',
+              additional_error))
+            goto failure;
+        
+          break;
+        case 'a':	/* Generates gene-average file.  */
+        
+        
+          if (update_arg((void *)&(args_info->gavg_flag), 0, &(args_info->gavg_given),
+              &(local_args_info.gavg_given), optarg, 0, 0, ARG_FLAG,
+              check_ambiguity, override, 1, 0, "gavg", 'a',
+              additional_error))
+            goto failure;
+        
+          break;
+        case 'p':	/* Generates gene-presence file.  */
+        
+        
+          if (update_arg((void *)&(args_info->gpres_flag), 0, &(args_info->gpres_given),
+              &(local_args_info.gpres_given), optarg, 0, 0, ARG_FLAG,
+              check_ambiguity, override, 1, 0, "gpres", 'p',
+              additional_error))
+            goto failure;
+        
+          break;
+        case 'v':	/* Generates gene-variance file.  */
+        
+        
+          if (update_arg((void *)&(args_info->gvar_flag), 0, &(args_info->gvar_given),
+              &(local_args_info.gvar_given), optarg, 0, 0, ARG_FLAG,
+              check_ambiguity, override, 1, 0, "gvar", 'v',
+              additional_error))
+            goto failure;
+        
+          break;
+        case 'D':	/* Output directory.  */
+        
+        
+          if (update_arg( (void *)&(args_info->dir_out_arg), 
+               &(args_info->dir_out_orig), &(args_info->dir_out_given),
+              &(local_args_info.dir_out_given), optarg, 0, 0, ARG_STRING,
+              check_ambiguity, override, 0, 0,
+              "dir_out", 'D',
+              additional_error))
+            goto failure;
+        
+          break;
+        case 'i':	/* Gene mapping file.  */
+        
+        
+          if (update_arg( (void *)&(args_info->input_arg), 
+               &(args_info->input_orig), &(args_info->input_given),
+              &(local_args_info.input_given), optarg, 0, 0, ARG_STRING,
+              check_ambiguity, override, 0, 0,
+              "input", 'i',
+              additional_error))
+            goto failure;
+        
+          break;
+
+        case 0:	/* Long option with no short option */
+        case '?':	/* Invalid option.  */
+          /* `getopt_long' already printed an error message.  */
+          goto failure;
+
+        default:	/* bug: option not considered.  */
+          fprintf (stderr, "%s: option unknown: %c%s\n", CMDLINE_PARSER_PACKAGE, c, (additional_error ? additional_error : ""));
+          abort ();
+        } /* switch */
+    } /* while */
+
+
+
+  if (check_required)
+    {
+      error += cmdline_parser_required2 (args_info, argv[0], additional_error);
+    }
+
+  cmdline_parser_release (&local_args_info);
+
+  if ( error )
+    return (EXIT_FAILURE);
+
+  if (optind < argc)
+    {
+      int i = 0 ;
+      int found_prog_name = 0;
+      /* whether program name, i.e., argv[0], is in the remaining args
+         (this may happen with some implementations of getopt,
+          but surely not with the one included by gengetopt) */
+
+      i = optind;
+      while (i < argc)
+        if (argv[i++] == argv[0]) {
+          found_prog_name = 1;
+          break;
+        }
+      i = 0;
+
+      args_info->inputs_num = argc - optind - found_prog_name;
+      args_info->inputs =
+        (char **)(malloc ((args_info->inputs_num)*sizeof(char *))) ;
+      while (optind < argc)
+        if (argv[optind++] != argv[0])
+          args_info->inputs[ i++ ] = gengetopt_strdup (argv[optind-1]) ;
+    }
+
+  return 0;
+
+failure:
+  
+  cmdline_parser_release (&local_args_info);
+  return (EXIT_FAILURE);
+}

tools/SeekPrep/cmdline.h

+/** @file cmdline.h
+ *  @brief The header file for the command line option parser
+ *  generated by GNU Gengetopt version 2.22.5
+ *  http://www.gnu.org/software/gengetopt.
+ *  DO NOT modify this file, since it can be overwritten
+ *  @author GNU Gengetopt by Lorenzo Bettini */
+
+#ifndef CMDLINE_H
+#define CMDLINE_H
+
+/* If we use autoconf.  */
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h> /* for FILE */
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#ifndef CMDLINE_PARSER_PACKAGE
+/** @brief the program name (used for printing errors) */
+#define CMDLINE_PARSER_PACKAGE "SeekPrep"
+#endif
+
+#ifndef CMDLINE_PARSER_PACKAGE_NAME
+/** @brief the complete program name (used for help and version) */
+#define CMDLINE_PARSER_PACKAGE_NAME "SeekPrep"
+#endif
+
+#ifndef CMDLINE_PARSER_VERSION
+/** @brief the program version */
+#define CMDLINE_PARSER_VERSION "1.0"
+#endif
+
+/** @brief Where the command line options are stored */
+struct gengetopt_args_info
+{
+  const char *help_help; /**< @brief Print help and exit help description.  */
+  const char *version_help; /**< @brief Print version and exit help description.  */
+  char * dab_arg;	/**< @brief Input dataset (.dab).  */
+  char * dab_orig;	/**< @brief Input dataset (.dab) original value given at command line.  */
+  const char *dab_help; /**< @brief Input dataset (.dab) help description.  */
+  int sinfo_flag;	/**< @brief Generates sinfo file (with dataset wide mean and stdev) (default=off).  */
+  const char *sinfo_help; /**< @brief Generates sinfo file (with dataset wide mean and stdev) help description.  */
+  int gavg_flag;	/**< @brief Generates gene-average file (default=off).  */
+  const char *gavg_help; /**< @brief Generates gene-average file help description.  */
+  int gpres_flag;	/**< @brief Generates gene-presence file (default=off).  */
+  const char *gpres_help; /**< @brief Generates gene-presence file help description.  */
+  int gvar_flag;	/**< @brief Generates gene-variance file (default=off).  */
+  const char *gvar_help; /**< @brief Generates gene-variance file help description.  */
+  char * dir_out_arg;	/**< @brief Output directory.  */
+  char * dir_out_orig;	/**< @brief Output directory original value given at command line.  */
+  const char *dir_out_help; /**< @brief Output directory help description.  */
+  char * input_arg;	/**< @brief Gene mapping file.  */
+  char * input_orig;	/**< @brief Gene mapping file original value given at command line.  */
+  const char *input_help; /**< @brief Gene mapping file help description.  */
+  
+  unsigned int help_given ;	/**< @brief Whether help was given.  */
+  unsigned int version_given ;	/**< @brief Whether version was given.  */
+  unsigned int dab_given ;	/**< @brief Whether dab was given.  */
+  unsigned int sinfo_given ;	/**< @brief Whether sinfo was given.  */
+  unsigned int gavg_given ;	/**< @brief Whether gavg was given.  */
+  unsigned int gpres_given ;	/**< @brief Whether gpres was given.  */
+  unsigned int gvar_given ;	/**< @brief Whether gvar was given.  */
+  unsigned int dir_out_given ;	/**< @brief Whether dir_out was given.  */
+  unsigned int input_given ;	/**< @brief Whether input was given.  */
+
+  char **inputs ; /**< @brief unamed options (options without names) */
+  unsigned inputs_num ; /**< @brief unamed options number */
+} ;
+
+/** @brief The additional parameters to pass to parser functions */
+struct cmdline_parser_params
+{
+  int override; /**< @brief whether to override possibly already present options (default 0) */
+  int initialize; /**< @brief whether to initialize the option structure gengetopt_args_info (default 1) */
+  int check_required; /**< @brief whether to check that all required options were provided (default 1) */
+  int check_ambiguity; /**< @brief whether to check for options already specified in the option structure gengetopt_args_info (default 0) */
+  int print_errors; /**< @brief whether getopt_long should print an error message for a bad option (default 1) */
+} ;
+
+/** @brief the purpose string of the program */
+extern const char *gengetopt_args_info_purpose;
+/** @brief the usage string of the program */
+extern const char *gengetopt_args_info_usage;
+/** @brief all the lines making the help output */
+extern const char *gengetopt_args_info_help[];
+
+/**
+ * The command line parser
+ * @param argc the number of command line options
+ * @param argv the command line options
+ * @param args_info the structure where option information will be stored
+ * @return 0 if everything went fine, NON 0 if an error took place
+ */
+int cmdline_parser (int argc, char **argv,
+  struct gengetopt_args_info *args_info);
+
+/**
+ * The command line parser (version with additional parameters - deprecated)
+ * @param argc the number of command line options
+ * @param argv the command line options
+ * @param args_info the structure where option information will be stored
+ * @param override whether to override possibly already present options
+ * @param initialize whether to initialize the option structure my_args_info
+ * @param check_required whether to check that all required options were provided
+ * @return 0 if everything went fine, NON 0 if an error took place
+ * @deprecated use cmdline_parser_ext() instead
+ */
+int cmdline_parser2 (int argc, char **argv,
+  struct gengetopt_args_info *args_info,
+  int override, int initialize, int check_required);
+
+/**
+ * The command line parser (version with additional parameters)
+ * @param argc the number of command line options
+ * @param argv the command line options
+ * @param args_info the structure where option information will be stored
+ * @param params additional parameters for the parser
+ * @return 0 if everything went fine, NON 0 if an error took place
+ */
+int cmdline_parser_ext (int argc, char **argv,
+  struct gengetopt_args_info *args_info,
+  struct cmdline_parser_params *params);
+
+/**
+ * Save the contents of the option struct into an already open FILE stream.
+ * @param outfile the stream where to dump options
+ * @param args_info the option struct to dump
+ * @return 0 if everything went fine, NON 0 if an error took place
+ */
+int cmdline_parser_dump(FILE *outfile,
+  struct gengetopt_args_info *args_info);
+
+/**
+ * Save the contents of the option struct into a (text) file.
+ * This file can be read by the config file parser (if generated by gengetopt)
+ * @param filename the file where to save
+ * @param args_info the option struct to save
+ * @return 0 if everything went fine, NON 0 if an error took place
+ */
+int cmdline_parser_file_save(const char *filename,
+  struct gengetopt_args_info *args_info);
+
+/**
+ * Print the help
+ */
+void cmdline_parser_print_help(void);
+/**
+ * Print the version
+ */
+void cmdline_parser_print_version(void);
+
+/**
+ * Initializes all the fields a cmdline_parser_params structure 
+ * to their default values
+ * @param params the structure to initialize
+ */
+void cmdline_parser_params_init(struct cmdline_parser_params *params);
+
+/**
+ * Allocates dynamically a cmdline_parser_params structure and initializes
+ * all its fields to their default values
+ * @return the created and initialized cmdline_parser_params structure
+ */
+struct cmdline_parser_params *cmdline_parser_params_create(void);
+
+/**
+ * Initializes the passed gengetopt_args_info structure's fields
+ * (also set default values for options that have a default)
+ * @param args_info the structure to initialize
+ */
+void cmdline_parser_init (struct gengetopt_args_info *args_info);
+/**
+ * Deallocates the string fields of the gengetopt_args_info structure
+ * (but does not deallocate the structure itself)
+ * @param args_info the structure to deallocate
+ */
+void cmdline_parser_free (struct gengetopt_args_info *args_info);
+
+/**
+ * Checks that all the required options were specified
+ * @param args_info the structure to check
+ * @param prog_name the name of the program that will be used to print
+ *   possible errors
+ * @return
+ */
+int cmdline_parser_required (struct gengetopt_args_info *args_info,
+  const char *prog_name);
+
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+#endif /* CMDLINE_H */

tools/SeekPrep/stdafx.cpp

+/*****************************************************************************
+* This file is provided under the Creative Commons Attribution 3.0 license.
+*
+* You are free to share, copy, distribute, transmit, or adapt this work
+* PROVIDED THAT you attribute the work to the authors listed below.
+* For more information, please see the following web page:
+* http://creativecommons.org/licenses/by/3.0/
+*
+* This file is a component of the Sleipnir library for functional genomics,
+* authored by:
+* Curtis Huttenhower (chuttenh@princeton.edu)
+* Mark Schroeder
+* Maria D. Chikina
+* Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
+*
+* If you use this library, the included executable tools, or any related
+* code in your work, please cite the following publication:
+* Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
+* Olga G. Troyanskaya.
+* "The Sleipnir library for computational functional genomics"
+*****************************************************************************/
+#include "stdafx.h"
+
+/*!
+ * \page DBCombiner DBCombiner
+ * 
+ * 
+ * \section sec_usage Usage
+ * 
+ * \subsection ssec_usage_basic Basic Usage
+ * 
+ * \code
+ * DBCombiner -i <genes.txt> -x <db list> -d <input directory> -D <output_dir>
+ * \endcode
+ * 
+ * 
+ * \subsection ssec_usage_detailed Detailed Usage
+ * 
+ * \include DBCombiner/DBCombiner.ggo
+ * 
+ * <table><tr>
+ *	<th>Flag</th>
+ *	<th>Default</th>
+ *	<th>Type</th>
+ *	<th>Description</th>
+ * </tr><tr>
+ *	<td>-i</td>
+ *	<td>stdin</td>
+ *	<td>Text file</td>
+ *	<td>Tab-delimited text file containing two columns, numerical gene IDs (one-based) and unique gene
+ *		names (matching those in the input DAT/DAB files).</td>
+ * </tr><tr>
+ *	<td>-d</td>
+ *	<td>.</td>
+ *	<td>Directory</td>
+ *	<td>Input directory containing DB files</td>
+ * </tr><tr>
+ *	<td>-D</td>
+ *	<td>.</td>
+ *	<td>Directory</td>
+ *	<td>Output directory in which database files will be stored.</td>
+ * </tr><tr>
+ *	<td>-x</td>
+ *	<td>.</td>
+ *	<td>Text file</td>
+ *	<td>Input file containing list of CDatabaselets to combine</td>
+ * </tr></table>
+ */

tools/SeekPrep/stdafx.h

+/*****************************************************************************
+* This file is provided under the Creative Commons Attribution 3.0 license.
+*
+* You are free to share, copy, distribute, transmit, or adapt this work
+* PROVIDED THAT you attribute the work to the authors listed below.
+* For more information, please see the following web page:
+* http://creativecommons.org/licenses/by/3.0/
+*
+* This file is a component of the Sleipnir library for functional genomics,
+* authored by:
+* Curtis Huttenhower (chuttenh@princeton.edu)
+* Mark Schroeder
+* Maria D. Chikina
+* Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
+*
+* If you use this library, the included executable tools, or any related
+* code in your work, please cite the following publication:
+* Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
+* Olga G. Troyanskaya.
+* "The Sleipnir library for computational functional genomics"
+*****************************************************************************/
+#ifndef STDAFX_H
+#define STDAFX_H
+
+#define __STDC_LIMIT_MACROS
+
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <string>
+#include <omp.h>
+using namespace std;
+
+#include <pthread.h>
+
+#include "bayesnet.h"
+#include "seekmap.h"
+#include "seekreader.h"
+#include "seekwriter.h"
+
+#include "meta.h"
+using namespace Sleipnir;
+
+#endif // STDAFX_H

tools/SeekReader/SeekReader.cpp

+/*****************************************************************************
+* This file is provided under the Creative Commons Attribution 3.0 license.
+*
+* You are free to share, copy, distribute, transmit, or adapt this work
+* PROVIDED THAT you attribute the work to the authors listed below.
+* For more information, please see the following web page:
+* http://creativecommons.org/licenses/by/3.0/
+*
+* This file is a component of the Sleipnir library for functional genomics,
+* authored by:
+* Curtis Huttenhower (chuttenh@princeton.edu)
+* Mark Schroeder
+* Maria D. Chikina
+* Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
+*
+* If you use this library, the included executable tools, or any related
+* code in your work, please cite the following publication:
+* Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
+* Olga G. Troyanskaya.
+* "The Sleipnir library for computational functional genomics"
+*****************************************************************************/
+#include "stdafx.h"
+#include "cmdline.h"
+
+
+int main( int iArgs, char** aszArgs ) {
+	static const size_t	c_iBuffer	= 1024;
+#ifdef WIN32
+	pthread_win32_process_attach_np( );
+#endif // WIN32
+	gengetopt_args_info	sArgs;
+	ifstream			ifsm;
+	istream*			pistm;
+	vector<string>		vecstrLine, vecstrGenes, vecstrDatasets, vecstrQuery;
+	char				acBuffer[ c_iBuffer ];
+	size_t				i;
+
+	if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) {
+		cmdline_parser_print_help( );
+		return 1; }
+
+	if( sArgs.input_arg ) {
+		ifsm.open( sArgs.input_arg );
+		pistm = &ifsm; }
+	else
+		pistm = &cin;
+	while( !pistm->eof( ) ) {
+		pistm->getline( acBuffer, c_iBuffer - 1 );
+		acBuffer[ c_iBuffer - 1 ] = 0;
+		vecstrLine.clear( );
+		CMeta::Tokenize( acBuffer, vecstrLine );
+		if( vecstrLine.size( ) < 2 ) {
+			cerr << "Ignoring line: " << acBuffer << endl;
+			continue; }
+		if( !( i = atoi( vecstrLine[ 0 ].c_str( ) ) ) ) {
+			cerr << "Illegal gene ID: " << vecstrLine[ 0 ] << " for " << vecstrLine[ 1 ] << endl;
+			return 1; }
+		i--;
+		if( vecstrGenes.size( ) <= i )
+			vecstrGenes.resize( i + 1 );
+		vecstrGenes[ i ] = vecstrLine[ 1 ]; }
+	if( sArgs.input_arg )
+		ifsm.close( );