Commits

Qian Zhu committed 7006cae

fixed important bug in seekevaluate.cpp
added flexibility in database.cpp to allow reading directory with only subset
of *.db files (rather than having the entire directory present)
parallelized main dataset weighting step

  • Participants
  • Parent commits cc4bd28
  • Branches search_project

Comments (0)

Files changed (24)

File configure.ac

 #      AC_DEFINE([SMILEXML_LIB], [1])
 #      SMILEXML_LIB="-lsmilexml"
 #    fi
-         SMILE_CFLAGS="-I $SMILE_INCLUDE_DIR -fopenmp -pg"
+         SMILE_CFLAGS="-I $SMILE_INCLUDE_DIR -fopenmp"
 #         SMILE_LIBS="-L $SMILE_LIB_DIR $SMILEXML_LIB -lsmile"
-         SMILE_LIBS="-L $SMILE_LIB_DIR -lsmile -fopenmp -pg"
+         SMILE_LIBS="-L $SMILE_LIB_DIR -lsmile -fopenmp"
         ],                                                        dnl and found in specified path
 	[],                                                       dnl not found
         [smile_state=no],                                         dnl and not found installed

File src/Makefile.am

 	seekquery.cpp			\
 	seekreader.cpp			\
 	seekwriter.cpp			\
+	seekplatform.cpp		\
 include_HEADERS			=	\
+	seekplatform.h			\
 	seekreader.h			\
 	seekwriter.h			\
 	seekquery.h				\

File src/database.cpp

 		abImage = NULL;
 
 	//vecData: # of genes in databaselet x # of genes user's list
-
 	//if this is not the first dataset in the dataset block
 	if( iBaseDatasets % 2 ){
 		//iGeneOne: iterate over all genes in this databaselet (# of genes in each databaselet)
 }
 
 
+/* mainly used by SeekMinder */
 bool CDatabaselet::Get(size_t iOne, vector<unsigned char>& vecbData){
 	size_t iSize;
 	size_t i, j;
 	size_t offset1 = GetOffset(iOne);
-	if(!this->m_fstm.is_open()){
-		cerr << "file not opened" << endl;
-		return false;
+	if(this->m_fstm.is_open()){
+		cerr << "file is already opened" << endl;
+	}else{
+		m_fstm.clear( );
+		m_fstm.open( this->strFileName.c_str( ), ios_base::binary | ios_base::in );
 	}
 
 	this->m_fstm.seekg(offset1, ios_base::beg);
 	unsigned char *abImage = (unsigned char*)malloc( iSize = GetSizeGene());
 	this->m_fstm.read((char*)abImage, iSize);
 
+	m_fstm.close();
+
 	if(m_useNibble==false){
 		vecbData.clear();
 		vecbData.resize(iSize);
 	/* load all Databaselets into memory, for efficiency */
 	unsigned char **charImages =
 			(unsigned char**)malloc(vecDatabaselet.size()*sizeof(unsigned char*));
-	size_t iImageSize = iDatasets * iGenes * first->m_iGenes;
-	charImages[0] = (unsigned char*)malloc(iImageSize*sizeof(unsigned char));
+	size_t totSize = 0;
+	size_t numPairs = first->m_iGenes * iGenes;
+	for(i=0; i<vecDatabaselet.size(); i++){
+		totSize += vecDatabaselet[i]->GetSizePair() * numPairs;
+	}
+	charImages[0] = (unsigned char*)malloc(totSize*sizeof(unsigned char));
 	for(i=1; i<vecDatabaselet.size(); i++){
-		charImages[i] = charImages[i-1] + vecDatabaselet[i-1]->m_iDatasets * first->m_iGenes * iGenes;
+		charImages[i] = charImages[i-1] + vecDatabaselet[i-1]->GetSizePair() * numPairs;
 	}
 
 	/* read databaselet into charImages */
 	for(i=0; i<vecDatabaselet.size(); i++){
 		CDatabaselet *current = vecDatabaselet[i];
-		if(current->m_fstm.is_open()){
-			current->m_fstm.seekg(current->m_iHeader, ios_base::beg);
-			current->m_fstm.read((char*) charImages[i], iImageSize);
-		}else{
-			cerr << "CDatabaselet is not open." << endl;
-			free(charImages[0]);
-			free(charImages);
-			return false;
+		if(!current->m_fstm.is_open()){
+			cerr << "CDatabaselet is not opened. Opening..." << endl;
+			current->m_fstm.open( current->strFileName.c_str( ), ios_base::binary | ios_base::in );
 		}
+		current->m_fstm.seekg(current->m_iHeader, ios_base::beg);
+		current->m_fstm.read((char*) charImages[i], vecDatabaselet[i]->GetSizePair() * numPairs);
 	}
 
 	map<string, size_t> mapstrintGenes;
 	if(bSplit){
 
 		for(i=0; i<iGenes; i++){
-
 			/* open a new Databaselet containing only one gene */
 			string thisGene = first->GetGene(i);
 			size_t iGeneID = mapstrintGenes[thisGene];
 				}
 			}else{
 				size_t j;
-				unsigned char *abImage2 = (unsigned char*)
-					malloc(iDatasets);
+				unsigned char *abImage2 = (unsigned char*)malloc(iDatasets);
 
 				/* m_iGenes is all the genes in the genome */
 				for( iGeneTwo = 0; iGeneTwo < first->m_iGenes; ++iGeneTwo ){
 			}
 
 			/* close fstream */
-			if(DBS.m_fstm.is_open()){
-				DBS.m_fstm.seekp( DBS.m_iHeader, ios_base::beg );
-				DBS.m_fstm.write( (char*)abImage, iSize );
-				DBS.m_fstm.close();
-			}else{
-				cerr << "CDatabaselet is not opened." << endl;
-				free(abImage);
-				free(charImages[0]);
-				free(charImages);
-				return false;
+			if(!DBS.m_fstm.is_open()){
+				cerr << "CDatabaselet is not opened. Opening..." << endl;
+				DBS.m_fstm.open(DBS.strFileName.c_str( ), ios_base::binary | ios_base::in);
 			}
-
+			DBS.m_fstm.seekp( DBS.m_iHeader, ios_base::beg );
+			DBS.m_fstm.write( (char*)abImage, iSize );
+			DBS.m_fstm.close();
 			free(abImage);
 
 		}
 		}
 
 		/* close the databaselet */
-		if(DBS.m_fstm.is_open()){
-			DBS.m_fstm.seekp( DBS.m_iHeader, ios_base::beg );
-			DBS.m_fstm.write( (char*)abImage, iSize );
-			DBS.m_fstm.close();
-		}else{
+		if(!DBS.m_fstm.is_open()){
 			cerr << "CDatabaselet is not opened." << endl;
-			free(abImage);
-			free(charImages[0]);
-			free(charImages);
-			return false;
+			DBS.m_fstm.open(DBS.strFileName.c_str( ), ios_base::binary | ios_base::in);
 		}
-
+		DBS.m_fstm.seekp( DBS.m_iHeader, ios_base::beg );
+		DBS.m_fstm.write( (char*)abImage, iSize );
+		DBS.m_fstm.close();
 		free(abImage);
 	}
 
 	return true;
 }
 
+bool CDatabaselet::Set(uint32_t &iGenes, uint32_t &iDatasets, vector<string> &vecstrSubsetGenes){
+	size_t i;
+	m_vecstrGenes.clear();
+	m_vecstrGenes.resize(vecstrSubsetGenes.size());
+	for(i=0; i<vecstrSubsetGenes.size(); i++){
+		m_vecstrGenes[i] = vecstrSubsetGenes[i];
+	}
 
+	m_iGenes = iGenes;
+	m_iDatasets = iDatasets;
+	uint32_t iSize = m_vecstrGenes.size( );
+	m_iHeader = sizeof(m_iHeader) + sizeof(m_iGenes) + sizeof(m_iDatasets) + sizeof(iSize);
+	for( i = 0; i < m_vecstrGenes.size( ); ++i ) {
+		m_iHeader += m_vecstrGenes[ i ].size( ) + 1;
+	}
 
+	return true;
+}
+
+
+
+bool CDatabase::Open(string &strDBDirectory,
+		vector<string> &vecstrGenes, size_t &iDatasets, size_t &iNumDBs){
+	size_t i, j, k;
+	Clear();
+	m_vecpDBs.resize(iNumDBs);
+	char acNumber[ 16 ];
+
+	for( i = 0; i < m_vecpDBs.size( ); ++i ) {
+		m_vecpDBs[ i ] = new CDatabaselet( m_useNibble );
+	}
+
+	for(i=0; i<iNumDBs; i++){
+		vector<string> vecstrSubset;
+		vecstrSubset.clear( );
+		for( j = i; j < vecstrGenes.size( ); j += m_vecpDBs.size( ) )
+			vecstrSubset.push_back( vecstrGenes[ j ] ); //contains index for 1000, 2000, 3000th genes
+		sprintf( acNumber, "%08u", i );
+		string strFile = strDBDirectory + '/' + acNumber + c_acExtension;
+		uint32_t iGenes = vecstrGenes.size();
+		uint32_t iDset = iDatasets;
+		m_vecpDBs[i]->Set(iGenes, iDset, vecstrSubset);
+		m_vecpDBs[i]->SetFile(strFile);
+	}
+
+	for( i = 0; i < vecstrGenes.size( ); ++i ){
+		m_mapstriGenes[ m_vecpDBs[ i % m_vecpDBs.size( ) ]->GetGene( i / m_vecpDBs.size( ) ) ] = i;
+	}
+
+
+	return true;
+}
 
 }

File src/database.h

 
 		return ( m_vecpDBs.empty( ) ? 0 : m_vecpDBs[ 0 ]->GetDatasets( ) ); }
 
+
+	bool Open(string &, vector<string> &, size_t &, size_t &);
+
 	/*!
 	 * \brief
 	 * Set memory mapping behavior when opening DAB files.

File src/databasei.h

 	bool Get( size_t, const std::vector<size_t>&, std::vector<unsigned char>&, bool ) const;
 	bool Get(size_t, vector<unsigned char>&);
 
+	bool Set(uint32_t&, uint32_t&, vector<string>&);
+
 	static bool Combine(std::vector<CDatabaselet*>& vecDatabaselet,
 			std::string strOutDirectory, vector<string> &vecstrGenes, bool bSplit = true);
 
 		strFileName = std;
 	}
 
+
+	unsigned char* GetCharImage(){
+		size_t iImageSize = GetSizeGenes();
+		unsigned char *charImage = (unsigned char*)malloc(iImageSize*sizeof(unsigned char));
+
+		/* read databaselet into charImage */
+		if(m_fstm.is_open()){
+			m_fstm.seekg(m_iHeader, ios_base::beg);
+			m_fstm.read((char*) charImage, iImageSize);
+		}else{
+			cerr << "CDatabaselet is not open." << endl;
+			free(charImage);
+			return NULL;
+		}
+
+		return charImage;
+	}
+
+	size_t GetImageSize(){
+		return GetSizeGenes();
+	}
+
+
 private:
+	size_t GetSizeGenes( ) const {
+		return ( GetSizeGene( ) * m_vecstrGenes.size( ) ); }
+
 
 	size_t GetOffsetDataset( size_t iDataset ) const {
 		if(m_useNibble){
 
 	}
 
-	size_t GetSizeGenes( ) const {
-
-		return ( GetSizeGene( ) * m_vecstrGenes.size( ) ); }
-
 	size_t GetSizeGene( ) const {
 
 		return ( GetSizePair( ) * m_iGenes ); }
 
 		return ( GetOffset( iOne, iTwo ) + GetOffsetDataset( iDataset ) ); }
 
-	uint32_t					m_iHeader;
 	uint32_t					m_iGenes;
 	uint32_t					m_iDatasets;
 	std::vector<std::string>	m_vecstrGenes;
 	std::string					strFileName;
 
+	mutable std::fstream		m_fstm;
+	uint32_t					m_iHeader;
+
 	bool						m_useNibble;
-	mutable std::fstream		m_fstm;
 	mutable pthread_mutex_t*	m_pmutx;
 };
 

File src/seekdataset.cpp

 	size_t i, j;
 	queryMap = new CSeekIntIntMap(iSize);
 	for(i=0; i<geneMap->GetNumSet(); i++){
-		size_t j = geneMap->GetReverse(i);
+		j = geneMap->GetReverse(i);
 		if(query[j]==0) continue;
 		queryMap->Add(j);
 	}
 	iNumGenes = iSize;
 
 	if(iQuerySize==0){
-		cerr << "Dataset will be skipped" << endl;
+		//cerr << "Dataset will be skipped" << endl;
+		r = NULL;
 		return true;
 	}
 	r = new CFullMatrix<unsigned char>();
 bool CSeekDataset::DeleteQuery(){
 	if(queryMap!=NULL){
 		delete queryMap;
+		queryMap = NULL;
 	}
 	iQuerySize = 0;
 	iNumGenes = 0;
 	if(r!=NULL){
 		delete r;
+		r = NULL;
 	}
 	return true;
 }
 	return rData;
 }
 
-bool CSeekDataset::InitializeFloatMatrix(bool bSubtractAvg){
+bool CSeekDataset::InitializeFloatMatrix(bool bSubtractAvg,
+	bool bSubtractPlatformAvg){
+	/* assume platform is already set */
+
 	//hard coded quant file
 	vector<float> quant;
 	float w = -5.0;
 
 	size_t i,j;
 	if(bSubtractAvg){
+		float *platform_avg = new float[rData->GetColumns()];
+		float *platform_stdev = new float[rData->GetColumns()];
+
+		if(bSubtractPlatformAvg){
+			for(j=0; j<rData->GetColumns(); j++){
+				size_t jj = queryMap->GetReverse(j);
+				platform_avg[j] = platform->GetPlatformAvg(jj);
+				platform_stdev[j] = platform->GetPlatformStdev(jj);
+			}
+		}
+
 		/* numGenes */
 		for(i=0; i<rData->GetRows(); i++){
 			float a = GetGeneAverage(i);
 				}
 				continue;
 			}
+
 			/* numQueries */
 			for(j=0; j<rData->GetColumns(); j++){
 				unsigned char x = r->Get(j, i);
+
 				if(x==255){
 					rData->Set(i, j, -50.0);
 					//printf("Bad %.5f %d\n", x, r->Get(j, i));
 					/*}else if(x>=quant.size()){
 					printf("Bad oversize %d\n", x);
 					getchar();*/
-				}else{
-					rData->Set(i, j, quant[x] - a);
+					continue;
 				}
+
+				float v = quant[x] - a;
+				if(bSubtractPlatformAvg){
+					/*if(CMeta::IsNaN(platform_avg[j]) ||
+						CMeta::IsNaN(platform_stdev[j])){
+						printf("platform average or stdev is NaN\n");
+						getchar();
+						continue;
+					}*/
+					rData->Set(i, j, (v - platform_avg[j] / platform_stdev[j]));
+					continue;
+				}
+				rData->Set(i, j, v);
 			}
+
 		}
-	}else{
-		/* numGenes */
-		for(i=0; i<rData->GetRows(); i++){
-			/* numQueries */
-			for(j=0; j<rData->GetColumns(); j++){
-				rData->Set(i, j, quant[r->Get(j, i)]);
-			}
+
+		delete[] platform_avg;
+		delete[] platform_stdev;
+
+		return true;
+	}
+
+	/* numGenes */
+	for(i=0; i<rData->GetRows(); i++){
+		/* numQueries */
+		for(j=0; j<rData->GetColumns(); j++){
+			rData->Set(i, j, quant[r->Get(j, i)]);
 		}
 	}
+
 	return true;
 }
 
 	return sum_weight;
 }
 
+void CSeekDataset::SetPlatform(CSeekPlatform &cp){
+	platform = &cp;
+}
+
+CSeekPlatform& CSeekDataset::GetPlatform(){
+	return *platform;
+}
+
+
+
 
 }

File src/seekdataset.h

 #include "seekmap.h"
 #include "stdafx.h"
 #include "datapair.h"
+#include "seekplatform.h"
 
 
 namespace Sleipnir {
 	bool SetQueryNoMapping(size_t &, size_t &, unsigned char &);
 	bool SetQuery(size_t &, vector<unsigned char> &);
 	CFullMatrix<float> *GetFloatMatrix();
-	bool InitializeFloatMatrix(bool=true);
+	bool InitializeFloatMatrix(bool=true, bool=true);
 	bool FreeFloatMatrix();
 	CFullMatrix<unsigned char> *GetMatrix();
 	CSeekIntIntMap* GetGeneMap();
 	bool InitializeCVWeight(size_t);
 	bool SetCVWeight(size_t, float);
 	float GetDatasetSumWeight();
+	void SetPlatform(CSeekPlatform &);
+	CSeekPlatform& GetPlatform();
 
 private:
 	string strName;
-	string strPlatform;
+	CSeekPlatform *platform;
 	CFullMatrix<unsigned char> *r;
 	vector<float> geneAverage;
 	vector<float> geneVariance;

File src/seekevaluate.cpp

 	}
 
 	float x = 0;
+	jj = 0;
 	int numNonZero = sing.size();
 	for(i=0; i<numNonZero; i++){
 		if(sing[i].f<=0) break;

File src/seekmap.cpp

 	}
 }
 
-
-
 /*
  * StrIntMap Data Structure
  */
 
 CSeekStrIntMap::~CSeekStrIntMap(){}
 
-void CSeekStrIntMap::Set(string s, int i){
+void CSeekStrIntMap::Clear(){
+	m_mapstrint.clear();
+	m_mapintstr.clear();
+}
+
+void CSeekStrIntMap::SetAll(vector<string> &s){
+	Clear();
+	size_t i = 0;
+	for(i=0; i<s.size(); i++){
+		m_mapstrint[s[i]] = i;
+		m_mapintstr[i] = s[i];
+	}
+}
+
+void CSeekStrIntMap::Set(string s, size_t i){
 	m_mapstrint[s] = i;
 	m_mapintstr[i] = s;
 }
 
+map<string, size_t>& CSeekStrIntMap::GetMapForward(){
+	return m_mapstrint;
+}
+
+map<size_t, string>& CSeekStrIntMap::GetMapReverse(){
+	return m_mapintstr;
+}
+
+
 int CSeekStrIntMap::Get(string s){
 	return m_mapstrint[s];
 }
 
-string CSeekStrIntMap::Get(int i){
+string CSeekStrIntMap::Get(size_t i){
 	return m_mapintstr[i];
 }
 
 	vector<string> vecStr;
 	vecStr.clear();
 	vecStr.resize(GetSize());
-	map<string, int>::iterator	iter;
+	map<string, size_t>::iterator	iter;
 	size_t i = 0;
 	for(iter = m_mapstrint.begin(); iter!=m_mapstrint.end(); iter++){
 		vecStr[i] = iter->first;
 	return vecStr;
 }
 
-vector<int> CSeekStrIntMap::GetAllInteger(){
-	vector<int> vecInt;
+vector<size_t> CSeekStrIntMap::GetAllInteger(){
+	vector<size_t> vecInt;
 	vecInt.clear();
 	vecInt.resize(GetSize());
-	map<int, string>::iterator	iter;
+	map<size_t, string>::iterator	iter;
 	size_t i = 0;
 	for(iter = m_mapintstr.begin(); iter!=m_mapintstr.end(); iter++){
 		vecInt[i] = iter->first;

File src/seekmap.h

 public:
 	CSeekStrIntMap();
 	~CSeekStrIntMap();
-	void Set(string, int);
+	void Clear();
+	void Set(string, size_t);
+	void SetAll(vector<string>&);
 	int Get(string);
+	map<string, size_t>& GetMapForward();
+	map<size_t, string>& GetMapReverse();
 	size_t GetSize();
-	string Get(int);
+	string Get(size_t);
 	vector<string> GetAllString();
-	vector<int> GetAllInteger();
+	vector<size_t> GetAllInteger();
 private:
-	map<string, int> m_mapstrint;
-	map<int, string> m_mapintstr;
+	map<string, size_t> m_mapstrint;
+	map<size_t, string> m_mapintstr;
 };
 
 }

File src/seekplatform.cpp

+/*****************************************************************************
+* This file is provided under the Creative Commons Attribution 3.0 license.
+*
+* You are free to share, copy, distribute, transmit, or adapt this work
+* PROVIDED THAT you attribute the work to the authors listed below.
+* For more information, please see the following web page:
+* http://creativecommons.org/licenses/by/3.0/
+*
+* This file is a component of the Sleipnir library for functional genomics,
+* authored by:
+* Curtis Huttenhower (chuttenh@princeton.edu)
+* Mark Schroeder
+* Maria D. Chikina
+* Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
+*
+* If you use this library, the included executable tools, or any related
+* code in your work, please cite the following publication:
+* Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
+* Olga G. Troyanskaya.
+* "The Sleipnir library for computational functional genomics"
+*****************************************************************************/
+#include "stdafx.h"
+#include "seekplatform.h"
+#include "seekreader.h"
+
+namespace Sleipnir {
+
+CSeekPlatform::CSeekPlatform(){
+	m_iNumGenes = 0;
+	m_vecfPlatformAvg.clear();
+	m_vecfPlatformStdev.clear();
+	m_strPlatformName = "";
+}
+
+CSeekPlatform::~CSeekPlatform(){
+}
+
+void CSeekPlatform::InitializePlatform(const size_t &numGenes, string &strPlatformName){
+	m_iNumGenes = numGenes;
+	CSeekTools::InitVector(m_vecfPlatformAvg, numGenes, (float) 0);
+	CSeekTools::InitVector(m_vecfPlatformStdev, numGenes, (float) 0);
+	m_strPlatformName = strPlatformName;
+}
+
+void CSeekPlatform::SetPlatformAvg(const size_t &i, float val){
+	m_vecfPlatformAvg[i] = val;
+}
+	
+void CSeekPlatform::SetPlatformStdev(const size_t &i, float val){
+	m_vecfPlatformStdev[i] = val;
+}
+	
+float CSeekPlatform::GetPlatformAvg(const size_t &i){
+	return m_vecfPlatformAvg[i];
+}
+
+float CSeekPlatform::GetPlatformStdev(const size_t &i){
+	return m_vecfPlatformStdev[i];
+}
+
+void CSeekPlatform::ResetPlatform(){
+	m_iNumGenes = 0;
+	m_vecfPlatformAvg.clear();
+	m_vecfPlatformStdev.clear();
+	m_strPlatformName = "";
+}
+
+}

File src/seekplatform.h

+/*****************************************************************************
+* This file is provided under the Creative Commons Attribution 3.0 license.
+*
+* You are free to share, copy, distribute, transmit, or adapt this work
+* PROVIDED THAT you attribute the work to the authors listed below.
+* For more information, please see the following web page:
+* http://creativecommons.org/licenses/by/3.0/
+*
+* This file is a component of the Sleipnir library for functional genomics,
+* authored by:
+* Curtis Huttenhower (chuttenh@princeton.edu)
+* Mark Schroeder
+* Maria D. Chikina
+* Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
+*
+* If you use this library, the included executable tools, or any related
+* code in your work, please cite the following publication:
+* Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
+* Olga G. Troyanskaya.
+* "The Sleipnir library for computational functional genomics"
+*****************************************************************************/
+#ifndef SEEKPLATFORM_H
+#define SEEKPLATFORM_H
+
+#include "stdafx.h"
+
+namespace Sleipnir {
+
+class CSeekPlatform{
+public:
+	CSeekPlatform();
+	~CSeekPlatform();
+
+	void InitializePlatform(const size_t &, string &);
+	void SetPlatformAvg(const size_t &, float);
+	void SetPlatformStdev(const size_t &, float);
+	float GetPlatformAvg(const size_t &);
+	float GetPlatformStdev(const size_t &);
+	void ResetPlatform();
+
+private:
+	vector<float> m_vecfPlatformAvg;
+	vector<float> m_vecfPlatformStdev;
+	size_t m_iPlatformID;
+	string m_strPlatformName;
+	size_t m_iNumGenes;
+};
+
+}
+#endif

File src/seekreader.cpp

 	return true;
 }
 
-bool CSeekTools::LoadDatabase(CDatabase &DB, string &strInputDirectory, string &strPrepInputDirectory, 
+bool CSeekTools::LoadDatabase(CDatabase &DB, string &strPrepInputDirectory,
 	vector<char> &cQuery, vector<string> &vecstrQuery, vector<string> &vecstrDatasets, 
-	vector<CSeekDataset*> &vc){
+	map<string, string> &mapstrstrDatasetPlatform, map<string, size_t> &mapstriPlatform,
+	vector<CSeekPlatform> &vp, vector<CSeekDataset*> &vc){
 		
-	DB.Open(strInputDirectory);
 	size_t iDatasets = DB.GetDatasets();
 	size_t iGenes = DB.GetGenes();
-	size_t i, j,k;
+	size_t i, j, k;
 	vc.clear();
 	vc.resize(iDatasets);
 	for(i=0; i<iDatasets; i++){
 		string strPresencePath = strPrepInputDirectory + "/" + strFileStem + ".gpres";
 		vc[i]->ReadGeneAverage(strAvgPath);
 		vc[i]->ReadGenePresence(strPresencePath);
+		string strPlatform = mapstrstrDatasetPlatform[strFileStem];
+		size_t platform_id = mapstriPlatform[strPlatform];
+		//printf("Platform id %s %d\n", strPlatform.c_str(), platform_id);
+		vc[i]->SetPlatform(vp[platform_id]);
 	}
 
 	CSeekTools::InitVector(cQuery, iGenes, (char) 0);
 		}
 		size_t m = DB.GetGene(vecstrQuery[i]);
 		size_t l = 0;
+
 		for(j=0; j<iDatasets; j++){
 			CSeekIntIntMap *qu = vc[j]->GetQueryMap();
 			size_t query = qu->GetForward(m);
 	return true;
 }
 
+bool CSeekTools::ReadPlatforms(string &strPlatformDirectory, vector<CSeekPlatform> &plat,
+		vector<string> &vecstrPlatforms, map<string, size_t> &mapstriPlatforms){
+
+	string strAvgFile = strPlatformDirectory + "/" + "all_platforms.gplatavg";
+	string strStdevFile = strPlatformDirectory + "/" + "all_platforms.gplatstdev";
+	string strPlatformOrderFile = strPlatformDirectory + "/" + "all_platforms.gplatorder";
+
+	CFullMatrix<float> plat_avg;
+	plat_avg.Open(strAvgFile.c_str());
+	CFullMatrix<float> plat_stdev;
+	plat_stdev.Open(strStdevFile.c_str());
+	plat.clear();
+	plat.resize(plat_avg.GetRows());
+	size_t i, j;
+
+	/*for(i=0; i<plat_avg.GetRows(); i++){
+		int c = 0;
+		for(j=0; j<plat_avg.GetColumns(); j++){
+			if(CMeta::IsNaN(plat_avg.Get(i,j))){
+				continue;
+			}
+			c++;
+			//printf("Gene %d %d: %.5f %.5f\n", i, j, plat_avg.Get(i, j), plat_stdev.Get(i,j));
+		}
+		printf("Platform %d, %d\n", i, c);
+	}
+	printf("Done");
+	 */
+	vecstrPlatforms.clear();
+	mapstriPlatforms.clear();
+	ifstream ifsm;
+	ifsm.open(strPlatformOrderFile.c_str());
+	char acBuffer[1024];
+	int c_iBuffer = 1024;
+	i = 0;
+	while(!ifsm.eof()){
+		ifsm.getline(acBuffer, c_iBuffer -1);
+		if(acBuffer[0]==0){
+			break;
+		}
+		acBuffer[c_iBuffer-1] = 0;
+		vecstrPlatforms.push_back(acBuffer);
+		mapstriPlatforms[acBuffer] = i;
+		i++;
+	}
+	vecstrPlatforms.resize(vecstrPlatforms.size());
+	ifsm.close();
+
+	for(i=0; i<plat_avg.GetRows(); i++){
+		plat[i].InitializePlatform(plat_avg.GetColumns(), vecstrPlatforms[i]);
+		for(j=0; j<plat_avg.GetColumns(); j++){
+			plat[i].SetPlatformAvg(j, plat_avg.Get(i, j));
+			plat[i].SetPlatformStdev(j, plat_stdev.Get(i, j));
+		}
+	}
+
+	return true;
 }
+
+bool CSeekTools::ReadListTwoColumns(string &strFile, vector<string> &vecstrList1, vector<string> &vecstrList2){
+	ifstream ifsm;
+	ifsm.open(strFile.c_str());
+	if(!ifsm.is_open()){
+		cerr << "Error opening file " << strFile << endl;
+		return false;
+	}
+	char acBuffer[1024];
+	int c_iBuffer = 1024;
+	vecstrList1.clear();
+	vecstrList2.clear();
+
+	while(!ifsm.eof()){
+		ifsm.getline(acBuffer, c_iBuffer -1);
+		if(acBuffer[0]==0){
+			break;
+		}
+		acBuffer[c_iBuffer-1] = 0;
+		vector<string> tok;
+		CMeta::Tokenize(acBuffer, tok);
+		vecstrList1.push_back(tok[0]);
+		vecstrList2.push_back(tok[1]);
+	}
+	vecstrList1.resize(vecstrList1.size());
+	vecstrList2.resize(vecstrList2.size());
+	ifsm.close();
+	return true;
+}
+
+bool CSeekTools::ReadListOneColumn(string &strFile, vector<string> &vecstrList, CSeekStrIntMap &mapstriList){
+	ifstream ifsm;
+	ifsm.open(strFile.c_str());
+	if(!ifsm.is_open()){
+		cerr << "Error opening file " << strFile << endl;
+		return false;
+	}
+
+	char acBuffer[1024];
+	int c_iBuffer = 1024;
+	vecstrList.clear();
+
+	size_t i = 0;
+	while(!ifsm.eof()){
+		ifsm.getline(acBuffer, c_iBuffer -1);
+		if(acBuffer[0]==0){
+			break;
+		}
+		acBuffer[c_iBuffer-1] = 0;
+		string line = acBuffer;
+		vecstrList.push_back(line);
+		mapstriList.Set(line, i);
+		i++;
+	}
+	vecstrList.resize(vecstrList.size());
+	ifsm.close();
+	return true;
+}
+
+
+}

File src/seekreader.h

 #include "stdafx.h"
 #include "datapair.h"
 #include "seekdataset.h"
+#include "seekplatform.h"
 #include "database.h"
 
 namespace Sleipnir {
 		return true;
 	}
 
-	static bool CreatePresenceVector(vector<int> &srcData, vector<char> &destData, size_t iSize);
-	static bool LoadDatabase(CDatabase &DB, string &strInputDirectory, 
-	string &strPrepInputDirectory, vector<char> &cQuery, 
-	vector<string> &vecstrQuery, vector<string> &vecstrDatasets, 
-	vector<CSeekDataset*> &vc);
+	template<class tType>
+	static tType** Init2DArray(size_t iSize1, size_t iSize2, tType tValue){
+		tType **f = (tType**)malloc(iSize1*sizeof(tType*));
+		f[0] = (tType*)malloc(iSize1*iSize2*sizeof(tType));
+		size_t i, j;
+		for(i=1; i<iSize1; i++){
+			f[i] = f[i-1] + iSize2;
+		}
+		for(i=0; i<iSize1; i++){
+			for(j=0; j<iSize2; j++){
+				f[i][j] = tValue;
+			}
+		}
+		return f;
+	}
+
+	template<class tType>
+	static void Free2DArray(tType** f){
+		free(f[0]);
+		free(f);
+	}
+
+	static bool CreatePresenceVector(vector<int> &, vector<char> &, size_t);
+	static bool LoadDatabase(CDatabase &, string &, vector<char> &,
+	vector<string> &, vector<string> &, map<string, string> &, map<string, size_t> &,
+	vector<CSeekPlatform> &, vector<CSeekDataset*> &);
+
+	static bool ReadPlatforms(string &strPlatformDirectory, vector<CSeekPlatform> &plat,
+			vector<string> &vecstrPlatforms, map<string, size_t> &mapstriPlatforms);
+
+	static bool ReadListOneColumn(string &strFile, vector<string> &vecstrList, CSeekStrIntMap &mapstriList);
+
+	static bool ReadListTwoColumns(string &strFile, vector<string> &list1, vector<string> &list2);
 
 };
 

File src/seekweight.cpp

 	}
 	size_t iNumGenes = sDataset.GetNumGenes();
 
-	vector<float> new_rank;
+	//vector<float> new_rank;
 	CSeekTools::InitVector(rank, iNumGenes, (float)0);
-	CSeekTools::InitVector(new_rank, iNumGenes, (float)0);
+	//CSeekTools::InitVector(new_rank, iNumGenes, (float)0);
 	size_t i, j, k;
 
 	int q_size = cv_query.size();
-	for(i=0; i<q_size; i++){
+	/*for(i=0; i<q_size; i++){
 		rank[cv_query[i]] = 1.0 / q_size;
-	}
+	}*/
 
 	/*if(q_size==0){
 		printf("Bad!\n");
 				printf("Bad %.5f\n", f->Get(g,q));
 				getchar();
 			}*/
-			new_rank[g] += rank[qq] * f->Get(g, q);
+			rank[g] += f->Get(g, q);
 		}
+		rank[g] /= (float) q_size;
 	}
 
-	for(i=0; i<iGenesPresent; i++){
-		size_t g = mapG->GetReverse(i);
-		rank[g] = new_rank[g];
+	//for(i=0; i<iGenesPresent; i++){
+	//	size_t g = mapG->GetReverse(i);
+	//	rank[g] = new_rank[g];
 		//printf("Gene %d %.5f\n", g, rank[g]);
-	}
+	//}
 
 	//getchar();
 

File tools/SeekMiner/SeekMiner.cpp

 	gengetopt_args_info	sArgs;
 	ifstream			ifsm;
 	istream*			pistm;
-	vector<string>		vecstrLine, vecstrGenes, vecstrDatasets, vecstrQuery;
+	vector<string>		vecstrGenes;
 	char				acBuffer[ c_iBuffer ];
 	size_t				i;
 
 		return 1; }
 
 	if( sArgs.input_arg ) {
-		ifsm.open( sArgs.input_arg );
-		pistm = &ifsm; }
-	else
-		pistm = &cin;
-	while( !pistm->eof( ) ) {
-		pistm->getline( acBuffer, c_iBuffer - 1 );
-		acBuffer[ c_iBuffer - 1 ] = 0;
-		vecstrLine.clear( );
-		CMeta::Tokenize( acBuffer, vecstrLine );
-		if( vecstrLine.size( ) < 2 ) {
-			cerr << "Ignoring line: " << acBuffer << endl;
-			continue; }
-		if( !( i = atoi( vecstrLine[ 0 ].c_str( ) ) ) ) {
-			cerr << "Illegal gene ID: " << vecstrLine[ 0 ] << " for " << vecstrLine[ 1 ] << endl;
-			return 1; }
-		i--;
-		if( vecstrGenes.size( ) <= i )
-			vecstrGenes.resize( i + 1 );
-		vecstrGenes[ i ] = vecstrLine[ 1 ]; }
-	if( sArgs.input_arg )
-		ifsm.close( );
+		string strGeneInput = sArgs.input_arg;
+		vector<string> vecstrGeneID;
+		if(!CSeekTools::ReadListTwoColumns(strGeneInput, vecstrGeneID, vecstrGenes)){
+			return false;
+		}
+	}
 
 	bool useNibble = false;
 	if(sArgs.is_nibble_flag==1){
 	CDatabase DB(useNibble);
 
 	if(sArgs.db_arg){
-		ifsm.open(sArgs.db_arg);
-		while(!pistm->eof()){
-			pistm->getline(acBuffer, c_iBuffer -1);
-			if(acBuffer[0]==0){
-				break;
-			}
-			acBuffer[c_iBuffer-1] = 0;
-			vecstrDatasets.push_back(acBuffer);
+		string strDBInput = sArgs.db_arg;
+		vector<string> vecstrDatasets, vecstrDP;
+		if(!CSeekTools::ReadListTwoColumns(strDBInput, vecstrDatasets, vecstrDP)){
+			return false;
 		}
-		vecstrDatasets.resize(vecstrDatasets.size());
-		ifsm.close();
+		map<string, string> mapstrstrDatasetPlatform;
+		for(i=0; i<vecstrDatasets.size(); i++){
+			mapstrstrDatasetPlatform[vecstrDatasets[i]] = vecstrDP[i];
+		}
 
-		ifsm.open(sArgs.query_arg);
-		while(!pistm->eof()){
-			pistm->getline(acBuffer, c_iBuffer -1);
-			if(acBuffer[0]==0){
-				break;
-			}
-			acBuffer[c_iBuffer-1] = 0;
-			vecstrQuery.push_back(acBuffer);
+		string strQueryInput = sArgs.query_arg;
+		vector<string> vecstrQuery;
+		CSeekStrIntMap mapstriQuery;
+		if(!CSeekTools::ReadListOneColumn(strQueryInput, vecstrQuery, mapstriQuery)){
+			return false;
 		}
-		vecstrQuery.resize(vecstrQuery.size());
-		ifsm.close();
+
+		string strPlatformDirectory = sArgs.dir_platform_arg;
+		vector<CSeekPlatform> vp;
+		map<string, size_t> mapstriPlatform;
+		vector<string> vecstrPlatforms;
+		CSeekTools::ReadPlatforms(strPlatformDirectory, vp, vecstrPlatforms,
+				mapstriPlatform);
+
+		//printf("Done reading"); getchar();
 
 		string strInputDirectory = sArgs.dir_in_arg;
 		string strPrepInputDirectory = sArgs.dir_prep_in_arg;
+		size_t iNumDBs = sArgs.num_db_arg;
+		size_t iDatasets = vecstrDatasets.size();
+		size_t iGenes = vecstrGenes.size();
+
+		DB.Open(strInputDirectory, vecstrGenes, iDatasets, iNumDBs);
+
 		vector<CSeekDataset*> vc;
 		vector<char> cQuery;
-		CSeekTools::LoadDatabase(DB, strInputDirectory, strPrepInputDirectory,
-			cQuery, vecstrQuery, vecstrDatasets, vc);
-		size_t iDatasets = DB.GetDatasets();
-		size_t iGenes = DB.GetGenes();
+		CSeekTools::LoadDatabase(DB, strPrepInputDirectory, cQuery, vecstrQuery,
+			vecstrDatasets, mapstrstrDatasetPlatform, mapstriPlatform, vp, vc);
 
 		/*
 		DB.Open(strInputDirectory);
 			vc[i]->InitializeQuery(cQuery);
 		}
 
-		vector<unsigned char> *Q =
-			new vector<unsigned char>[vecstrQuery.size()];
+		vector<unsigned char> *Q = new vector<unsigned char>[vecstrQuery.size()];
 
 		for(i=0; i<vecstrQuery.size(); i++){
 			if(!DB.GetGene(vecstrQuery[i], Q[i])){
 			    }
 			}
 		}
-
 		delete[] Q;
 		*/
 		size_t j;
 		T = gsl_rng_default;
 		rnd = gsl_rng_alloc(T);
 
-		size_t d;
+		int d;
+		omp_set_num_threads(8);
+		int numThreads = omp_get_max_threads();
 
 		for(i=0; i<1; i++){
 			CSeekQuery query;
 			vector<int> counts;
 			CSeekTools::InitVector(counts, iGenes, (int) 0);
 
+			float **master_rank_threads = CSeekTools::Init2DArray(numThreads, iGenes, (float) 0);
+			float **sum_weight_threads = CSeekTools::Init2DArray(numThreads, iGenes, (float) 0);
+			int **counts_threads = CSeekTools::Init2DArray(numThreads, iGenes, (int) 0);
+
 			printf("Entering search\n");
+
+			#pragma omp parallel for \
+			shared(vc) \
+			private(d, j) \
+			firstprivate(iDatasets) \
+			schedule(static)
+
 			for(d=0; d<iDatasets; d++){
-				printf("Dataset %d\n", d);
+				int tid = omp_get_thread_num();
+				//printf("Dataset %d\n", d);
 				CSeekIntIntMap *mapQ = vc[d]->GetQueryMap();
 				CSeekIntIntMap *mapG = vc[d]->GetGeneMap();
 
 				}
 
 				if(mapQ->GetNumSet()==0){
-					printf("This dataset is skipped\n");
+					//printf("This dataset is skipped\n");
 					continue;
 				}
 
-				printf("Initializing\n");
+				//printf("Initializing\n");
 				vc[d]->InitializeFloatMatrix();
-				printf("Weighting dataset\n");
+				//printf("Weighting dataset\n");
 				CSeekWeighter::CVWeighting(query, *vc[d]);
 				float w = vc[d]->GetDatasetSumWeight();
 				if(w==-1){
-					printf("Bad weight\n"); 
+					//printf("Bad weight\n");
 					vc[d]->FreeFloatMatrix();
 					continue;
 					//getchar();
 				}
 				vector<float> rank_normal;
-				printf("Doing linear combination\n");
+				//printf("Doing linear combination\n");
 				CSeekWeighter::LinearCombine(rank_normal, this_q, *vc[d]);
 				/*for(j=0; j<1000; j++){
 					size_t g = mapG->GetReverse(j);
 				}*/
 				vc[d]->FreeFloatMatrix();
 
-				printf("Adding contribution of dataset to master ranking: %.5f\n", w);
+				//printf("Adding contribution of dataset to master ranking: %.5f\n", w);
 				for(j=0; j<mapG->GetNumSet(); j++){
 					size_t g = mapG->GetReverse(j);
-					master_rank[g] += rank_normal[g] * w;
-					counts[g]++;
-					sum_weight[g] += w;
+					master_rank_threads[tid][g] += rank_normal[g] * w;
+					counts_threads[tid][g]++;
+					sum_weight_threads[tid][g] += w;
 				}
 			}
 
+			for(j=0; j<numThreads; j++){
+				size_t k;
+				for(k=0; k<iGenes; k++){
+					master_rank[k] += master_rank_threads[j][k];
+					counts[k] += counts_threads[j][k];
+					sum_weight[k]+=sum_weight_threads[j][k];
+				}
+			}
+
+			CSeekTools::Free2DArray(master_rank_threads);
+			CSeekTools::Free2DArray(counts_threads);
+			CSeekTools::Free2DArray(sum_weight_threads);
+
+
 			printf("Aggregating genes\n");
 			for(j=0; j<iGenes; j++){
 				if(counts[j]<(int)(0.5*iDatasets)){
 				}else{
 					master_rank[j] /= sum_weight[j];
 				}
-				printf("Gene %d %.5f\n", j, master_rank[j]);
+				//printf("Gene %d %.5f\n", j, master_rank[j]);
 			}
 
 			printf("Sorting genes\n");
 				printf("%d %.5f\n", a[ii].i, a[ii].f);
 				jj++;
 			}
-
-
 		}
 
-
-
 		/*for(i=0; i<iDatasets; i++){
 			printf("Dataset %ld\n", i);
 			CSeekMatrix<unsigned char> *cm = vc[i]->GetMatrix();

File tools/SeekMiner/SeekMiner.ggo

 								string	typestr="directory"	yes
 option	"dir_prep_in"		p	"Prep directory (containing .gavg, .gpres files)"
 								string	typestr="directory"	yes				
+option	"dir_platform"		P	"Platform directory (containing .gplatavg, .gplatstdev, .gplatorder files)"
+								string	typestr="directory"	yes
 option	"is_nibble"			N	"Whether the input DB is nibble type"
 								flag	off
+option	"num_db"			n	"Number of databaselets in database"
+								int	default="1000"	yes

File tools/SeekMiner/cmdline.c

 /*
   File autogenerated by gengetopt version 2.22.5
   generated with the following command:
-  gengetopt -iSeekMiner.ggo --default-optional -u -N -e 
+  /usr/local/bin/gengetopt -iSeekMiner.ggo --default-optional -u -N -e 
 
   The developers of gengetopt consider the fixed text that goes in all
   gengetopt output files to be in the public domain:
 const char *gengetopt_args_info_description = "";
 
 const char *gengetopt_args_info_help[] = {
-  "  -h, --help                   Print help and exit",
-  "  -V, --version                Print version and exit",
+  "  -h, --help                    Print help and exit",
+  "  -V, --version                 Print version and exit",
   "\nMain:",
-  "  -x, --db=filename            Input a set of datasets",
-  "  -i, --input=filename         Input gene mapping",
-  "  -q, --query=filename         Query gene list",
-  "  -d, --dir_in=directory       Database directory",
-  "  -p, --dir_prep_in=directory  Prep directory (containing .gavg, .gpres files)",
-  "  -N, --is_nibble              Whether the input DB is nibble type  \n                                 (default=off)",
+  "  -x, --db=filename             Input a set of datasets",
+  "  -i, --input=filename          Input gene mapping",
+  "  -q, --query=filename          Query gene list",
+  "  -d, --dir_in=directory        Database directory",
+  "  -p, --dir_prep_in=directory   Prep directory (containing .gavg, .gpres files)",
+  "  -P, --dir_platform=directory  Platform directory (containing .gplatavg, \n                                  .gplatstdev, .gplatorder files)",
+  "  -N, --is_nibble               Whether the input DB is nibble type  \n                                  (default=off)",
+  "  -n, --num_db=INT              Number of databaselets in database  \n                                  (default=`1000')",
     0
 };
 
 typedef enum {ARG_NO
   , ARG_FLAG
   , ARG_STRING
+  , ARG_INT
 } cmdline_parser_arg_type;
 
 static
   args_info->query_given = 0 ;
   args_info->dir_in_given = 0 ;
   args_info->dir_prep_in_given = 0 ;
+  args_info->dir_platform_given = 0 ;
   args_info->is_nibble_given = 0 ;
+  args_info->num_db_given = 0 ;
 }
 
 static
   args_info->dir_in_orig = NULL;
   args_info->dir_prep_in_arg = NULL;
   args_info->dir_prep_in_orig = NULL;
+  args_info->dir_platform_arg = NULL;
+  args_info->dir_platform_orig = NULL;
   args_info->is_nibble_flag = 0;
+  args_info->num_db_arg = 1000;
+  args_info->num_db_orig = NULL;
   
 }
 
   args_info->query_help = gengetopt_args_info_help[5] ;
   args_info->dir_in_help = gengetopt_args_info_help[6] ;
   args_info->dir_prep_in_help = gengetopt_args_info_help[7] ;
-  args_info->is_nibble_help = gengetopt_args_info_help[8] ;
+  args_info->dir_platform_help = gengetopt_args_info_help[8] ;
+  args_info->is_nibble_help = gengetopt_args_info_help[9] ;
+  args_info->num_db_help = gengetopt_args_info_help[10] ;
   
 }
 
   free_string_field (&(args_info->dir_in_orig));
   free_string_field (&(args_info->dir_prep_in_arg));
   free_string_field (&(args_info->dir_prep_in_orig));
+  free_string_field (&(args_info->dir_platform_arg));
+  free_string_field (&(args_info->dir_platform_orig));
+  free_string_field (&(args_info->num_db_orig));
   
   
   for (i = 0; i < args_info->inputs_num; ++i)
     write_into_file(outfile, "dir_in", args_info->dir_in_orig, 0);
   if (args_info->dir_prep_in_given)
     write_into_file(outfile, "dir_prep_in", args_info->dir_prep_in_orig, 0);
+  if (args_info->dir_platform_given)
+    write_into_file(outfile, "dir_platform", args_info->dir_platform_orig, 0);
   if (args_info->is_nibble_given)
     write_into_file(outfile, "is_nibble", 0, 0 );
+  if (args_info->num_db_given)
+    write_into_file(outfile, "num_db", args_info->num_db_orig, 0);
   
 
   i = EXIT_SUCCESS;
       error = 1;
     }
   
+  if (! args_info->dir_platform_given)
+    {
+      fprintf (stderr, "%s: '--dir_platform' ('-P') option required%s\n", prog_name, (additional_error ? additional_error : ""));
+      error = 1;
+    }
+  
+  if (! args_info->num_db_given)
+    {
+      fprintf (stderr, "%s: '--num_db' ('-n') option required%s\n", prog_name, (additional_error ? additional_error : ""));
+      error = 1;
+    }
+  
   
   /* checks for dependences among options */
 
   case ARG_FLAG:
     *((int *)field) = !*((int *)field);
     break;
+  case ARG_INT:
+    if (val) *((int *)field) = strtol (val, &stop_char, 0);
+    break;
   case ARG_STRING:
     if (val) {
       string_field = (char **)field;
     break;
   };
 
+  /* check numeric conversion */
+  switch(arg_type) {
+  case ARG_INT:
+    if (val && !(stop_char && *stop_char == '\0')) {
+      fprintf(stderr, "%s: invalid numeric value: %s\n", package_name, val);
+      return 1; /* failure */
+    }
+    break;
+  default:
+    ;
+  };
 
   /* store the original value */
   switch(arg_type) {
         { "query",	1, NULL, 'q' },
         { "dir_in",	1, NULL, 'd' },
         { "dir_prep_in",	1, NULL, 'p' },
+        { "dir_platform",	1, NULL, 'P' },
         { "is_nibble",	0, NULL, 'N' },
+        { "num_db",	1, NULL, 'n' },
         { 0,  0, 0, 0 }
       };
 
-      c = getopt_long (argc, argv, "hVx:i:q:d:p:N", long_options, &option_index);
+      c = getopt_long (argc, argv, "hVx:i:q:d:p:P:Nn:", long_options, &option_index);
 
       if (c == -1) break;	/* Exit from `while (1)' loop.  */
 
             goto failure;
         
           break;
+        case 'P':	/* Platform directory (containing .gplatavg, .gplatstdev, .gplatorder files).  */
+        
+        
+          if (update_arg( (void *)&(args_info->dir_platform_arg), 
+               &(args_info->dir_platform_orig), &(args_info->dir_platform_given),
+              &(local_args_info.dir_platform_given), optarg, 0, 0, ARG_STRING,
+              check_ambiguity, override, 0, 0,
+              "dir_platform", 'P',
+              additional_error))
+            goto failure;
+        
+          break;
         case 'N':	/* Whether the input DB is nibble type.  */
         
         
             goto failure;
         
           break;
+        case 'n':	/* Number of databaselets in database.  */
+        
+        
+          if (update_arg( (void *)&(args_info->num_db_arg), 
+               &(args_info->num_db_orig), &(args_info->num_db_given),
+              &(local_args_info.num_db_given), optarg, 0, "1000", ARG_INT,
+              check_ambiguity, override, 0, 0,
+              "num_db", 'n',
+              additional_error))
+            goto failure;
+        
+          break;
 
         case 0:	/* Long option with no short option */
         case '?':	/* Invalid option.  */

File tools/SeekMiner/cmdline.h

   char * dir_prep_in_arg;	/**< @brief Prep directory (containing .gavg, .gpres files).  */
   char * dir_prep_in_orig;	/**< @brief Prep directory (containing .gavg, .gpres files) original value given at command line.  */
   const char *dir_prep_in_help; /**< @brief Prep directory (containing .gavg, .gpres files) help description.  */
+  char * dir_platform_arg;	/**< @brief Platform directory (containing .gplatavg, .gplatstdev, .gplatorder files).  */
+  char * dir_platform_orig;	/**< @brief Platform directory (containing .gplatavg, .gplatstdev, .gplatorder files) original value given at command line.  */
+  const char *dir_platform_help; /**< @brief Platform directory (containing .gplatavg, .gplatstdev, .gplatorder files) help description.  */
   int is_nibble_flag;	/**< @brief Whether the input DB is nibble type (default=off).  */
   const char *is_nibble_help; /**< @brief Whether the input DB is nibble type help description.  */
+  int num_db_arg;	/**< @brief Number of databaselets in database (default='1000').  */
+  char * num_db_orig;	/**< @brief Number of databaselets in database original value given at command line.  */
+  const char *num_db_help; /**< @brief Number of databaselets in database help description.  */
   
   unsigned int help_given ;	/**< @brief Whether help was given.  */
   unsigned int version_given ;	/**< @brief Whether version was given.  */
   unsigned int query_given ;	/**< @brief Whether query was given.  */
   unsigned int dir_in_given ;	/**< @brief Whether dir_in was given.  */
   unsigned int dir_prep_in_given ;	/**< @brief Whether dir_prep_in was given.  */
+  unsigned int dir_platform_given ;	/**< @brief Whether dir_platform was given.  */
   unsigned int is_nibble_given ;	/**< @brief Whether is_nibble was given.  */
+  unsigned int num_db_given ;	/**< @brief Whether num_db was given.  */
 
   char **inputs ; /**< @brief unamed options (options without names) */
   unsigned inputs_num ; /**< @brief unamed options number */

File tools/SeekMiner/stdafx.h

 #include "seekweight.h"
 #include "seekdataset.h"
 #include "seekevaluate.h"
+#include "seekplatform.h"
 #include "seekreader.h"
 #include "seekwriter.h"
 #include "seekquery.h"

File tools/SeekPrep/SeekPrep.cpp

 #include "stdafx.h"
 #include "cmdline.h"
 
+bool InitializeDataset(size_t &iDatasets, vector<string> &vecstrDatasets,
+	string &strPrepInputDirectory, vector<CSeekDataset*> &vc){
+	vc.clear();
+	vc.resize(iDatasets);
+	size_t i, j, k;
+
+	for(i=0; i<iDatasets; i++){
+		vc[i] = new CSeekDataset();
+		string strFileStem = vecstrDatasets[i];
+		string strAvgPath = strPrepInputDirectory + "/" + strFileStem + ".gavg";
+		string strPresencePath = strPrepInputDirectory + "/" + strFileStem + ".gpres";
+		vc[i]->ReadGeneAverage(strAvgPath);
+		vc[i]->ReadGenePresence(strPresencePath);
+	}
+	return true;
+}
+
+
+bool InitializeDB(size_t &iDatasets, size_t &iGenes, vector<string> &vecstrGenes,
+	vector<CSeekDataset*> &vc, CDatabaselet &DBL){
+
+	size_t i,j,k;
+	vector<char> cQuery;
+	CSeekTools::InitVector(cQuery, iGenes, (char) 0);
+	vector<string> allQuery;
+	map<string, size_t> mapstrGenes;
+	for(i=0; i<vecstrGenes.size(); i++){
+		mapstrGenes[vecstrGenes[i]] = i;
+	}
+
+	/* Databaselet mapping */
+	map<string, size_t> dbmap;
+	for(i=0; i<DBL.GetGenes(); i++){
+		string strQuery = DBL.GetGene(i);
+		dbmap[strQuery] = i;
+		allQuery.push_back(strQuery);
+		/* global mapping */
+		cQuery[mapstrGenes[strQuery]] = 1;
+	}
+
+	for(i=0; i<iDatasets; i++){
+		vc[i]->InitializeQuery(cQuery);
+	}
+
+	for(i=0; i<DBL.GetGenes(); i++){
+		vector<unsigned char> Q;
+		/* expanded */
+		DBL.Get(i, Q);
+		size_t m = mapstrGenes[DBL.GetGene(i)];
+		size_t l = 0;
+		for(j=0; j<iDatasets; j++){
+			CSeekIntIntMap *qu = vc[j]->GetQueryMap();
+			size_t query = qu->GetForward(m);
+			if(query==-1) continue;
+			for(k=0; k<iGenes; k++){
+				unsigned char c = Q[k*iDatasets + j];
+				//printf("c is %d", c); getchar();
+				vc[j]->SetQueryNoMapping(query, k, c);
+			}
+		}
+	}
+	return true;
+}
+
+bool OpenDB(string &DBFile, bool &useNibble, size_t &iDatasets, size_t &m_iGenes,
+	vector<string> &vecstrGenes, map<int, int> &mapiPlatform,
+	vector<float> &quant, vector<CSeekDataset*> &vc,
+	CFullMatrix<float> &platform_avg, CFullMatrix<float> &platform_stdev,
+	vector<string> &vecstrQuery){
+
+	string fileName = CMeta::Basename(DBFile.c_str());
+	string fileStem = CMeta::Deextension(DBFile);
+	if(!CMeta::IsExtension(fileName, ".db")){
+		cerr << "Wrong extension." << endl;
+		return false;
+	}
+
+	size_t i, j, k;
+
+	CDatabaselet CD(useNibble);
+	CD.Open(DBFile);
+	unsigned char *charImage = CD.GetCharImage();
+	InitializeDB(iDatasets, m_iGenes, vecstrGenes, vc, CD);
+
+	vector<string> presGenes;
+	for(i=0; i<CD.GetGenes(); i++){
+		presGenes.push_back(CD.GetGene(i));
+	}
+
+	size_t numPlatforms = platform_avg.GetRows();
+	map<string, size_t> mapstriGenes;
+	for(i=0; i<vecstrGenes.size(); i++){
+		mapstriGenes[vecstrGenes[i]] = i;
+	}
+
+	for(i=0; i<CD.GetGenes(); i++){
+		vector<float> sum, sq_sum, mean, stdev;
+		vector<int> num;
+		sum.resize(numPlatforms);
+		sq_sum.resize(numPlatforms);
+		mean.resize(numPlatforms);
+		stdev.resize(numPlatforms);
+		num.resize(numPlatforms);
+
+		for(k=0; k<numPlatforms; k++){
+			sum[k] = 0;
+			sq_sum[k] = 0;
+			mean[k] = 0;
+			stdev[k] = 0;
+			num[k] = 0;
+		}
+
+		string thisGene = CD.GetGene(i);
+		size_t geneID = mapstriGenes[thisGene];
+		vecstrQuery.push_back(thisGene);
+		for(k=0; k<iDatasets; k++){
+			CSeekIntIntMap *mapQ = vc[k]->GetQueryMap();
+			CFullMatrix<unsigned char> *f = vc[k]->GetMatrix();
+			size_t iQ = mapQ->GetForward(geneID);
+			if(iQ==-1){
+				continue;
+			}
+			int platform_id = mapiPlatform[k];
+			if(platform_id>=numPlatforms){
+				printf("Error, platforms are equal %d %d", platform_id, numPlatforms); getchar();
+			}
+			for(j=0; j<m_iGenes; j++){
+				unsigned char uc = f->Get(iQ, j);
+				float v = 0;
+				if(uc==255){
+					v = CMeta::GetNaN();
+				}else{
+					v = quant[uc] - vc[k]->GetGeneAverage(j);
+					//v = quant[uc];
+					sum[platform_id] += v;
+					num[platform_id]++;
+					sq_sum[platform_id] += v*v;
+				}
+			}
+		}
+
+		for(k=0; k<numPlatforms; k++){
+			if(num[k]==0){
+				continue;
+			}
+			mean[k] = sum[k] / (float) num[k];
+			stdev[k] = sq_sum[k] / (float) num[k] - mean[k] * mean[k];
+			stdev[k] = sqrt(stdev[k]);
+			platform_avg.Set(k, geneID, mean[k]);
+			platform_stdev.Set(k, geneID, stdev[k]);
+		}
+	}
+
+	for(i=0; i<iDatasets; i++){
+		vc[i]->DeleteQuery();
+	}
+
+	free(charImage);
+	return true;
+}
+
 
 int main( int iArgs, char** aszArgs ) {
 	static const size_t	c_iBuffer	= 1024;
 	istream*			pistm;
 	vector<string>		vecstrLine, vecstrGenes, vecstrDBs, vecstrQuery;
 	char				acBuffer[ c_iBuffer ];
-	size_t				i;
+	size_t				i, j;
 
 	if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) {
 		cmdline_parser_print_help( );
 		pistm = &ifsm; }
 	else
 		pistm = &cin;
+
+	map<string, size_t> mapstriGenes;
 	while( !pistm->eof( ) ) {
 		pistm->getline( acBuffer, c_iBuffer - 1 );
 		acBuffer[ c_iBuffer - 1 ] = 0;
 		CMeta::Tokenize( acBuffer, vecstrLine );
 		if( vecstrLine.size( ) < 2 ) {
 			cerr << "Ignoring line: " << acBuffer << endl;
-			continue; }
+			continue;
+		}
 		if( !( i = atoi( vecstrLine[ 0 ].c_str( ) ) ) ) {
 			cerr << "Illegal gene ID: " << vecstrLine[ 0 ] << " for " << vecstrLine[ 1 ] << endl;
-			return 1; }
+			return 1;
+		}
 		i--;
 		if( vecstrGenes.size( ) <= i )
 			vecstrGenes.resize( i + 1 );
-		vecstrGenes[ i ] = vecstrLine[ 1 ]; }
+		vecstrGenes[ i ] = vecstrLine[ 1 ];
+		mapstriGenes[vecstrGenes[i]] = i;
+	}
+
+	vector<float> quant;
+	float w = -5.0;
+	while(w<5.01){
+		quant.push_back(w);
+		w+=0.1;
+	}
+	quant.resize(quant.size());
+
 	if( sArgs.input_arg )
 		ifsm.close( );
 
 	if(sArgs.dab_arg){
-		CDataPair Dat;
-		char outFile[125];
-		if(!Dat.Open(sArgs.dab_arg, false, false)){
-			cerr << "error opening file" << endl;
-			return 1;
+
+		if(sArgs.gplat_flag==1){
+			map<string, int> mapstrPlatform;
+			map<int, int> mapiPlatform;
+			map<int, string> mapistrPlatform;
+
+			vector<string> vecstrDatasets;
+
+			ifsm.open(sArgs.dset_arg);
+			i = 0;
+			while(!pistm->eof()){
+				pistm->getline(acBuffer, c_iBuffer -1);
+				if(acBuffer[0]==0){
+					break;
+				}
+				acBuffer[c_iBuffer-1] = 0;
+				vecstrLine.clear();
+				CMeta::Tokenize( acBuffer, vecstrLine );
+				/* read dataset name */
+				vecstrDatasets.push_back(vecstrLine[0]);
+				/* just read the platform information */
+				string pl = vecstrLine[1];
+				map< string, int >::const_iterator	iter;
+				iter = mapstrPlatform.find(pl);
+				if(iter== mapstrPlatform.end()){
+					int s = mapstrPlatform.size();
+					mapstrPlatform[pl] = s;
+					mapistrPlatform[s] = pl;
+				}
+				int platform_id = mapstrPlatform[pl];
+				mapiPlatform[i] = platform_id;
+				i++;
+			}
+			ifsm.close();
+
+			vector<string> dblist;
+			ifsm.open(sArgs.dab_arg);
+			i = 0;
+			while(!pistm->eof()){
+				pistm->getline(acBuffer, c_iBuffer -1);
+				if(acBuffer[0]==0){
+					break;
+				}
+				acBuffer[c_iBuffer-1] = 0;
+				dblist.push_back(acBuffer);
+			}
+			dblist.resize(dblist.size());
+			ifsm.close();
+
+			bool useNibble = false;
+			if(sArgs.useNibble_flag==1){
+				useNibble = true;
+			}
+
+			size_t numPlatforms = mapstrPlatform.size();
+			size_t iDatasets = vecstrDatasets.size();
+			size_t m_iGenes = vecstrGenes.size();
+			CFullMatrix<float> platform_avg, platform_stdev;
+			platform_avg.Initialize(numPlatforms, m_iGenes);
+			platform_stdev.Initialize(numPlatforms, m_iGenes);
+
+			for(i=0; i<numPlatforms; i++){
+				for(j=0; j<m_iGenes; j++){
+					platform_avg.Set(i, j, CMeta::GetNaN());
+					platform_stdev.Set(i, j, CMeta::GetNaN());
+				}
+			}
+
+			string strPrepInputDirectory = sArgs.dir_prep_in_arg;
+			vector<CSeekDataset*> vc;
+			InitializeDataset(iDatasets, vecstrDatasets, strPrepInputDirectory, vc);
+
+			//printf("Dataset initialized"); getchar();
+			vector<string> vecstrQuery;
+
+			for(i=0; i<dblist.size(); i++){
+				string DBFile = dblist[i];
+				printf("opening db file %s\n", DBFile.c_str()); //getchar();
+				OpenDB(DBFile, useNibble, iDatasets, m_iGenes,
+				vecstrGenes, mapiPlatform, quant, vc, platform_avg,
+				platform_stdev, vecstrQuery);
+				printf("finished opening db file %s\n", DBFile.c_str()); //getchar();
+			}
+
+			for(i=0; i<numPlatforms; i++){
+				printf("Platform %s\n", mapistrPlatform[i].c_str());
+				/*for(j=0; j<vecstrQuery.size(); j++){
+					size_t iGene = mapstriGenes[vecstrQuery[j]];
+					printf("Gene %s %.5f %.5f\n", vecstrQuery[j].c_str(), platform_avg.Get(i, iGene),
+						platform_stdev.Get(i,iGene));
+				}*/
+			}
+
+			platform_avg.Save("all_platform.gplatavg");
+			platform_stdev.Save("all_platform.gplatstdev");
+
 		}
 
-		if(sArgs.gavg_flag==1){
+		else if(sArgs.gavg_flag==1){
+			CDataPair Dat;
+			char outFile[125];
+			if(!Dat.Open(sArgs.dab_arg, false, false)){
+				cerr << "error opening file" << endl;
+				return 1;
+			}
 			vector<float> vecGeneAvg;
 			string fileName = CMeta::Basename(sArgs.dab_arg);
 			string fileStem = CMeta::Deextension(fileName);
 			CSeekTools::WriteArray(outFile, vecGeneAvg);
 		}
 
-		if(sArgs.gpres_flag==1){
+		else if(sArgs.gpres_flag==1){
+			CDataPair Dat;
+			char outFile[125];
+			if(!Dat.Open(sArgs.dab_arg, false, false)){
+				cerr << "error opening file" << endl;
+				return 1;
+			}
 			vector<char> vecGenePresence;
 			string fileName = CMeta::Basename(sArgs.dab_arg);
 			string fileStem = CMeta::Deextension(fileName);

File tools/SeekPrep/SeekPrep.ggo

 purpose	"Preprocess datasets for Seek"
 
 section "Main"
-option	"dab"				x	"Input dataset (.dab)"
+option	"dab"				x	"Input dataset (.dab) or databaselet (.db)"
 								string typestr="filename"	yes
 option	"sinfo"				s	"Generates sinfo file (with dataset wide mean and stdev)"
 								flag	off
 option	"gavg"				a	"Generates gene-average file"
 								flag	off
+option	"gplat"				P	"Generates gene-platform average + stdev file (requires .db input)"
+								flag	off
 option	"gpres"				p	"Generates gene-presence file"
 								flag	off
 option	"gvar"				v	"Generates gene-variance file"
 								string typestr="directory"	yes
 option	"input"				i	"Gene mapping file"
 								string typestr="filename"	yes
+option	"dir_prep_in"		I	"Prep input directory"
+								string typestr="directory"
+option	"dset"				A	"Dataset ordering file (with platform info) (required for -P)"
+								string typestr="filename"	yes
+option	"useNibble"			N	"Is DB file nibble? (required for -P)"
+								flag	off

File tools/SeekPrep/cmdline.c

 /*
   File autogenerated by gengetopt version 2.22.5
   generated with the following command:
-  gengetopt -iSeekPrep.ggo --default-optional -u -N -e 
+  /usr/local/bin/gengetopt -iSeekPrep.ggo --default-optional -u -N -e 
 
   The developers of gengetopt consider the fixed text that goes in all
   gengetopt output files to be in the public domain:
   "  -h, --help               Print help and exit",
   "  -V, --version            Print version and exit",
   "\nMain:",
-  "  -x, --dab=filename       Input dataset (.dab)",
+  "  -x, --dab=filename       Input dataset (.dab) or databaselet (.db)",
   "  -s, --sinfo              Generates sinfo file (with dataset wide mean and \n                             stdev)  (default=off)",
   "  -a, --gavg               Generates gene-average file  (default=off)",
+  "  -P, --gplat              Generates gene-platform average + stdev file \n                             (requires .db input)  (default=off)",
   "  -p, --gpres              Generates gene-presence file  (default=off)",
   "  -v, --gvar               Generates gene-variance file  (default=off)",
   "  -D, --dir_out=directory  Output directory",
   "  -i, --input=filename     Gene mapping file",
+  "  -A, --dset=filename      Dataset ordering file (with platform info) (required \n                             for -P)",
+  "  -N, --useNibble          Is DB file nibble? (required for -P)  (default=off)",
     0
 };
 
   args_info->dab_given = 0 ;
   args_info->sinfo_given = 0 ;
   args_info->gavg_given = 0 ;
+  args_info->gplat_given = 0 ;
   args_info->gpres_given = 0 ;
   args_info->gvar_given = 0 ;
   args_info->dir_out_given = 0 ;
   args_info->input_given = 0 ;
+  args_info->dset_given = 0 ;
+  args_info->useNibble_given = 0 ;
 }
 
 static
   args_info->dab_orig = NULL;
   args_info->sinfo_flag = 0;
   args_info->gavg_flag = 0;
+  args_info->gplat_flag = 0;
   args_info->gpres_flag = 0;
   args_info->gvar_flag = 0;
   args_info->dir_out_arg = NULL;
   args_info->dir_out_orig = NULL;
   args_info->input_arg = NULL;
   args_info->input_orig = NULL;
+  args_info->dset_arg = NULL;
+  args_info->dset_orig = NULL;
+  args_info->useNibble_flag = 0;
   
 }
 
   args_info->dab_help = gengetopt_args_info_help[3] ;
   args_info->sinfo_help = gengetopt_args_info_help[4] ;
   args_info->gavg_help = gengetopt_args_info_help[5] ;
-  args_info->gpres_help = gengetopt_args_info_help[6] ;
-  args_info->gvar_help = gengetopt_args_info_help[7] ;
-  args_info->dir_out_help = gengetopt_args_info_help[8] ;
-  args_info->input_help = gengetopt_args_info_help[9] ;
+  args_info->gplat_help = gengetopt_args_info_help[6] ;
+  args_info->gpres_help = gengetopt_args_info_help[7] ;
+  args_info->gvar_help = gengetopt_args_info_help[8] ;
+  args_info->dir_out_help = gengetopt_args_info_help[9] ;
+  args_info->input_help = gengetopt_args_info_help[10] ;
+  args_info->dset_help = gengetopt_args_info_help[11] ;
+  args_info->useNibble_help = gengetopt_args_info_help[12] ;
   
 }
 
   free_string_field (&(args_info->dir_out_orig));
   free_string_field (&(args_info->input_arg));
   free_string_field (&(args_info->input_orig));
+  free_string_field (&(args_info->dset_arg));
+  free_string_field (&(args_info->dset_orig));
   
   
   for (i = 0; i < args_info->inputs_num; ++i)
     write_into_file(outfile, "sinfo", 0, 0 );
   if (args_info->gavg_given)
     write_into_file(outfile, "gavg", 0, 0 );
+  if (args_info->gplat_given)
+    write_into_file(outfile, "gplat", 0, 0 );
   if (args_info->gpres_given)
     write_into_file(outfile, "gpres", 0, 0 );
   if (args_info->gvar_given)
     write_into_file(outfile, "dir_out", args_info->dir_out_orig, 0);
   if (args_info->input_given)
     write_into_file(outfile, "input", args_info->input_orig, 0);
+  if (args_info->dset_given)
+    write_into_file(outfile, "dset", args_info->dset_orig, 0);
+  if (args_info->useNibble_given)
+    write_into_file(outfile, "useNibble", 0, 0 );
   
 
   i = EXIT_SUCCESS;
       error = 1;
     }
   
+  if (! args_info->dset_given)
+    {
+      fprintf (stderr, "%s: '--dset' ('-A') option required%s\n", prog_name, (additional_error ? additional_error : ""));
+      error = 1;
+    }
+  
   
   /* checks for dependences among options */
 
         { "dab",	1, NULL, 'x' },
         { "sinfo",	0, NULL, 's' },
         { "gavg",	0, NULL, 'a' },
+        { "gplat",	0, NULL, 'P' },
         { "gpres",	0, NULL, 'p' },
         { "gvar",	0, NULL, 'v' },
         { "dir_out",	1, NULL, 'D' },
         { "input",	1, NULL, 'i' },
+        { "dset",	1, NULL, 'A' },
+        { "useNibble",	0, NULL, 'N' },
         { 0,  0, 0, 0 }
       };
 
-      c = getopt_long (argc, argv, "hVx:sapvD:i:", long_options, &option_index);
+      c = getopt_long (argc, argv, "hVx:saPpvD:i:A:N", long_options, &option_index);
 
       if (c == -1) break;	/* Exit from `while (1)' loop.  */
 
           return 0;
         
           break;
-        case 'x':	/* Input dataset (.dab).  */
+        case 'x':	/* Input dataset (.dab) or databaselet (.db).  */
         
         
           if (update_arg( (void *)&(args_info->dab_arg), 
             goto failure;
         
           break;
+        case 'P':	/* Generates gene-platform average + stdev file (requires .db input).  */
+        
+        
+          if (update_arg((void *)&(args_info->gplat_flag), 0, &(args_info->gplat_given),
+              &(local_args_info.gplat_given), optarg, 0, 0, ARG_FLAG,
+              check_ambiguity, override, 1, 0, "gplat", 'P',
+              additional_error))
+            goto failure;
+        
+          break;
         case 'p':	/* Generates gene-presence file.  */
         
         
             goto failure;
         
           break;
+        case 'A':	/* Dataset ordering file (with platform info) (required for -P).  */
+        
+        
+          if (update_arg( (void *)&(args_info->dset_arg), 
+               &(args_info->dset_orig), &(args_info->dset_given),
+              &(local_args_info.dset_given), optarg, 0, 0, ARG_STRING,
+              check_ambiguity, override, 0, 0,
+              "dset", 'A',
+              additional_error))
+            goto failure;
+        
+          break;
+        case 'N':	/* Is DB file nibble? (required for -P).  */
+        
+        
+          if (update_arg((void *)&(args_info->useNibble_flag), 0, &(args_info->useNibble_given),
+              &(local_args_info.useNibble_given), optarg, 0, 0, ARG_FLAG,
+              check_ambiguity, override, 1, 0, "useNibble", 'N',
+              additional_error))
+            goto failure;
+        
+          break;
 
         case 0:	/* Long option with no short option */
         case '?':	/* Invalid option.  */

File tools/SeekPrep/cmdline.h

 {
   const char *help_help; /**< @brief Print help and exit help description.  */
   const char *version_help; /**< @brief Print version and exit help description.  */
-  char * dab_arg;	/**< @brief Input dataset (.dab).  */
-  char * dab_orig;	/**< @brief Input dataset (.dab) original value given at command line.  */
-  const char *dab_help; /**< @brief Input dataset (.dab) help description.  */
+  char * dab_arg;	/**< @brief Input dataset (.dab) or databaselet (.db).  */
+  char * dab_orig;	/**< @brief Input dataset (.dab) or databaselet (.db) original value given at command line.  */
+  const char *dab_help; /**< @brief Input dataset (.dab) or databaselet (.db) help description.  */
   int sinfo_flag;	/**< @brief Generates sinfo file (with dataset wide mean and stdev) (default=off).  */
   const char *sinfo_help; /**< @brief Generates sinfo file (with dataset wide mean and stdev) help description.  */
   int gavg_flag;	/**< @brief Generates gene-average file (default=off).  */
   const char *gavg_help; /**< @brief Generates gene-average file help description.  */
+  int gplat_flag;	/**< @brief Generates gene-platform average + stdev file (requires .db input) (default=off).  */
+  const char *gplat_help; /**< @brief Generates gene-platform average + stdev file (requires .db input) help description.  */
   int gpres_flag;	/**< @brief Generates gene-presence file (default=off).  */
   const char *gpres_help; /**< @brief Generates gene-presence file help description.  */
   int gvar_flag;	/**< @brief Generates gene-variance file (default=off).  */
   char * input_arg;	/**< @brief Gene mapping file.  */