YOUNG-SUK LEE avatar YOUNG-SUK LEE committed ed214a9 Merge

merge

Comments (0)

Files changed (60)

         fi
     ],
         [smile_state=check])
-        LOCAL_CHECK_LIB([smile], [smile], [DSL_isFinite(1.0)],
 
+LOCAL_CHECK_LIB([smile], [smile], [DSL_isFinite(1.0)],
     [smile_state=ok],                                         dnl found
     [
             SMILE_LIBS="-lsmile"
          fi
         ],
         [boost_state=check])
-LOCAL_CHECK_LIB([boost_graph], [boost], [],
+prev_boost_state=$boost_state
+AC_LANG_PUSH([C++])
+LOCAL_CHECK_LIB([boost_graph], [boost],
+		[using namespace boost; typedef adjacency_list< vecS, vecS, directedS, property< vertex_color_t, default_color_type >, property< edge_weight_t, double > > Graph],
         [boost_state=ok],                                         dnl found
-        [],                                                       dnl and found installed
-        [],                                                       dnl and found in specified path
+        [BOOST_LIBS="-lboost_graph"], 				              dnl and found installed
+        [
+         BOOST_CFLAGS="-I $BOOST_INCLUDE_DIR"
+         BOOST_LIBS="-L $BOOST_LIB_DIR -lboost_graph"
+        ],                                                        dnl and found in specified path
         [],                                                       dnl not found
         [boost_state=no],                                         dnl and not found installed
         [boost_state=warn],                                       dnl and not found in specified path
         [],                                                       dnl requested without
         [#include "boost/graph/graphviz.hpp"],
-        [],
+		[],
         [-L $BOOST_LIB_DIR -I $BOOST_INCLUDE_DIR])
+AC_LANG_POP
+
 if test "x$boost_state" = "xok"; then
-LOCAL_CHECK_LIB([boost_regex], [boost], [],
+boost_state=$prev_boost_state
+AC_LANG_PUSH([C++])
+LOCAL_CHECK_LIB([boost_regex], [boost], 
+		[boost::regex e("(\\d{4}[- ]){3}\\d{4}")],
         [boost_state=ok],                                         dnl found
         [BOOST_LIBS="-lboost_graph -lboost_regex"],               dnl and found installed
         [
         [boost_state=no],                                         dnl and not found installed
         [boost_state=warn],                                       dnl and not found in specified path
         [],                                                       dnl requested without
-        [#include "boost/graph/graphviz.hpp"],
-        [],
+        [#include "boost/regex.hpp"],
+		[],
         [-L $BOOST_LIB_DIR -I $BOOST_INCLUDE_DIR])
+AC_LANG_POP
 fi
 if test "x$boost_state" != "xok"; then
   AC_DEFINE([NO_BOOST], [1])

configure.incendio

 	--with-svm-perf=/Genomics/fgrid/function/sleipnir-extlib/svm_perf/ \
 	--with-boost-includes=/Genomics/fgrid/function/sleipnir-extlib/boost/ \
 	--with-libsvm=/Genomics/Users/youngl/lib/libsvm-3.17/ \
-    --prefix=/home/youngl/software/sleipnir/
+    --prefix=/home/youngl/software/sleipnir/ \ 
 	LDFLAGS=-static
 # CXXFLAGS=-fno-threadsafe-statics
-./configure --prefix=/memex/qzhu/sleipnir_build --with-log4cpp=/memex/qzhu/usr --with-smile=/memex/qzhu/packages/sleipnir/smile --with-svm-perf=/memex/qzhu/packages/sleipnir/svmperf
+./configure --prefix=/memex/qzhu/sleipnir_build --with-log4cpp=/memex/qzhu/usr --with-smile=/memex/qzhu/packages/sleipnir/smile --with-svm-perf=/memex/qzhu/packages/sleipnir/svmperf --with-boost=/memex/qzhu/usr
 lib_LIBRARIES			= libSleipnir.a
 
-AM_CPPFLAGS = $(GSL_CFLAGS) $(LOG4CPP_CFLAGS) $(SMILE_CFLAGS) $(SVM_PERF_CFLAGS) $(SVM_MULTICLASS_CFLAGS) $(SVM_HIERARCHY_CFLAGS) $(PTHREAD_CFLAGS) $(VOWPAL_WABBIT_CFLAGS) $(LIBSVM_CFLAGS)
+AM_CPPFLAGS = $(BOOST_CFLAGS) $(GSL_CFLAGS) $(LOG4CPP_CFLAGS) $(SMILE_CFLAGS) $(SVM_PERF_CFLAGS) $(SVM_MULTICLASS_CFLAGS) $(SVM_HIERARCHY_CFLAGS) $(PTHREAD_CFLAGS) $(VOWPAL_WABBIT_CFLAGS) $(LIBSVM_CFLAGS)
 
 #LDADD = $(LOG4CPP_LIBS) $(SMILE_LIBS) $(SVM_PERF_LIBS) $(SVM_MULTICLASS_LIBS) $(SVM_HIERARCHY_LIBS) $(PTHREAD_LIBS)
 
 	$(libSleipnir_LIBSVM_SOURCES) \
 	$(libSleipnir_SVM_MULTICLASS_SOURCES)\
 	$(libSleipnir_SVM_HIERARCHY_SOURCES)\
-	vwb.cpp
+	vwb.cpp					\
+	strassen.cpp
 include_HEADERS			=	\
 	annotation.h			\
 	annotationi.h			\
 	statistics.h			\
 	statisticsi.h			\
 	stdafx.h				\
+	strassen.h				\
 	$(libSleipnir_SVM_INCLUDES)  \
 	$(libSleipnir_GSL_SOURCES) \
 	$(libSleipnir_LIBSVM_INCLUDES) \
 const CColor	CColor::c_Red		= CColor( 0xFF, 0x00, 0x00 );
 const CColor	CColor::c_White		= CColor( 0xFF, 0xFF, 0xFF );
 const CColor	CColor::c_Yellow	= CColor( 0xFF, 0xFF, 0x00 );
+const CColor	CColor::c_Blue		= CColor( 0x00, 0x00, 0xFF );
+const CColor	CColor::c_DarkGreen	= CColor( 0x00, 0x64, 0x00 );
+const CColor	CColor::c_Orange	= CColor( 0xFF, 0xA5, 0x00 );
 
 /*!
  * \brief
 	 */
 	static const CColor	c_Yellow;
 
+	static const CColor	c_Blue;
+
+	static const CColor	c_DarkGreen;
+
+	static const CColor	c_Orange;
+
 	static CColor Interpolate( float dValue, const CColor& ColorMinimum, const CColor& ColorMedium,
 		const CColor& ColorMaximum );
 
 				ostm << ", setlinewidth(" << (*pvecdBorders)[ i ] << ")";
 			ostm << "\"";
 			ostm << ", fillcolor = \"" << ( fHashes ? "#" : "" ) << ( pvecdColors ? CColor::Interpolate(
-				(*pvecdColors)[ i ], CColor::c_Cyan, CColor::c_White, CColor::c_Yellow ).ToRGB( ) :
+				(*pvecdColors)[ i ], 
+				CColor::c_Cyan, CColor::c_White, CColor::c_Yellow 
+				//CColor::c_White, CColor::c_Cyan, CColor::c_Yellow 
+				).ToRGB( ) :
 				"FFFFFF" ) << "\"";
 			if( fLabel )
 				ostm << ", label=\"" << strName << "\"";
 				d = 1.0f / ( 1 + exp( ( dAve - d ) / dStd ) );
 				ostm << vecstrNames[ i ] << " -- " << vecstrNames[ j ] << " [weight = " << d <<
 					", color = \"" << ( fHashes ? "#" : "" ) << CColor::Interpolate( d,
-					CColor::c_Green, CColor::c_Black, CColor::c_Red ).ToRGB( ) << "\"];" << endl; }
+					//CColor::c_Green, CColor::c_Black, CColor::c_Red ).ToRGB( ) 
+					CColor::c_Orange, CColor::c_DarkGreen, CColor::c_Blue ).ToRGB( ) 
+					<< "\"];" << endl; }
 
 	ostm << "}" << endl; }
 
 
 }
 
+bool CDatabaselet::Write(char* Qt, const size_t &iSize, const size_t offset){
+	m_fstm.seekg(m_iHeader + offset, ios_base::beg);
+	m_fstm.write(Qt, iSize);
+}
+
 /* simply opens the file without overwriting */
 bool CDatabaselet::OpenNoOverwrite() {
 	m_fstm.clear( );
 	return true;
 }
 
-
-
-/* mainly used by SeekMinder */
+/* mainly used by SeekMiner */
 bool CDatabaselet::Get(size_t iOne, vector<unsigned char>& vecbData){
 	size_t iSize;
 	size_t i, j;
 				vecbData[ j * m_iDatasets + (2 * i) + 1 ] = bValue;
 			}
 		}
-
 	}
 	free(abImage);
 	return true;
 }
 
-
-
 bool CDatabaselet::Get( size_t iOne, size_t iTwo,
 		vector<unsigned char>& vecbData, unsigned char *charImage){
 	size_t offset = GetOffset(iOne, iTwo) - m_iHeader;
 	return true;
 }
 
-
 /*	static function, combine multiple databaselets (that share the same genes, ie m_vecStrGenes),
 	and output result to a single file, or output one-gene per file (if databaselet contains multiple genes)
  	bSplit: whether or not to output one-gene per file
 
 	size_t bb = 0;
 	size_t sofar = 0;
+	fprintf(stderr, "Number of times: %d\n", numTimes); 
 	for(bb=0; bb<numTimes; sofar+=sizes[bb], bb++){
+		fprintf(stderr, "bb sizes[bb] sofar: %d %d %d\n", bb, sizes[bb], sofar); 
 
 		fprintf(stderr, "Started allocated memory %lu\n", bb);
 		/* load all Databaselets into memory, for efficiency */
 			size_t iGeneOne, iGeneTwo;
 			size_t offset1, offset2, offset3;
 
+			fprintf(stderr, "GetSizeGene() %d\n", DBS.GetSizeGene());
+			fprintf(stderr, "GetSizePair() %d\n", DBS.GetSizePair());
+			fprintf(stderr, "first->m_iGenes %d\n", first->m_iGenes);
 			if(first->m_useNibble==false){
 				for(iGeneOne = 0; iGeneOne < sizes[bb]; ++iGeneOne){
 					offset1 = DBS.GetSizeGene() * iGeneOne;
 							vector<unsigned char> vc;
 							CDatabaselet *current = vecDatabaselet[iDatum];
 							//size_t offset_c = current->GetSizeGene() * iGeneOne + current->GetSizePair() * iGeneTwo;
-							current->Get( offset1 + offset2, vc, charImages[iDatum]);
+							//current->Get( offset1 + offset2, vc, charImages[iDatum]);
 							//current->Get( iGeneOne, iGeneTwo, vc, charImages[iDatum]);
+							current->Get(current->GetSizeGene() * iGeneOne + current->GetSizePair() * iGeneTwo, vc, charImages[iDatum]);
 							offset3 = offset1 + offset2 + totalSum;
 							for(j=0; j<vc.size(); j++){
 								abImage[offset3 + j] = vc[j];
 						for( iDatum = 0; iDatum  < vecDatabaselet.size(); iDatum ++ ){
 							vector<unsigned char> vc;
 							CDatabaselet *current = vecDatabaselet[iDatum];
-							current->Get( offset1 + offset2, vc, charImages[iDatum]);
+							current->Get(current->GetSizeGene()*iGeneOne + current->GetSizePair()*iGeneTwo, vc, charImages[iDatum]);
+							//current->Get( offset1 + offset2, vc, charImages[iDatum]);
 							//current->Get( iGeneOne, iGeneTwo, vc, charImages[iDatum]);
 							offset3 = totalSum;
 							for(j=0; j<vc.size(); j++){
 	return true;
 }
 
+//Create a copy of current CDatabase collection that has X number of CDatabaselets 
+bool CDatabase::Reorganize(const char *dest_db_dir, const size_t &num_db){
+	int dest_db = num_db;
+	size_t i, j, l;
+	const char c_acExtension[] = ".db";
+	char acNumber[16];
+
+	vector<string> vecstrG;
+	vecstrG.resize(m_mapstriGenes.size());
+
+	for(map<string,size_t>::iterator iter=m_mapstriGenes.begin();
+		iter!=m_mapstriGenes.end(); iter++){
+		string first = iter->first;
+		size_t second = iter->second;
+		vecstrG[second] = first;
+	}
+
+	for(i=0; i<dest_db; i++){
+		vector<string> vecstrSubset;
+		for(j=i; j<vecstrG.size(); j+=dest_db)
+			vecstrSubset.push_back(vecstrG[j]);
+		//size of this db
+		size_t iSize = GetGenes() * GetDatasets() * vecstrSubset.size();
+
+		unsigned char *Qt = (unsigned char*)malloc(iSize);
+		int tot = 0;
+		for(j=0; j<vecstrSubset.size(); j++){
+			size_t k = m_mapstriGenes.find(vecstrSubset[j])->second;
+			vector<unsigned char> Qi;
+			if(!GetGene(k, Qi)){
+				cerr << "Gene error" << endl;
+				continue;
+			}
+			for(l=0; l<Qi.size(); l++)
+				Qt[tot+l] = Qi[l];
+			tot+=Qi.size();
+		}
+		sprintf(acNumber, "%08lu", i);
+		string dest_dir = dest_db_dir;
+		string strFile = dest_dir + "/" + acNumber + c_acExtension;
+
+		CDatabaselet DBS(false);
+		DBS.Open(strFile.c_str(), vecstrSubset, vecstrG.size(), 
+			GetDatasets());
+		DBS.Write((char*) Qt, iSize, 0);
+		free(Qt);
+	}
+	return true;
+}
+
+//For SeekMiner
 bool CDatabase::Open(const string &strDBDirectory,
 		const vector<string> &vecstrGenes, const size_t &iDatasets, const size_t &iNumDBs){
 	return CDatabase::Open(strDBDirectory.c_str(), vecstrGenes, iDatasets, iNumDBs);
 }
 
-
-bool CDatabase::Open(const char *db_dir,
-		const vector<string> &vecstrGenes, const size_t &iDatasets, const size_t &iNumDBs){
+//For SeekMiner
+bool CDatabase::Open(const char *db_dir, const vector<string> &vecstrGenes, 
+	const size_t &iDatasets, const size_t &iNumDBs){
 	size_t i, j, k;
 	Clear();
 	m_vecpDBs.resize(iNumDBs);
 	char acNumber[ 16 ];
 
-	for( i = 0; i < m_vecpDBs.size( ); ++i ) {
+	for( i = 0; i < m_vecpDBs.size( ); ++i )
 		m_vecpDBs[ i ] = new CDatabaselet( m_useNibble );
-	}
 
 	string strDBDirectory = db_dir;
 
 	CDatabase(bool isNibble) : CDatabaseImpl(isNibble){
 	}
 
+	bool Reorganize(const char*, const size_t&);
+
+
 	bool GetGene(const string &, vector<unsigned char>&) const;
 	bool GetGene(const size_t &, vector<unsigned char>&) const;
 
 
 		return ( m_vecpDBs.empty( ) ? 0 : m_vecpDBs[ 0 ]->GetDatasets( ) ); }
 
-
+	//Used by SeekMiner and SeekServer
 	bool Open(const string &, const vector<string> &, const size_t &, const size_t &);
 	bool Open(const char *, const vector<string> &, const size_t &, const size_t &);
 
 
 	bool OpenNoOverwrite();
 
+	//directly write bytes to disk
+	bool Write(char* data, const size_t& iSize, const size_t offset = 0);
+
 	bool OpenWrite( unsigned char, size_t, ENibbles, unsigned char* );
 
 	/* Get pair by referring to memory cache (ie charImage) of the db file */
 	if (pMeasure->IsRank())
 		PCL.RankTransform();
 
+	g_CatSleipnir().info("Number of Experiments: %d", PCL.GetExperiments());
+
 	if ((iLimit != -1) && (PCL.GetGenes() > iLimit))
 		Dat.Open(PCL, pMeasure->Clone(), true);
 	else {
 	if (pMeasure->IsRank())
 		PCL.RankTransform();
 
+	g_CatSleipnir().info("Number of Experiments: %d", PCL.GetExperiments());
+
 	if ((iLimit != -1) && (PCL.GetGenes() > iLimit))
 		Dat.Open(PCL, pMeasure->Clone(), true);
 	else {
 		return;
 
 /*	if (fCDT)
-		ostm << c_szGID << '\t';
-	ostm << m_vecstrFeatures[0];*/
-	for (i = 1; i < m_vecstrFeatures.size(); ++i)
-		ostm << '\t' << m_vecstrFeatures[i];
+		ostm << c_szGID << '\t'; */
+	ostm << m_vecstrFeatures[0]; //Gene name
+//	for (i = 1; i < m_vecstrFeatures.size(); ++i)
+//		ostm << '\t' << m_vecstrFeatures[i];
 	for (i = 0; i < m_vecstrExperiments.size(); ++i)
 		ostm << '\t' << m_vecstrExperiments[i];
 	ostm << endl;

src/seekcentral.cpp

 
 	m_master_rank_threads = NULL;
 	m_sum_weight_threads = NULL;
+	m_sum_sq_weight_threads = NULL; //sum squared weight
 	m_counts_threads = NULL;
 	m_rank_normal_threads = NULL;
 	m_rank_threads = NULL;
 
 	m_master_rank.clear();
 	m_sum_weight.clear();
+	m_sum_sq_weight.clear();
 	m_weight.clear();
 	m_counts.clear();
 	m_mapLoadTime.clear();
 		CSeekTools::Free2DArray(m_sum_weight_threads);
 		m_sum_weight_threads = NULL;
 	}
+	if(m_sum_sq_weight_threads != NULL){
+		CSeekTools::Free2DArray(m_sum_sq_weight_threads);
+		m_sum_sq_weight_threads = NULL;
+	}
 	if(m_counts_threads !=NULL){
 		CSeekTools::Free2DArray(m_counts_threads);
 		m_counts_threads = NULL;
 
 	m_master_rank.clear();
 	m_sum_weight.clear();
+	m_sum_sq_weight.clear();
 	m_counts.clear();
 	m_weight.clear();
 	m_final.clear();
 		}
 	}
 	ss.clear();
+
 	//check
 	vector<char> vc;
 	utype tot = 0;
 		m_vc, src->m_vc, m_vp, src->m_vp, m_vecstrDatasets,
 		m_mapstrstrDatasetPlatform, m_mapstriPlatform);
 
-	if(!CalculateRestart())
+	if(!CalculateRestart()){
+		fprintf(stderr, "Error occurred during CalculateRestart()\n");
 		return false;
+	}
 
-	if(!EnableNetwork(iClient))
+	//fprintf(stderr, "Finished CalculateRestart()\n");
+
+	if(!EnableNetwork(iClient)){
+		fprintf(stderr, "Error occurred during EnableNetwork()\n");
 		return false;
+	}
 
-	if(!CheckDatasets(true)) //replace parameter is true
+	//fprintf(stderr, "Finished EnableNetworks()\n");
+
+	if(!CheckDatasets(true)){ //replace parameter is true
+		fprintf(stderr, "Error occurred during CheckDatasets()\n");
 		return false;
+	}
 
+	//fprintf(stderr, "Finished CheckDatasets()\n");
 	return true;
 }
 
 		vector<int> count;
 		CSeekTools::InitVector(count, m_vecstrAllQuery[l].size(), (int) 0);
 		bool isFirst = true;
+		//fprintf(stderr, "iUserDatasets %d\n", iUserDatasets);
 
 		for(dd=0; dd<iUserDatasets; dd++){
 			utype i = allRDatasets[dd];
 			ss << "|";
 		}
 
+		//fprintf(stderr, "ss %s\n", ss.str().c_str());
 		isFirst = true;		
 		for(j=0; j<m_vecstrAllQuery[l].size(); j++){
 			sq << m_vecstrAllQuery[l][j] << ":" << count[j];
 			sq << "|";
 		}
 
+		//fprintf(stderr, "sq %s\n", sq.str().c_str());
+		//fprintf(stderr, "aq %s\n", aq.str().c_str());
 	}
 
 	string refinedQuery = aq.str();
 			CMeta::Tokenize(sd[i].c_str(), m_vecstrSearchDatasets[i], " ", false);
 		}
 
+		//fprintf(stderr, "replace datasets %d %d\n", sd.size(), m_vecstrSearchDatasets[0].size());
+		//fprintf(stderr, "searchdsetMap size %d\n", m_searchdsetMap.size());
+
 		for(i=0; i<m_searchdsetMap.size(); i++){
 			delete m_searchdsetMap[i];
 		}
 		m_searchdsetMap.clear();
+		//fprintf(stderr, "cleared searchdsetMap\n");
 
 		m_searchdsetMap.resize(m_vecstrAllQuery.size());
 		for(i=0; i<m_vecstrAllQuery.size(); i++){
 		}
 	}
 
-	if(!CalculateRestart()) return false;
+	if(!CalculateRestart()){
+		fprintf(stderr, "Error occurred during CalculateRestart()\n");
+		return false;
+	}
+	//fprintf(stderr, "Finished CalculateRestart()\n");
 
 	return true;
 }
 	CSeekIntIntMap &dMap, vector<float> &weight){
 
 	assert(m_master_rank_threads==NULL && m_counts_threads==NULL &&
-		m_sum_weight_threads==NULL);
+		m_sum_weight_threads==NULL && m_sum_sq_weight_threads==NULL);
 	assert(m_rank_normal_threads==NULL && m_rank_threads==NULL);
 	assert(m_rData==NULL);
 
 		CSeekTools::Init2DArray(m_numThreads, m_iGenes, (float)0);
 	m_sum_weight_threads =
 		CSeekTools::Init2DArray(m_numThreads, m_iGenes, (float)0);
+	m_sum_sq_weight_threads =
+		CSeekTools::Init2DArray(m_numThreads, m_iGenes, (float)0);
 	m_counts_threads =
 		CSeekTools::Init2DArray(m_numThreads, m_iGenes, (utype)0);
 	
 	
 	CSeekTools::InitVector(m_master_rank, m_iGenes, (float) 0);
 	CSeekTools::InitVector(m_sum_weight, m_iGenes, (float) 0);
+	CSeekTools::InitVector(m_sum_sq_weight, m_iGenes, (float) 0);
 	CSeekTools::InitVector(m_counts, m_iGenes, (utype) 0);
 	CSeekTools::InitVector(weight, m_iDatasets, (float)0);
 	
 
 bool CSeekCentral::AggregateThreads(){
 	assert(m_master_rank_threads!=NULL && m_counts_threads!=NULL &&
-		m_sum_weight_threads!=NULL);
+		m_sum_sq_weight_threads!=NULL && m_sum_weight_threads!=NULL);
 	assert(m_rank_normal_threads!=NULL && m_rank_threads!=NULL);
 
-	//Aggregate into three vectors: m_master_rank, m_counts, m_sum_weight
+	//Aggregate into three vectors: m_master_rank, m_counts, m_sum_weight, m_sum_sq_weight
 	utype j, k;
 	for(j=0; j<m_numThreads; j++){
 		for(k=0; k<m_iGenes; k++){
 			m_master_rank[k] += m_master_rank_threads[j][k];
 			m_counts[k] += m_counts_threads[j][k];
 			m_sum_weight[k]+=m_sum_weight_threads[j][k];
+			m_sum_sq_weight[k]+=m_sum_sq_weight_threads[j][k];
 		}
 	}
 
 	CSeekTools::Free2DArray(m_master_rank_threads);
 	CSeekTools::Free2DArray(m_counts_threads);
 	CSeekTools::Free2DArray(m_sum_weight_threads);
+	CSeekTools::Free2DArray(m_sum_sq_weight_threads);
 	m_master_rank_threads=NULL;
 	m_counts_threads=NULL;
 	m_sum_weight_threads = NULL;
+	m_sum_sq_weight_threads = NULL;
 
 	for(j=0; j<m_numThreads; j++){
 		m_rank_normal_threads[j].clear();
 			m_master_rank[j] = -320;
 		else{
 			m_master_rank[j] =
-				(m_master_rank[j] / m_sum_weight[j] - 320) / 100.0;
+				//(m_master_rank[j] / m_sum_weight[j] - 320) / 100.0;
+				(m_master_rank[j] - 320 * m_sum_weight[j]) / 100.0 / m_sum_weight[j];
+				//(m_master_rank[j] - 320 * m_sum_weight[j]) / 100.0 / sqrt(m_sum_sq_weight[j]);
 			if(m_eDistMeasure==CSeekDataset::CORRELATION){
 				m_master_rank[j] = m_master_rank[j] / 3.0;
 			}
 	current_sm = sm;
 
 	for(i=0; i<m_vecstrAllQuery.size(); i++){
-
 		//simulated weight case ======================
 		/*if(simulateWeight && redoWithEqual>=1) //1 or 2 
 			current_sm = EQUAL;
 			}
 			fill(wc.begin(), wc.end(), (float)0);
 		}
-
-
 		//fprintf(stderr, "2 %lu\n", CMeta::GetMemoryUsage());
 
 		#pragma omp parallel for \
 		shared(customGoldStd) \
 		private(dd, d, j) \
-		firstprivate(iSearchDatasets, iQuery) \
+		firstprivate(i, iSearchDatasets, iQuery) \
 		schedule(dynamic)
 		for(dd=0; dd<iSearchDatasets; dd++){
 			d = allRDatasets[dd];
 					}
 				}
 			}
+			else if(current_sm==AVERAGE_Z){
+				CSeekWeighter::AverageWeighting(query, *m_vc[d],
+					m_fPercentQueryAfterScoreCutOff, m_bSquareZ, w);
+				if(w==-1) continue;
+			}
 			else if(current_sm==EQUAL && redoWithEqual==0){
 				w = 1.0;
 			}
 			vector<utype> &Rank_Normal = m_rank_normal_threads[tid];
 			float* Master_Rank = &m_master_rank_threads[tid][0];
 			float* Sum_Weight = &m_sum_weight_threads[tid][0];
+			float* Sum_Sq_Weight = &m_sum_sq_weight_threads[tid][0];
 			utype* Counts = &m_counts_threads[tid][0];
 
 			if(current_sm==ORDER_STATISTICS)
 					//if(Rank_Normal[*iterR]==0) continue;
 					Master_Rank[*iterR] += (float) Rank_Normal[*iterR] * w;
 					Sum_Weight[*iterR] += w;
+					Sum_Sq_Weight[*iterR] += w * w;
 					Counts[*iterR]++;
 				}
 
 	return CSeekCentral::Common(sm);
 }
 
+bool CSeekCentral::AverageWeightSearch(){
+	CSeekCentral::SearchMode sm = AVERAGE_Z;
+	return CSeekCentral::Common(sm, NULL, NULL, NULL, NULL);
+}
+
 bool CSeekCentral::VarianceWeightSearch(){
 	vector<vector<float> > weights;
 	weights.resize(m_vecstrAllQuery.size());

src/seekcentral.h

         CV_CUSTOM=3, /**< Cross-validated weighting, 
                       but instead of using the query genes to cross-validate, use the user 
                       supplied gene-sets to validate each query partition */
-        ORDER_STATISTICS=4 /**< MEM algorithm */
+        ORDER_STATISTICS=4, /**< MEM algorithm */
+        AVERAGE_Z=5 /**< Average z-scores between query, SPELL algorithm */
 	};
     
     /*!
 	bool VarianceWeightSearch();
 
 	/*!
+	 * \brief Run Seek with the SPELL search
+	 *
+	 * \remark Assumes that the CSeekCentral::Initialize() has been called.
+	 */
+	bool AverageWeightSearch();
+
+	/*!
 	 * \brief Run Seek with the order statistics dataset weighting algorithm
 	 *
 	 * \remark Assumes that the CSeekCentral::Initialize() has been called.
 	/* multi-threaded programming */
 	float **m_master_rank_threads;
 	float **m_sum_weight_threads;
+	float **m_sum_sq_weight_threads;
 	utype **m_counts_threads;
 	vector<utype> *m_rank_normal_threads;
 	vector<utype> *m_rank_threads;
 	/* Essential search results */
 	vector<float> m_master_rank;
 	vector<float> m_sum_weight;
+	vector<float> m_sum_sq_weight;
 	vector<utype> m_counts;
 
 	/* Holds results for all queries */

src/seekdataset.cpp

 	DeleteQuery();
 
 	if(iDBSize==0 || dbMap==NULL) return true;
-
 	//must require initializequeryBlock be executed first
 
 	queryMap = new CSeekIntIntMap(iNumGenes);

src/seekreader.cpp

 		return false;
 	}
 	
-	//fprintf(stderr, "Here\n");	
 	#pragma omp parallel for \
 	shared(allQ) private(i) schedule(dynamic)
 	for(i=0; i<iDatasets; i++){
 		return false;
 	}
 
-	size_t m;
-	size_t d;
+	size_t m, d;
 
 	for(i=0; i<allQ.size(); i++){
 		m = allQ[i];
-		for(d=0; d<DB.size(); d++){
+		for(d=0; d<DB.size(); d++){ //number of CDatabase collections
 			vector<unsigned char> Qi;
 			if(!DB[d]->GetGene(m, Qi)){
 				cerr << "Gene does not exist" << endl;
 	fprintf(stderr, "Finished reading query genes' correlations\n");
 	ret = system("date +%s%N 1>&2");
 	if(bNetwork && CSeekNetwork::Send(iClient, 
-		"Finished reading databaselets and query centric")==-1){
+		"Finished reading query genes' correlations")==-1){
 		fprintf(stderr, "Error sending client message\n");
 		return false;
 	}
 
 	vc.clear();
 	vc.resize(iDatasets);
-
 	vp.clear();
 	vp.resize(vp_src.size());
-	for(i=0; i<vp.size(); i++){
+
+	for(i=0; i<vp.size(); i++)
 		vp[i].Copy(vp_src[i]);
-	}
+
 	int ret; //system call returns
 
 	fprintf(stderr, "Start reading average and presence files\n");
 	return true;
 }
 
+bool CSeekTools::ReadMultipleNotQueries(const char *file,
+	vector<vector<vector<string> > > &qList, const int lineSize){
+	qList.clear();
+	FILE *infile;
+	if((infile=fopen(file, "r"))==NULL){
+		fprintf(stderr, "Error opening file %s\n", file);
+		return false;
+	}
+
+	char *acBuffer;
+	int MAX_CHAR_PER_LINE = lineSize;
+	int lineLen = MAX_CHAR_PER_LINE;
+	acBuffer = (char*)malloc(lineLen);
+	while(fgets(acBuffer, lineLen, infile)!=NULL){
+		while(strlen(acBuffer)==lineLen-1){
+			int len = strlen(acBuffer);
+			fseek(infile, -len, SEEK_CUR);
+			lineLen+=MAX_CHAR_PER_LINE;
+			acBuffer = (char*)realloc(acBuffer, lineLen);
+			char *ret = fgets(acBuffer, lineLen, infile);
+		}
+	}
+	rewind(infile);
+
+	while(fgets(acBuffer, lineLen, infile)!=NULL){
+		char *p = strtok(acBuffer, "\n");
+		vector<vector<string> > aQ;
+		vector<string> tok;
+		CMeta::Tokenize(p, tok, "|");
+		aQ.resize(tok.size());
+		int i;
+		for(i=0; i<tok.size(); i++){
+			vector<string> tmp;
+			CMeta::Tokenize(tok[i].c_str(), tmp, " ");
+			aQ[i] = tmp;
+		}
+		qList.push_back(aQ);
+	}
+	qList.resize(qList.size());
+	free(acBuffer);
+	fclose(infile);
+	return true;
+}
+
 bool CSeekTools::ReadMultiGeneOneLine(const string &strFile,
 	vector<string> &list, const int lineSize){
 	return CSeekTools::ReadMultiGeneOneLine(strFile.c_str(), list, lineSize);
 	static bool ReadArray(const char *fileName, vector<tType> &vData){
 		FILE *f = fopen(fileName, "rb");
 		if(f==NULL){
-			cerr << "File not found" << endl;
+			fprintf(stderr, "File not found %s\n", fileName);
 			return false;
 		}
 
 	static bool WriteArray(const char *fileName, const vector<tType> &vData){
 		FILE *f = fopen(fileName, "wb");
 		if(f==NULL){
-			cerr << "File not found" << endl;
+			fprintf(stderr, "File not found %s\n", fileName);
 			return false;
 		}
 		size_t i;
 	static bool ReadMultipleQueries(const string &strFile,
 		vector< vector<string> > &qList, const int lineSize = 1024);
 
+
+	static bool ReadMultipleNotQueries(const char *file,
+		vector<vector<vector<string> > > &qList, const int lineSize = 1024);
+
 	/*!
 	 * \brief Read a list of queries
 	 *

src/seekweight.cpp

 	return true;
 }
 
+bool CSeekWeighter::AverageWeighting(CSeekQuery &sQuery, CSeekDataset &sDataset,
+	const float &percent_required, const bool &bSquareZ, float &w){
+
+	CSeekIntIntMap *mapQ = sDataset.GetQueryMap();
+	if(mapQ==NULL) return true;
+
+	utype i, j, qi, qj;
+
+	const vector<utype> &allQ = sQuery.GetQuery();
+	vector<utype> presentQ;
+	utype num_q = 0;
+	for(qi=0; qi<allQ.size(); qi++){
+		if(CSeekTools::IsNaN(mapQ->GetForward(allQ[qi]))) continue;
+		num_q++;
+		presentQ.push_back(allQ[qi]);
+	}
+
+	w = 0;
+	const utype MIN_QUERY_REQUIRED =
+		max((utype) 2, (utype) (percent_required * allQ.size()));
+
+	if(num_q<MIN_QUERY_REQUIRED){
+		w = -1;
+		return true;
+	}
+
+	utype **f = sDataset.GetDataMatrix();
+	/* as long as rank[g] does not overflow, due to too many queries, we are fine
+	 * should control query size to be <100. */
+	vector<utype> queryPos;
+	queryPos.resize(num_q);
+	for(i=0; i<num_q; i++)
+		queryPos[i] = mapQ->GetForward(presentQ[i]);
+
+	int pairs = 0;
+	if(bSquareZ){
+		for(qi=0; qi<presentQ.size(); qi++){
+			for(qj=0; qj<presentQ.size(); qj++){
+				if(qi==qj) continue;
+				float t = (float) f[presentQ[qj]][queryPos[qi]];
+				t = (t-320)/100.0;
+				t = t * t;
+				w += t;
+				pairs++;
+			}
+		}
+		w /= (float) pairs;
+
+	}else{
+		for(qi=0; qi<presentQ.size(); qi++){
+			for(qj=0; qj<presentQ.size(); qj++){
+				if(qi==qj) continue;
+				w += (float) f[presentQ[qj]][queryPos[qi]];
+				pairs++;
+			}
+		}
+		w /= (float) pairs;
+		w /= (float) 640;
+	}
+	return true;
+}
+
 bool CSeekWeighter::CVWeighting(CSeekQuery &sQuery, CSeekDataset &sDataset,
 	const float &rate, const float &percent_required, const bool &bSquareZ,
 	vector<utype> *rrank, const CSeekQuery *goldStd){
 	static bool OneGeneWeighting(CSeekQuery&, 
 		CSeekDataset&, const float&, const float&, const bool&, 
 		vector<utype>*, const CSeekQuery*);
+
+	static bool AverageWeighting(CSeekQuery &sQuery, CSeekDataset &sDataset,
+		const float &percent_required, const bool &bSquareZ, float &w);
 };
 
 

src/seekwriter.cpp

 	return true;
 }
 
+bool CSeekWriter::TopologicalOverlap(CDataPair &Dat,
+const vector<string> &vecstrGenes){
+	size_t i, j;
+	vector<unsigned int> veciGenes;
+	veciGenes.resize(vecstrGenes.size());
+	for(i=0; i<vecstrGenes.size(); i++)
+		veciGenes[i] = (unsigned int) Dat.GetGeneIndex(vecstrGenes[i]);
+
+	unsigned int s,t;
+	float d;
+	CSeekIntIntMap m(vecstrGenes.size());
+	for(i=0; i<vecstrGenes.size(); i++){
+		if((s=veciGenes[i])==(unsigned int)-1) continue;
+		m.Add(i);
+	}
+
+	size_t trueSize = m.GetNumSet();
+	vector<float> inner(trueSize);
+	vector<vector<float> > fs(trueSize, inner);
+	
+	//float* fs = new float[trueSize*trueSize];
+	const vector<utype> &allRGenes = m.GetAllReverse();
+
+	for(i=0; i<m.GetNumSet(); i++){	
+		s = veciGenes[allRGenes[i]];
+		for(j=i+1; j<m.GetNumSet(); j++){
+			t = veciGenes[allRGenes[j]];
+			if(CMeta::IsNaN(d = Dat.Get(s,t))){
+				fs[i][j] = 0;
+				fs[j][i] = 0;
+				fprintf(stderr, "Warning i, j is NaN, set to 0!\n", i, j);
+			}else{
+				fs[i][j] = d;
+				fs[j][i] = d;
+			}
+		}
+		fs[i][i] = 0;
+	}
+
+	//first step: transform z-scores back to correlation 
+	//(exp(z*2) = (1+r)/(1-r) -> exp(2z) - exp(2z)*r = 1+r -> exp(2z) - 1 = exp(2z)*r + r = (exp(2z) + 1)*r->
+	//(exp(2z) - 1) / (exp(2z) + 1) = r
+	//second step: transform by abs, then take it to the exponent 9
+	for(i=0; i<m.GetNumSet(); i++){	
+		for(j=i+1; j<m.GetNumSet(); j++){
+			if((d = fs[i][j])==0) continue;
+			d = (expf(2.0*d) - 1.0) / (expf(2.0*d) + 1.0);
+			d = pow(abs(d), 9);
+			fs[i][j] = d;
+			fs[j][i] = d;
+			//fprintf(stderr, "%.3e\n", d);
+		}
+	}
+
+	fprintf(stderr, "Finished step 1: tranform z-score back to pearson\n");
+	vector<float> vecSum;
+	CSeekTools::InitVector(vecSum, trueSize, CMeta::GetNaN());
+	for(i=0; i<m.GetNumSet(); i++)
+		vecSum[i] = 0;
+
+	for(i=0; i<m.GetNumSet(); i++){	
+		for(j=i+1; j<m.GetNumSet(); j++){
+			vecSum[i] += fs[i][j];
+			vecSum[j] += fs[i][j];
+		}
+	}
+
+	//duplicate of fs
+	vector<vector<float> > fs2(trueSize, inner);
+	for(i=0; i<m.GetNumSet(); i++){	
+		for(j=i+1; j<m.GetNumSet(); j++){
+			fs2[i][j] = fs[i][j];
+			fs2[j][i] = fs[i][j];
+		}
+		fs2[i][i] = 0;
+	}
+
+
+	//temporary storage matrix
+	CHalfMatrix<float> res;
+	res.Initialize(trueSize);
+	for(i=0; i<m.GetNumSet(); i++){	
+		for(j=i+1; j<m.GetNumSet(); j++){
+			res.Set(i, j, 0);
+		}
+	}
+
+	//result of multiplication
+	fprintf(stderr, "Begin!\n");
+	vector<vector<float> > fs_result(trueSize, inner);
+	CStrassen::strassen(fs, fs2, fs_result, trueSize);
+	fprintf(stderr, "Done!\n");
+
+	size_t k;
+	unsigned int u;
+	//size_t ii = 0;
+	for(i=0; i<m.GetNumSet(); i++){	
+		for(j=i+1; j<m.GetNumSet(); j++){
+			float tsum = fs_result[i][j];
+			/*float *pi = &fs[i*trueSize];
+			float *pj = &fs[j*trueSize];
+			for(k=0; k<m.GetNumSet(); k++){
+				tsum += pi[k] * pj[k];
+			}*/
+			tsum -= fs[i][i] * fs[j][i];
+			tsum -= fs[i][j] * fs[j][j];
+			float tmin = 0;
+			if(vecSum[i] < vecSum[j])
+				tmin = vecSum[i];
+			else
+				tmin = vecSum[j];
+			float to = (tsum + d) / (tmin + 1.0 - d);
+			res.Set(i, j, (float) to); //temporary matrix to store the results
+			//fprintf(stderr, "%.3e\n", to);	
+		}
+		if(i%100==0){
+			fprintf(stderr, "Doing topological overlap calculation, current %d\n", i);
+		}
+	}
+
+	fprintf(stderr, "Finished step 2: topological overlap calculation\n");
+
+	for(i=0; i<m.GetNumSet(); i++){	
+		s = veciGenes[allRGenes[i]];
+		for(j=i+1; j<m.GetNumSet(); j++){
+			t = veciGenes[allRGenes[j]];
+			Dat.Set(s, t, res.Get(i, j));
+		}
+		Dat.Set(s, s, 1.0);
+	}
+
+}
+
+
 bool CSeekWriter::NormalizeDAB(CDataPair &Dat,
-	const vector<string> &vecstrGenes,
-	bool cutoff, bool expTransform, bool divideNorm, bool subtractNorm){
+const vector<string> &vecstrGenes, 
+//bool cutoff, float cutoff_val,
+bool expTransform, bool divideNorm, bool subtractNorm){
+	//default cutoff_val is 0
 
-	utype i, j;
-	vector<utype> veciGenes;
+	size_t i, j;
+	vector<unsigned int> veciGenes;
 	veciGenes.clear();
 	veciGenes.resize(vecstrGenes.size());
-	for( i = 0; i < vecstrGenes.size( ); ++i )
-		veciGenes[ i ] = Dat.GetGene( vecstrGenes[i] );
+	for(i=0; i<vecstrGenes.size(); i++)
+		veciGenes[i] = (unsigned int) Dat.GetGeneIndex(vecstrGenes[i]);
 
 	vector<float> vecSum;
 	vector<int> vecNum;
 	CSeekTools::InitVector(vecSum, vecstrGenes.size(), CMeta::GetNaN());
 	CSeekTools::InitVector(vecNum, vecstrGenes.size(), (int)-9999);
 
+	unsigned int s,t;
 	for(i=0; i<vecstrGenes.size(); i++){
-		utype s = veciGenes[i];
-		if(CSeekTools::IsNaN(s)) continue;
-		float sum = 0;
-		int num = 0;
-		vector<float> all;
-		for(j=0; j<vecstrGenes.size(); j++){
-			utype t = veciGenes[j];
-			float d = Dat.Get(s,t);
-			if(CSeekTools::IsNaN(t)) continue;
-			if(CMeta::IsNaN(d)) continue;
-			if(cutoff){
-				if(d>0){
+		if((s=veciGenes[i])==(unsigned int)-1) continue;
+		vecSum[i] = 0;
+		vecNum[i] = 0;
+	}
+
+	if(divideNorm && subtractNorm){
+		fprintf(stderr, "Error: both divideNorm and subtractNorm are true\n");
+		return false;
+	}else if(!divideNorm && !subtractNorm){
+		fprintf(stderr, "Error: both divideNorm and subtractNorm are false\n");
+		return false;
+	}
+
+	float d = -1;
+	float r = -1;
+	for(i=0; i<vecstrGenes.size(); i++){
+		if((s=veciGenes[i])==(unsigned int)-1) continue;
+		for(j=i+1; j<vecstrGenes.size(); j++){
+			if((t=veciGenes[j])==(unsigned int)-1) continue;
+			if(CMeta::IsNaN(d = Dat.Get(s,t))) continue;
+			/*if(cutoff){
+				if(d>cutoff_val){
 					if(expTransform)
-						all.push_back(expf(-1.0*d*d/2.0));
+						r = expf(-1.0*d*d/2.0);
 					else
-						all.push_back(d);
+						r = d;
+					vecSum[i] += r;
+					vecSum[j] += r;
+					vecNum[i]++;
+					vecNum[j]++;
 				}
 			}
-			else{
+			else{*/
 				//fprintf(stderr, "Warning: Negative Z-Scores");
 				if(expTransform)
-					all.push_back(expf(-1.0*d*d/2.0));
+					r = expf(-1.0*d*d/2.0);
 				else
-					all.push_back(d);
-			}	
+					r = d;
+				vecSum[i] += r;
+				vecSum[j] += r;
+				vecNum[i]++;
+				vecNum[j]++;
+			//}	
 		}
-
-		for(j=0; j<all.size(); j++){
-			sum+=all[j];
-			num++;
-		}
-		vecSum[i] = sum;
-		vecNum[i] = num;
 	}
 
 	for(i=0; i<vecstrGenes.size(); i++){
-		utype s = veciGenes[i];
-		if(CSeekTools::IsNaN(s)) continue;
-		float *v = Dat.GetFullRow(s);
-
-		for(j=0; j<vecstrGenes.size(); j++){
-			utype t = veciGenes[j];
-			float d = v[t];
-			if(CSeekTools::IsNaN(t)) continue;
-			if(CMeta::IsNaN(d)) continue;
-			if(cutoff){
-				if(d>0){
+		if((s=veciGenes[i])==(unsigned int)-1) continue;
+		for(j=i+1; j<vecstrGenes.size(); j++){
+			if((t=veciGenes[j])==(unsigned int)-1) continue;
+			if(CMeta::IsNaN(d = Dat.Get(s,t))) continue;
+			/*if(cutoff){
+				if(d>cutoff_val){
 					if(expTransform){
-						if(divideNorm){
-							float r = expf(-1.0*d*d/2.0) / sqrtf(vecSum[i]) / sqrtf(vecSum[j]);
-							Dat.Set(s, t, r);
-						}else if(subtractNorm){
-							float r = expf(-1.0*d*d/2.0) - vecSum[i] / vecNum[i] - vecSum[j] / vecNum[j];
-							Dat.Set(s, t, r);
-						}
+						if(divideNorm)
+							r=expf(-1.0*d*d/2.0)/sqrtf(vecSum[i])/sqrtf(vecSum[j]);
+						else if(subtractNorm)
+							r=expf(-1.0*d*d/2.0)-vecSum[i]/vecNum[i]-vecSum[j]/vecNum[j];
 					}else{
-						if(divideNorm){
-							float r = d / sqrtf(vecSum[i]) / sqrtf(vecSum[j]);
-							Dat.Set(s, t, r);
-						}else if(subtractNorm){
-							float r = d - vecSum[i] / vecNum[i] - vecSum[j] / vecNum[j];
-							Dat.Set(s, t, r);
-						}
+						if(divideNorm)
+							r=d/sqrtf(vecSum[i])/sqrtf(vecSum[j]);
+						else if(subtractNorm)
+							r=d-vecSum[i]/vecNum[i]-vecSum[j]/vecNum[j];
 					}
 				}else{
-					Dat.Set(s, t, 0);
+					r=0; //default value
 				}
+				Dat.Set(s, t, r);
 			}
-			else{
+			else{*/
 				if(expTransform){
-					if(divideNorm){
-						float r = expf(-1.0*d*d/2.0) / sqrtf(vecSum[i]) / sqrtf(vecSum[j]);
-						Dat.Set(s, t, r);
-					}else if(subtractNorm){
-						float r = expf(-1.0*d*d/2.0) - vecSum[i] / vecNum[i] - vecSum[j] / vecNum[j];
-						Dat.Set(s, t, r);
-					}
+					if(divideNorm)
+						r=expf(-1.0*d*d/2.0)/sqrtf(vecSum[i])/sqrtf(vecSum[j]);
+					else if(subtractNorm)
+						r=expf(-1.0*d*d/2.0)-vecSum[i]/vecNum[i]-vecSum[j]/vecNum[j];
 				}else{
 					if(divideNorm){
-						float r = 0;
 						//DANGEROUS
 						if(vecSum[i]<=0){
-							fprintf(stderr, "Warning, Dangerous, divide sqrt(z), where z could be negative\n");
-							r = 0;
-						}else{
-							r = d / sqrtf(vecSum[i]) / sqrtf(vecSum[j]);
-						}
-						Dat.Set(s, t, r);
+							fprintf(stderr, "Warning, divide sqrt(z), when z<=0\n");
+							r=0; //default value
+						}else
+							r=d/sqrtf(vecSum[i])/sqrtf(vecSum[j]);
 					}else if(subtractNorm){
-						float r = d - vecSum[i] / vecNum[i] - vecSum[j] / vecNum[j];
-						Dat.Set(s, t, r);
+						r=d-0.5*(vecSum[i]/vecNum[i]+vecSum[j]/vecNum[j]);
 					}
-
 				}
-			}
+				Dat.Set(s, t, r);
+			//}
 		}
-		free(v);
 	}
 
+	//Plot a distribution
+	/*vector<unsigned long> bins;
+	bins.resize(55);
+	float upper = 5.0; //assume z scores
+	float lower = -5.0;
+	float bin_size = (upper - lower) / 50;
+	for(i=0; i<55; i++)
+		bins[i] = 0;
+	for(i=0; i<Dat.GetGenes(); i++){
+		for(j=i+1; j<Dat.GetGenes(); j++){
+			d = Dat.Get(i,j);
+			if(CMeta::IsNaN(d)) continue;
+			int b = (int) ((d - lower) / bin_size);
+			if(b<0){
+				bins[0]++;
+				continue;
+			}
+			if(b>=55){
+				bins[54]++;
+				continue;
+			}
+			bins[b]++;
+		}
+	}
+	fprintf(stderr, 
+	"Distances: bin size: %.5f, num of bins: %d, min bin val: %.5f, max bin val: %.5f\n",
+	bin_size, 55, lower, upper);
+	for(i=0; i<55; i++){
+		fprintf(stderr, "%lu\t%lu\n", i, bins[i]);
+	}
+	*/
 	return true;
 }
 
 #include "sparsematrix.h"
 #include "datapair.h"
 #include "seekreader.h"
+#include "strassen.h"
 
 namespace Sleipnir {
 
 				ret = fread((char*)(&val),1,sizeof(val),f);
 				tType first = id;
 				tType second = id2;
+				//mat is a full matrix
 				mat.Add(first, second, rbp_score[val]);
 				mat.Add(second, first, rbp_score[val]);
 			}
 
 		fprintf(stderr, "Begin normalization using row sum\n");
 		float rv;
+		#pragma omp parallel for \
+		shared(m, mat, allRGenes, vecSqrtSum) \
+		private(ii, i, j, rv) schedule(dynamic)
 		for(ii=0; ii<m.GetNumSet(); ii++){
 			i = (size_t) allRGenes[ii];
 			vector<CPair<float> >::iterator row_it;
 
 	//compatibility
 	template<class tType>
-	static bool WriteSparseMatrix(CDataPair &Dat, vector<map<tType,unsigned short> > &umat,
-	int maxRank, const vector<string> &vecstrGenes, const char *fileName){
+	static bool RemoveDominant(CSparseFlatMatrix<float> &mat, 
+	CSeekIntIntMap &m, const vector<string> &vecstrGenes){
+	
+		size_t i, j;
+		vector<vector<float> > tmp_mat, tmp2;
+		tmp_mat.resize(vecstrGenes.size());
+		tmp2.resize(vecstrGenes.size());
+		for(i=0; i<vecstrGenes.size(); i++){
+			tmp_mat[i].resize(vecstrGenes.size());
+			tmp2[i].resize(vecstrGenes.size());
+			for(j=0; j<vecstrGenes.size(); j++){
+				tmp_mat[i][j] = 0;
+				tmp2[i][j] = 0;
+			}
+		}
+
+		size_t ii, jj;
+		const vector<utype> &allRGenes = m.GetAllReverse();
+		for(ii=0; ii<m.GetNumSet(); ii++){
+			i = (size_t) allRGenes[ii];
+			vector<CPair<float> >::iterator row_it;
+			for(row_it=mat.RowBegin(i); row_it!=mat.RowEnd(i); row_it++){
+				j = (size_t) row_it->i;
+				tmp_mat[i][j] = row_it->v;
+			}
+		}
+
+		int TOP = 100;
+		fprintf(stderr, "Started removing dominant...\n");
+		for(ii=0; ii<m.GetNumSet(); ii++){
+			i = (size_t) allRGenes[ii];
+			vector<CPair<float> > vp;
+			vp.resize(m.GetNumSet());
+			for(jj=0; jj<m.GetNumSet(); jj++){
+				j = (size_t) allRGenes[jj];
+				vp[jj].i = (utype) j;
+				vp[jj].v = tmp_mat[i][j];
+			}
+			nth_element(vp.begin(), vp.begin()+TOP, vp.end(), CDescendingValue<float>());
+			sort(vp.begin(), vp.begin()+TOP, CDescendingValue<float>());
+
+			//top 100
+			size_t k, l;
+			size_t max_ind = 0;
+			float max_val = -1;
+			for(k=0; k<TOP; k++){
+				size_t this_g = vp[k].i;
+				float v = 0;
+				for(l=0; l<TOP; l++){
+					size_t other_g = vp[l].i;
+					if(this_g==other_g) continue;
+					v+=tmp_mat[this_g][other_g];
+				}
+				if(v>max_val){
+					max_val = v;
+					max_ind = k;
+				}
+			}
+			for(jj=0; jj<m.GetNumSet(); jj++){
+				j = (size_t) allRGenes[jj];
+				tmp2[i][j] = tmp_mat[i][j] - tmp_mat[j][max_ind];
+				if(tmp2[i][j]<0)	
+					tmp2[i][j] = 0;
+			}
+		}
+
+		fprintf(stderr, "Started re-normalizing matrix...\n");
+
+		for(ii=0; ii<m.GetNumSet(); ii++){
+			i = (size_t) allRGenes[ii];
+			for(jj=ii+1; jj<m.GetNumSet(); jj++){
+				j = (size_t) allRGenes[jj];
+				float m = max(tmp2[i][j], tmp2[j][i]);
+				tmp2[i][j] = m;
+				tmp2[j][i] = m;
+			}
+		}
+
+		vector<float> vecSum;
+		CSeekTools::InitVector(vecSum, vecstrGenes.size(), (float) 0);
+		for(ii=0; ii<m.GetNumSet(); ii++){
+			i = (size_t) allRGenes[ii];
+			for(jj=0; jj<m.GetNumSet(); jj++){
+				j = (size_t) allRGenes[jj];
+				vecSum[i] += tmp2[i][j];
+			}
+		}
+
+		vector<float> vecSqrtSum;
+		CSeekTools::InitVector(vecSqrtSum, vecstrGenes.size(), (float) 0);
+		for(ii=0; ii<m.GetNumSet(); ii++){
+			i = (size_t) allRGenes[ii];
+			if(vecSum[i]==0) continue;
+			vecSqrtSum[i] = sqrtf(vecSum[i]);
+		}
+
+		for(ii=0; ii<m.GetNumSet(); ii++){
+			i = (size_t) allRGenes[ii];
+			vector<CPair<float> >::iterator row_it;
+			for(row_it=mat.RowBegin(i); row_it!=mat.RowEnd(i); row_it++){
+				j = (size_t) row_it->i;
+				if(vecSqrtSum[i]==0 || vecSqrtSum[j]==0) continue;
+				row_it->v = tmp2[i][j] / vecSqrtSum[i] / vecSqrtSum[j];
+				//fprintf(stderr, "%d %d %.3e\n", i, j, row_it->v);
+			}
+		}
+		return true;
+	}
+
+	//compatibility
+	template<class tType>
+	static bool WriteSparseMatrix(CDataPair &Dat, 
+	vector<map<tType,unsigned short> > &umat, 
+	const vector<string> &vecstrGenes, const char *fileName){
 
 		FILE *f = fopen(fileName, "wb");
 		if(f==NULL){
 
 	//compatiblity
 	template<class tType>
-	static bool GetSparseRankMatrix(CDataPair &Dat, vector<map<tType,unsigned short> > &umat, 
-	int maxRank, const vector<string> &vecstrGenes){
+	static bool GetSparseRankMatrix(CDataPair &Dat, 
+	vector<map<tType,unsigned short> > &umat, int maxRank, 
+	const vector<string> &vecstrGenes){
 	
 		size_t i, j;
 		vector<tType> veciGenes;
 		return true;
 	}
 
+	//To be used after NormalizeDAB
+	template<class tType>
+	static bool ConvertToSparseMatrix(CDataPair &Dat,
+	vector<map<tType,unsigned short> > &umat,
+	const vector<string> &vecstrGenes, const float cutoff_val){
+
+		size_t i, j;
+		vector<tType> veciGenes;
+		veciGenes.resize(vecstrGenes.size());
+		for( i = 0; i < vecstrGenes.size( ); ++i )
+			veciGenes[ i ] = (tType) Dat.GetGeneIndex( vecstrGenes[i] );
+		umat.resize(vecstrGenes.size());
+		for(i=0; i<vecstrGenes.size(); i++)
+			umat[i] = map<tType, unsigned short>();
+
+		tType s,t;
+		for(i=0; i<vecstrGenes.size(); i++){
+			if((s=veciGenes[i])==(tType)-1) continue;
+			if(i%1000==0) fprintf(stderr, "Start reading gene %d...\n", i);
+
+			for(j=i+1; j<vecstrGenes.size(); j++){
+				if((t=veciGenes[j])==(tType)-1) continue;
+				float r = Dat.Get(s,t);
+				if(CMeta::IsNaN(r)) continue;
+				if(r > cutoff_val)
+					umat[i][j] = (unsigned short) (r * 100.0);
+			}
+		}
+		fprintf(stderr, "Finished reading DAB\n");
+		return true;
+	}
+
+	//to be used for sparse matrix created from cutting-off z-scores
+	template<class tType>
+	static bool ReadSeekSparseMatrix(const char *fileName,
+	CSparseFlatMatrix<float> &mat, CSeekIntIntMap &m, 
+	const vector<string> &vecstrGenes, const int initialCapacity, 
+	const float exponent){
+	
+		if(exponent<1.0){
+			fprintf(stderr, "Exponent must be >=1.0\n");
+			return false;
+		}
+	
+		FILE *f = fopen(fileName, "rb");
+		if(f==NULL){
+			cerr << "File not found" << endl;
+			return false;
+		}
+
+		size_t i, j;
+		tType numGenes, numPresent, val;
+		int ret;
+
+		mat.Initialize(vecstrGenes.size());
+		ret = fread((char*) (&numPresent), 1, sizeof(numPresent), f);
+		for(j=0; j<numPresent; j++){
+			ret = fread((char*)(&val), 1, sizeof(val), f); //val = gene ID
+			m.Add((utype) val);
+			mat.InitializeRow(val, initialCapacity); //initial capacity
+		}
+		ret = fread((char*) (&numGenes), 1, sizeof(numGenes), f);
+
+		for(i=0; i<numGenes; i++){
+			tType id, id2;  //gene ID
+			unsigned short numEntries, val; //z-scores * 100.0
+			ret = fread((char*)(&id), 1, sizeof(id), f);
+			ret = fread((char*)(&numEntries), 1, sizeof(numEntries), f);
+			for(j=0; j<numEntries; j++){
+				ret = fread((char*)(&id2),1,sizeof(id2),f);
+				ret = fread((char*)(&val),1,sizeof(val),f);
+				tType first = id;
+				tType second = id2;
+				float fval = (float) val / 100.0;
+				if(exponent>1.0)
+					fval = pow(fval, exponent);
+				mat.Add(first, second, fval);
+				mat.Add(second, first, fval);
+			}
+		}
+		fclose(f);
+
+		mat.Organize();
+		return true;
+	}
+
 	//===============================================================
 	//not currently used
 	static bool ReadSparseMatrix(const char *fileName, 
 	static bool SumSparseMatrix(CSparseFlatMatrix<float> &mat1,
 		CHalfMatrix<float> &res, const CSeekIntIntMap &mi, const float w);
 
-	static bool NormalizeDAB(CDataPair &Dat,
-		const vector<string> &vecstrGenes,
-		bool cutoff, bool expTransform, bool divideNorm, bool subtractNorm);
+	static bool NormalizeDAB(CDataPair &Dat, const vector<string> &vecstrGenes,
+		//bool cutoff, float cutoff_val, 
+		bool expTransform, bool divideNorm, bool subtractNorm);
 
 	static bool GetGeneAverage(CDataPair &Dat,
 		const vector<string> &vecstrGenes,
 		vector<char> &vecResult);
 	static bool GetDatasetSinfo(CDataPair &Dat, float &mean,
 		float &stdev);
+
+	static bool TopologicalOverlap(CDataPair &Dat,
+		const vector<string> &vecstrGenes);
 };
 
 }
+#include "strassen.h"
+
+namespace Sleipnir {
+
+void CStrassen::ikjalgorithm(vector<vector<float> > A,
+vector<vector<float> > B, vector<vector<float> > &C, int n) {
+	for (int i = 0; i < n; i++)
+		for (int k = 0; k < n; k++)
+			for (int j = 0; j < n; j++)
+				C[i][j] += A[i][k] * B[k][j];
+}
+ 
+void CStrassen::strassenR(vector< vector<float> > &A,
+vector< vector<float> > &B,
+vector< vector<float> > &C, int tam) {
+	int leafsize = 64;
+
+	if (tam <= leafsize){
+		CStrassen::ikjalgorithm(A, B, C, tam);
+		return;
+	}
+	// other cases are treated here:
+	else {
+		//fprintf(stderr, "TAM: %d\n", tam);
+		int newTam = tam/2;
+		vector<float> inner (newTam);
+		vector<vector<float> >
+			a11(newTam,inner), a12(newTam,inner), a21(newTam,inner), a22(newTam,inner),
+			b11(newTam,inner), b12(newTam,inner), b21(newTam,inner), b22(newTam,inner),
+			c11(newTam,inner), c12(newTam,inner), c21(newTam,inner), c22(newTam,inner),
+			p1(newTam,inner), p2(newTam,inner), p3(newTam,inner), p4(newTam,inner),
+			p5(newTam,inner), p6(newTam,inner), p7(newTam,inner),
+			aResult(newTam,inner), bResult(newTam,inner);
+  
+		int i, j;
+		//dividing the matrices in 4 sub-matrices:
+		for (i = 0; i < newTam; i++) {
+			for (j = 0; j < newTam; j++) {
+				a11[i][j] = A[i][j];
+				a12[i][j] = A[i][j + newTam];
+				a21[i][j] = A[i + newTam][j];
+				a22[i][j] = A[i + newTam][j + newTam];
+
+				b11[i][j] = B[i][j];
+				b12[i][j] = B[i][j + newTam];
+				b21[i][j] = B[i + newTam][j];
+				b22[i][j] = B[i + newTam][j + newTam];
+			}
+		}
+  
+        // Calculating p1 to p7:
+		CStrassen::sum(a11, a22, aResult, newTam); // a11 + a22
+		CStrassen::sum(b11, b22, bResult, newTam); // b11 + b22
+		CStrassen::strassenR(aResult, bResult, p1, newTam); // p1 = (a11+a22) * (b11+b22)
+
+		CStrassen::sum(a21, a22, aResult, newTam); // a21 + a22
+		CStrassen::strassenR(aResult, b11, p2, newTam); // p2 = (a21+a22) * (b11)
+  
+		CStrassen::subtract(b12, b22, bResult, newTam); // b12 - b22
+		CStrassen::strassenR(a11, bResult, p3, newTam); // p3 = (a11) * (b12 - b22)
+  
+		CStrassen::subtract(b21, b11, bResult, newTam); // b21 - b11
+		CStrassen::strassenR(a22, bResult, p4, newTam); // p4 = (a22) * (b21 - b11)
+  
+		CStrassen::sum(a11, a12, aResult, newTam); // a11 + a12
+		CStrassen::strassenR(aResult, b22, p5, newTam); // p5 = (a11+a12) * (b22)  
+  
+		CStrassen::subtract(a21, a11, aResult, newTam); // a21 - a11
+		CStrassen::sum(b11, b12, bResult, newTam); // b11 + b12
+		CStrassen::strassenR(aResult, bResult, p6, newTam); // p6 = (a21-a11) * (b11+b12)
+  
+		CStrassen::subtract(a12, a22, aResult, newTam); // a12 - a22
+		CStrassen::sum(b21, b22, bResult, newTam); // b21 + b22
+		CStrassen::strassenR(aResult, bResult, p7, newTam); // p7 = (a12-a22) * (b21+b22)
+
+        // calculating c21, c21, c11 e c22:
+		CStrassen::sum(p3, p5, c12, newTam); // c12 = p3 + p5
+		CStrassen::sum(p2, p4, c21, newTam); // c21 = p2 + p4
+  
+		CStrassen::sum(p1, p4, aResult, newTam); // p1 + p4
+		CStrassen::sum(aResult, p7, bResult, newTam); // p1 + p4 + p7
+		CStrassen::subtract(bResult, p5, c11, newTam); // c11 = p1 + p4 - p5 + p7
+
+		CStrassen::sum(p1, p3, aResult, newTam); // p1 + p3
+		CStrassen::sum(aResult, p6, bResult, newTam); // p1 + p3 + p6
+		CStrassen::subtract(bResult, p2, c22, newTam); // c22 = p1 + p3 - p2 + p6
+
+		// Grouping the results obtained in a single matrix:
+		for (i = 0; i < newTam ; i++) {
+			for (j = 0 ; j < newTam ; j++) {
+				C[i][j] = c11[i][j];
+				C[i][j + newTam] = c12[i][j];
+				C[i + newTam][j] = c21[i][j];
+				C[i + newTam][j + newTam] = c22[i][j];
+			}
+		}
+	}
+}
+ 
+unsigned int CStrassen::nextPowerOfTwo(int n) {
+	return pow(2, int(ceil(log2(n))));
+}
+ 
+void CStrassen::strassen(vector< vector<float> > &A,
+vector< vector<float> > &B,
+vector< vector<float> > &C, unsigned int n) {
+
+	//unsigned int n = tam;
+	unsigned int m = CStrassen::nextPowerOfTwo(n);
+	vector<float> inner(m);
+	vector< vector<float> > APrep(m, inner), BPrep(m, inner), CPrep(m, inner);
+ 
+	for(unsigned int i=0; i<n; i++) {
+		for (unsigned int j=0; j<n; j++) {
+			APrep[i][j] = A[i][j];
+			BPrep[i][j] = B[i][j];
+		}
+	}
+ 
+	CStrassen::strassenR(APrep, BPrep, CPrep, m);
+	for(unsigned int i=0; i<n; i++)
+		for (unsigned int j=0; j<n; j++)
+			C[i][j] = CPrep[i][j];
+}
+ 
+void CStrassen::sum(vector< vector<float> > &A,
+vector< vector<float> > &B,
+vector< vector<float> > &C, int tam) {
+	int i, j;
+	for (i = 0; i < tam; i++)
+		for (j = 0; j < tam; j++)
+			C[i][j] = A[i][j] + B[i][j];
+}
+ 
+void CStrassen::subtract(vector< vector<float> > &A,
+vector< vector<float> > &B,
+vector< vector<float> > &C, int tam) {
+	int i, j;
+	for (i = 0; i < tam; i++)
+		for (j = 0; j < tam; j++) 
+			C[i][j] = A[i][j] - B[i][j]; 
+}
+
+/* 
+int CStrassen::getMatrixSize(string filename) {
+	string line;
+	ifstream infile;
+	infile.open(filename.c_str());
+	getline(infile, line);
+	return count(line.begin(), line.end(), '\t') + 1;
+}*/
+ 
+/*
+void read(string filename, vector< vector<float> > &A, vector< vector<float> > &B) {
+	string line;
+	FILE* matrixfile = freopen(filename.c_str(), "r", stdin);
+     
+	if (matrixfile == 0) {
+        cerr << "Could not read file " << filename << endl;
+        return;
+    }
+ 
+    int i = 0, j, a;
+    while (getline(cin, line) && !line.empty()) {
+        istringstream iss(line);
+        j = 0;
+        while (iss >> a) {
+            A[i][j] = a;
+            j++;
+        }
+        i++;
+    }
+ 
+    i = 0;
+    while (getline(cin, line)) {
+        istringstream iss(line);
+        j = 0;
+        while (iss >> a) {
+            B[i][j] = a;
+            j++;
+        }
+        i++;
+    }
+ 
+    fclose (matrixfile);
+}
+ 
+void printMatrix(vector< vector<float> > matrix, int n) {
+    for (int i=0; i < n; i++) {
+        for (int j=0; j < n; j++) {
+            if (j != 0) {
+                cout << "\t";
+            }
+            cout << matrix[i][j];
+        }
+        cout << endl;
+    }
+}
+ 
+int main (int argc, char* argv[]) {
+    string filename;
+    if (argc < 3) {
+        filename = "2000.in";
+    } else {
+        filename = argv[2];
+    }
+ 
+    if (argc < 5) {
+        leafsize = 16;
+    } else {
+        leafsize = atoi(argv[4]);
+    }
+ 
+    int n = getMatrixSize(filename);
+    vector<float> inner (n);
+    vector< vector<float> > A(n, inner), B(n, inner), C(n, inner);
+    read (filename, A, B);
+    strassen(A, B, C, n);
+    printMatrix(C, n);
+    return 0;
+}*/
+
+}
+#ifndef STRASSEN_H
+#define STRASSEN_H
+
+#include "seekbasic.h"
+#include <sstream>
+#include <fstream>
+
+namespace Sleipnir {
+
+class CStrassen { 
+public:
+// Set LEAF_SIZE to 1 if you want to the pure strassen algorithm
+// otherwise, the ikj-algorithm will be applied when the split
+// matrices are as small as LEAF_SIZE x LEAF_SIZE
+/*
+ * Implementation of the strassen algorithm, similar to
+ * http://en.wikipedia.org/w/index.php?title=Strassen_algorithm&oldid=498910018#Source_code_of_the_Strassen_algorithm_in_C_language
+ */
+static void strassen(vector<vector<float> > &A,
+vector<vector<float> > &B,
+vector<vector<float> > &C, unsigned int tam);
+
+private:
+
+//int leafsize = 64;
+static void ikjalgorithm(vector<vector<float> > A,
+vector<vector<float> > B, vector<vector<float> > &C, int n);
+
+static unsigned int nextPowerOfTwo(int n);
+static void strassenR(vector<vector<float> > &A,
+vector<vector<float> > &B,
+vector<vector<float> > &C, int tam);
+static void sum(vector<vector<float> > &A,
+vector<vector<float> > &B,
+vector<vector<float> > &C, int tam);
+static void subtract(vector<vector<float> > &A,
+vector<vector<float> > &B,
+vector<vector<float> > &C, int tam);
+//void printMatrix(vector<vector<float> > matrix, int n);
+//void read(string filename, vector<vector<float> > &A, vector<vector<float> > &B);
+};
+
+}
+#endif

tools/DBCombiner/DBCombiner.cpp

 		useNibble = true;
 	}
 
-	CDatabase DB(useNibble);
+	if(sArgs.reorganize_flag==1){
+		vector<string> vecstrDataset;
+		ifstream ifsm2;
+		ifsm2.open(sArgs.dataset_arg);
+		while(!ifsm2.eof()){
+			ifsm2.getline(acBuffer, c_iBuffer-1);
+			if(acBuffer[0]==0) break;
+			acBuffer[c_iBuffer-1] = 0;
+			vector<string> vecstrLine;
+			CMeta::Tokenize(acBuffer, vecstrLine);
+			vecstrDataset.push_back(vecstrLine[0]);
+		}
+		ifsm2.close();
 
-	bool fSplit = false;
-	if(sArgs.split_flag==1){
-		fSplit = true;
+		if(useNibble){
+			fprintf(stderr, "The use of nibble flag is not supported for --reorganize mode\n");
+			return 1;
+		}
+		CDatabase db(false);
+		db.Open(sArgs.db_dir_arg, vecstrGenes, vecstrDataset.size(), 
+			sArgs.src_db_num_arg);
+		db.Reorganize(sArgs.dest_db_dir_arg, sArgs.dest_db_num_arg);
+		return 0;
 	}
 
-	if(sArgs.db_arg){
-		ifsm.open(sArgs.db_arg);
-		while(!pistm->eof()){
-			pistm->getline(acBuffer, c_iBuffer -1);
-			if(acBuffer[0]==0){
-				break;
+	if(sArgs.combine_flag==1){
+		CDatabase DB(useNibble);
+
+		bool fSplit = false;
+		if(sArgs.split_flag==1){
+			fSplit = true;
+		}
+
+		if(sArgs.db_arg){
+			ifsm.open(sArgs.db_arg);
+			while(!pistm->eof()){
+				pistm->getline(acBuffer, c_iBuffer -1);
+				if(acBuffer[0]==0){
+					break;
+				}
+				acBuffer[c_iBuffer-1] = 0;
+				vecstrDBs.push_back(acBuffer);
 			}
-			acBuffer[c_iBuffer-1] = 0;
-			vecstrDBs.push_back(acBuffer);
+			vecstrDBs.resize(vecstrDBs.size());
+			ifsm.close();
+
+			//printf("Reading DBS"); getchar();
+			vector<CDatabaselet*> DBS;
+			DBS.resize(vecstrDBs.size());
+			for(i=0; i<vecstrDBs.size(); i++){
+				DBS[i] = new CDatabaselet(useNibble);
+				DBS[i]->Open(vecstrDBs[i]);
+			}
+			//printf("Finished reading DBS"); getchar();
+
+			CDatabaselet::Combine(DBS, sArgs.dir_out_arg, vecstrGenes, fSplit);
+			for(i=0; i<vecstrDBs.size(); i++){
+				free(DBS[i]);
+			}
+
+		}else{
+			cerr << "Must give a db list." << endl;
+			return 1;
+
 		}
-		vecstrDBs.resize(vecstrDBs.size());
-		ifsm.close();
-
-		//printf("Reading DBS"); getchar();
-		vector<CDatabaselet*> DBS;
-		DBS.resize(vecstrDBs.size());
-		for(i=0; i<vecstrDBs.size(); i++){
-	    	DBS[i] = new CDatabaselet(useNibble);
-	    	DBS[i]->Open(vecstrDBs[i]);
-	    }
-		//printf("Finished reading DBS"); getchar();
-
-	    CDatabaselet::Combine(DBS, sArgs.dir_out_arg, vecstrGenes, fSplit);
-	    for(i=0; i<vecstrDBs.size(); i++){
-	    	free(DBS[i]);
-	    }
-
-	}else{
-		cerr << "Must give a db list." << endl;
-		return 1;
-
 	}
-
 #ifdef WIN32
 	pthread_win32_process_detach_np( );
 #endif // WIN32

tools/DBCombiner/DBCombiner.ggo

 version	"1.0"
 purpose	"Combines a list of DB files with the same gene content"
 
+section "Mode"
+option	"combine"			C	"Combine a set of DB's, each coming from a different dataset subset"
+								flag	off
+option	"reorganize"		R	"Reorganize a set of DB's, such as from 21000 DB files to 1000 DB files, ie expanding/shrinking the number of genes a DB contains"
+								flag	off
+
 section "Main"
+option	"input"				i	"Input gene mapping"
+								string	typestr="filename"	yes	
+
+section "Combine Mode"
 option	"db"				x	"Input a set of databaselet filenames (including path)"
 								string typestr="filename"
-option	"input"				i	"Input gene mapping"
-								string	typestr="filename"	
-option	"dir_out"			D	"Database directory"
+option	"dir_out"			D	"Output database directory"
 								string	typestr="directory"	default="."
 option	"is_nibble"			N	"Whether the input DB is nibble type"
 								flag	off
 option	"split"				s	"Split to one-gene per file"
-								flag	off
+								flag	off
+
+section "Reorganize Mode"
+option	"dataset"			A	"Dataset-platform mapping file"
+								string typestr="filename"
+option	"db_dir"			d	"Source DB collection directory"
+								string typestr="directory"
+option	"src_db_num"		n	"Source DB number of files"
+								int
+option	"dest_db_num"		b	"Destination DB number of files"
+								int
+option	"dest_db_dir"		B	"Destination DB directory"
+								string typestr="directory"

tools/DBCombiner/cmdline.c

 /*
   File autogenerated by gengetopt version 2.22
   generated with the following command:
-  gengetopt -iDBCombiner.ggo --default-optional -u -N -e 
+  /memex/qzhu/usr/bin/gengetopt -iDBCombiner.ggo --default-optional -u -N -e 
 
   The developers of gengetopt consider the fixed text that goes in all
   gengetopt output files to be in the public domain:
 const char *gengetopt_args_info_description = "";
 
 const char *gengetopt_args_info_help[] = {
-  "  -h, --help               Print help and exit",
-  "  -V, --version            Print version and exit",
+  "  -h, --help                   Print help and exit",
+  "  -V, --version                Print version and exit",
+  "\nMode:",
+  "  -C, --combine                Combine a set of DB's, each coming from a \n                                 different dataset subset  (default=off)",
+  "  -R, --reorganize             Reorganize a set of DB's, such as from 21000 DB \n                                 files to 1000 DB files, ie expanding/shrinking \n                                 the number of genes a DB contains  \n                                 (default=off)",
   "\nMain:",
-  "  -x, --db=filename        Input a set of databaselet filenames (including \n                             path)",
-  "  -i, --input=filename     Input gene mapping",
-  "  -D, --dir_out=directory  Database directory  (default=`.')",
-  "  -N, --is_nibble          Whether the input DB is nibble type  (default=off)",
-  "  -s, --split              Split to one-gene per file  (default=off)",
+  "  -i, --input=filename         Input gene mapping",
+  "\nCombine Mode:",
+  "  -x, --db=filename            Input a set of databaselet filenames (including \n                                 path)",
+  "  -D, --dir_out=directory      Output database directory  (default=`.')",
+  "  -N, --is_nibble              Whether the input DB is nibble type  \n                                 (default=off)",
+  "  -s, --split                  Split to one-gene per file  (default=off)",
+  "\nReorganize Mode:",
+  "  -A, --dataset=filename       Dataset-platform mapping file",
+  "  -d, --db_dir=directory       Source DB collection directory",
+  "  -n, --src_db_num=INT         Source DB number of files",
+  "  -b, --dest_db_num=INT        Destination DB number of files",
+  "  -B, --dest_db_dir=directory  Destination DB directory",
     0
 };
 
 typedef enum {ARG_NO
   , ARG_FLAG
   , ARG_STRING
+  , ARG_INT
 } cmdline_parser_arg_type;
 
 static
 cmdline_parser_internal (int argc, char * const *argv, struct gengetopt_args_info *args_info,
                         struct cmdline_parser_params *params, const char *additional_error);
 
+static int
+cmdline_parser_required2 (struct gengetopt_args_info *args_info, const char *prog_name, const char *additional_error);
 
 static char *
 gengetopt_strdup (const char *s);
 {
   args_info->help_given = 0 ;
   args_info->version_given = 0 ;
+  args_info->combine_given = 0 ;
+  args_info->reorganize_given = 0 ;
+  args_info->input_given = 0 ;
   args_info->db_given = 0 ;
-  args_info->input_given = 0 ;
   args_info->dir_out_given = 0 ;
   args_info->is_nibble_given = 0 ;
   args_info->split_given = 0 ;
+  args_info->dataset_given = 0 ;
+  args_info->db_dir_given = 0 ;
+  args_info->src_db_num_given = 0 ;
+  args_info->dest_db_num_given = 0 ;
+  args_info->dest_db_dir_given = 0 ;
 }
 
 static
 void clear_args (struct gengetopt_args_info *args_info)
 {
+  args_info->combine_flag = 0;
+  args_info->reorganize_flag = 0;
+  args_info->input_arg = NULL;
+  args_info->input_orig = NULL;
   args_info->db_arg = NULL;
   args_info->db_orig = NULL;
-  args_info->input_arg = NULL;
-  args_info->input_orig = NULL;
   args_info->dir_out_arg = gengetopt_strdup (".");
   args_info->dir_out_orig = NULL;
   args_info->is_nibble_flag = 0;
   args_info->split_flag = 0;
+  args_info->dataset_arg = NULL;
+  args_info->dataset_orig = NULL;
+  args_info->db_dir_arg = NULL;
+  args_info->db_dir_orig = NULL;
+  args_info->src_db_num_orig = NULL;
+  args_info->dest_db_num_orig = NULL;
+  args_info->dest_db_dir_arg = NULL;
+  args_info->dest_db_dir_orig = NULL;
   
 }
 
 
   args_info->help_help = gengetopt_args_info_help[0] ;
   args_info->version_help = gengetopt_args_info_help[1] ;
-  args_info->db_help = gengetopt_args_info_help[3] ;
-  args_info->input_help = gengetopt_args_info_help[4] ;
-  args_info->dir_out_help = gengetopt_args_info_help[5] ;
-  args_info->is_nibble_help = gengetopt_args_info_help[6] ;
-  args_info->split_help = gengetopt_args_info_help[7] ;
+  args_info->combine_help = gengetopt_args_info_help[3] ;
+  args_info->reorganize_help = gengetopt_args_info_help[4] ;
+  args_info->input_help = gengetopt_args_info_help[6] ;
+  args_info->db_help = gengetopt_args_info_help[8] ;
+  args_info->dir_out_help = gengetopt_args_info_help[9] ;
+  args_info->is_nibble_help = gengetopt_args_info_help[10] ;
+  args_info->split_help = gengetopt_args_info_help[11] ;
+  args_info->dataset_help = gengetopt_args_info_help[13] ;
+  args_info->db_dir_help = gengetopt_args_info_help[14] ;
+  args_info->src_db_num_help = gengetopt_args_info_help[15] ;
+  args_info->dest_db_num_help = gengetopt_args_info_help[16] ;
+  args_info->dest_db_dir_help = gengetopt_args_info_help[17] ;
   
 }
 
 cmdline_parser_release (struct gengetopt_args_info *args_info)
 {
   unsigned int i;
+  free_string_field (&(args_info->input_arg));
+  free_string_field (&(args_info->input_orig));
   free_string_field (&(args_info->db_arg));
   free_string_field (&(args_info->db_orig));
-  free_string_field (&(args_info->input_arg));
-  free_string_field (&(args_info->input_orig));
   free_string_field (&(args_info->dir_out_arg));
   free_string_field (&(args_info->dir_out_orig));
+  free_string_field (&(args_info->dataset_arg));
+  free_string_field (&(args_info->dataset_orig));
+  free_string_field (&(args_info->db_dir_arg));
+  free_string_field (&(args_info->db_dir_orig));
+  free_string_field (&(args_info->src_db_num_orig));
+  free_string_field (&(args_info->dest_db_num_orig));
+  free_string_field (&(args_info->dest_db_dir_arg));
+  free_string_field (&(args_info->dest_db_dir_orig));
   
   
   for (i = 0; i < args_info->inputs_num; ++i)
     write_into_file(outfile, "help", 0, 0 );
   if (args_info->version_given)
     write_into_file(outfile, "version", 0, 0 );
+  if (args_info->combine_given)
+    write_into_file(outfile, "combine", 0, 0 );
+  if (args_info->reorganize_given)
+    write_into_file(outfile, "reorganize", 0, 0 );
+  if (args_info->input_given)
+    write_into_file(outfile, "input", args_info->input_orig, 0);
   if (args_info->db_given)
     write_into_file(outfile, "db", args_info->db_orig, 0);
-  if (args_info->input_given)
-    write_into_file(outfile, "input", args_info->input_orig, 0);
   if (args_info->dir_out_given)
     write_into_file(outfile, "dir_out", args_info->dir_out_orig, 0);
   if (args_info->is_nibble_given)
     write_into_file(outfile, "is_nibble", 0, 0 );
   if (args_info->split_given)
     write_into_file(outfile, "split", 0, 0 );
+  if (args_info->dataset_given)
+    write_into_file(outfile, "dataset", args_info->dataset_orig, 0);
+  if (args_info->db_dir_given)
+    write_into_file(outfile, "db_dir", args_info->db_dir_orig, 0);
+  if (args_info->src_db_num_given)
+    write_into_file(outfile, "src_db_num", args_info->src_db_num_orig, 0);
+  if (args_info->dest_db_num_given)
+    write_into_file(outfile, "dest_db_num", args_info->dest_db_num_orig, 0);
+  if (args_info->dest_db_dir_given)
+    write_into_file(outfile, "dest_db_dir", args_info->dest_db_dir_orig, 0);
   
 
   i = EXIT_SUCCESS;
 int
 cmdline_parser_required (struct gengetopt_args_info *args_info, const char *prog_name)
 {
-  return EXIT_SUCCESS;
+  int result = EXIT_SUCCESS;
+
+  if (cmdline_parser_required2(args_info, prog_name, NULL) > 0)
+    result = EXIT_FAILURE;