Commits

Aaron Wong committed e57aedb

Adding two tools NetworkRanker and DataServer

  • Participants
  • Parent commits ae6e505

Comments (0)

Files changed (17)

File configure.ac

                  tools/Data2Features/Makefile \
                  tools/Data2Sql/Makefile \
                  tools/DataDumper/Makefile \
+                 tools/DataServer/Makefile \
                  tools/Distancer/Makefile \
                  tools/Explainer/Makefile \
                  tools/Filterer/Makefile \
                  tools/MIer/Makefile \
                  tools/MIed/Makefile \
                  tools/NetMiner/Makefile \
+                 tools/NetworkRanker/Makefile \
                  tools/Normalizer/Makefile \
                  tools/Orthologer/Makefile \
                  tools/Overlapper/Makefile \

File configure.local

 
 ./configure --with-gengetopt=/usr/bin/gengetopt\
 	--with-log4cpp=/usr/include/log4cpp/ \
-	--with-smile=/Genomics/fgrid/function/sleipnir-extlib/smile_1_1_linux64_gcc_4_1_2/ \
+	--with-smile=/Genomics/fgrid/function/sleipnir-extlib/smile_linux_x64_gcc_4_4_5/ \
 	--with-svm-perf=/Genomics/fgrid/function/sleipnir-extlib/svm_perf/ \
 	--with-boost-includes=/usr/include/boost/\
 	--with-boost-graph-lib=/usr/lib/libboost_graph.a \
-#	--with-vowpal-wabbit=../extlib/vowpal_wabbit_v4.1/ \
 	LDFLAGS=-static
 # CXXFLAGS=-fno-threadsafe-statics

File tools/DataServer/DataServer.cpp

+/*****************************************************************************
+* This file is provided under the Creative Commons Attribution 3.0 license.
+*
+* You are free to share, copy, distribute, transmit, or adapt this work
+* PROVIDED THAT you attribute the work to the authors listed below.
+* For more information, please see the following web page:
+* http://creativecommons.org/licenses/by/3.0/
+*
+* This file is a component of the Sleipnir library for functional genomics,
+* authored by:
+* Curtis Huttenhower (chuttenh@princeton.edu)
+* Mark Schroeder
+* Maria D. Chikina
+* Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
+*
+* If you use this library, the included executable tools, or any related
+* code in your work, please cite the following publication:
+* Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
+* Olga G. Troyanskaya.
+* "The Sleipnir library for computational functional genomics"
+*****************************************************************************/
+#include "stdafx.h"
+#include "cmdline.h"
+#include "DataServer.h"
+
+const CDataServer::TPFNProcessor	CDataServer::c_apfnProcessors[]	=
+	{&CDataServer::ProcessDatasetSearch};
+
+struct SData {
+    vector<size_t>* m_veciGenes; 
+    vector<float>* m_vecfDataWeights;
+    vector<float>* m_vecfScores; 
+    vector<float>* m_vecfTotal;
+    CDataServer* m_Server;
+    //CPCL* m_QueryCorScores;
+    //CPCL* m_QueryCorTotal;
+    CFullMatrix<float>* m_QueryCorScores;
+    CFullMatrix<float>* m_QueryCorTotal;
+    
+};
+
+struct SWeight {
+    size_t m_iData;
+    //CPCL* m_CorPCL;
+    CFullMatrix<float>* m_CorMat;
+    float m_cutoff;
+    float m_base;
+    vector<float>* m_vecfDataWeights;
+    CDataServer* m_Server;
+    size_t m_iThread;
+};
+
+int main( int iArgs, char** aszArgs ) {
+	gengetopt_args_info			sArgs;
+    static const size_t c_iBuffer = 1024;
+    char acBuffer[ c_iBuffer ];
+
+	CServer						Server;
+    CDatabase Database(false);
+    CPCL VarPCL;
+    vector<string> vecstrDatasets;
+    ifstream ifsm;
+    int iRet;
+    vector<size_t> veciPCLGeneIdx, veciPCLDataIdx;
+    size_t i;
+
+    iRet = cmdline_parser2( iArgs, aszArgs, &sArgs, 0, 1, 0 );
+
+	cerr << "Loading the database..." << endl;
+	if( !Database.Open( sArgs.database_arg ) ) {
+		cerr << "Could not open: " << sArgs.database_arg << endl;
+		return 1; 
+    }
+
+    if( !VarPCL.Open( sArgs.variances_arg, 0 ) ) {
+        cerr << "Could not open: " << sArgs.database_arg << endl;
+        return 1;
+    }
+
+    if( sArgs.datasets_arg ) { 
+        ifsm.clear( );
+        ifsm.open(sArgs.datasets_arg);
+        while(!ifsm.eof()){
+            ifsm.getline(acBuffer, c_iBuffer -1); 
+            if(acBuffer[0]==0)
+                break;
+            acBuffer[c_iBuffer-1] = 0; 
+            vector<string> tok; 
+            CMeta::Tokenize(acBuffer, tok, " \t");
+            vecstrDatasets.push_back(tok[1]);
+        }
+        ifsm.close();
+    }  
+
+    // Map PCL genes to CDatabase genes
+    veciPCLGeneIdx.resize( Database.GetGenes() );
+    for ( i = 0; i < veciPCLGeneIdx.size(); i++ ) {
+        veciPCLGeneIdx[i] = VarPCL.GetGene( Database.GetGene( i ) );
+    } 
+
+    veciPCLDataIdx.resize( vecstrDatasets.size() );
+    for ( i = 0; i < veciPCLDataIdx.size(); i++ ) {
+        veciPCLDataIdx[i] = VarPCL.GetExperiment( vecstrDatasets[i] );
+    } 
+
+
+	SDataServerData	sData( Database, vecstrDatasets, VarPCL, veciPCLGeneIdx, veciPCLDataIdx, sArgs.threads_arg );
+	CDataServer	DataServer( 0, "", sData );
+
+    cerr << "Maximum number of threads: " << sArgs.threads_arg << endl;
+
+	Server.Initialize( sArgs.port_arg, sArgs.timeout_arg, &DataServer );
+#ifdef WIN32
+	pthread_win32_process_attach_np( );
+#endif // WIN32
+	Server.Start( );
+#ifdef WIN32
+	pthread_win32_process_detach_np( );
+#endif // WIN32
+
+	return 0; }
+
+
+CDataServer::CDataServer( SOCKET iSocket, const string& strConnection, const SDataServerData& sData ) :
+	m_iSocket(iSocket), m_strConnection(strConnection), m_sData(sData) {
+
+	if( m_strConnection.length( ) > 0 )
+		cerr << "New connection from: " << m_strConnection << endl; }
+
+CDataServer::~CDataServer( ) {
+
+}
+
+IServerClient* CDataServer::NewInstance( SOCKET iSocket, uint32_t iHost, uint16_t sPort ) {
+	string	strConnection;
+	char	acBuffer[ 16 ];
+	in_addr	sAddr;
+
+#pragma warning(disable : 4996)
+	sprintf( acBuffer, "%hu", sPort );
+#pragma warning(default : 4996)
+	sAddr.s_addr = htonl( iHost );
+	strConnection = (string)inet_ntoa( sAddr ) + ":" + acBuffer;
+	return new CDataServer( iSocket, strConnection, m_sData ); }
+
+void CDataServer::Destroy( ) {
+
+	cerr << "Disconnected: " << m_strConnection << endl;
+
+	delete this; }
+
+bool CDataServer::ProcessMessage( const vector<unsigned char>& vecbMessage ) {
+	size_t	i, iProcessed, iOffset;
+
+	for( iOffset = 0; iOffset < vecbMessage.size( ); iOffset += ( iProcessed + 1 ) ) {
+		cerr << "LOG	" << time( NULL ) << '\t' << m_strConnection << endl; //'\t' << hex;
+		if( vecbMessage[ iOffset ] >= ARRAYSIZE(c_apfnProcessors) ) {
+			cerr << m_strConnection << " unknown opcode: " << (int)vecbMessage[ iOffset ] << endl;
+			return false; }
+		else {
+			cerr << m_strConnection << " opcode: " << (int)vecbMessage[ iOffset ] << endl;
+		}
+		if( ( iProcessed = (this->*c_apfnProcessors[ vecbMessage[ iOffset ] ])( vecbMessage,
+			iOffset + 1 ) ) == -1 )
+			return false; }
+
+	return true; }
+
+
+void* GetScoresThread( void* pData ) {
+    SData* data = (SData*)pData;
+
+    data->m_Server->GetScores( *(data->m_veciGenes), *(data->m_vecfDataWeights), 
+        *(data->m_vecfScores), *(data->m_vecfTotal), data->m_QueryCorScores, data->m_QueryCorTotal );
+
+    return NULL; 
+}
+
+
+
+void CDataServer::GetScores( const vector<size_t>& veciGenes, const vector<float>& vecfDataWeights,
+        vector<float>& vecfScores, vector<float>& vecfTotal, 
+        CFullMatrix<float>* AllScores = NULL, CFullMatrix<float>* AllCounts = NULL ) {
+
+    size_t iGene, iTargetPCL, iDataPCL, iGenePCL, q, i, j, iOffset;
+    vector<unsigned char> vecbData;
+    float v, t;
+
+    // Initialize data structures
+    vecfScores.resize( GetDatabase().GetGenes() );
+    fill( vecfScores.begin(), vecfScores.end(), 0 );
+    vecfTotal.resize( GetDatabase().GetGenes() );
+    fill( vecfTotal.begin(), vecfTotal.end(), 0 );
+
+    if ( AllScores ) {
+        AllScores->Clear();
+        AllCounts->Clear();
+    }
+
+    // Iterate over query genes
+    for ( q = 0; q < veciGenes.size(); q++ ) {
+        iGene = veciGenes[q];
+        iGenePCL = GetPCLGeneIdx()[ iGene ]; 
+
+        if ( iGene == -1 || iGenePCL == -1 ) {
+            cerr << "Missing gene: " << GetDatabase().GetGene( iGene ) << endl;
+            continue;
+        }
+        cerr << q << ": " << GetDatabase().GetGene( iGene ) << endl;
+ 
+        vecbData.clear();
+        GetDatabase().Get( iGene, vecbData ); 
+
+        // Iterate over all genes
+        for ( i = 0; i < GetDatabase().GetGenes(); i++ ) {
+            iOffset = i * GetDatabase().GetDatasets();
+            
+            for ( j = 0; j < GetDatabase().GetDatasets(); j++ ) {
+                iDataPCL = GetPCLDataIdx()[j];
+                if ( iDataPCL == -1 ) continue;
+
+                v = (float)vecbData[ iOffset + j ];     
+                if ( v == 0xFF ) continue;
+
+                // Convert to z-score from bin value
+                v = -5 + v*(10.0/255.0);
+            
+                // Ignore negative z-scores
+                if ( v < 0 ) v = 0;
+
+                // Store all scores
+                if ( AllScores && AllCounts ) {
+                    t = GetVarPCL().Get( iGenePCL, iDataPCL );
+                    if ( CMeta::IsNaN( t ) ) continue;
+
+                    AllScores->Set( i, j, AllScores->Get( i, j ) + ( v * t ) );
+                    AllCounts->Set( i, j, AllCounts->Get( i, j ) + t );
+                }
+                // Summarize scores
+                else {
+                    t = GetVarPCL().Get( iGenePCL, iDataPCL ) * vecfDataWeights[ j ];
+                    if ( CMeta::IsNaN( t ) ) continue;
+                    v *= t;
+                    vecfScores[i] += v;
+                    vecfTotal[i] += t; 
+                }
+            }
+        } 
+    }
+}
+	
+void* GetDataWeightsThread( void* pData ) {
+    SWeight* data = (SWeight*)pData;
+
+    data->m_Server->GetDataWeights( data->m_iData, *(data->m_CorMat), 
+        data->m_cutoff, data->m_base, *(data->m_vecfDataWeights), data->m_iThread );
+
+    return NULL; 
+}
+
+
+void CDataServer::GetDataWeights( size_t iData, CFullMatrix<float>& CorMat, float cutoff, float s, 
+        vector<float>& vecfDataWeights, int iThread = -1 ) {
+    size_t i, j, iDataPCL;
+    float d1, d2, w1, w2, fSim;
+    CMeasurePearson Pearson;
+    vector<float> adOne, adTwo, adW1, adW2;
+
+    cerr << "Calculating correlations for thread: " << iThread << endl;
+
+    for( j = 0; j < GetDatabase().GetDatasets(); j++ ) {
+        if ( iThread >= 0 && j % GetMaxThreads() != iThread )
+            continue;
+
+        adOne.clear();
+        adTwo.clear();
+        adW1.clear();
+        adW2.clear();
+
+        for( i = 0; i < GetDatabase().GetGenes(); i++ ) {
+            d1 = CorMat.Get( i, j ); 
+            d2 = CorMat.Get( i, iData );
+
+            if ( CMeta::IsNaN( d1 ) || CMeta::IsNaN( d2 ) ) continue;
+            if ( d1 < cutoff && d2 < cutoff ) continue;
+
+            adOne.push_back( d1 );
+            adTwo.push_back( d2 );
+
+            w1 = pow ( s, d1 ) - 1;
+            w2 = pow ( s, d2 ) - 1;
+
+            adW1.push_back( w1 );
+            adW2.push_back( w2 );
+        }
+        fSim = 0;
+
+        if ( adOne.size() ) {
+            fSim = (float) Pearson.Measure( &adOne[0], adOne.size(), &adTwo[0], adTwo.size(), 
+                IMeasure::EMapNone, &adW1[0], &adW2[0] );
+        }
+
+        if ( fSim < 0 ) fSim = 0;
+
+        vecfDataWeights[ j ] = fSim;
+    }
+}
+
+
+size_t CDataServer::ProcessDatasetSearch( const vector<unsigned char>& vecbMessage, size_t iOffset ) {
+	size_t		iStart, i, j, t;
+	uint32_t	iGene, iDataset, iSize;
+    float   iCut, iExp;
+    vector<string> vecstrFeatures;
+    vector<size_t> veciGenes;
+    vector<float> vecfDataWeights, vecfScores, vecfTotal;
+    vector<pthread_t> vecpthdThreads;
+    vector<SData> vecsData;
+    vector<SWeight> vecsWeight;
+    CFullMatrix<float> QueryCorScores, QueryCorTotal;
+
+    size_t iThreads = 1; 
+
+	if( ( iOffset + sizeof(iDataset) ) > vecbMessage.size( ) )
+		return -1;
+	iStart = iOffset;
+	iDataset = *(uint32_t*)&vecbMessage[ iOffset ];
+
+    cerr << "Processing Search" << endl;
+    cerr << "Dataset: " << GetDatasetNames()[iDataset] << endl; 
+    
+    iOffset += sizeof(iDataset);
+	if( iOffset + sizeof(iCut) > vecbMessage.size( ) )
+		return -1;
+    iCut = *(float*)&vecbMessage[ iOffset ];
+
+    iOffset += sizeof(iCut);
+	if( iOffset + sizeof(iExp) > vecbMessage.size( ) )
+		return -1;
+    iExp = *(float*)&vecbMessage[ iOffset ];
+
+    cerr << "Parameters: " << iCut << ", " << iExp << endl;
+
+    for( iOffset += sizeof(iExp); ( iOffset + sizeof(iGene) ) <= vecbMessage.size();
+        iOffset += sizeof(iGene) ) {
+        iGene = *(uint32_t*)&vecbMessage[ iOffset ];
+        veciGenes.push_back(iGene); 
+    }
+
+    iThreads = std::min( (int)veciGenes.size(), GetMaxThreads() );    
+
+    cerr << "Setting threads to: " << iThreads << endl;
+
+
+    // ------------------------------------------------------------------------/
+    // Get correlations to query genes
+    
+    // Initialize dataset weights to equal weighting
+    vecfDataWeights.resize( GetDatabase().GetDatasets() );
+    fill( vecfDataWeights.begin(), vecfDataWeights.end(), 1 );
+
+    QueryCorScores.Initialize( GetDatabase().GetGenes(), GetDatabase().GetDatasets() );
+    QueryCorTotal.Initialize( GetDatabase().GetGenes(), GetDatabase().GetDatasets() );
+    QueryCorScores.Clear();
+    QueryCorTotal.Clear();
+
+    vecsData.resize( iThreads/2 );
+    vecpthdThreads.resize( iThreads/2 );
+    for( i = 0; i < vecsData.size(); i++ ) {
+        vecsData[ i ].m_vecfDataWeights = &vecfDataWeights;         
+        vecsData[ i ].m_vecfScores = new vector<float>(); 
+        vecsData[ i ].m_vecfTotal = new vector<float>(); 
+        vecsData[ i ].m_veciGenes = new vector<size_t>(); 
+        vecsData[ i ].m_Server = (CDataServer*)this;
+        vecsData[ i ].m_QueryCorScores = new CFullMatrix<float>(); 
+        vecsData[ i ].m_QueryCorTotal = new CFullMatrix<float>();
+
+        vecsData[ i ].m_QueryCorScores->Initialize( GetDatabase().GetGenes(), GetDatabase().GetDatasets() );
+        vecsData[ i ].m_QueryCorTotal->Initialize( GetDatabase().GetGenes(), GetDatabase().GetDatasets() );
+
+        for( iGene = 0; iGene < veciGenes.size(); iGene++ ) {
+            if ( iGene % (iThreads/2) == i ) {
+                vecsData[ i ].m_veciGenes->push_back( veciGenes[ iGene ] );
+            }
+        }
+        cerr << "Creating " << i << endl;
+        pthread_create( &vecpthdThreads[ i ], NULL, GetScoresThread, &vecsData[ i ] ); 
+    }
+    for( i = 0; i < vecpthdThreads.size(); i++ ) {
+        pthread_join( vecpthdThreads[ i ], NULL );
+    }
+
+    // Collect results
+    for ( t = 0; t < vecsData.size(); t++ ) {
+        for ( i = 0; i < GetDatabase().GetGenes(); i++ ) {
+            for ( j = 0; j < GetDatabase().GetDatasets(); j++ ) {
+                QueryCorScores.Set( i, j, 
+                    QueryCorScores.Get( i, j ) + vecsData[ t ].m_QueryCorScores->Get( i, j ) ); 
+                QueryCorTotal.Set( i, j, 
+                    QueryCorTotal.Get( i, j ) + vecsData[ t ].m_QueryCorTotal->Get( i, j ) ); 
+            }
+        }
+    }
+
+    // Calculate average
+    for ( i = 0; i < GetDatabase().GetGenes(); i++ ) {
+        for ( j = 0; j < GetDatabase().GetDatasets(); j++ ) {
+            QueryCorScores.Set( i, j, 
+                QueryCorScores.Get( i, j ) / QueryCorTotal.Get( i, j ) ); 
+        }
+    }
+
+    for ( t = 0; t < vecsData.size(); t++ ) {
+        delete vecsData[ t ].m_vecfScores;
+        delete vecsData[ t ].m_vecfTotal;
+        delete vecsData[ t ].m_veciGenes;
+        delete vecsData[ t ].m_QueryCorScores;
+        delete vecsData[ t ].m_QueryCorTotal;
+    }
+
+    // ------------------------------------------------------------------------/
+    // Calculate correlations
+    // GetDataWeights( GetVarPCL().GetExperiment( GetDatasetNames()[ iDataset ] ), 
+    //    QueryCorScores, iCut, iExp, vecfDataWeights );
+
+    vecfDataWeights.resize( GetDatabase().GetDatasets() );
+    fill( vecfDataWeights.begin(), vecfDataWeights.end(), 0 );
+
+    vecsWeight.resize( iThreads );
+    vecpthdThreads.resize( iThreads );
+    for( i = 0; i < vecsWeight.size(); i++ ) {
+
+        vecsWeight[ i ].m_iData = iDataset;
+        vecsWeight[ i ].m_CorMat = &QueryCorScores;
+        vecsWeight[ i ].m_cutoff = iCut;
+        vecsWeight[ i ].m_base = iExp;
+        vecsWeight[ i ].m_vecfDataWeights = &vecfDataWeights;
+        vecsWeight[ i ].m_Server = this;
+        vecsWeight[ i ].m_iThread = i;
+
+        cerr << "Creating " << i << endl;
+        pthread_create( &vecpthdThreads[ i ], NULL, GetDataWeightsThread, &vecsWeight[ i ] ); 
+    }
+
+    for( i = 0; i < vecpthdThreads.size(); i++ ) {
+        pthread_join( vecpthdThreads[ i ], NULL );
+    }
+
+
+    // ------------------------------------------------------------------------/
+    // Calculated weighted correlations
+    vecsData.resize( iThreads );
+    vecpthdThreads.resize( iThreads );
+    for( i = 0; i < vecsData.size(); i++ ) {
+        vecsData[ i ].m_vecfDataWeights = &vecfDataWeights;         
+        vecsData[ i ].m_vecfScores = new vector<float>(); 
+        vecsData[ i ].m_vecfTotal = new vector<float>(); 
+        vecsData[ i ].m_veciGenes = new vector<size_t>(); 
+        vecsData[ i ].m_Server = (CDataServer*)this;
+        vecsData[ i ].m_QueryCorScores = NULL;
+        vecsData[ i ].m_QueryCorTotal = NULL;
+
+        for( iGene = 0; iGene < veciGenes.size(); iGene++ ) {
+            if ( iGene % iThreads == i ) {
+                vecsData[ i ].m_veciGenes->push_back( veciGenes[ iGene ] );
+            }
+        }
+        cerr << "Creating " << i << endl;
+        pthread_create( &vecpthdThreads[ i ], NULL, GetScoresThread, &vecsData[ i ] ); 
+    }
+
+    for( i = 0; i < vecpthdThreads.size(); i++ ) {
+        pthread_join( vecpthdThreads[ i ], NULL );
+    }
+
+    vecfScores.resize( GetDatabase().GetGenes() );
+    fill( vecfScores.begin(), vecfScores.end(), 0 );
+    vecfTotal.resize( GetDatabase().GetGenes() );
+    fill( vecfTotal.begin(), vecfTotal.end(), 0 );
+
+    // Collect results
+    for ( i = 0; i < vecsData.size(); i++ ) {
+        for ( iGene = 0; iGene < GetDatabase().GetGenes(); iGene++ ) {
+            vecfScores[ iGene ] += vecsData[ i ].m_vecfScores->at( iGene );
+            vecfTotal[ iGene ] += vecsData[ i ].m_vecfTotal->at( iGene );
+        }
+    }
+
+    // Calculate average
+    for ( i = 0; i < vecfScores.size(); i++ ) {
+        vecfScores[i] /= vecfTotal[i];
+    }
+
+    // Clean up
+    for ( t = 0; t < vecsData.size(); t++ ) {
+        delete vecsData[ t ].m_vecfScores;
+        delete vecsData[ t ].m_vecfTotal;
+        delete vecsData[ t ].m_veciGenes;
+    }
+
+    // Send scores
+    iSize = (uint32_t)( sizeof(float) * ( GetDatabase().GetGenes() + GetDatabase().GetDatasets() ) ); 
+    send( m_iSocket, (char*)&iSize, sizeof(iSize), 0 );
+    send( m_iSocket, (char*)&vecfScores[0], sizeof(float)*vecfScores.size(), 0 );
+    send( m_iSocket, (char*)&vecfDataWeights[0], sizeof(float)*vecfDataWeights.size(), 0 );
+    return ( iOffset - iStart );
+}
+
+
+
+

File tools/DataServer/DataServer.ggo

+package	"DataServer"
+version	"1.0"
+purpose	""
+
+section "Input"
+option  "database"  d   "Database directory"
+                        string  typestr="directory"
+option  "datasets"  I   "File of dataset names"   
+                        string typestr="filename"
+option  "dataname"  D   "Name of input dataset"
+                        string
+option  "variances" v   "PCL File of gene variances"
+                        string typestr="filename"
+
+section "Server"
+option	"port"			p	"Server port"
+	int	default="1234"
+option	"timeout"		t	"Server timeout"
+	int	default="100"
+option	"threads"		T	"Maximum number of threads"
+	int	default="1"
+
+

File tools/DataServer/DataServer.h

+/*****************************************************************************
+* This file is provided under the Creative Commons Attribution 3.0 license.
+*
+* You are free to share, copy, distribute, transmit, or adapt this work
+* PROVIDED THAT you attribute the work to the authors listed below.
+* For more information, please see the following web page:
+* http://creativecommons.org/licenses/by/3.0/
+*
+* This file is a component of the Sleipnir library for functional genomics,
+* authored by:
+* Curtis Huttenhower (chuttenh@princeton.edu)
+* Mark Schroeder
+* Maria D. Chikina
+* Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
+*
+* If you use this library, the included executable tools, or any related
+* code in your work, please cite the following publication:
+* Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
+* Olga G. Troyanskaya.
+* "The Sleipnir library for computational functional genomics"
+*****************************************************************************/
+#ifndef DATASERVER_H
+#define DATASERVER_H
+
+
+struct SDataServerData {
+	const CDatabase& m_Database;
+	const vector<string> m_vecstrDatasets;
+	const CPCL& m_VarPCL;
+	const vector<size_t> m_veciPCLGeneIdx;
+	const vector<size_t> m_veciPCLDataIdx;
+	const int m_iThreads;
+
+	SDataServerData( const CDatabase& Database,
+					const vector<string>& vecstrDatasets,
+					const CPCL& VarPCL,
+					vector<size_t>& veciPCLGeneIdx,
+					vector<size_t>& veciPCLDataIdx,
+					int iThreads ) :
+					m_Database(Database),
+					m_vecstrDatasets(vecstrDatasets),
+					m_VarPCL(VarPCL),
+					m_veciPCLGeneIdx(veciPCLGeneIdx),
+					m_veciPCLDataIdx(veciPCLDataIdx),
+					m_iThreads(iThreads)
+					 { }
+};
+
+class CDataServer : public IServerClient {
+public:
+	CDataServer( SOCKET, const string&, const SDataServerData& );
+	~CDataServer( );
+
+	IServerClient* NewInstance( SOCKET, uint32_t, uint16_t );
+	void Destroy( );
+	bool ProcessMessage( const std::vector<unsigned char>& );
+
+	const CDatabase& GetDatabase( ) const {
+		return m_sData.m_Database; }
+
+	const size_t GetDatasets() const {
+		return GetDatabase().GetDatasets();
+	}
+	const size_t GetGenes() const {
+		return GetDatabase().GetGenes();
+	}
+
+	const vector<string>& GetDatasetNames( ) const {
+		return m_sData.m_vecstrDatasets; }
+
+	void GetScores( const vector<size_t>&, const vector<float>&,
+	    vector<float>&, vector<float>&, CFullMatrix<float>*, CFullMatrix<float>* );
+
+	void GetDataWeights( size_t, CFullMatrix<float>&, float, float, vector<float>&, int );
+
+private:
+
+	size_t ProcessDatasetSearch( const vector<unsigned char>& , size_t );
+
+//	void GetScores( const vector<size_t>&, const vector<float>&, vector<float>&, vector<float>& );
+//	void* GetScoresThread( void* pData );
+
+	const CPCL& GetVarPCL( ) const {
+		return m_sData.m_VarPCL; }
+	const vector<size_t>& GetPCLGeneIdx( ) const {
+		return m_sData.m_veciPCLGeneIdx; }
+	const vector<size_t>& GetPCLDataIdx( ) const {
+		return m_sData.m_veciPCLDataIdx; }
+
+	const float GetGeneVar( size_t iGene, size_t iData ) {
+		return m_sData.m_VarPCL.Get( m_sData.m_veciPCLGeneIdx[ iGene ],
+		    m_sData.m_veciPCLDataIdx[ iData ] );
+	}
+
+	const int GetMaxThreads() const {
+		return m_sData.m_iThreads; }
+
+	typedef size_t (CDataServer::*TPFNProcessor)( const std::vector<unsigned char>&, size_t );
+	static const TPFNProcessor	c_apfnProcessors[];
+
+	SOCKET					m_iSocket;
+	const SDataServerData&	m_sData;
+	string					m_strConnection;
+};
+
+#endif // DATASERVER_H

File tools/DataServer/cmdline.c

+/*
+  File autogenerated by gengetopt version 2.22
+  generated with the following command:
+  /Genomics/fgrid/function/sleipnir-extlib/gengetopt/src/gengetopt -iDataServer.ggo --default-optional -C -N -e 
+
+  The developers of gengetopt consider the fixed text that goes in all
+  gengetopt output files to be in the public domain:
+  we make no copyright claims on it.
+*/
+
+/* If we use autoconf.  */
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "getopt.h"
+
+#include "cmdline.h"
+
+const char *gengetopt_args_info_purpose = "";
+
+const char *gengetopt_args_info_usage = "Usage: DataServer [OPTIONS]...";
+
+const char *gengetopt_args_info_description = "";
+
+const char *gengetopt_args_info_help[] = {
+  "  -h, --help                Print help and exit",
+  "  -V, --version             Print version and exit",
+  "\nInput:",
+  "  -d, --database=directory  Database directory",
+  "  -I, --datasets=filename   File of dataset names",
+  "  -D, --dataname=STRING     Name of input dataset",
+  "  -v, --variances=filename  PCL File of gene variances",
+  "\nServer:",
+  "  -p, --port=INT            Server port  (default=`1234')",
+  "  -t, --timeout=INT         Server timeout  (default=`100')",
+  "  -T, --threads=INT         Maximum number of threads  (default=`1')",
+    0
+};
+
+typedef enum {ARG_NO
+  , ARG_STRING
+  , ARG_INT
+} cmdline_parser_arg_type;
+
+static
+void clear_given (struct gengetopt_args_info *args_info);
+static
+void clear_args (struct gengetopt_args_info *args_info);
+
+static int
+cmdline_parser_internal (int argc, char * const *argv, struct gengetopt_args_info *args_info,
+                        struct cmdline_parser_params *params, const char *additional_error);
+
+struct line_list
+{
+  char * string_arg;
+  struct line_list * next;
+};
+
+static struct line_list *cmd_line_list = 0;
+static struct line_list *cmd_line_list_tmp = 0;
+
+static void
+free_cmd_list(void)
+{
+  /* free the list of a previous call */
+  if (cmd_line_list)
+    {
+      while (cmd_line_list) {
+        cmd_line_list_tmp = cmd_line_list;
+        cmd_line_list = cmd_line_list->next;
+        free (cmd_line_list_tmp->string_arg);
+        free (cmd_line_list_tmp);
+      }
+    }
+}
+
+
+static char *
+gengetopt_strdup (const char *s);
+
+static
+void clear_given (struct gengetopt_args_info *args_info)
+{
+  args_info->help_given = 0 ;
+  args_info->version_given = 0 ;
+  args_info->database_given = 0 ;
+  args_info->datasets_given = 0 ;
+  args_info->dataname_given = 0 ;
+  args_info->variances_given = 0 ;
+  args_info->port_given = 0 ;
+  args_info->timeout_given = 0 ;
+  args_info->threads_given = 0 ;
+}
+
+static
+void clear_args (struct gengetopt_args_info *args_info)
+{
+  args_info->database_arg = NULL;
+  args_info->database_orig = NULL;
+  args_info->datasets_arg = NULL;
+  args_info->datasets_orig = NULL;
+  args_info->dataname_arg = NULL;
+  args_info->dataname_orig = NULL;
+  args_info->variances_arg = NULL;
+  args_info->variances_orig = NULL;
+  args_info->port_arg = 1234;
+  args_info->port_orig = NULL;
+  args_info->timeout_arg = 100;
+  args_info->timeout_orig = NULL;
+  args_info->threads_arg = 1;
+  args_info->threads_orig = NULL;
+  
+}
+
+static
+void init_args_info(struct gengetopt_args_info *args_info)
+{
+
+
+  args_info->help_help = gengetopt_args_info_help[0] ;
+  args_info->version_help = gengetopt_args_info_help[1] ;
+  args_info->database_help = gengetopt_args_info_help[3] ;
+  args_info->datasets_help = gengetopt_args_info_help[4] ;
+  args_info->dataname_help = gengetopt_args_info_help[5] ;
+  args_info->variances_help = gengetopt_args_info_help[6] ;
+  args_info->port_help = gengetopt_args_info_help[8] ;
+  args_info->timeout_help = gengetopt_args_info_help[9] ;
+  args_info->threads_help = gengetopt_args_info_help[10] ;
+  
+}
+
+void
+cmdline_parser_print_version (void)
+{
+  printf ("%s %s\n", CMDLINE_PARSER_PACKAGE, CMDLINE_PARSER_VERSION);
+}
+
+static void print_help_common(void) {
+  cmdline_parser_print_version ();
+
+  if (strlen(gengetopt_args_info_purpose) > 0)
+    printf("\n%s\n", gengetopt_args_info_purpose);
+
+  if (strlen(gengetopt_args_info_usage) > 0)
+    printf("\n%s\n", gengetopt_args_info_usage);
+
+  printf("\n");
+
+  if (strlen(gengetopt_args_info_description) > 0)
+    printf("%s\n", gengetopt_args_info_description);
+}
+
+void
+cmdline_parser_print_help (void)
+{
+  int i = 0;
+  print_help_common();
+  while (gengetopt_args_info_help[i])
+    printf("%s\n", gengetopt_args_info_help[i++]);
+}
+
+void
+cmdline_parser_init (struct gengetopt_args_info *args_info)
+{
+  clear_given (args_info);
+  clear_args (args_info);
+  init_args_info (args_info);
+}
+
+void
+cmdline_parser_params_init(struct cmdline_parser_params *params)
+{
+  if (params)
+    { 
+      params->override = 0;
+      params->initialize = 1;
+      params->check_required = 1;
+      params->check_ambiguity = 0;
+      params->print_errors = 1;
+    }
+}
+
+struct cmdline_parser_params *
+cmdline_parser_params_create(void)
+{
+  struct cmdline_parser_params *params = 
+    (struct cmdline_parser_params *)malloc(sizeof(struct cmdline_parser_params));
+  cmdline_parser_params_init(params);  
+  return params;
+}
+
+static void
+free_string_field (char **s)
+{
+  if (*s)
+    {
+      free (*s);
+      *s = 0;
+    }
+}
+
+
+static void
+cmdline_parser_release (struct gengetopt_args_info *args_info)
+{
+
+  free_string_field (&(args_info->database_arg));
+  free_string_field (&(args_info->database_orig));
+  free_string_field (&(args_info->datasets_arg));
+  free_string_field (&(args_info->datasets_orig));
+  free_string_field (&(args_info->dataname_arg));
+  free_string_field (&(args_info->dataname_orig));
+  free_string_field (&(args_info->variances_arg));
+  free_string_field (&(args_info->variances_orig));
+  free_string_field (&(args_info->port_orig));
+  free_string_field (&(args_info->timeout_orig));
+  free_string_field (&(args_info->threads_orig));
+  
+  
+
+  clear_given (args_info);
+}
+
+
+static void
+write_into_file(FILE *outfile, const char *opt, const char *arg, char *values[])
+{
+  if (arg) {
+    fprintf(outfile, "%s=\"%s\"\n", opt, arg);
+  } else {
+    fprintf(outfile, "%s\n", opt);
+  }
+}
+
+
+int
+cmdline_parser_dump(FILE *outfile, struct gengetopt_args_info *args_info)
+{
+  int i = 0;
+
+  if (!outfile)
+    {
+      fprintf (stderr, "%s: cannot dump options to stream\n", CMDLINE_PARSER_PACKAGE);
+      return EXIT_FAILURE;
+    }
+
+  if (args_info->help_given)
+    write_into_file(outfile, "help", 0, 0 );
+  if (args_info->version_given)
+    write_into_file(outfile, "version", 0, 0 );
+  if (args_info->database_given)
+    write_into_file(outfile, "database", args_info->database_orig, 0);
+  if (args_info->datasets_given)
+    write_into_file(outfile, "datasets", args_info->datasets_orig, 0);
+  if (args_info->dataname_given)
+    write_into_file(outfile, "dataname", args_info->dataname_orig, 0);
+  if (args_info->variances_given)
+    write_into_file(outfile, "variances", args_info->variances_orig, 0);
+  if (args_info->port_given)
+    write_into_file(outfile, "port", args_info->port_orig, 0);
+  if (args_info->timeout_given)
+    write_into_file(outfile, "timeout", args_info->timeout_orig, 0);
+  if (args_info->threads_given)
+    write_into_file(outfile, "threads", args_info->threads_orig, 0);
+  
+
+  i = EXIT_SUCCESS;
+  return i;
+}
+
+int
+cmdline_parser_file_save(const char *filename, struct gengetopt_args_info *args_info)
+{
+  FILE *outfile;
+  int i = 0;
+
+  outfile = fopen(filename, "w");
+
+  if (!outfile)
+    {
+      fprintf (stderr, "%s: cannot open file for writing: %s\n", CMDLINE_PARSER_PACKAGE, filename);
+      return EXIT_FAILURE;
+    }
+
+  i = cmdline_parser_dump(outfile, args_info);
+  fclose (outfile);
+
+  return i;
+}
+
+void
+cmdline_parser_free (struct gengetopt_args_info *args_info)
+{
+  cmdline_parser_release (args_info);
+}
+
+/** @brief replacement of strdup, which is not standard */
+char *
+gengetopt_strdup (const char *s)
+{
+  char *result = NULL;
+  if (!s)
+    return result;
+
+  result = (char*)malloc(strlen(s) + 1);
+  if (result == (char*)0)
+    return (char*)0;
+  strcpy(result, s);
+  return result;
+}
+
+int
+cmdline_parser (int argc, char * const *argv, struct gengetopt_args_info *args_info)
+{
+  return cmdline_parser2 (argc, argv, args_info, 0, 1, 1);
+}
+
+int
+cmdline_parser_ext (int argc, char * const *argv, struct gengetopt_args_info *args_info,
+                   struct cmdline_parser_params *params)
+{
+  int result;
+  result = cmdline_parser_internal (argc, argv, args_info, params, NULL);
+
+  return result;
+}
+
+int
+cmdline_parser2 (int argc, char * const *argv, struct gengetopt_args_info *args_info, int override, int initialize, int check_required)
+{
+  int result;
+  struct cmdline_parser_params params;
+  
+  params.override = override;
+  params.initialize = initialize;
+  params.check_required = check_required;
+  params.check_ambiguity = 0;
+  params.print_errors = 1;
+
+  result = cmdline_parser_internal (argc, argv, args_info, &params, NULL);
+
+  return result;
+}
+
+int
+cmdline_parser_required (struct gengetopt_args_info *args_info, const char *prog_name)
+{
+  return EXIT_SUCCESS;
+}
+
+
+static char *package_name = 0;
+
+/**
+ * @brief updates an option
+ * @param field the generic pointer to the field to update
+ * @param orig_field the pointer to the orig field
+ * @param field_given the pointer to the number of occurrence of this option
+ * @param prev_given the pointer to the number of occurrence already seen
+ * @param value the argument for this option (if null no arg was specified)
+ * @param possible_values the possible values for this option (if specified)
+ * @param default_value the default value (in case the option only accepts fixed values)
+ * @param arg_type the type of this option
+ * @param check_ambiguity @see cmdline_parser_params.check_ambiguity
+ * @param override @see cmdline_parser_params.override
+ * @param no_free whether to free a possible previous value
+ * @param multiple_option whether this is a multiple option
+ * @param long_opt the corresponding long option
+ * @param short_opt the corresponding short option (or '-' if none)
+ * @param additional_error possible further error specification
+ */
+static
+int update_arg(void *field, char **orig_field,
+               unsigned int *field_given, unsigned int *prev_given, 
+               char *value, char *possible_values[], const char *default_value,
+               cmdline_parser_arg_type arg_type,
+               int check_ambiguity, int override,
+               int no_free, int multiple_option,
+               const char *long_opt, char short_opt,
+               const char *additional_error)
+{
+  char *stop_char = 0;
+  const char *val = value;
+  int found;
+  char **string_field;
+
+  stop_char = 0;
+  found = 0;
+
+  if (!multiple_option && prev_given && (*prev_given || (check_ambiguity && *field_given)))
+    {
+      if (short_opt != '-')
+        fprintf (stderr, "%s: `--%s' (`-%c') option given more than once%s\n", 
+               package_name, long_opt, short_opt,
+               (additional_error ? additional_error : ""));
+      else
+        fprintf (stderr, "%s: `--%s' option given more than once%s\n", 
+               package_name, long_opt,
+               (additional_error ? additional_error : ""));
+      return 1; /* failure */
+    }
+
+    
+  if (field_given && *field_given && ! override)
+    return 0;
+  if (prev_given)
+    (*prev_given)++;
+  if (field_given)
+    (*field_given)++;
+  if (possible_values)
+    val = possible_values[found];
+
+  switch(arg_type) {
+  case ARG_INT:
+    if (val) *((int *)field) = strtol (val, &stop_char, 0);
+    break;
+  case ARG_STRING:
+    if (val) {
+      string_field = (char **)field;
+      if (!no_free && *string_field)
+        free (*string_field); /* free previous string */
+      *string_field = gengetopt_strdup (val);
+    }
+    break;
+  default:
+    break;
+  };
+
+  /* check numeric conversion */
+  switch(arg_type) {
+  case ARG_INT:
+    if (val && !(stop_char && *stop_char == '\0')) {
+      fprintf(stderr, "%s: invalid numeric value: %s\n", package_name, val);
+      return 1; /* failure */
+    }
+    break;
+  default:
+    ;
+  };
+
+  /* store the original value */
+  switch(arg_type) {
+  case ARG_NO:
+    break;
+  default:
+    if (value && orig_field) {
+      if (no_free) {
+        *orig_field = value;
+      } else {
+        if (*orig_field)
+          free (*orig_field); /* free previous string */
+        *orig_field = gengetopt_strdup (value);
+      }
+    }
+  };
+
+  return 0; /* OK */
+}
+
+
+int
+cmdline_parser_internal (int argc, char * const *argv, struct gengetopt_args_info *args_info,
+                        struct cmdline_parser_params *params, const char *additional_error)
+{
+  int c;	/* Character of the parsed option.  */
+
+  int error = 0;
+  struct gengetopt_args_info local_args_info;
+  
+  int override;
+  int initialize;
+  int check_required;
+  int check_ambiguity;
+  
+  package_name = argv[0];
+  
+  override = params->override;
+  initialize = params->initialize;
+  check_required = params->check_required;
+  check_ambiguity = params->check_ambiguity;
+
+  if (initialize)
+    cmdline_parser_init (args_info);
+
+  cmdline_parser_init (&local_args_info);
+
+  optarg = 0;
+  optind = 0;
+  opterr = params->print_errors;
+  optopt = '?';
+
+  while (1)
+    {
+      int option_index = 0;
+
+      static struct option long_options[] = {
+        { "help",	0, NULL, 'h' },
+        { "version",	0, NULL, 'V' },
+        { "database",	1, NULL, 'd' },
+        { "datasets",	1, NULL, 'I' },
+        { "dataname",	1, NULL, 'D' },
+        { "variances",	1, NULL, 'v' },
+        { "port",	1, NULL, 'p' },
+        { "timeout",	1, NULL, 't' },
+        { "threads",	1, NULL, 'T' },
+        { NULL,	0, NULL, 0 }
+      };
+
+      c = getopt_long (argc, argv, "hVd:I:D:v:p:t:T:", long_options, &option_index);
+
+      if (c == -1) break;	/* Exit from `while (1)' loop.  */
+
+      switch (c)
+        {
+        case 'h':	/* Print help and exit.  */
+          cmdline_parser_print_help ();
+          cmdline_parser_free (&local_args_info);
+          exit (EXIT_SUCCESS);
+
+        case 'V':	/* Print version and exit.  */
+        
+        
+          if (update_arg( 0 , 
+               0 , &(args_info->version_given),
+              &(local_args_info.version_given), optarg, 0, 0, ARG_NO,
+              check_ambiguity, override, 0, 0,
+              "version", 'V',
+              additional_error))
+            goto failure;
+          cmdline_parser_free (&local_args_info);
+          return 0;
+        
+          break;
+        case 'd':	/* Database directory.  */
+        
+        
+          if (update_arg( (void *)&(args_info->database_arg), 
+               &(args_info->database_orig), &(args_info->database_given),
+              &(local_args_info.database_given), optarg, 0, 0, ARG_STRING,
+              check_ambiguity, override, 0, 0,
+              "database", 'd',
+              additional_error))
+            goto failure;
+        
+          break;
+        case 'I':	/* File of dataset names.  */
+        
+        
+          if (update_arg( (void *)&(args_info->datasets_arg), 
+               &(args_info->datasets_orig), &(args_info->datasets_given),
+              &(local_args_info.datasets_given), optarg, 0, 0, ARG_STRING,
+              check_ambiguity, override, 0, 0,
+              "datasets", 'I',
+              additional_error))
+            goto failure;
+        
+          break;
+        case 'D':	/* Name of input dataset.  */
+        
+        
+          if (update_arg( (void *)&(args_info->dataname_arg), 
+               &(args_info->dataname_orig), &(args_info->dataname_given),
+              &(local_args_info.dataname_given), optarg, 0, 0, ARG_STRING,
+              check_ambiguity, override, 0, 0,
+              "dataname", 'D',
+              additional_error))
+            goto failure;
+        
+          break;
+        case 'v':	/* PCL File of gene variances.  */
+        
+        
+          if (update_arg( (void *)&(args_info->variances_arg), 
+               &(args_info->variances_orig), &(args_info->variances_given),
+              &(local_args_info.variances_given), optarg, 0, 0, ARG_STRING,
+              check_ambiguity, override, 0, 0,
+              "variances", 'v',
+              additional_error))
+            goto failure;
+        
+          break;
+        case 'p':	/* Server port.  */
+        
+        
+          if (update_arg( (void *)&(args_info->port_arg), 
+               &(args_info->port_orig), &(args_info->port_given),
+              &(local_args_info.port_given), optarg, 0, "1234", ARG_INT,
+              check_ambiguity, override, 0, 0,
+              "port", 'p',
+              additional_error))
+            goto failure;
+        
+          break;
+        case 't':	/* Server timeout.  */
+        
+        
+          if (update_arg( (void *)&(args_info->timeout_arg), 
+               &(args_info->timeout_orig), &(args_info->timeout_given),
+              &(local_args_info.timeout_given), optarg, 0, "100", ARG_INT,
+              check_ambiguity, override, 0, 0,
+              "timeout", 't',
+              additional_error))
+            goto failure;
+        
+          break;
+        case 'T':	/* Maximum number of threads.  */
+        
+        
+          if (update_arg( (void *)&(args_info->threads_arg), 
+               &(args_info->threads_orig), &(args_info->threads_given),
+              &(local_args_info.threads_given), optarg, 0, "1", ARG_INT,
+              check_ambiguity, override, 0, 0,
+              "threads", 'T',
+              additional_error))
+            goto failure;
+        
+          break;
+
+        case 0:	/* Long option with no short option */
+        case '?':	/* Invalid option.  */
+          /* `getopt_long' already printed an error message.  */
+          goto failure;
+
+        default:	/* bug: option not considered.  */
+          fprintf (stderr, "%s: option unknown: %c%s\n", CMDLINE_PARSER_PACKAGE, c, (additional_error ? additional_error : ""));
+          abort ();
+        } /* switch */
+    } /* while */
+
+
+
+
+  cmdline_parser_release (&local_args_info);
+
+  if ( error )
+    return (EXIT_FAILURE);
+
+  return 0;
+
+failure:
+  
+  cmdline_parser_release (&local_args_info);
+  return (EXIT_FAILURE);
+}
+
+#ifndef CONFIG_FILE_LINE_SIZE
+#define CONFIG_FILE_LINE_SIZE 2048
+#endif
+#define ADDITIONAL_ERROR " in configuration file "
+
+#define CONFIG_FILE_LINE_BUFFER_SIZE (CONFIG_FILE_LINE_SIZE+3)
+/* 3 is for "--" and "=" */
+
+static int
+_cmdline_parser_configfile (char * const filename, int *my_argc)
+{
+  FILE* file;
+  char my_argv[CONFIG_FILE_LINE_BUFFER_SIZE+1];
+  char linebuf[CONFIG_FILE_LINE_SIZE];
+  int line_num = 0;
+  int result = 0, equal;
+  char *fopt, *farg;
+  char *str_index;
+  size_t len, next_token;
+  char delimiter;
+
+  if ((file = fopen(filename, "r")) == NULL)
+    {
+      fprintf (stderr, "%s: Error opening configuration file '%s'\n",
+               CMDLINE_PARSER_PACKAGE, filename);
+      return EXIT_FAILURE;
+    }
+
+  while ((fgets(linebuf, CONFIG_FILE_LINE_SIZE, file)) != NULL)
+    {
+      ++line_num;
+      my_argv[0] = '\0';
+      len = strlen(linebuf);
+      if (len > (CONFIG_FILE_LINE_BUFFER_SIZE-1))
+        {
+          fprintf (stderr, "%s:%s:%d: Line too long in configuration file\n",
+                   CMDLINE_PARSER_PACKAGE, filename, line_num);
+          result = EXIT_FAILURE;
+          break;
+        }
+
+      /* find first non-whitespace character in the line */
+      next_token = strspn (linebuf, " \t\r\n");
+      str_index  = linebuf + next_token;
+
+      if ( str_index[0] == '\0' || str_index[0] == '#')
+        continue; /* empty line or comment line is skipped */
+
+      fopt = str_index;
+
+      /* truncate fopt at the end of the first non-valid character */
+      next_token = strcspn (fopt, " \t\r\n=");
+
+      if (fopt[next_token] == '\0') /* the line is over */
+        {
+          farg  = NULL;
+          equal = 0;
+          goto noarg;
+        }
+
+      /* remember if equal sign is present */
+      equal = (fopt[next_token] == '=');
+      fopt[next_token++] = '\0';
+
+      /* advance pointers to the next token after the end of fopt */
+      next_token += strspn (fopt + next_token, " \t\r\n");
+
+      /* check for the presence of equal sign, and if so, skip it */
+      if ( !equal )
+        if ((equal = (fopt[next_token] == '=')))
+          {
+            next_token++;
+            next_token += strspn (fopt + next_token, " \t\r\n");
+          }
+      str_index  += next_token;
+
+      /* find argument */
+      farg = str_index;
+      if ( farg[0] == '\"' || farg[0] == '\'' )
+        { /* quoted argument */
+          str_index = strchr (++farg, str_index[0] ); /* skip opening quote */
+          if (! str_index)
+            {
+              fprintf
+                (stderr,
+                 "%s:%s:%d: unterminated string in configuration file\n",
+                 CMDLINE_PARSER_PACKAGE, filename, line_num);
+              result = EXIT_FAILURE;
+              break;
+            }
+        }
+      else
+        { /* read up the remaining part up to a delimiter */
+          next_token = strcspn (farg, " \t\r\n#\'\"");
+          str_index += next_token;
+        }
+
+      /* truncate farg at the delimiter and store it for further check */
+      delimiter = *str_index, *str_index++ = '\0';
+
+      /* everything but comment is illegal at the end of line */
+      if (delimiter != '\0' && delimiter != '#')
+        {
+          str_index += strspn(str_index, " \t\r\n");
+          if (*str_index != '\0' && *str_index != '#')
+            {
+              fprintf
+                (stderr,
+                 "%s:%s:%d: malformed string in configuration file\n",
+                 CMDLINE_PARSER_PACKAGE, filename, line_num);
+              result = EXIT_FAILURE;
+              break;
+            }
+        }
+
+    noarg:
+      if (!strcmp(fopt,"include")) {
+        if (farg && *farg) {
+          result = _cmdline_parser_configfile(farg, my_argc);
+        } else {
+          fprintf(stderr, "%s:%s:%d: include requires a filename argument.\n",
+                  CMDLINE_PARSER_PACKAGE, filename, line_num);
+        }
+        continue;
+      }
+      len = strlen(fopt);
+      strcat (my_argv, len > 1 ? "--" : "-");
+      strcat (my_argv, fopt);
+      if (len > 1 && ((farg && *farg) || equal))
+        strcat (my_argv, "=");
+      if (farg && *farg)
+        strcat (my_argv, farg);
+      ++(*my_argc);
+
+      cmd_line_list_tmp = (struct line_list *) malloc (sizeof (struct line_list));
+      cmd_line_list_tmp->next = cmd_line_list;
+      cmd_line_list = cmd_line_list_tmp;
+      cmd_line_list->string_arg = gengetopt_strdup(my_argv);
+    } /* while */
+
+  if (file)
+    fclose(file);
+  return result;
+}
+
+int
+cmdline_parser_configfile (char * const filename,
+                           struct gengetopt_args_info *args_info,
+                           int override, int initialize, int check_required)
+{
+  struct cmdline_parser_params params;
+
+  params.override = override;
+  params.initialize = initialize;
+  params.check_required = check_required;
+  params.check_ambiguity = 0;
+  params.print_errors = 1;
+  
+  return cmdline_parser_config_file (filename, args_info, &params);
+}
+
+int
+cmdline_parser_config_file (char * const filename,
+                           struct gengetopt_args_info *args_info,
+                           struct cmdline_parser_params *params)
+{
+  int i, result;
+  int my_argc = 1;
+  char **my_argv_arg;
+  char *additional_error;
+
+  /* store the program name */
+  cmd_line_list_tmp = (struct line_list *) malloc (sizeof (struct line_list));
+  cmd_line_list_tmp->next = cmd_line_list;
+  cmd_line_list = cmd_line_list_tmp;
+  cmd_line_list->string_arg = gengetopt_strdup (CMDLINE_PARSER_PACKAGE);
+
+  result = _cmdline_parser_configfile(filename, &my_argc);
+
+  if (result != EXIT_FAILURE) {
+    my_argv_arg = (char **) malloc((my_argc+1) * sizeof(char *));
+    cmd_line_list_tmp = cmd_line_list;
+
+    for (i = my_argc - 1; i >= 0; --i) {
+      my_argv_arg[i] = cmd_line_list_tmp->string_arg;
+      cmd_line_list_tmp = cmd_line_list_tmp->next;
+    }
+
+    my_argv_arg[my_argc] = 0;
+
+    additional_error = (char *)malloc(strlen(filename) + strlen(ADDITIONAL_ERROR) + 1);
+    strcpy (additional_error, ADDITIONAL_ERROR);
+    strcat (additional_error, filename);
+    result =
+      cmdline_parser_internal (my_argc, my_argv_arg, args_info,
+                              params,
+                              additional_error);
+
+    free (additional_error);
+    free (my_argv_arg);
+  }
+
+  free_cmd_list();
+  return result;
+}

File tools/DataServer/cmdline.h

+/** @file cmdline.h
+ *  @brief The header file for the command line option parser
+ *  generated by GNU Gengetopt version 2.22
+ *  http://www.gnu.org/software/gengetopt.
+ *  DO NOT modify this file, since it can be overwritten
+ *  @author GNU Gengetopt by Lorenzo Bettini */
+
+#ifndef CMDLINE_H
+#define CMDLINE_H
+
+/* If we use autoconf.  */
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h> /* for FILE */
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#ifndef CMDLINE_PARSER_PACKAGE
+/** @brief the program name */
+#define CMDLINE_PARSER_PACKAGE "DataServer"
+#endif
+
+#ifndef CMDLINE_PARSER_VERSION
+/** @brief the program version */
+#define CMDLINE_PARSER_VERSION "1.0"
+#endif
+
+/** @brief Where the command line options are stored */
+struct gengetopt_args_info
+{
+  const char *help_help; /**< @brief Print help and exit help description.  */
+  const char *version_help; /**< @brief Print version and exit help description.  */
+  char * database_arg;	/**< @brief Database directory.  */
+  char * database_orig;	/**< @brief Database directory original value given at command line.  */
+  const char *database_help; /**< @brief Database directory help description.  */
+  char * datasets_arg;	/**< @brief File of dataset names.  */
+  char * datasets_orig;	/**< @brief File of dataset names original value given at command line.  */
+  const char *datasets_help; /**< @brief File of dataset names help description.  */
+  char * dataname_arg;	/**< @brief Name of input dataset.  */
+  char * dataname_orig;	/**< @brief Name of input dataset original value given at command line.  */
+  const char *dataname_help; /**< @brief Name of input dataset help description.  */
+  char * variances_arg;	/**< @brief PCL File of gene variances.  */
+  char * variances_orig;	/**< @brief PCL File of gene variances original value given at command line.  */
+  const char *variances_help; /**< @brief PCL File of gene variances help description.  */
+  int port_arg;	/**< @brief Server port (default='1234').  */
+  char * port_orig;	/**< @brief Server port original value given at command line.  */
+  const char *port_help; /**< @brief Server port help description.  */
+  int timeout_arg;	/**< @brief Server timeout (default='100').  */
+  char * timeout_orig;	/**< @brief Server timeout original value given at command line.  */
+  const char *timeout_help; /**< @brief Server timeout help description.  */
+  int threads_arg;	/**< @brief Maximum number of threads (default='1').  */
+  char * threads_orig;	/**< @brief Maximum number of threads original value given at command line.  */
+  const char *threads_help; /**< @brief Maximum number of threads help description.  */
+  
+  unsigned int help_given ;	/**< @brief Whether help was given.  */
+  unsigned int version_given ;	/**< @brief Whether version was given.  */
+  unsigned int database_given ;	/**< @brief Whether database was given.  */
+  unsigned int datasets_given ;	/**< @brief Whether datasets was given.  */
+  unsigned int dataname_given ;	/**< @brief Whether dataname was given.  */
+  unsigned int variances_given ;	/**< @brief Whether variances was given.  */
+  unsigned int port_given ;	/**< @brief Whether port was given.  */
+  unsigned int timeout_given ;	/**< @brief Whether timeout was given.  */
+  unsigned int threads_given ;	/**< @brief Whether threads was given.  */
+
+} ;
+
+/** @brief The additional parameters to pass to parser functions */
+struct cmdline_parser_params
+{
+  int override; /**< @brief whether to override possibly already present options (default 0) */
+  int initialize; /**< @brief whether to initialize the option structure gengetopt_args_info (default 1) */
+  int check_required; /**< @brief whether to check that all required options were provided (default 1) */
+  int check_ambiguity; /**< @brief whether to check for options already specified in the option structure gengetopt_args_info (default 0) */
+  int print_errors; /**< @brief whether getopt_long should print an error message for a bad option (default 1) */
+} ;
+
+/** @brief the purpose string of the program */
+extern const char *gengetopt_args_info_purpose;
+/** @brief the usage string of the program */
+extern const char *gengetopt_args_info_usage;
+/** @brief all the lines making the help output */
+extern const char *gengetopt_args_info_help[];
+
+/**
+ * The command line parser
+ * @param argc the number of command line options
+ * @param argv the command line options
+ * @param args_info the structure where option information will be stored
+ * @return 0 if everything went fine, NON 0 if an error took place
+ */
+int cmdline_parser (int argc, char * const *argv,
+  struct gengetopt_args_info *args_info);
+
+/**
+ * The command line parser (version with additional parameters - deprecated)
+ * @param argc the number of command line options
+ * @param argv the command line options
+ * @param args_info the structure where option information will be stored
+ * @param override whether to override possibly already present options
+ * @param initialize whether to initialize the option structure my_args_info
+ * @param check_required whether to check that all required options were provided
+ * @return 0 if everything went fine, NON 0 if an error took place
+ * @deprecated use cmdline_parser_ext() instead
+ */
+int cmdline_parser2 (int argc, char * const *argv,
+  struct gengetopt_args_info *args_info,
+  int override, int initialize, int check_required);
+
+/**
+ * The command line parser (version with additional parameters)
+ * @param argc the number of command line options
+ * @param argv the command line options
+ * @param args_info the structure where option information will be stored
+ * @param params additional parameters for the parser
+ * @return 0 if everything went fine, NON 0 if an error took place
+ */
+int cmdline_parser_ext (int argc, char * const *argv,
+  struct gengetopt_args_info *args_info,
+  struct cmdline_parser_params *params);
+
+/**
+ * Save the contents of the option struct into an already open FILE stream.
+ * @param outfile the stream where to dump options
+ * @param args_info the option struct to dump
+ * @return 0 if everything went fine, NON 0 if an error took place
+ */
+int cmdline_parser_dump(FILE *outfile,
+  struct gengetopt_args_info *args_info);
+
+/**
+ * Save the contents of the option struct into a (text) file.
+ * This file can be read by the config file parser (if generated by gengetopt)
+ * @param filename the file where to save
+ * @param args_info the option struct to save
+ * @return 0 if everything went fine, NON 0 if an error took place
+ */
+int cmdline_parser_file_save(const char *filename,
+  struct gengetopt_args_info *args_info);
+
+/**
+ * Print the help
+ */
+void cmdline_parser_print_help(void);
+/**
+ * Print the version
+ */
+void cmdline_parser_print_version(void);
+
+/**
+ * Initializes all the fields a cmdline_parser_params structure 
+ * to their default values
+ * @param params the structure to initialize
+ */
+void cmdline_parser_params_init(struct cmdline_parser_params *params);
+
+/**
+ * Allocates dynamically a cmdline_parser_params structure and initializes
+ * all its fields to their default values
+ * @return the created and initialized cmdline_parser_params structure
+ */
+struct cmdline_parser_params *cmdline_parser_params_create(void);
+
+/**
+ * Initializes the passed gengetopt_args_info structure's fields
+ * (also set default values for options that have a default)
+ * @param args_info the structure to initialize
+ */
+void cmdline_parser_init (struct gengetopt_args_info *args_info);
+/**
+ * Deallocates the string fields of the gengetopt_args_info structure
+ * (but does not deallocate the structure itself)
+ * @param args_info the structure to deallocate
+ */
+void cmdline_parser_free (struct gengetopt_args_info *args_info);
+
+/**
+ * The config file parser (deprecated version)
+ * @param filename the name of the config file
+ * @param args_info the structure where option information will be stored
+ * @param override whether to override possibly already present options
+ * @param initialize whether to initialize the option structure my_args_info
+ * @param check_required whether to check that all required options were provided
+ * @return 0 if everything went fine, NON 0 if an error took place
+ * @deprecated use cmdline_parser_config_file() instead
+ */
+int cmdline_parser_configfile (char * const filename,
+  struct gengetopt_args_info *args_info,
+  int override, int initialize, int check_required);
+
+/**
+ * The config file parser
+ * @param filename the name of the config file
+ * @param args_info the structure where option information will be stored
+ * @param params additional parameters for the parser
+ * @return 0 if everything went fine, NON 0 if an error took place
+ */
+int cmdline_parser_config_file (char * const filename,
+  struct gengetopt_args_info *args_info,
+  struct cmdline_parser_params *params);
+
+/**
+ * Checks that all the required options were specified
+ * @param args_info the structure to check
+ * @param prog_name the name of the program that will be used to print
+ *   possible errors
+ * @return
+ */
+int cmdline_parser_required (struct gengetopt_args_info *args_info,
+  const char *prog_name);
+
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+#endif /* CMDLINE_H */

File tools/DataServer/stdafx.cpp

+/*****************************************************************************
+* This file is provided under the Creative Commons Attribution 3.0 license.
+*
+* You are free to share, copy, distribute, transmit, or adapt this work
+* PROVIDED THAT you attribute the work to the authors listed below.
+* For more information, please see the following web page:
+* http://creativecommons.org/licenses/by/3.0/
+*
+* This file is a component of the Sleipnir library for functional genomics,
+* authored by:
+* Curtis Huttenhower (chuttenh@princeton.edu)
+* Mark Schroeder
+* Maria D. Chikina
+* Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
+*
+* If you use this library, the included executable tools, or any related
+* code in your work, please cite the following publication:
+* Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
+* Olga G. Troyanskaya.
+* "The Sleipnir library for computational functional genomics"
+*****************************************************************************/
+#include "stdafx.h"
+
+/*!
+ * \page NetworkRanker NetworkRanker
+ * 
+ * NetworkRanker can replace the DAT mode of \ref Combiner.  It is faster, but achieves this by making some
+ * important assumptions.  NetworkRanker uses the mean method of combining and assumes that all gene pairs 
+ * be present in all DABs being combined. Therfore, NetworkRanker should only be used if these assumptions 
+ * are met.
+ * 
+ * \section sec_usage Usage
+ * 
+ * \subsection ssec_usage_basic Basic Usage
+ * 
+ * Create a new DAB file from a directory of existing DAB files by calculating the mean for each gene pair
+ * across all of the existing DAB files.
+ * 
+ * \code
+ * NetworkRanker -d <directory of dabs> -o <combined.dab>
+ * \endcode
+ * 
+ * \subsection ssec_usage_detailed Detailed Usage
+ * 
+ * \include NetworkRanker/NetworkRanker.ggo
+ * 
+ * <table><tr>
+ *	<th>Flag</th>
+ *	<th>Default</th>
+ *	<th>Type</th>
+ *	<th>Description</th>
+ * </tr><tr>
+ * 	<td>-d</td>
+ * 	<td>None</td>
+ * 	<td>DAT/DAB Directory</td>
+ * 	<td>Input directory (must only contain input files as DAT/DAB).</td>
+ * </tr><tr>
+ *	<td>-m</td>
+ *	<td>on</td>
+ *	<td>Flag</td>
+ *	<td>Map gene index among the network dabs to combine (should be used when the gene indices are not identical among network dabs).</td>
+ * </tr><tr>
+ *	<td>-o</td>
+ *	<td>None</td>
+ *	<td>DAB file</td>
+ *	<td>Output file for combined network.</td>
+ * </tr></table>
+ */

File tools/DataServer/stdafx.h

+/*****************************************************************************
+* This file is provided under the Creative Commons Attribution 3.0 license.
+*
+* You are free to share, copy, distribute, transmit, or adapt this work
+* PROVIDED THAT you attribute the work to the authors listed below.
+* For more information, please see the following web page:
+* http://creativecommons.org/licenses/by/3.0/
+*
+* This file is a component of the Sleipnir library for functional genomics,
+* authored by:
+* Curtis Huttenhower (chuttenh@princeton.edu)
+* Mark Schroeder
+* Maria D. Chikina
+* Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
+*
+* If you use this library, the included executable tools, or any related
+* code in your work, please cite the following publication:
+* Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
+* Olga G. Troyanskaya.
+* "The Sleipnir library for computational functional genomics"
+*****************************************************************************/
+#ifndef STDAFX_H
+#define STDAFX_H
+
+#include <fstream>
+#include <algorithm>
+
+#ifdef _MSC_VER
+#include <io.h>
+#include <winsock2.h>
+#else // _MSC_VER
+#include <arpa/inet.h>
+#include <netinet/in.h>
+
+#define SOCKET		int
+#endif // _MSC_VER
+
+#include <pthread.h>
+
+using namespace std;
+
+#include "dataset.h"
+#include "database.h"
+#include "genome.h"
+#include "meta.h"
+#include "pcl.h"
+#include "statistics.h"
+using namespace Sleipnir;
+
+#include "cmdline.h"
+#include "server.h"
+#include "serverclient.h"
+
+typedef int (*TPFnCombiner)( const gengetopt_args_info& );
+
+#endif // STDAFX_H

File tools/DataServer/test.py

+import socket
+import struct
+
+import logging
+logger = logging.getLogger(__name__)
+
+import operator
+import sys
+import numpy
+
+class DataServer:
+
+    SEARCH, QUERY = range(2)
+
+
+    def __init__(self, gidx, didx, ip = '127.0.0.1', port = 1234):
+        self.ip = ip
+        self.port = port
+        self.gidx = gidx;
+        self.didx = didx;
+
+    def open_socket(self):
+        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        s.connect((self.ip, self.port))
+        return s
+
+    def close_socket(self,s):
+        s.shutdown(socket.SHUT_WR)
+        s.close()
+
+    def search(self, cut, exp, genes = []):
+        s = self.open_socket()
+
+        size = 1 + 4 + 4 + 4 # opcode + dataset id + cut + exp
+        size += 4*len(genes)
+
+        size = struct.pack('<i', size)
+        s.send(size)
+
+        opcode = struct.pack('<b', self.SEARCH)
+        s.send(opcode)
+
+        did = struct.pack('<i', 537)
+        s.send(did)
+
+        params = struct.pack('<ff', cut, exp)
+        s.send(params)
+
+        gene = struct.pack('<'+'i'*len(genes), *genes)
+        s.send(gene)
+        s.shutdown(socket.SHUT_WR)
+
+        scores = []
+        result = s.recv(4)
+        res_len = struct.unpack('<i', result)[0]
+
+        # Get all bytes until finished
+        result = s.recv(res_len)
+        while len(result) < res_len:
+            result += s.recv(res_len)
+
+        scores = struct.unpack('f'*(res_len/4), result)
+
+        genes = scores[0:len(gidx)]
+        dsets = scores[len(gidx):len(scores)]
+        #print len(genes), len(dsets), genes[0:20], dsets[0:20]
+
+        s.close()
+        return (genes,dsets)
+
+
+if __name__ == '__main__':
+    from optparse import OptionParser
+
+    usage = "usage: %prog [options]"
+    parser = OptionParser(usage, version="%prog dev-unreleased")
+    parser.add_option("-I", "--IP-address",dest="ip", default='127.0.0.1', help="IP address of BNServer instance")
+    parser.add_option("-p", "--port", dest="port", default=1234, help="Port number of BNServer instance", type=int)
+    parser.add_option("-x", "--dataset-id", dest="did", default=0, help="Dataset ID", type=int)
+    parser.add_option("-d", "--datasets", dest="dset", help="File of dataset names", metavar="FILE")
+    parser.add_option("-g", "--gene-file", dest="gene_file", help="File of gene names", metavar="FILE")
+    parser.add_option("-q", "--gene-query-file", dest="query_file", help="File of gene names", metavar="FILE")
+
+
+    (options, args) = parser.parse_args()
+
+    genef = open(options.gene_file)
+    gidx = []
+    gidx_dict = {}
+    for l in genef:
+        (idx, gene) = l.strip().split()
+        gidx.append((int(idx)-1, gene))
+        gidx_dict[gene] = int(idx) - 1
+    genef.close()
+
+    dsf = open(options.dset)
+    didx = []
+    for l in dsf:
+        (idx, ds, pfm) = l.strip().split()
+        didx.append((int(idx)-1, ds))
+    dsf.close()
+
+    qf = open(options.query_file)
+    query = set()
+    query_names = set()
+    for l in qf:
+        if l.strip() in gidx_dict:
+            query.add(gidx_dict[l.strip()])
+            query_names.add(l.strip())
+
+    ds  = DataServer(gidx, didx, options.ip, options.port)
+    genes,dsets = ds.search(.5, 8, list(query))
+
+    for ((idx,name),score) in zip(gidx,genes)[0:10]:
+        print name + '\t' + ('1' if name in query_names else '-1') + '\t' + str(score)

File tools/Makefile.am

 	  Data2Features \
 	  Data2Sql \
 	  DataDumper \
+	  DataServer \
 	  Distancer \
 	  Explainer \
 	  Filterer \
 	  Matcher \
 	  MIer \
 	  MIed \
+	  NetworkRanker \
 	  Normalizer \
 	  Orthologer \
 	  Overlapper \

File tools/NetworkRanker/NetworkRanker.cpp

+/*****************************************************************************
+ * This file is provided under the Creative Commons Attribution 3.0 license.
+ *
+ * You are free to share, copy, distribute, transmit, or adapt this work
+ * PROVIDED THAT you attribute the work to the authors listed below.
+ * For more information, please see the following web page:
+ * http://creativecommons.org/licenses/by/3.0/
+ *
+ * This file is a component of the Sleipnir library for functional genomics,
+ * authored by:
+ * Curtis Huttenhower (chuttenh@princeton.edu)
+ * Mark Schroeder
+ * Maria D. Chikina
+ * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
+ *
+ * If you use this library, the included executable tools, or any related
+ * code in your work, please cite the following publication:
+ * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
+ * Olga G. Troyanskaya.
+ * "The Sleipnir library for computational functional genomics"
+ *****************************************************************************/
+#include "stdafx.h"
+#include <iostream>
+#include <fstream>
+#include <cmath>
+#include <limits>
+#include <algorithm>
+
+
+template<class tType>
+struct SCompareRank {                                                       
+    const vector<tType>&    m_vecData;
+    SCompareRank( const vector<tType>& vecData ) : m_vecData(vecData) { }
+    bool operator()( size_t iOne, size_t iTwo ) const {
+        return ( m_vecData[ iOne ] < m_vecData[ iTwo ] ); }
+};
+
+
+double WilcoxonRankSum( const vector<float> vecdValues, const vector<float> vecAnswers ) {
+    std::vector<size_t> veciIndices;
+    std::vector<float> vecdRanks;
+    size_t              iIndex, iCount, iPos, iNeg, i, j; 
+    double  dSum, d;