Commits

Qian Zhu committed 4907210

Update Seek developer docs

  • Participants
  • Parent commits 13eb047

Comments (0)

Files changed (9)

src/seekcentral.h

 namespace Sleipnir {
 
 /*!
- * \brief The main search algorithm that is used by Seek
+ * \brief A suite of search algorithms that are supported by Seek
  *
- * The Seek search algorithm performs the coexpression search of the user's 
+ * The Seek search algorithms perform the coexpression search of the user's
  * query genes in a large compendium of microarray datasets. 
- * The output of the search algorithm is a ranking of genes based on their
+ * The output of the search algorithms is a ranking of genes based on their
  * gene score, where the gene score represents the overall weighted coexpression
  * to the query genes. 
  *
- * One of the first steps in the algorithm is to weight
+ * One of the first steps in a search is to weight
  * the datasets in such a way to prioritize informative datasets.
  * Then, with the dataset weight generated, the final gene-score is given by:
  * \f[FS(g, Q)=\alpha\sum_{d \in D}{w_d \cdot s_d(g, Q)}\f]
  * \li Order-statistics (CSeekCentral::ORDER_STATISTICS): the algorithm used in MEM.
  * (Adler et al, Genome Biology 2009)
  *
- * CSeekCentral can accept multiple queries as the search input.
+ * CSeekCentral can handle multiple queries at a time, but the search parameters must remain
+ * the same for all queries.
  */
 class CSeekCentral{
 public:
 	 * \param RATE The weighting parameter \a p
 	 *
 	 * \remark The random number generator is used for partitioning the query.
-	 * \remark Assumes that the CSeekCentral::Initialize() has been called to prepare for the search instance.
+	 * \remark Assumes that the CSeekCentral::Initialize() has been called.
 	 */
 	bool CVSearch(gsl_rng*, const CSeekQuery::PartitionMode&, const ushort&, const float&);
 
 	 * standard gene-set.
 	 *
 	 * \remark The random number generator is used for partitioning the query.
-	 * \remark Assumes that the CSeekCentral::Initialize() has been called to prepare for the search instance.
+	 * \remark Assumes that the CSeekCentral::Initialize() has been called.
 	 */
 	bool CVCustomSearch(const vector< vector<string> > &, gsl_rng*,
 		const CSeekQuery::PartitionMode&, const ushort&, const float&);
 
 	/*!
 	 * \brief Run Seek with the equal dataset weighting
-	 * \remark Assumes that the CSeekCentral::Initialize() has been called to prepare for the search instance.
+	 * \remark Assumes that the CSeekCentral::Initialize() has been called.
 	 */
 	bool EqualWeightSearch();
 
 	 * where \a Q is the number of queries, \a D is the number of datasets. \c weights[i][j]
 	 * stores the weight of dataset \a j in query \a i.
 	 *
-	 * \remark Assumes that the CSeekCentral::Initialize() has been called to prepare for the search instance.
+	 * \remark Assumes that the CSeekCentral::Initialize() has been called.
 	 */
 	bool WeightSearch(const vector<vector<float> >&);
 
 	 *
 	 * Same as CSeekCentral::WeightSearch(), except that the user-given weights are the query gene expression variances.
 	 *
-	 * \remark Assumes that the CSeekCentral::Initialize() has been called to prepare for the search instance.
+	 * \remark Assumes that the CSeekCentral::Initialize() has been called.
 	 */
 	bool VarianceWeightSearch();
 
 	/*!
 	 * \brief Run Seek with the order statistics dataset weighting algorithm
 	 *
-	 * \remark Assumes that the CSeekCentral::Initialize() has been called to prepare for the search instance.
+	 * \remark Assumes that the CSeekCentral::Initialize() has been called.
 	 */
 	bool OrderStatistics();
 

src/seekdataset.h

 
 namespace Sleipnir {
 /*!
- * \brief A microarray dataset structure
+ * \brief Representation of a microarray dataset that is used by Seek
  *
  * A \c CSeekDataset encapsulates the following information about the dataset:
  *
 	 * \param quant The discretization function
 	 * \param iRows The number of rows for the \a correlation matrix
 	 * \param iColumns The number of columns for the \a correlation matrix
-	 * \param bSubtractAvg Whether or not to subtract \a correlation by the dataset average
-	 * \param bSubtractPlatformAvg Whether or not to subtract \a correlation by the platform average
-	 * \param logit Whether or not to apply the logit transform on \a correlations
-	 * \param bCorrelation Whether or not to use Pearson
+	 * \param bSubtractAvg If true, subtract the \a correlation by the dataset average
+	 * \param bSubtractPlatformAvg If true, subtract the \a correlation by the platform average
+	 * \param logit If true, apply the logit transform on \a correlations
+	 * \param bCorrelation If true, use Pearson
 	 * \param cutoff Apply a hard cutoff on \a correlations
-	 * \param bRandom Shuffle the \a correlation vector
-	 * \param rand Random generator for the shuffling operation above
+	 * \param bRandom If true, shuffle the \a correlation vector
+	 * \param rand The random generator for the shuffling operation above
 	 * \remarks
 	 * The discretized \a correlation in the matrix \c rD is bounded by 0 to 255 (the limit of
 	 * \c unsigned \c char). The parameter \c quant specifies how a \a correlation is

src/seekevaluate.cpp

 	return true;
 }
 
-float CSeekPerformanceMeasure::RBPRateConvert(const float &RBP, 
-	const ushort &num){
-
-	float x = RBP;
-	if(RBP>=0.989 && RBP<=0.991){
-		if(num>=17600)	x = RBP;
-		else if(num>=15840) x = 0.989;
-		else if(num>=14080) x = 0.988;
-		else if(num>=12320) x = 0.986;
-		else if(num>=10560) x = 0.983;
-		else if(num>=8800) x = 0.98;
-		else if(num>=7040) x = 0.975;
-		else if(num>=5280) x = 0.97;
-		else if(num>=3520) x = 0.955;
-		else if(num>=1760) x = 0.92;
-		else x = 0.90;
-	}
-	return x;
-}			
-
 bool CSeekPerformanceMeasure::AveragePrecision(
 	const vector<unsigned short> &rank, float &ap,
 	const vector<char> &mask, const vector<char> &gold,

src/seekevaluate.h

     }
 };
 
+/*!
+ * \brief Evaluation metrics for a rank-list
+ *
+ * Provide static utility functions for evaluating a ranking of genes with the user-given gold standard
+ * gene-set. The typical use of such functions is in weighting datasets. Generally speaking, each dataset
+ * is weighted by how well the query genes are able to retrieve each other in the dataset.
+ * In a cross-validation, we use a part of the query genes to try to retrieve the other remaining
+ * query genes. These functions provide a measure of precision for each query retrieval, which eventually
+ * forms the basis of the dataset weight.
+ *
+ */
 class CSeekPerformanceMeasure{
 public:
+	/*!
+	 * \brief Sort a gene ranking by the gene score
+	 *
+	 * \param rank The vector of gene-scores to be sorted. Gene scores are inserted to this vector based
+	 * on their gene IDs, which are a value from 0 to the size of the vector.
+	 * \param mapG The gene presence map
+	 * \param a The output, which is a vector of (gene ID, gene score) pairs that are sorted by score
+	 * \param top If \c X, sort only the top \c X elements. If 0, then sort the entire vector.
+	 *
+	 * The struct \c AResult represents a (gene ID, gene score) pair. This function sorts the vector of
+	 * \c AResult in the descending order of the gene score.
+	 */
 	static bool SortRankVector(const vector<unsigned short> &rank,
 		const CSeekIntIntMap &mapG, vector<AResult> &a,
 		const ushort top = 0);
+
+	/*!
+	 * \brief Calculate the rank-biased precision for a gene ranking
+	 *
+	 * \param rate The parameter \a p in the RBP formula
+	 * \param rbp The calculated RBP score, the output
+	 * \param mask The genes in the ranking to be skipped over (typically the query genes)
+	 * \param gold The gold-standard genes
+	 * \param mapG The gene presence map. Genes that are not present in the dataset are skipped over.
+	 * \param sing The sorted vector of (gene ID, gene score) pairs
+	 * \param rank The gene-score vector
+	 * \param top If \c X, sort only the top \c X elements. If 0, then sort the entire vector.
+	 *
+	 * First calls the CSeekPerformanceMeasure::SortRankVector() with the arguments \c rank and \c top,
+	 * in order to sort the gene-scores. Then with the sorted gene-ranking returned to \c sing, it calculates
+	 * the rank-biased precision.
+	 *
+	 * \remarks The RBP formula is given by:
+	 * \f[RBP=\sum_{g \in U}{(1-p)p^{rank(g)}}\f]
+	 * where \f$U\f$ is the gold standard gene-set, \f$p\f$ is the emphasis on ranks,
+	 * \f$rank(g)\f$ is the position of \f$g\f$ in the ranking
+	 * \f$p\f$ is typically set to 0.95 - 0.99. The recommended value is 0.99.
+	 * For more information, please read (Moffat et al 2008).
+	 */
 	/* designed specifically for a CSeekDataset */
 	/* mask: the query genes which are not included in RBP calcualtion */
 	static bool RankBiasedPrecision(const float &rate,
 		/* optional */
 		const ushort top = 0);
 
-	static float RBPRateConvert(const float &RBP, const ushort &num);
-
+	/*!
+	 * \brief Calculate the average precision for a gene ranking
+	 *
+	 * \param rank The gene-score vector
+	 * \param ap The calculated average precision
+	 * \param mask The genes in the ranking to be skipped over (typically the query genes)
+	 * \param gold The gold-standard genes
+	 * \param mapG The gene presence map. Genes that are not present in the dataset are skipped over.
+	 * \param ar The sorted vector of (gene ID, gene score) pairs
+	 */
 	static bool AveragePrecision(
 		const vector<unsigned short> &rank, float &ap,
 		const vector<char> &mask, const vector<char> &gold,

src/seeknetwork.h

 
 #include "seekbasic.h"
 
-//additinal network include files
+//additional network include files
 #include <sys/socket.h>
 #include <sys/types.h>
 #include <netinet/in.h>
 
 namespace Sleipnir {
 
+/*!
+ * \brief Utilities for sending and receiving data over the network
+ *
+ * This class provides static utility functions to facilitate the exchange of messages between the Seek
+ * client and the Seek server. In order to allow this exchange to occur, all messages
+ * must conform to a uniform standard.
+ *
+ * On the sending end, all outgoing messages must first begin with a message header that specifies the
+ * length and the type of the message. Then the body of the message follows.
+ *
+ * The supported outgoing messages are: an array of \c chars (such as a \c string), an array of \c floats.
+ * The outgoing message is structured as follows:
+ * \li Byte #1-4: An \c unsigned \c integer that specifies the size of one element (\a S). (1 for a \c char, 4 for a \c float)
+ * \li Byte #5-8: An \c unsigned \c integer that specifies the total number of elements to be sent (\a N). (1 for a single-value,
+ * otherwise the size of the array)
+ * \li Byte #9 and onward: \a S times \a N bytes specifying the array content
+ *
+ * On the receiving end, CSeekNetwork also supports the receiving of a \c char array (or a \c string) or a \c float array.
+ *
+ * In order to be properly recognized, the incoming message should be structured as follows:
+ *
+ * For a \c char array:
+ * \li Byte #1-4: A \c signed \c integer that specifies the length of the \c char array to receive (\a NC)
+ * \li Byte #5 and onward: \a NC bytes specifying the \c char array.
+ *
+ * For a \c float array:
+ * \li Byte #1-4: A \c signed \c integer that specifies the length of the \c float array to receive (\a NF)
+ * \li Byte #5 and onward: \a NF times 4 bytes specifying the \c float array.
+ *
+ * IMPORTANT:
+ * <b>
+ * Outgoing messages are always encoded using bytes in the Little Endian order.
+ *
+ * For an incoming message to be properly recognized, the message should also be encoded with bytes in the Little Endian order.
+ * </b>
+ */
 class CSeekNetwork{
 public:
+	/*!
+	 * \brief Send a string
+	 *
+	 * Encodes an outgoing message and sends it to the client
+	 *
+	 * \param new_fd The client socket
+	 * \param str The string to be sent to the client
+	 *
+	 * \remarks Assumes that the client connection has been established.
+	 */
 	static int Send(int, const string&);
+
+	/*!
+	 * \brief Send a float array
+	 *
+	 * Encodes an outgoing message and sends it to the client
+	 *
+	 * \param new_fd The client socket
+	 * \param str The array of floats to be sent to the client
+	 *
+	 * \remarks Assumes that the client connection has been established.
+	 */
 	static int Send(int, const vector<float>&);
+
+	/*!
+	 * \brief Low-level send function
+	 *
+	 * \param new_fd The client socket
+	 * \param c The message
+	 * \param size The message length
+	 * \return -1 if an error occurs or \c size if the sending is successful
+	 *
+	 * \remarks Assumes that the client connection has been established.
+	 */
 	static int Send(int, char*, int);
+
+	/*!
+	 * \brief Clear a char array
+	 *
+	 * Clears a char array by zeroing all bytes
+	 *
+	 * \param b The char array
+	 * \param size The size of the char array
+	 */
 	static void Clear(char*, int);
+
+	/*!
+	 * \brief Copy a char array
+	 *
+	 * Copies the entire source array (0...N) to the destination array beginning at the index \c beg
+	 *
+	 * \param d The destination
+	 * \param s The source
+	 * \param beg The position on the destination array where the pasting starts
+	 * \param num The size of the source array
+	 * \return \c beg + \c num
+	 */
 	static int Copy(char*, char*, int, int);
+
+	/*!
+	 * \brief Receive a string
+	 *
+	 * Receive a string from the client
+	 *
+	 * \param new_fd The client socket
+	 * \param s The string where the message will be received to
+	 *
+	 * \remarks Assumes that the client connection has been established.
+	 */
 	static int Receive(int, string&);
+
+	/*!
+	 * \brief Receive a float array
+	 *
+	 * Receive a float array from the client
+	 *
+	 * \param new_fd The client socket
+	 * \param f The float array where the message will be received to
+	 *
+	 * \remarks Assumes that the client connection has been established.
+	 */
 	static int Receive(int, vector<float>&);
 };
 

src/seekplatform.h

 
 /*!
  * \brief
- * A microarray platform that is used by Seek
+ * Representation of a microarray platform that is used by Seek
  *
  * Contains the gene \a correlation average and standard deviation for a given platform
  *
     
 /*!
  * \brief
- * A query structure that is used by Seek
+ * Representation of a query gene-set that is used by Seek
  *
  * Includes vectors for storing the query genes, and utilities for partitioning query
  * genes into specified number of groups

src/seekreader.cpp

 	return false;
 }
 
-bool CSeekTools::CreatePresenceVector(const vector<ushort> &srcData,
-	vector<char> &destData, const ushort &iSize){
-	ushort i;
-	destData.clear();
-	destData.resize(iSize);
-	for(i=0; i<iSize; i++) destData[i] = 0;
-	for(i=0; i<srcData.size(); i++) destData[srcData[i]] = 1;
-	return true;
-}
-
 bool CSeekTools::ReadDatabaselets(const CDatabase &DB, 
 	const vector< vector<string> > &vecstrAllQuery,
 	vector<CSeekDataset*> &vc, 
 		return false;
 	}
 
-	fprintf(stderr, "Reading %lu gene cdatabaselets and doing query centric\n",
+	fprintf(stderr, "Reading %lu query genes' correlations\n",
 		allQ.size());
 	system("date +%s%N 1>&2");
 	if(bNetwork && CSeekNetwork::Send(iClient, "Reading " + 
 		CSeekTools::ConvertInt(allQ.size()) + 
-		" gene cdatabaselets and doing query centric")==-1){
+		" query genes' correlations")==-1){
 		fprintf(stderr, "Error sending client message\n");
 		return false;
 	}
 		Qi.clear();
 	}
 
-	fprintf(stderr, "Finished reading databaselets and query centric\n");
+	fprintf(stderr, "Finished reading query genes' correlations\n");
 	system("date +%s%N 1>&2");
 	if(bNetwork && CSeekNetwork::Send(iClient, 
 		"Finished reading databaselets and query centric")==-1){
 	return true;
 }
 	
-bool CSeekTools::ReadQuantFile(const string &strFile, vector<float> &quant){
-	return CSeekTools::ReadQuantFile(strFile.c_str(), quant);
+bool CSeekTools::ReadQuantFile(const string &strFile, vector<float> &quant,
+	const int lineSize){
+	return CSeekTools::ReadQuantFile(strFile.c_str(), quant, lineSize);
 }
 
-bool CSeekTools::ReadQuantFile(const char *file, vector<float> &quant){
+bool CSeekTools::ReadQuantFile(const char *file, vector<float> &quant, const int lineSize){
 	ifstream ifsm;
 	ifsm.open(file);
-	char acBuffer[5000];
-	ushort c_iBuffer = 5000;
+	char acBuffer[lineSize];
+	ushort c_iBuffer = lineSize;
 	vector<string> vecstrLine;
 
 	ifsm.getline(acBuffer, c_iBuffer -1);
 
 bool CSeekTools::ReadPlatforms(const string &strPlatformDirectory,
 		vector<CSeekPlatform> &plat, vector<string> &vecstrPlatforms,
-		map<string, ushort> &mapstriPlatforms){
+		map<string, ushort> &mapstriPlatforms, const int lineSize){
 	return CSeekTools::ReadPlatforms(strPlatformDirectory.c_str(), plat,
-		vecstrPlatforms, mapstriPlatforms);
+		vecstrPlatforms, mapstriPlatforms, lineSize);
 }
 
 bool CSeekTools::ReadPlatforms(const char *plat_dir,
 		vector<CSeekPlatform> &plat, vector<string> &vecstrPlatforms,
-		map<string, ushort> &mapstriPlatforms){
+		map<string, ushort> &mapstriPlatforms, const int lineSize){
 
 	string strPlatformDirectory = plat_dir;
 	string strAvgFile = strPlatformDirectory + "/" +
 	mapstriPlatforms.clear();
 	ifstream ifsm;
 	ifsm.open(strPlatformOrderFile.c_str());
-	char acBuffer[1024];
-	ushort c_iBuffer = 1024;
+	char acBuffer[lineSize];
+	ushort c_iBuffer = lineSize;
 	i = 0;
 	while(!ifsm.eof()){
 		ifsm.getline(acBuffer, c_iBuffer -1);
 }
 
 bool CSeekTools::ReadListTwoColumns(const string &strFile,
-		vector<string> &vecstrList1, vector<string> &vecstrList2){
+		vector<string> &vecstrList1, vector<string> &vecstrList2, const int lineSize){
 	return CSeekTools::ReadListTwoColumns(strFile.c_str(),
-		vecstrList1, vecstrList2);
+		vecstrList1, vecstrList2, lineSize);
 }
 
 bool CSeekTools::ReadListTwoColumns(const char *file,
-		vector<string> &vecstrList1, vector<string> &vecstrList2){
+		vector<string> &vecstrList1, vector<string> &vecstrList2,
+		const int lineSize){
 	ifstream ifsm;
 	ifsm.open(file);
 	if(!ifsm.is_open()){
 		fprintf(stderr, "Error opening file %s\n", file);
 		return false;
 	}
-	char acBuffer[1024];
-	ushort c_iBuffer = 1024;
+	char acBuffer[lineSize];
+	ushort c_iBuffer = lineSize;
 	vecstrList1.clear();
 	vecstrList2.clear();
 
 }
 
 bool CSeekTools::ReadListOneColumn(const string &strFile,
-	vector<string> &vecstrList, CSeekStrIntMap &mapstriList){
+	vector<string> &vecstrList, CSeekStrIntMap &mapstriList, const int lineSize){
 	return CSeekTools::ReadListOneColumn(strFile.c_str(),
-		vecstrList, mapstriList);
+		vecstrList, mapstriList, lineSize);
 }
 
 
 bool CSeekTools::ReadListOneColumn(const char *file,
-	vector<string> &vecstrList, CSeekStrIntMap &mapstriList){
+	vector<string> &vecstrList, CSeekStrIntMap &mapstriList, const int lineSize){
 	ifstream ifsm;
 	ifsm.open(file);
 	if(!ifsm.is_open()){
 		return false;
 	}
 
-	char acBuffer[1024];
-	ushort c_iBuffer = 1024;
+	char acBuffer[lineSize];
+	ushort c_iBuffer = lineSize;
 	vecstrList.clear();
 
 	int i = 0;
 }
 
 bool CSeekTools::ReadMultipleQueries(const string &strFile,
-	vector< vector<string> > &qList){
-	return CSeekTools::ReadMultipleQueries(strFile.c_str(), qList);
+	vector< vector<string> > &qList, const int lineSize){
+	return CSeekTools::ReadMultipleQueries(strFile.c_str(), qList, lineSize);
 }
 
 bool CSeekTools::ReadMultipleQueries(const char *file,
-	vector< vector<string> > &qList){
+	vector< vector<string> > &qList, const int lineSize){
 	qList.clear();
 	FILE *infile;
 	if((infile=fopen(file, "r"))==NULL){
 	}
 
 	char *acBuffer;
-	int MAX_CHAR_PER_LINE = 1024;
+	int MAX_CHAR_PER_LINE = lineSize;
 	int lineLen = MAX_CHAR_PER_LINE;
 	acBuffer = (char*)malloc(lineLen);
 	while(fgets(acBuffer, lineLen, infile)!=NULL){
 	}
 
 	char *acBuffer= (char*)malloc(lineSize);
-	//char acBuffer[1024];
 	int c_iBuffer = lineSize;
 	int i = 0;
-	//string sBuffer;
-	//getline(ifsm, sBuffer);
 
 	ifsm.getline(acBuffer, c_iBuffer -1);
 	acBuffer[c_iBuffer-1] = 0;
 	vector<string> tok;
 	CMeta::Tokenize(acBuffer, tok, " ");
-	//CMeta::Tokenize(sBuffer.c_str(), tok, " ");
 	for(i = 0; i<tok.size(); i++){
 		list.push_back(tok[i]);
 	}
 }
 
 bool CSeekTools::ReadListOneColumn(const string &strFile,
-	vector<string> &vecstrList){
-	return CSeekTools::ReadListOneColumn(strFile.c_str(), vecstrList);
+	vector<string> &vecstrList, const int lineSize){
+	return CSeekTools::ReadListOneColumn(strFile.c_str(), vecstrList, lineSize);
 }
 
 bool CSeekTools::ReadListOneColumn(const char *file,
-	vector<string> &vecstrList){
+	vector<string> &vecstrList, const int lineSize){
 	ifstream ifsm;
 	ifsm.open(file);
 
 		fprintf(stderr, "Error opening file %s\n", file);
 		return false;
 	}
-	char acBuffer[1024];
-	ushort c_iBuffer = 1024;
+	char acBuffer[lineSize];
+	ushort c_iBuffer = lineSize;
 	vecstrList.clear();
 	int i = 0;
 	while(!ifsm.eof()){
 
 namespace Sleipnir {
 
+/*!
+ * \brief A suite of file I/O and general purpose tools that are used by Seek
+ *
+ * These tools are critical for initializing the search parameters, and are highly beneficial to the routine
+ * manipulations of vectors and files.
+ *
+ * Some examples of these tools include:
+ * \li Reading different search setting files, such as the dataset mapping, the gene mapping,
+ * the list of queries, the correlation discretization file, etc.
+ * \li Loading a set of CDatabaselet from the given directory
+ * \li Reading a binary file that contains a vector of standard type elements (\c int, \c char, \c string,
+ * \c float)
+ * \li Reading a text file that contains a table with one or two columns
+ * \li Setting all elements of a given vector to the given value
+ * \li Writing a vector to a file (in binary or in text format)
+ */
 class CSeekTools{
 public:
 	/* binary */
+	/*!
+	 * \brief Read an array from a given binary file
+	 *
+	 * \param fileName The file name
+	 * \param vData The destination array
+	 * \return True if the reading is successful
+	 * \remark This function reads an one-dimensional array. The binary file needs to be
+	 * organized as follows:
+	 * 1) The first field is the size of the array, \a N (\c size_t).
+	 * 2) The second field is a set of \a N elements.
+	 */
 	template<class tType>
 	static bool ReadArray(const char *fileName, vector<tType> &vData){
 		FILE *f = fopen(fileName, "rb");
 	}
 
 	/* binary */
+	/*!
+	 * \brief Write an array in binary format
+	 *
+	 * \param fileName The file name
+	 * \param vData The source array
+	 * \return True if the reading is successful
+	 * \remark This function writes an one-dimensional array. It will write the array in the following
+	 * way:
+	 * 1) The first field is the size of the array, \a N (\c size_t).
+	 * 2) The second field is the \a N elements in the array.
+	 */
 	template<class tType>
 	static bool WriteArray(const char *fileName, const vector<tType> &vData){
 		FILE *f = fopen(fileName, "wb");
 		return true;
 	}
 
+	/*!
+	 * \brief Write an array in text format
+	 *
+	 * \param fileName The file name
+	 * \param vData The source array
+	 * \return True if the reading is successful
+	 * \remark This function writes an one-dimensional array in the text format. The array elements
+	 * are separated by spaces.
+	 */
 	template<class tType>
 	static bool WriteArrayText(const char *fileName,
 		const vector<tType> &vData){
 		return true;
 	}
 
+	/*!
+	 * \brief Write a two-dimensional array in text format
+	 *
+	 * \param fileName The file name
+	 * \param vData The source array
+	 * \return True if the reading is successful
+	 * \remark This function writes an two-dimensional array in the text format.
+	 * The rows are separated by new lines. The elements in a row are separated by spaces.
+	 */
 	template<class tType>
 	static bool Write2DArrayText(const char *fileName,
 		const vector<vector<tType> > &vData){
 		return true;
 	}
 
+	/*!
+	 * \brief Initialize a vector with a given value
+	 *
+	 * \param vData The source vector
+	 * \param iSize The number of elements that the vector should contain
+	 * \param tValue The value
+	 *
+	 * Resizes the source vector to the given size, then sets all elements in the vector
+	 * to the given value.
+	 */
 	template<class tType>
 	static bool InitVector(vector<tType> &vData, const ushort &iSize,
 		const tType &tValue) {
 		return true;
 	}
 
+	/*!
+	 * \brief Initialize a vector
+	 *
+	 * \param vData The source vector
+	 * \param iSize The number of elements that the vector should contain
+	 *
+	 * Resizes the source vector to the given size.
+	 */
 	template<class tType>
 	static bool InitVector(vector<tType> &vData, const ushort &iSize) {
 		vData.clear();
 		return true;
 	}
 
+	/*!
+	 * \brief Initialize a two-dimensional array with the given size and value
+	 *
+	 * \param iSize1 The first dimension size
+	 * \param iSize2 The second dimension size
+	 * \param tValue The value
+	 *
+	 * Creates a two-dimensional array of the given dimension, then populates
+	 * it with the given value.
+	 */
 	template<class tType>
 	static tType** Init2DArray(const size_t &iSize1, const size_t &iSize2,
 		const tType &tValue){
 		return f;
 	}
 
+	/*!
+	 * \brief Free a two-dimensional array
+	 * \param f The two-dimensional array
+	 */
 	template<class tType>
 	static void Free2DArray(tType** f){
 		free(f[0]);
 		free(f);
 	}
 
+	/*!
+	 * \brief Checks if a \c ushort value is invalid
+	 * \param v The value to be checked
+	 * A \c ushort value is invalid if it is maximum (65535).
+	 */
 	static bool IsNaN(const ushort &);
 
+	/*!
+	 * \brief Converts an integer to a string
+	 * \param number The given integer number
+	 * \return The string
+	 */
 	static string ConvertInt(const int &);
 
-	static bool CreatePresenceVector(const vector<ushort> &, vector<char> &,
-		const ushort &);
-
+	/*!
+	 * \brief Read a set of CDatabaselet from CDatabase instance
+	 *
+	 * \param DB The CDatabase instance
+	 * \param vecstrAllQuery The list of queries
+	 * \param vc A vector of datasets (the output)
+	 * \param iClient If the network mode is enabled, the client's socket
+	 * \param bNetwork If true, the network mode is enabled
+	 *
+	 * \remarks
+	 * A CDatabaselet stores the correlation of a given gene, \a g, to all other genes in all of the
+	 * datasets. In order to perform the coexpression search, the CDatabaselet's corresponding
+	 * to the query genes need to be loaded from disk. This function reads the
+	 * CDatabaselet files corresponding to the query genes.
+	 *
+	 * \remarks
+	 * Once the CDatabaselet for a query gene has been read, the next step that this function
+	 * performs is allotting the correlations to their corresponding datasets (CSeekDataset).
+	 *
+	 * \remarks
+	 * The network mode is used to relay status messages between the server and the client.
+	 *
+	 * \remarks
+	 * Assumes that the CSeekTools::LoadDatabase() has been called.
+	 */
 	static bool ReadDatabaselets(const CDatabase &, 
 		const vector< vector<string> > &, vector<CSeekDataset*> &, 
 		//network mode options
 		const int&, const bool&);
 
+	/*!
+	 * \brief Read the search setting files and load the CDatabase
+	 *
+	 * Performs the following search initializing operations:
+	 * \li Reads the gene presence files \c *.gpres, the gene averages \c *.gavg,
+	 * the gene variances \c *.gvar, and each dataset's correlation average and
+	 * standard deviation \c *.sinfo.
+	 * \li Reads the dataset-platform mapping file
+	 * \li Initializes the vector of CSeekDataset with each dataset's gene-presence, gene-averages
+	 *
+	 * \param DB The CDatabase instance
+	 * \param strPrepInputDirectory The prep directory which contains the \c *.gavg and \c *.gpres files
+	 * \param strGvarInputDirectory The directory that contains the gene variance files \c *.gvar
+	 * \param strSinfoInputDirectory The directory that contains the \c *.sinfo files
+	 * \param vecstrDatasets The dataset definition
+	 * \param mapstrstrDatasetPlatform The dataset-platform mapping
+	 * \param mapstriPlatform Platform name-platform ID mapping
+	 * \param vp The vector of CSeekPlatform
+	 * \param vc The vector of CSeekDataset, the output
+	 *
+	 */
 	static bool LoadDatabase(const CDatabase &, const string &,
 		const string &, const string &,
 		const vector<string> &, const map<string, string> &,
 		const map<string, ushort> &, vector<CSeekPlatform> &,
 		vector<CSeekDataset*> &);
 
+	/*!
+	 * \brief Read the search setting files and load the CDatabase
+	 *
+	 * Same as the previous CSeekTools::LoadDatabase() definition, except that this function
+	 * accepts string arguments as \c const \c char \c *.
+	 */
 	static bool LoadDatabase(const CDatabase &, const char *,
 		const char *, const char *,
 		const vector<string> &, const map<string, string> &,
 		const map<string, ushort> &, vector<CSeekPlatform> &,
 		vector<CSeekDataset*> &);
 
+	/*!
+	 * \brief Load a CDatabase by copying from an existing instance
+	 *
+	 * Copies the vector of initialized CSeekDataset to a new vector.
+	 * Copies the vector of initialized CSeekPlatform to a new vector.
+	 *
+	 * \param DB The CDatabase
+	 * \param vc The destination dataset vector
+	 * \param vc_src The source dataset vector
+	 * \param vp The destination platform vector
+	 * \param vp_src The source platform vector
+	 * \param vecstrDatasets The dataset definition
+	 * \param mapstrstrDatasetPlatform The dataset-platform mapping
+	 * \param mapstriPlatform Platform name-platform ID mapping
+	 */
 	static bool LoadDatabase(const CDatabase &, vector<CSeekDataset*>&,
 		const vector<CSeekDataset*>&, vector<CSeekPlatform>&, 
 		const vector<CSeekPlatform>&, const vector<string>&, 
 		const map<string,string>&, const map<string,ushort>&);
 
+	/*!
+	 * \brief Read the platforms
+	 *
+	 * Reading the platforms mainly involves reading the correlation average and
+	 * the correlation standard deviation for each platform in the database.
+	 * The purpose is to correct the platform specific biases on the correlation values.
+	 *
+	 * \param strPlatformDirectory The directory that contains the platform average and standard deviation files
+	 * \param plat The output
+	 * \param vecstrPlatforms The platform names
+	 * \param mapstriPlatform The platform name - platform ID mapping
+	 * \param lineSize The maximum characters per line in the file (default 1024)
+	 */
 	static bool ReadPlatforms(const string &strPlatformDirectory,
 		vector<CSeekPlatform> &plat, vector<string> &vecstrPlatforms,
-		map<string, ushort> &mapstriPlatforms);
+		map<string, ushort> &mapstriPlatforms, const int lineSize = 1024);
+
+	/*!
+	 * \brief Read the platforms
+	 *
+	 * This is the same as the previous CSeekTools::ReadPlatforms() declaration, except that the
+	 * accepted string arguments are of the type \c const \c char \c *.
+	 */
 	static bool ReadPlatforms(const char *plat_dir,
-			vector<CSeekPlatform> &plat, vector<string> &vecstrPlatforms,
-			map<string, ushort> &mapstriPlatforms);
+		vector<CSeekPlatform> &plat, vector<string> &vecstrPlatforms,
+		map<string, ushort> &mapstriPlatforms, const int lineSize = 1024);
 
+	/*!
+	 * \brief Read a table with one column
+	 *
+	 * Outputs the lines in the table as a vector of strings
+	 *
+	 * \param strFile The file name
+	 * \param vecstrList The output
+	 * \param mapstriList Mapping the line to its line number
+	 * \param lineSize The maximum characters per line in the file (default 1024)
+	 */
 	static bool ReadListOneColumn(const string &strFile,
-		vector<string> &vecstrList, CSeekStrIntMap &mapstriList);
+		vector<string> &vecstrList, CSeekStrIntMap &mapstriList, const int lineSize = 1024);
+
+	/*!
+	 * \brief Read a table with one column
+	 *
+	 * This is the same as the previous CSeekTools::ReadListOneColumn() declaration, except that the
+	 * accepted string arguments are of the type const char*.
+	 */
 	static bool ReadListOneColumn(const char *file,
-			vector<string> &vecstrList, CSeekStrIntMap &mapstriList);
+		vector<string> &vecstrList, CSeekStrIntMap &mapstriList, const int lineSize = 1024);
 
+	/*!
+	 * \brief Read a table with one column
+	 *
+	 * Same as the previous CSeekTools::ReadListOneColumn() declaration, except that this does not
+	 * generate the line to line number mapping.
+	 */
+	static bool ReadListOneColumn(const string &strFile,
+		vector<string> &vecstrList, const int lineSize = 1024);
+
+	/*!
+	 * \brief Read a table with one column
+	 *
+	 * Same as the previous CSeekTools::ReadListOneColumn() declaration, except that this does not
+	 * generate the line to line number mapping, and accepts the file name as const char *.
+	 */
+	static bool ReadListOneColumn(const char *file,
+		vector<string> &vecstrList, const int lineSize = 1024);
+
+	/*!
+	 * \brief Read a table with two columns
+	 *
+	 * \param strFile The file name
+	 * \param list1 The column 1 output
+	 * \param list2 The column 2 output
+	 * \param lineSize The maximum characters per line in the file (default 1024)
+	 */
 	static bool ReadListTwoColumns(const string &strFile,
-		vector<string> &list1, vector<string> &list2);
+		vector<string> &list1, vector<string> &list2, const int lineSize = 1024);
+
+	/*!
+	 * \brief Read a table with two columns
+	 *
+	 * This is the same as the previous CSeekTools::ReadListTwoColumns() declaration, except that the
+	 * accepted string arguments are of the type const char *.
+	 */
 	static bool ReadListTwoColumns(const char *file,
-		vector<string> &list1, vector<string> &list2);
+		vector<string> &list1, vector<string> &list2, const int lineSize = 1024);
 
+	/*!
+	 * \brief Read a list of queries
+	 *
+	 * A query is specified as a set of gene names delimited by spaces.
+	 * A query occupies one line in the file.
+	 * \param strFile The file name
+	 * \param qList The output
+	 * \param lineSize The maximum characters per line in the file (default 1024)
+	 */
 	static bool ReadMultipleQueries(const string &strFile,
-		vector< vector<string> > &qList);
+		vector< vector<string> > &qList, const int lineSize = 1024);
+
+	/*!
+	 * \brief Read a list of queries
+	 *
+	 * Same as the previous CSeekTools::ReadMultipleQueries() declaration, except that this function
+	 * accepts the string argument as a const char *.
+	 */
 	static bool ReadMultipleQueries(const char *file,
-			vector< vector<string> > &qList);
+		vector< vector<string> > &qList, const int lineSize = 1024);
 
+	/*!
+	 * \brief Read just one gene-set line
+	 *
+	 * Reads the first line in the file. The line contains a set of gene names delimited by spaces.
+	 * The output is a vector of strings representing the genes in that line.
+	 *
+	 * \param strFile The file name
+	 * \param list1 The output
+	 * \param lineSize The maximum characters per line in the file (default 1024)
+	 */
 	static bool ReadMultiGeneOneLine(const string &strFile,
 		vector<string> &list1, const int lineSize = 1024);
+
+	/*!
+	 * \brief Read just one gene-set line
+	 *
+	 * Same as the previous CSeekTools::ReadMultiGeneOneLine() except that the accepted string argument
+	 * is of the type const char *.
+	 */
 	static bool ReadMultiGeneOneLine(const char *file,
-			vector<string> &list1, const int lineSize = 1024);
+		vector<string> &list1, const int lineSize = 1024);
 
-	static bool ReadListOneColumn(const string &strFile,
-		vector<string> &vecstrList);
-	static bool ReadListOneColumn(const char *file,
-			vector<string> &vecstrList);
+	/*!
+	 * \brief Read the correlation discretization
+	 *
+	 * Specifies how the correlations should be binned. The file contains the bin boundaries separated by spaces.
+	 * \param strFile The file name
+	 * \param quant The output
+	 * \param lineSize The maximum characters per line in the file (default 5000)
+	 */
+	static bool ReadQuantFile(const string &strFile, vector<float> &quant, const int lineSize = 5000);
 
-	static bool ReadQuantFile(const string &strFile, vector<float> &quant);
-	static bool ReadQuantFile(const char *file, vector<float> &quant);
+	/*!
+	 * \brief Read the correlation discretization
+	 *
+	 * Same as the previous CSeekTools::ReadQuantFile() except that the accepted string argument is of the type
+	 * const char *.
+	 */
+	static bool ReadQuantFile(const char *file, vector<float> &quant, const int lineSize = 5000);
 
 
 };