Source

sleipnir / src / seekweight.h

Full commit
/*****************************************************************************
* This file is provided under the Creative Commons Attribution 3.0 license.
*
* You are free to share, copy, distribute, transmit, or adapt this work
* PROVIDED THAT you attribute the work to the authors listed below.
* For more information, please see the following web page:
* http://creativecommons.org/licenses/by/3.0/
*
* This file is a component of the Sleipnir library for functional genomics,
* authored by:
* Curtis Huttenhower (chuttenh@princeton.edu)
* Mark Schroeder
* Maria D. Chikina
* Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
*
* If you use this library, the included executable tools, or any related
* code in your work, please cite the following publication:
* Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
* Olga G. Troyanskaya.
* "The Sleipnir library for computational functional genomics"
*****************************************************************************/
#ifndef SEEKWEIGHT_H
#define SEEKWEIGHT_H

#include "seekbasic.h"
#include "seekreader.h"
#include "seekquery.h"
#include "seekevaluate.h"

namespace Sleipnir {

/*!
 * \brief
 * Provide functions to assign dataset weight using the query gene.
 *
 * For dataset weighting, one way is to use CSeekWeighter::CVWeighting. The CSeekWeighter::CVWeighting
 * uses a cross-validation (CV) framework, where it partitions the query and performs a search
 * instance on one sub-query, using the remainder of the queries as the evaluation of the search instance.
 *
 * The CSeekWeighter::OrderStatisticsRankAggregation is a rank-based technique described by Adler et al (2009). This combines
 * dataset weighting and dataset gene-ranking aggregation all into one step.
 *
 */
class CSeekWeighter{
public:

	/*!
	 * \brief
	 * Calculates for each gene the average \a correlation to all of the query genes in a dataset.
	 *
	 * \param rank
	 * A vector that stores the \a correlation of each gene to all of the query genes
	 *
	 * \param cv_query
	 * A vector that stores the query genes
	 *
	 * \param sDataset
	 * A dataset
	 *
	 * \param MIN_REQUIRED
	 * A utype that specifies how many query genes are required to be present in a dataset.
	 * If not enough query genes are present, then the averaging is not performed.
	 *
	 * \param bSquareZ
	 * If true, square the \a correlation values before adding \a correlations.
	 *
	 * \remark
	 * The word \a correlations refer to z-scored, standardized Pearson correlations.
	 * The result is returned in the parameter \c rank.
	 *
	 */
	/*cv_query must be present in sDataset */
	static bool LinearCombine(vector<utype> &rank,
		const vector<utype> &cv_query, CSeekDataset &sDataset,
		const utype &, const bool &);

	/*!
	 * \brief
	 * Cross-validates query-genes in a dataset
	 *
	 * \param sQuery
	 * The query and its partitions
	 *
	 * \param sDataset
	 * A dataset
	 *
	 * \param rate
	 * RBP parameter \a p
	 *
	 * \param percent_required
	 * Percentage of query genes required to be present in the dataset
	 *
	 * \param bSquareZ
	 * Whether or not to square \a correlations
	 *
	 * \param rrank
	 * Temporary vector storing intermediary \a correlations
	 *
	 * \param goldStd
	 * If a gold-standard gene-set is provided, use this to evaluate the retrieval of a cross-validation
	 *
	 * This performs multiple cross-validation runs to validate
	 * the query genes in retrieving themselves in the dataset. 
     * The sum of the evaluation of all the
	 * runs then becomes the dataset weight. For evaluation, we use the following formula for scoring a validation run \f$i\f$:
	 * \f[s(i)=\sum_{g \in U}{(1-p)p^{rank(g)}}\f]
     * where \f$U\f$ is the \f$N-1\f$ parts of the query used for evaluation, \f$p\f$ is an exponential rate
     * parameter, \f$rank(g)\f$ is the position of \f$g\f$ in the ranking of genes
     * generated by the subsearch instance \f$i\f$.
     *
     * The above formulation is inspired by <em>rank-biased precision</em>.
     * The parameter \a p needs to be provided. The default value is 0.99.
	 *
	 */
	static bool CVWeighting(CSeekQuery &sQuery, CSeekDataset &sDataset,
		const float &rate, const float &percent_required, const bool &bsquareZ,
		vector<utype> *rrank, const CSeekQuery *goldStd = NULL);

	/*!
	 * \brief
	 * Performs OrderStatisticsAggregation, also known as the MEM algorithm
	 * 
	 * \param iDatasets
	 * The number of datasets
	 * 
	 * \param iGenes
	 * The number of genes
	 * 
	 * \param rank_d
	 * Two-dimensional vectors storing correlation-ranks to the query genes. 
	 * First dimension: datasets. Second dimension: genes.
	 * 
	 * \param counts
	 * A vector storing the count of datasets for each gene
	 * 
	 * \param master_rank
	 * A vector storing the integrated gene-score
	 * 
	 * \param numThreads
	 * The number of threads to be used (in a parallel setup)
	 *
	 * \c rank_d needs to be prepared as follows: a <em>correlation rank</em> vector is obtained from sorting Pearson correlations 
	 * in a dataset, and then it is normalized by <em>(rank of correlation) / (number of genes)</em>. The result is stored
	 * in \c rank_d.
	 *
	 * Afterward, for each gene \a g, the algorithm compares this gene's \c rank_d distribution across datasets with
	 * that derived from a set of datasets with randomly ordered correlation vectors (ie a null distribution). 
	 * A significance p-value is calculated for this gene, and \a -log(p) values are stored in master_rank. 
	 */
	static bool OrderStatisticsRankAggregation(const utype&, const utype&,
		utype**, const vector<utype> &, vector<float>&, const utype&);
	static bool OrderStatisticsPreCompute();


	/*!
	 * \brief
	 * Simulates a dataset weight for one-gene query
	 *
	 * \param sQuery
	 * The query
	 * 
	 * \param sDataset
	 * The dataset
	 * 
	 * \param rate
	 * RBP parameter \a p
	 * 
	 * \param percent_required
	 * Percentage of query genes required to be present in a dataset (assumed to be 1 in this case)
	 * 
	 * \param bSquareZ
	 * Whether or not to square \a correlations
	 * 
	 * \param rrank
	 * Final gene-score
	 * 
	 * \param goldStd
	 * Gold-standard gene-set for weighting a dataset
	 * 
	 * This function is mainly used for <em>equal weighting</em>. 
	 * Although <em>equal weighting</em> integrates all datasets with weight = 1, 
	 * for the purpose of displaying datasets, the datasets need to be ranked according to the distance to the
	 * average gene-ranking. 
	 *
	 * This <em>average gene-ranking</em> is produced by summing gene-rankings from all datasets and divided by the number of datasets.
	 * To score a dataset, we calculate the RBP precision of this dataset in retrieving the top 100 genes of the <em>average ranking</em>.
	 * 
	 */
	static bool OneGeneWeighting(CSeekQuery&, 
		CSeekDataset&, const float&, const float&, const bool&, 
		vector<utype>*, const CSeekQuery*);

	static bool AverageWeighting(CSeekQuery &sQuery, CSeekDataset &sDataset,
		const float &percent_required, const bool &bSquareZ, float &w);
};


}
#endif