# sleipnir / src / seekweight.h

 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 /***************************************************************************** * This file is provided under the Creative Commons Attribution 3.0 license. * * You are free to share, copy, distribute, transmit, or adapt this work * PROVIDED THAT you attribute the work to the authors listed below. * For more information, please see the following web page: * http://creativecommons.org/licenses/by/3.0/ * * This file is a component of the Sleipnir library for functional genomics, * authored by: * Curtis Huttenhower (chuttenh@princeton.edu) * Mark Schroeder * Maria D. Chikina * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) * * If you use this library, the included executable tools, or any related * code in your work, please cite the following publication: * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and * Olga G. Troyanskaya. * "The Sleipnir library for computational functional genomics" *****************************************************************************/ #ifndef SEEKWEIGHT_H #define SEEKWEIGHT_H #include "seekbasic.h" #include "seekreader.h" #include "seekquery.h" #include "seekevaluate.h" namespace Sleipnir { /*! * \brief * Provide functions to assign dataset weight using the query gene. * * For dataset weighting, one way is to use CSeekWeighter::CVWeighting. The CSeekWeighter::CVWeighting * uses a cross-validation (CV) framework, where it partitions the query and performs a search * instance on one sub-query, using the remainder of the queries as the evaluation of the search instance. * * The CSeekWeighter::OrderStatisticsRankAggregation is a rank-based technique described by Adler et al (2009). This combines * dataset weighting and dataset gene-ranking aggregation all into one step. * */ class CSeekWeighter{ public: /*! * \brief * Calculates for each gene the average \a correlation to all of the query genes in a dataset. * * \param rank * A vector that stores the \a correlation of each gene to all of the query genes * * \param cv_query * A vector that stores the query genes * * \param sDataset * A dataset * * \param MIN_REQUIRED * A utype that specifies how many query genes are required to be present in a dataset. * If not enough query genes are present, then the averaging is not performed. * * \param bSquareZ * If true, square the \a correlation values before adding \a correlations. * * \remark * The word \a correlations refer to z-scored, standardized Pearson correlations. * The result is returned in the parameter \c rank. * */ /*cv_query must be present in sDataset */ static bool LinearCombine(vector &rank, const vector &cv_query, CSeekDataset &sDataset, const utype &, const bool &); /*! * \brief * Cross-validates query-genes in a dataset * * \param sQuery * The query and its partitions * * \param sDataset * A dataset * * \param rate * RBP parameter \a p * * \param percent_required * Percentage of query genes required to be present in the dataset * * \param bSquareZ * Whether or not to square \a correlations * * \param rrank * Temporary vector storing intermediary \a correlations * * \param goldStd * If a gold-standard gene-set is provided, use this to evaluate the retrieval of a cross-validation * * This performs multiple cross-validation runs to validate * the query genes in retrieving themselves in the dataset. * The sum of the evaluation of all the * runs then becomes the dataset weight. For evaluation, we use the following formula for scoring a validation run \f$i\f$: * \f[s(i)=\sum_{g \in U}{(1-p)p^{rank(g)}}\f] * where \f$U\f$ is the \f$N-1\f$ parts of the query used for evaluation, \f$p\f$ is an exponential rate * parameter, \f$rank(g)\f$ is the position of \f$g\f$ in the ranking of genes * generated by the subsearch instance \f$i\f$. * * The above formulation is inspired by rank-biased precision. * The parameter \a p needs to be provided. The default value is 0.99. * */ static bool CVWeighting(CSeekQuery &sQuery, CSeekDataset &sDataset, const float &rate, const float &percent_required, const bool &bsquareZ, vector *rrank, const CSeekQuery *goldStd = NULL); /*! * \brief * Performs OrderStatisticsAggregation, also known as the MEM algorithm * * \param iDatasets * The number of datasets * * \param iGenes * The number of genes * * \param rank_d * Two-dimensional vectors storing correlation-ranks to the query genes. * First dimension: datasets. Second dimension: genes. * * \param counts * A vector storing the count of datasets for each gene * * \param master_rank * A vector storing the integrated gene-score * * \param numThreads * The number of threads to be used (in a parallel setup) * * \c rank_d needs to be prepared as follows: a correlation rank vector is obtained from sorting Pearson correlations * in a dataset, and then it is normalized by (rank of correlation) / (number of genes). The result is stored * in \c rank_d. * * Afterward, for each gene \a g, the algorithm compares this gene's \c rank_d distribution across datasets with * that derived from a set of datasets with randomly ordered correlation vectors (ie a null distribution). * A significance p-value is calculated for this gene, and \a -log(p) values are stored in master_rank. */ static bool OrderStatisticsRankAggregation(const utype&, const utype&, utype**, const vector &, vector&, const utype&); static bool OrderStatisticsPreCompute(); /*! * \brief * Simulates a dataset weight for one-gene query * * \param sQuery * The query * * \param sDataset * The dataset * * \param rate * RBP parameter \a p * * \param percent_required * Percentage of query genes required to be present in a dataset (assumed to be 1 in this case) * * \param bSquareZ * Whether or not to square \a correlations * * \param rrank * Final gene-score * * \param goldStd * Gold-standard gene-set for weighting a dataset * * This function is mainly used for equal weighting. * Although equal weighting integrates all datasets with weight = 1, * for the purpose of displaying datasets, the datasets need to be ranked according to the distance to the * average gene-ranking. * * This average gene-ranking is produced by summing gene-rankings from all datasets and divided by the number of datasets. * To score a dataset, we calculate the RBP precision of this dataset in retrieving the top 100 genes of the average ranking. * */ static bool OneGeneWeighting(CSeekQuery&, CSeekDataset&, const float&, const float&, const bool&, vector*, const CSeekQuery*); static bool AverageWeighting(CSeekQuery &sQuery, CSeekDataset &sDataset, const float &percent_required, const bool &bSquareZ, float &w); }; } #endif