# sleipnir / src / seekevaluate.h

  1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 /***************************************************************************** * This file is provided under the Creative Commons Attribution 3.0 license. * * You are free to share, copy, distribute, transmit, or adapt this work * PROVIDED THAT you attribute the work to the authors listed below. * For more information, please see the following web page: * http://creativecommons.org/licenses/by/3.0/ * * This file is a component of the Sleipnir library for functional genomics, * authored by: * Curtis Huttenhower (chuttenh@princeton.edu) * Mark Schroeder * Maria D. Chikina * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) * * If you use this library, the included executable tools, or any related * code in your work, please cite the following publication: * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and * Olga G. Troyanskaya. * "The Sleipnir library for computational functional genomics" *****************************************************************************/ #ifndef SEEKEVALUATE_H #define SEEKEVALUATE_H #include "seekbasic.h" #include "seekmap.h" namespace Sleipnir { struct AResult{ unsigned short i; unsigned short f; bool operator<(const AResult& val) const{ /*if(f<=val.f){ return false; }else{ return true; }*/ if(f < val.f){ return false; }else if(f > val.f){ return true; }else if(i < val.i){ return false; }else{ return true; } } }; struct Ascending{ bool operator()( const AResult& lx, const AResult& rx ) const { return lx.f < rx.f; } }; struct AResultFloat{ unsigned short i; float f; bool operator<(const AResultFloat& val) const{ /*if(f<=val.f){ return false; }else{ return true; }*/ if(f < val.f){ return false; }else if(f > val.f){ return true; }else if(i < val.i){ return false; }else{ return true; } } }; struct AscendingFloat{ bool operator()( const AResultFloat& lx, const AResultFloat& rx ) const { return lx.f < rx.f; } }; /*! * \brief Evaluation metrics for a rank-list given some judgment gene-set * * Provide static utility functions for evaluating a ranking of genes with the user-given gold standard * gene-set. The typical use of such functions is in weighting datasets. Generally speaking, each dataset * is weighted by how well the query genes are able to retrieve each other in the dataset. * It is important to pick an informative measure to evaluate the retrieval of the query genes. * Seek provides the choice of two evaluation metrics: Rank-Biased Precision (RBP) or Average Precision. * */ class CSeekPerformanceMeasure{ public: /*! * \brief Sort a gene ranking by the gene score * * \param rank The vector of gene-scores to be sorted. Gene scores are inserted to this vector based * on their gene IDs, which are a value from 0 to the size of the vector. * \param mapG The gene presence map * \param a The output, which is a vector of (gene ID, gene score) pairs that are sorted by score * \param top If \c X, sort only the top \c X elements. If 0, then sort the entire vector. * * The struct \c AResult represents a (gene ID, gene score) pair. This function sorts the vector of * \c AResult in the descending order of the gene score. */ static bool SortRankVector(const vector &rank, const CSeekIntIntMap &mapG, vector &a, const ushort top = 0); /*! * \brief Calculate the rank-biased precision for a gene ranking * * \param rate The parameter \a p in the RBP formula * \param rbp The calculated RBP score, the output * \param mask The genes in the ranking to be skipped over (typically the query genes) * \param gold The gold-standard genes * \param mapG The gene presence map. Genes that are not present in the dataset are skipped over. * \param sing The sorted vector of (gene ID, gene score) pairs * \param rank The gene-score vector * \param top If \c X, sort only the top \c X elements. If 0, then sort the entire vector. * * First calls the CSeekPerformanceMeasure::SortRankVector() with the arguments \c rank and \c top, * in order to sort the gene-scores. Then with the sorted gene-ranking returned to \c sing, it calculates * the rank-biased precision. * * \remarks The RBP formula is given by: * \f[RBP=\sum_{g \in U}{(1-p)p^{rank(g)}}\f] * where \f$U\f$ is the gold standard gene-set, \f$p\f$ is the emphasis on ranks, * \f$rank(g)\f$ is the position of \f$g\f$ in the ranking * \f$p\f$ is typically set to 0.95 - 0.99. The recommended value is 0.99. * For more information, please read (Moffat et al 2008). */ /* designed specifically for a CSeekDataset */ /* mask: the query genes which are not included in RBP calcualtion */ static bool RankBiasedPrecision(const float &rate, const vector &rank, float &rbp, const vector &mask, const vector &gold, const CSeekIntIntMap &mapG, vector *sing, /* optional */ const ushort top = 0); /*! * \brief Calculate the average precision for a gene ranking * * \param rank The gene-score vector * \param ap The calculated average precision * \param mask The genes in the ranking to be skipped over (typically the query genes) * \param gold The gold-standard genes * \param mapG The gene presence map. Genes that are not present in the dataset are skipped over. * \param ar The sorted vector of (gene ID, gene score) pairs */ static bool AveragePrecision( const vector &rank, float &ap, const vector &mask, const vector &gold, const CSeekIntIntMap &mapG, vector *ar); }; } #endif