sleipnir / src / seekcentral.h

  1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 /***************************************************************************** * This file is provided under the Creative Commons Attribution 3.0 license. * * You are free to share, copy, distribute, transmit, or adapt this work * PROVIDED THAT you attribute the work to the authors listed below. * For more information, please see the following web page: * http://creativecommons.org/licenses/by/3.0/ * * This file is a component of the Sleipnir library for functional genomics, * authored by: * Curtis Huttenhower (chuttenh@princeton.edu) * Mark Schroeder * Maria D. Chikina * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) * * If you use this library, the included executable tools, or any related * code in your work, please cite the following publication: * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and * Olga G. Troyanskaya. * "The Sleipnir library for computational functional genomics" *****************************************************************************/ #ifndef SEEKCENTRAL_H #define SEEKCENTRAL_H #include "seekbasic.h" #include "seekdataset.h" #include "seekplatform.h" #include "seekmap.h" #include "seekreader.h" #include "seekquery.h" #include "seekevaluate.h" #include "database.h" #include "datapair.h" #include "seekweight.h" namespace Sleipnir { /*! * \brief A suite of search algorithms that are supported by Seek * * The Seek search algorithms perform the coexpression search of the user's * query genes in a large compendium of microarray datasets. * The output of the search algorithms is a ranking of genes based on their * gene score, which is determined by the overall weighted coexpression * to the query genes. * * One of the first steps in a search is to weight * the datasets in such a way to prioritize informative datasets. * Then, with the weights generated, the final gene-score is given by: * \f[FS(g, Q)=\alpha\sum_{d \in D}{w_d \cdot s_d(g, Q)}\f] * where \f$w_d\f$ is the weight of the dataset, \f$s_d(g, Q)\f$ is the score * of \f$g\f$ to the query in the dataset, \f$\alpha\f$ is the normalization * constant. * * Currently the following dataset weighting algorithms are supported in Seek. * \li The query cross-validated (CV) weighting (CSeekCentral::CV): * This is a weighting based on the query coexpression. The idea is to * measure how well query genes are able to retrieve each other under a * cross-validation setting. * To do so, we first divide the query into \a N * parts, use 1 part to build a small search instance, and use \f$N-1\f$ parts * for evaluating the instance. The score of each instance \f$i\f$ is given by: * \f[s(i)=\sum_{g \in U}{(1-p)p^{rank(g)}}\f] * where \f$U\f$ is the genes in \f$N-1\f$ parts, \f$p\f$ is an exponential rate * parameter, \f$rank(g)\f$ is the position of \f$g\f$ in the ranking of genes * generated by the search instance. * * \li Equal weighting (CSeekCentral::EQUAL): the weight is 1 for all datasets. * * \li User-supplied weight vector (CSeekCentral::USE_WEIGHT). * (ie., Seek does not calculate dataset weights) * * \li User-supplied gene-sets for weighting datasets, and also use cross-validations (CSeekCentral::CV_CUSTOM) * * \li Order-statistics (CSeekCentral::ORDER_STATISTICS): the algorithm used in MEM. * (Adler et al, Genome Biology 2009) * * CSeekCentral can handle multiple queries at a time, but the search parameters must remain * the same for all queries. */ class CSeekCentral{ public: /*! * \enum SearchMode * \brief Search modes (see section Detailed Descriptions) */ enum SearchMode{ CV=0, /**< Cross-validated weighting */ EQUAL=1, /**< Equal weighting */ USE_WEIGHT=2, /**< User-supplied weights */ CV_CUSTOM=3, /**< Cross-validated weighting, but instead of using the query genes to cross-validate, use the user supplied gene-sets to validate each query partition */ ORDER_STATISTICS=4 /**< MEM algorithm */ }; /*! * \brief Constructor */ CSeekCentral(); /*! * \brief Destructor */ ~CSeekCentral(); /*! * \brief Initialize function * * Performs the following operations: * \li Read the search parameters * \li Read the gene mapping \c gene_map.txt * \li Read a list of queries * \li Read the dataset mapping and the search datasets * \li Read the CDatabaselets (ie, the gene-gene \a correlations for the * query genes) * * \param gene The gene mapping file name, \c gene_map.txt * \param quant The quant file name * \param dset The dataset mapping file name, \c dataset_platform.txt * \param search_dset The file which contains the dataset names to be used for the search * \param query The query file name * \param platform The platform directory, which contains the platform * \a correlation averages and standard deviations * \param db The CDatabaselet directory, which contains the * gene-centric compendium-wide \a correlations, \c *.db files * \param prep The Prep directory, which contains the gene \a correlation * average \c *.gavg, and the gene presence \c *.gpres. * \param gvar The gene variance directory, which contains the \c *.gvar files * \param sinfo The sinfo directory, which contains the \c *.sinfo files * \param num_db The total number of CDatabaselet files * \param buffer The number of query genes to store in the memory * \param output_dir The output directory * \param to_output_text If true, output the gene-ranking in textual format * \param bOutputWeightComponent If true, output the dataset weight components (ie the score of cross-validations) * \param bSimulateWeight If true, use simulated weight as dataset weight * \param dist_measure Distance measure, either CORRELATION or Z_SCORE * \param bSubtractAvg If true, subtract the average z-score on a per-gene basis * \param bNormPlatform If true, subtract the platform gene average, divide by platform gene standard deviation * \param bLogit If true, apply the logit transformation on the \a correlations * \param fCutOff Cutoff the \a correlation values * \param fPercentRequired The fraction of the query genes required to be present in a dataset * in order to consider the dataset for integration * \param bSquareZ If true, square the \a correlations * \param bRandom If true, shuffle the \a correlation vector * \param iNumRandom The number of random simulations to perform per query * \param rand The random number generator * \param useNibble Default to false * * \remark The word \a correlation refers to the z-scored, standardized Pearson. * \remark The parameters \c bSubtractAvg, \c bNormPlatform, * \c bLogit, and \c bSquareZ are options to transform the * \a correlation values. * \remark The \c bSimulateWeight option is for equal weighting or order statistics where the final gene ranking * is not derived from a weighted integration of datasets. In this case, if the user still wants to see * the contribution of each dataset, the simulated weight is computed from the distance of a dataset's coexpression ranking to the final gene ranking. * \remark This function is designed to be used by SeekMiner. */ bool Initialize( const vector &vecDBSetting, const char *search_dset, const char *query, const char* output_dir, const ushort buffer = 20, const bool to_output_text = false, const bool bOutputWeightComponent = false, const bool bSimulateWeight = false, const enum CSeekDataset::DistanceMeasure dist_measure = CSeekDataset::Z_SCORE, const bool bSubtractAvg = true, const bool bNormPlatform = false, const bool bLogit = false, const float fCutOff = -9999, const float fPercentRequired = 0, const bool bSquareZ = false, const bool bRandom = false, const int iNumRandom = 10, gsl_rng *rand = NULL, const bool useNibble = false); /*! * \brief Initialize function * * Load everything except the query, the search datasets, and the output directory * * \param gene The gene mapping file name, \c gene_map.txt * \param quant The quant file name * \param dset The dataset mapping file name, \c dataset_platform.txt * \param platform The platform directory, which contains the platform * \a correlation average and standard deviation * \param db The CDatabaselet directory, which contains the * gene-centric compendium-wide \a correlations, \c *.db files * \param prep The Prep directory, which contains the gene \a correlation * average \c *.gavg, and the gene presence \c *.gpres. * Divided by datasets. * \param gvar The gene variance directory, which contains the \c *.gvar files * \param sinfo The sinfo directory, which contains the \c *.sinfo files * \param num_db The total number of CDatabaselet files * \param buffer The number of query genes to store in the memory * \param to_output_text If true, output the gene-ranking in the textual format * \param bOutputWeightComponent If true, output the dataset weight components (ie the score of cross-validations) * \param bSimulateWeight If true, use simulated weight as dataset weight * \param dist_measure Distance measure, either CORRELATION or Z_SCORE * \param bSubtractAvg If true, subtract the average z-score on a per-gene basis * \param bNormPlatform If true, subtract the platform gene average, divide by platform gene standard deviation * \param bLogit If true, apply the logit transformation on the \a correlations * \param fCutOff Cutoff the \a correlations * \param fPercentRequired The fraction of the query genes required to be present in a dataset * \param bSquareZ If true, square the \a correlations * \param bRandom If true, shuffle the \a correlation vector * \param iNumRandom The number of random simulations to perform per query * \param rand The random number generator * \param useNibble Default to false * * \remark The word \a correlation refers to the z-scored, standardized Pearson. * \remark The parameters \c bSubtractAvg, \c bNormPlatform, * \c bLogit, and \c bSquareZ are options to transform the * \a correlation values. * \remark The \c bSimulateWeight option is for equal weighting or order statistics where the final gene ranking * is not derived from a weighted integration of datasets. In this case, if the user still wants to see * the contribution of each dataset, the simulated weight is computed from the distance of a dataset's coexpression ranking to the final gene ranking. * \remark This function is designed to be used by SeekMiner. */ bool Initialize( const vector &vecDBSetting, const ushort buffer = 20, const bool to_output_text = false, const bool bOutputWeightComponent = false, const bool bSimulateWeight = false, const enum CSeekDataset::DistanceMeasure dist_measure = CSeekDataset::Z_SCORE, const bool bSubtractAvg = true, const bool bNormPlatform = false, const bool bLogit = false, const float fCutOff = -9999, const float fPercentRequired = 0, const bool bSquareZ = false, const bool bRandom = false, const int iNumRandom = 10, gsl_rng *rand = NULL, const bool useNibble = false); /*! * \brief Initialize function * * Prepares Seek to be used in a client-server environment * * \param output_dir The output directory * \param query The query file name * \param search_dset The file that contains the name of datasets to be used for the search * \param src The CSeekCentral instance, where some settings will be copied to here * \param iClient The client's socket connection * \param query_min_required The minimum number of query genes required to be present in a dataset * \param dist_measure Distance measure, either CORRELATION or Z_SCORE. * \param bSubtractAvg If true, subtract the average z-score on a per-gene basis * \param bNormPlatform If true, subtract the platform gene average, divide by platform gene standard deviation * * \remark This function is designed to be used by SeekServer. * \remark The parameters \c bSubtractAvg, \c bNormPlatform * are options to transform the \a correlation values. * \remark Assumes that the CDatabaselets have been read, and the \c *.gvar, \c *.sinfo files have been loaded. * \remark Assumes that the dataset and gene mapping files have been read. */ bool Initialize(const string &output_dir, const string &query, const string &search_dset, CSeekCentral* src, const int iClient, const float query_min_required = 0, const enum CSeekDataset::DistanceMeasure = CSeekDataset::Z_SCORE, const bool bSubtractGeneAvg = true, const bool bNormPlatform = false); /*! * \brief Run Seek with the cross-validated dataset weighting * * \param rnd The random number generator * \param PART_M Query partition mode * \param FOLD Number of partitions to generate from the query * \param RATE The weighting parameter \a p * * \remark The random number generator is used for partitioning the query. * \remark Assumes that the CSeekCentral::Initialize() has been called. */ bool CVSearch(gsl_rng*, const CSeekQuery::PartitionMode&, const ushort&, const float&); /*! * \brief Run Seek with the custom dataset weighting * * \param newGoldStd The gold-standard gene-set that is used for weighting datasets * \param rnd The random number generator * \param PART_M Query partition mode * \param FOLD Number of partitions to generate from the query * \param RATE The weighting parameter \a p * * Same as CVSearch, except that the weighting is not based on the coexpression * of the query genes, but based on the similarity of the query genes to some custom gold * standard gene-set. * * \remark The random number generator is used for partitioning the query. * \remark Assumes that the CSeekCentral::Initialize() has been called. */ bool CVCustomSearch(const vector< vector > &, gsl_rng*, const CSeekQuery::PartitionMode&, const ushort&, const float&); /*! * \brief Run Seek with the equal dataset weighting * \remark Assumes that the CSeekCentral::Initialize() has been called. */ bool EqualWeightSearch(); /*! * \brief Run Seek with the user-given dataset weights * \param weights A two-dimensional array that stores the user-given weights * * \remark The two-dimensional array \c weights is \a Q by \a D : * where \a Q is the number of queries, \a D is the number of datasets. \c weights[i][j] * stores the weight of dataset \a j in query \a i. * * \remark Assumes that the CSeekCentral::Initialize() has been called. */ bool WeightSearch(const vector >&); /*! * \brief Run Seek with the variance weighted search * * Same as CSeekCentral::WeightSearch(), except that the user-given weights are the query gene expression variances. * * \remark Assumes that the CSeekCentral::Initialize() has been called. */ bool VarianceWeightSearch(); /*! * \brief Run Seek with the order statistics dataset weighting algorithm * * \remark Assumes that the CSeekCentral::Initialize() has been called. */ bool OrderStatistics(); /*! * \brief Get the final gene-ranking for all the queries * \return A two-dimensional array that stores the gene-rankings */ const vector< vector >& GetAllResult()const; /*! * \brief Get all the queries * \return A vector of queries. */ const vector& GetAllQuery() const; /*! * \brief Get the dataset weight vector for all the queries * \return A two-dimensional \c float array that stores the weights * * \remark The first dimension is the query. The second dimension is the dataset. */ const vector > &GetAllWeight() const; /*! * \brief Get the gene-map ID for a given \a gene-name * \param strGene The \a gene-name as a \c string * \return The gene-map ID */ ushort GetGene(const string &strGene) const; /*! * \brief Get the \a gene-name for a given gene-map ID * \param geneID The gene-map ID * \return The \a gene-name as a \c string */ string GetGene(const ushort &geneID) const; /*! * \brief Destruct this search instance * \return True if successful. */ bool Destruct(); private: //network mode bool EnableNetwork(const int&); bool CheckDatasets(const bool&); /* Central search function */ bool Common(CSeekCentral::SearchMode&, gsl_rng* = NULL, const CSeekQuery::PartitionMode* = NULL, const ushort* = NULL, const float* = NULL, const vector< vector >* = NULL, const vector< vector >* = NULL); bool CheckWeight(const ushort &i); bool CopyTopGenes(CSeekQuery&, const vector&, const ushort); bool SetQueryScoreNull(const CSeekQuery&); bool PrepareQuery(const vector&, CSeekQuery&); bool CalculateRestart(); bool PrepareOneQuery(CSeekQuery &, CSeekIntIntMap &, vector&); bool AggregateThreads(); bool FilterResults(const ushort &); bool Sort(vector &); bool Write(const ushort &); bool Display(CSeekQuery &, vector&); /* Gene, Dataset, and Platform Mapping*/ vector m_vecstrGenes; vector m_vecstrDatasets; vector m_vecstrDP; map m_mapstrstrDatasetPlatform; map m_mapstrintDataset; map m_mapstrintGene; vector > m_vecstrSearchDatasets; vector m_searchdsetMap; /* Datasets */ vector m_vc; /* Output */ bool m_bOutputText; /* If true, output random case (ie shuffle rankings per dataset) iNumRandom: number of repetitions (Oct 26, 2012) */ bool m_bRandom; int m_iNumRandom; gsl_rng *m_randRandom; /* random dataset weight over all repetitions */ //vector > m_vecRandWeight; /* random gene scores over all repetitions */ //vector > m_vecRandScore; /* Gene-gene correlation matrix for all datasets Organized per thread */ ushort ***m_rData; /* Correlation discretization */ vector m_quant; /* Correlation transformation options */ bool m_bSubtractGeneAvg; bool m_bNormPlatform; enum CSeekDataset::DistanceMeasure m_eDistMeasure; bool m_bLogit; bool m_bSquareZ; /* multi-threaded programming */ float **m_master_rank_threads; float **m_sum_weight_threads; ushort **m_counts_threads; vector *m_rank_normal_threads; vector *m_rank_threads; /* Essential search results */ vector m_master_rank; vector m_sum_weight; vector m_counts; /* Holds results for all queries */ vector< vector > m_weight; vector< vector > m_final; /* Query */ vector< vector > m_vecstrAllQuery; vector m_Query; /* Platform */ vector m_vp; map m_mapstriPlatform; vector m_vecstrPlatform; //CDatabase reference vector m_vecDB; vector > m_vecDBDataset; //A list of dsets in each CDatabase size_t m_iDatasets; size_t m_iGenes; ushort m_numThreads; ushort m_maxNumDB; map > > m_mapLoadTime; bool DEBUG; bool m_bOutputWeightComponent; bool m_bSimulateWeight; string m_output_dir; float m_fScoreCutOff; float m_fPercentQueryAfterScoreCutOff; /* for order statistics, a datasets-by-genes matrix */ ushort **m_rank_d; /* for network mode */ int m_iClient; bool m_bEnableNetwork; bool m_bSharedDB; //if m_DB is shared between multiple CSeekCentral instances }; } #endif