1. libsleipnir
  2. sleipnir

Source

sleipnir / tools / SeekPrep / stdafx.cpp

/*****************************************************************************
* This file is provided under the Creative Commons Attribution 3.0 license.
*
* You are free to share, copy, distribute, transmit, or adapt this work
* PROVIDED THAT you attribute the work to the authors listed below.
* For more information, please see the following web page:
* http://creativecommons.org/licenses/by/3.0/
*
* This file is a component of the Sleipnir library for functional genomics,
* authored by:
* Curtis Huttenhower (chuttenh@princeton.edu)
* Mark Schroeder
* Maria D. Chikina
* Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
*
* If you use this library, the included executable tools, or any related
* code in your work, please cite the following publication:
* Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
* Olga G. Troyanskaya.
* "The Sleipnir library for computational functional genomics"
*****************************************************************************/
#include "stdafx.h"

/*!
 * \page SeekPrep SeekPrep
 * 
 * Prepares prerequisite files that are necessary for the efficient integrations
 * of coexpressions in \ref SeekMiner and \ref SeekServer.
 * Some of the file preparation tasks that SeekPrep performs are:
 * preparing gene-presence file, calculating gene average correlation, 
 * calculating gene expression variances for each dataset. 
 *
 * 
 * \section sec_usage Usage
 * 
 * \subsection ssec_usage_basic Basic Usage
 * 
 * \subsubsection ssec_usage_avg Prepare Gene Average File (GAVG)
 * \code
 * SeekPrep -i <gene_map> -d -B <dab_file> -a -D <output_dir>
 * \endcode
 * Calculates the average z-score for each gene in a given DAB matrix and stores the results
 * as a vector of floats in the GAVG file. The index of a gene in the vector is determined by \c gene_map.
 * 
 * \subsubsection ssec_usage_pres Prepare Gene Presence File (GPRES)
 * \code
 * SeekPrep -i <gene_map> -d -B <dab_file> -p -D <output_dir>
 * \endcode
 * Stores the gene presence vector for a given DAB matrix, where each value is either
 * 1 if the gene is present, or 0 if the gene is absent in the dataset.
 *
 * \subsubsection ssec_usage_sinfo Prepare Dataset Sinfo file (SINFO)
 * \code
 * SeekPrep -i <gene_map> -e -V <pclbin_file> -s -D <output_dir>
 * \endcode
 * Calculates the average Fisher's transformed correlation between all gene pairs in an input dataset.
 * The input dataset needs to be a binary PCL file with the extension BIN (generated by \ref PCL2Bin).
 *
 * \subsubsection ssec_usage_gexpvar Prepare Dataset Gene Expression Variance file (GEXPVAR)
 * \code
 * SeekPrep -i <gene_map> -e -V <pclbin_file> -v -D <output_dir>
 * \endcode
 * Calculates the gene expression variance for each gene in an input dataset.
 *
 * \subsubsection ssec_usage_plat Prepare Platform average z-scores and their standard deviation (GPLAT)
 * \code
 * SeekPrep -i <gene_map> -f -P -b <db_file_list> -I <prep_dir> -A <dset_platform_map> -Q <quant>
 * \endcode
 * Calculates the platform-wide average of z-scores (\f$z_{p,avg}\f$) using the following algorithm: <br>
 * For each dataset \f$d\f$: <br>
 * &nbsp;&nbsp;&nbsp; For each gene \f$g\f$ in the genome \f$G\f$: <br>
 * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; Compute \f$z_{d, avg}(g) = (\sum_{i \in G}{z_{d}(g, i)}) / |G|\f$ <br>
 * &nbsp;&nbsp;&nbsp; For each gene \f$k\f$ in the genome \f$G\f$: <br>
 * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; Compute \f$z_{d, avg, corrected}(k) = (\sum_{g \in G}{z_{d}(k, g) - z_{d, avg}(g)}) / |G|\f$ <br>
 * For each platform \f$p\f$ and its set of dataset \f$D_p\f$: <br>
 * &nbsp;&nbsp;&nbsp; For each gene \f$k\f$ in the genome \f$G\f$: <br>
 * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; Compute \f$z_{p, avg}(k) = (\sum_{d \in D_p}{z_{d,avg,corrected}(k)}) / |D_p| \f$ <br>
 *
 * The \c prep_dir contains the GPRES and GAVG files for all datasets defined in \c dset_platform_map. (Users should generate these files with SeekPrep first.)
 *
 * The \c dset_platform_map is a tab-delimited file that looks something like:
 * \code
 * GSE15913.GPL570.pcl  GPL570
 * GSE16122.GPL2005.pcl GPL2005
 * GSE16797.GPL570.pcl  GPL570
 * GSE16836.GPL570.pcl  GPL570
 * GSE17351.GPL570.pcl  GPL570
 * GSE17537.GPL570.pcl  GPL570
 * \endcode
 * where the 1st column is the dataset name and the 2nd column is the corresponding platform.
 * 
 * The \c quant file is a space-delimited file that specifies how the z-scores are binned:
 * \code
 * -5.00 -4.96 -4.92 -4.88 -4.84 -4.80 -4.76 -4.72 -4.68 -4.64 -4.60 -4.56 -4.52 ...
 * \endcode
 *
 * The \c db_file_list file is a list of file paths to the entire DB collections:
 * \code
 * /x/y/z/00000001.db 
 * /x/y/z/00000002.db 
 * /x/y/z/00000003.db 
 * /x/y/z/00000004.db 
 * /x/y/z/00000005.db
 * ...
 * \endcode 
 *
 * \subsection ssec_usage_detailed Detailed Usage
 * 
 * \include SeekPrep/SeekPrep.ggo
 * 
 */