sleipnir / tools / SeekMiner / SeekMiner.ggo

Full commit
package	"SeekMiner"
version	"1.0"
purpose	"Performs cross-platform microarray query-guided search"

section "Main"
option	"dset"				x	"Input a set of datasets"
								string typestr="filename"	yes
option	"search_dset"		D	"A set of datasets to search. If not specified, search all datasets."
								string typestr="filename" default="NA"
option	"input"				i	"Input gene mapping"
								string	typestr="filename"	yes
option	"query"				q	"A list of queries (each line is a query, and contains the names of the query genes delimited by spaces)"
								string typestr="filename"	yes
option	"dir_in"			d	"Database directory (containing .db files)"
								string	typestr="directory"	yes
option	"dir_prep_in"		p	"Prep directory (containing .gavg, .gpres files)"
								string	typestr="directory"	yes
option	"dir_platform"		P	"Platform directory (containing .gplatavg, .gplatstdev, .gplatorder files)"
								string	typestr="directory"	yes
option	"dir_sinfo"			u	"Sinfo directory (containing .sinfo files)"
								string	typestr="directory"	default="NA"
option	"dir_gvar"			U	"Gene variance directory (containing .gexpvar files)"
								string	typestr="directory"	default="NA"
option	"quant"				Q	"Quant file (assuming all datasets use the same quantization)"
								string	typestr="filename"	yes								
option	"num_db"			n	"Number of databaselets in database"
								int	default="1000"
option	"num_threads"		T	"Number of threads"
								int default="8"
option	"per_g_required"	H	"Fraction (max 1.0) of genome required to be present in a dataset. Datasets not meeting this requirement are skipped."
								float default="0"
option	"neg_cor"			k	"Rank genes and datasets by negative correlations"
								flag	off

section "Dataset weighting"
option	"weighting_method"	V	"Weighting method: query cross-validated weighting (CV), equal weighting (EQUAL), order statistics weighting (ORDER_STAT), variance weighting (VAR), user-given weighting (USER), SPELL weighting (AVERAGE_Z), user cross-validated weighting (CV_CUSTOM)"
								values="CV","EQUAL","ORDER_STAT","VAR","USER","AVERAGE_Z","CV_CUSTOM" default="CV"

section "Dataset weighting: User Cross-validated Weighting (CV_CUSTOM)"
option	"user_gene_list"	K	"List of user's gene-sets to which queries will be judged upon (text, 1 line per query)"
								string typestr="filename"

section "Optional - User-given Weighting"
option	"user_weight_list"	J	"List of pre-computed dataset weight files (.dweight)"
								string typestr="filename"

section "Optional - Random simulations"
option	"random"			S	"Generate random ranking score"
								flag	off
option	"num_random"		t	"Number of repetitions of generating random rankings"
								int	default="10"

section "Optional - Distance matrix transformations"
option	"dist_measure"		z	"Distance measure"
								values="pearson","z_score" default="z_score"
option	"norm_subavg"		m	"If z_score is selected, subtract each gene's average z-score in the dataset."
								flag	off
option	"norm_subavg_plat"	M	"If z_score is selected, subtract each query gene's average score across platforms and divide by its stdev. Performed after --norm_subavg."
								flag	off
option	"score_cutoff"		c	"Cutoff on the gene-gene distance score before adding, default: no cutoff"
								float default="-9999"
option	"square_z"			e	"If z_score is selected, square the z-scores. Usually used in conjunction with --score-cutoff=1.0."							
								flag	off

section "Options for Dataset weighting"
option	"per_q_required"	C	"Fraction (max 1.0) of query genes required to be present in a dataset, if --score_cutoff is NOT specified. Datasets not meeting this requirement are skipped. However, if --score_cutoff is specified, this is the fraction of query genes required to pass the correlation cutoff, indicated by --score_cutoff (shortened SC) (note: Advanced usage!). In this case, in scoring a gene g, we first calculate the number of query genes (QS) to which g's correlation exceeds SC. Then if the fraction (QS) / (number of query genes) < per_q_required, do not score this gene in dataset's coexpressed gene-list. Use this with caution if --score_cutoff is specified."
								float default="0"

section "Options for CV-based dataset weighting"
option	"CV_partition"		I	"The query partitioning method (for CV weighting): Leave-One-In, Leave-One-Out, X-Fold."
								values="LOI","LOO","XFOLD" default="LOI"
option	"CV_fold"			X	"The number of folds (for X-fold partitioning)."
								int	default="5"
option	"CV_rbp_p"			G	"The parameter p for RBP scoring of each partition for its query gene retrieval (for CV weighting)."
								float	default="0.99"	

section "MISC"								
option	"is_nibble"			N	"The input CDatabase collection is nibble type"
								flag	off
option	"buffer"			b	"Number of query genes to load in memory (recommended: 50-100)"
								int default="50"
option	"output_text"		O	"Output results (gene scores and dataset weights) as text"
								flag	off
option	"output_dir"		o	"Output directory"
								string typestr="directory"	yes
option	"output_w_comp"		Y	"Output dataset weight components (generates .dweight_comp file)"
								flag	off
option	"simulate_w"		E	"If equal weighting or order-statistics weighting is selected, output simulated dataset weights"
								flag	off
option	"additional_db"		B	"Utilize a second CDatabase collection. Path to the second CDatabase's setting file."
								string default="NA"