Commits

Qian Zhu committed cc4bd28

main search program, SeekMiner

  • Participants
  • Parent commits 4fd46bd
  • Branches search_project

Comments (0)

Files changed (6)

File tools/SeekMiner/SeekMiner.cpp

+/*****************************************************************************
+* This file is provided under the Creative Commons Attribution 3.0 license.
+*
+* You are free to share, copy, distribute, transmit, or adapt this work
+* PROVIDED THAT you attribute the work to the authors listed below.
+* For more information, please see the following web page:
+* http://creativecommons.org/licenses/by/3.0/
+*
+* This file is a component of the Sleipnir library for functional genomics,
+* authored by:
+* Curtis Huttenhower (chuttenh@princeton.edu)
+* Mark Schroeder
+* Maria D. Chikina
+* Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
+*
+* If you use this library, the included executable tools, or any related
+* code in your work, please cite the following publication:
+* Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
+* Olga G. Troyanskaya.
+* "The Sleipnir library for computational functional genomics"
+*****************************************************************************/
+#include "stdafx.h"
+#include "cmdline.h"
+
+
+int main( int iArgs, char** aszArgs ) {
+	static const size_t	c_iBuffer	= 1024;
+#ifdef WIN32
+	pthread_win32_process_attach_np( );
+#endif // WIN32
+	gengetopt_args_info	sArgs;
+	ifstream			ifsm;
+	istream*			pistm;
+	vector<string>		vecstrLine, vecstrGenes, vecstrDatasets, vecstrQuery;
+	char				acBuffer[ c_iBuffer ];
+	size_t				i;
+
+	if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) {
+		cmdline_parser_print_help( );
+		return 1; }
+
+	if( sArgs.input_arg ) {
+		ifsm.open( sArgs.input_arg );
+		pistm = &ifsm; }
+	else
+		pistm = &cin;
+	while( !pistm->eof( ) ) {
+		pistm->getline( acBuffer, c_iBuffer - 1 );
+		acBuffer[ c_iBuffer - 1 ] = 0;
+		vecstrLine.clear( );
+		CMeta::Tokenize( acBuffer, vecstrLine );
+		if( vecstrLine.size( ) < 2 ) {
+			cerr << "Ignoring line: " << acBuffer << endl;
+			continue; }
+		if( !( i = atoi( vecstrLine[ 0 ].c_str( ) ) ) ) {
+			cerr << "Illegal gene ID: " << vecstrLine[ 0 ] << " for " << vecstrLine[ 1 ] << endl;
+			return 1; }
+		i--;
+		if( vecstrGenes.size( ) <= i )
+			vecstrGenes.resize( i + 1 );
+		vecstrGenes[ i ] = vecstrLine[ 1 ]; }
+	if( sArgs.input_arg )
+		ifsm.close( );
+
+	bool useNibble = false;
+	if(sArgs.is_nibble_flag==1){
+		useNibble = true;
+	}
+
+	CDatabase DB(useNibble);
+
+	if(sArgs.db_arg){
+		ifsm.open(sArgs.db_arg);
+		while(!pistm->eof()){
+			pistm->getline(acBuffer, c_iBuffer -1);
+			if(acBuffer[0]==0){
+				break;
+			}
+			acBuffer[c_iBuffer-1] = 0;
+			vecstrDatasets.push_back(acBuffer);
+		}
+		vecstrDatasets.resize(vecstrDatasets.size());
+		ifsm.close();
+
+		ifsm.open(sArgs.query_arg);
+		while(!pistm->eof()){
+			pistm->getline(acBuffer, c_iBuffer -1);
+			if(acBuffer[0]==0){
+				break;
+			}
+			acBuffer[c_iBuffer-1] = 0;
+			vecstrQuery.push_back(acBuffer);
+		}
+		vecstrQuery.resize(vecstrQuery.size());
+		ifsm.close();
+
+		string strInputDirectory = sArgs.dir_in_arg;
+		string strPrepInputDirectory = sArgs.dir_prep_in_arg;
+		vector<CSeekDataset*> vc;
+		vector<char> cQuery;
+		CSeekTools::LoadDatabase(DB, strInputDirectory, strPrepInputDirectory,
+			cQuery, vecstrQuery, vecstrDatasets, vc);
+		size_t iDatasets = DB.GetDatasets();
+		size_t iGenes = DB.GetGenes();
+
+		/*
+		DB.Open(strInputDirectory);
+		size_t j,k;
+		vc.clear();
+		vc.resize(iDatasets);
+		for(i=0; i<iDatasets; i++){
+			vc[i] = new CSeekDataset();
+			string strPrepInputDirectory = sArgs.dir_prep_in_arg;
+			string strFileStem = vecstrDatasets[i];
+			//string strFileStem = CMeta::Deextension(CMeta::Basename(vecstrDatasets[i].c_str()));
+			string strAvgPath = strPrepInputDirectory + "/" + strFileStem + ".gavg";
+			string strPresencePath = strPrepInputDirectory + "/" + strFileStem + ".gpres";
+			vc[i]->ReadGeneAverage(strAvgPath);
+			vc[i]->ReadGenePresence(strPresencePath);
+		}
+
+		CSeekTools::InitVector(cQuery, iGenes, (char) 0);
+
+		for(i=0; i<vecstrQuery.size(); i++){
+			k = DB.GetGene(vecstrQuery[i]);
+			if(k==-1) continue;
+			cQuery[k] = 1;
+		}
+
+		for(i=0; i<iDatasets; i++){
+			vc[i]->InitializeQuery(cQuery);
+		}
+
+		vector<unsigned char> *Q =
+			new vector<unsigned char>[vecstrQuery.size()];
+
+		for(i=0; i<vecstrQuery.size(); i++){
+			if(!DB.GetGene(vecstrQuery[i], Q[i])){
+				cerr << "Gene does not exist" << endl;
+			}
+		}
+
+		//printf("Before"); getchar();
+		for(i=0; i<vecstrQuery.size(); i++){
+			if(DB.GetGene(vecstrQuery[i])==-1){
+				continue;
+			}
+			size_t m = DB.GetGene(vecstrQuery[i]);
+			size_t l = 0;
+			for(j=0; j<iDatasets; j++){
+				CSeekIntIntMap *qu = vc[j]->GetQueryMap();
+			    size_t query = qu->GetForward(m);
+			    if(query==-1) continue;
+			    for(k=0; k<iGenes; k++){
+			    	unsigned char c = Q[i][k*iDatasets + j];
+			    	vc[j]->SetQueryNoMapping(query, k, c);
+			    }
+			}
+		}
+
+		delete[] Q;
+		*/
+		size_t j;
+		float RATE = 0.95;
+		int FOLD = 5;
+		enum PartitionMode PART_M = CUSTOM_PARTITION;
+
+		const gsl_rng_type *T;
+		gsl_rng *rnd;
+		gsl_rng_env_setup();
+		T = gsl_rng_default;
+		rnd = gsl_rng_alloc(T);
+
+		size_t d;
+
+		for(i=0; i<1; i++){
+			CSeekQuery query;
+			query.InitializeQuery(cQuery);
+			query.CreateCVPartitions(rnd, PART_M, FOLD);
+
+			vector<float> master_rank;
+			CSeekTools::InitVector(master_rank, iGenes, (float) 0);
+
+			vector<float> sum_weight;
+			CSeekTools::InitVector(sum_weight, iGenes, (float) 0);
+
+			vector<int> counts;
+			CSeekTools::InitVector(counts, iGenes, (int) 0);
+
+			printf("Entering search\n");
+			for(d=0; d<iDatasets; d++){
+				printf("Dataset %d\n", d);
+				CSeekIntIntMap *mapQ = vc[d]->GetQueryMap();
+				CSeekIntIntMap *mapG = vc[d]->GetGeneMap();
+
+				vector<int> this_q;
+				for(j=0; j<mapQ->GetNumSet(); j++){
+					this_q.push_back(mapQ->GetReverse(j));
+				}
+
+				if(mapQ->GetNumSet()==0){
+					printf("This dataset is skipped\n");
+					continue;
+				}
+
+				printf("Initializing\n");
+				vc[d]->InitializeFloatMatrix();
+				printf("Weighting dataset\n");
+				CSeekWeighter::CVWeighting(query, *vc[d]);
+				float w = vc[d]->GetDatasetSumWeight();
+				if(w==-1){
+					printf("Bad weight\n"); 
+					vc[d]->FreeFloatMatrix();
+					continue;
+					//getchar();
+				}
+				vector<float> rank_normal;
+				printf("Doing linear combination\n");
+				CSeekWeighter::LinearCombine(rank_normal, this_q, *vc[d]);
+				/*for(j=0; j<1000; j++){
+					size_t g = mapG->GetReverse(j);
+					printf("Gene %d %.5f\n", g, rank_normal[g]);
+				}*/
+				vc[d]->FreeFloatMatrix();
+
+				printf("Adding contribution of dataset to master ranking: %.5f\n", w);
+				for(j=0; j<mapG->GetNumSet(); j++){
+					size_t g = mapG->GetReverse(j);
+					master_rank[g] += rank_normal[g] * w;
+					counts[g]++;
+					sum_weight[g] += w;
+				}
+			}
+
+			printf("Aggregating genes\n");
+			for(j=0; j<iGenes; j++){
+				if(counts[j]<(int)(0.5*iDatasets)){
+					master_rank[j] = -50.0;
+				}else if(sum_weight[j]==0){
+					master_rank[j] = -50.0;
+				}else{
+					master_rank[j] /= sum_weight[j];
+				}
+				printf("Gene %d %.5f\n", j, master_rank[j]);
+			}
+
+			printf("Sorting genes\n");
+			vector<AResult> a;
+			a.clear();
+			a.resize(iGenes);
+			for(j=0; j<iGenes; j++){
+				a[j].i = j;
+				a[j].f = master_rank[j];
+			}
+			printf("Begin Sorting genes\n");
+			sort(a.begin(), a.end());
+
+			printf("Results:\n");
+			size_t jj;
+			size_t ii;
+			for(ii=0, jj=0; jj<500; ii++){
+				//if(cQuery[a[ii].i]==1) continue;
+				printf("%d %.5f\n", a[ii].i, a[ii].f);
+				jj++;
+			}
+
+
+		}
+
+
+
+		/*for(i=0; i<iDatasets; i++){
+			printf("Dataset %ld\n", i);
+			CSeekMatrix<unsigned char> *cm = vc[i]->GetMatrix();
+			for(j=0; j<cm->GetNumRow(); j++){
+				printf("Row %ld\n", j);
+				for(k=0; k<1000; k++){
+					printf("%d ", cm->Get(j, k));
+				}
+				printf("\n");
+			}
+		}*/
+		/*size_t j;
+		for(i=0; i<vecstrQuery.size(); i++){
+			printf("Query: %s\n", vecstrQuery[i].c_str());
+			for(j=0; j<Q[i].size(); j++){
+				printf("%d ", (int) Q[i][j]);
+			}
+			printf("\n");
+			getchar();
+		}*/
+
+		//printf("Done"); getchar();
+
+	}else{
+		cerr << "Must give a db list." << endl;
+		return 1;
+
+	}
+
+#ifdef WIN32
+	pthread_win32_process_detach_np( );
+#endif // WIN32
+	return 0; }

File tools/SeekMiner/SeekMiner.ggo

+package	"SeekMiner"
+version	"1.0"
+purpose	"Performs cross-platform microarray query-guided search"
+
+section "Main"
+option	"db"				x	"Input a set of datasets"
+								string typestr="filename"	yes
+option	"input"				i	"Input gene mapping"
+								string	typestr="filename"	yes
+option	"query"				q	"Query gene list"
+								string typestr="filename"	yes
+option	"dir_in"			d	"Database directory"
+								string	typestr="directory"	yes
+option	"dir_prep_in"		p	"Prep directory (containing .gavg, .gpres files)"
+								string	typestr="directory"	yes				
+option	"is_nibble"			N	"Whether the input DB is nibble type"
+								flag	off

File tools/SeekMiner/cmdline.c

+/*
+  File autogenerated by gengetopt version 2.22.5
+  generated with the following command:
+  gengetopt -iSeekMiner.ggo --default-optional -u -N -e 
+
+  The developers of gengetopt consider the fixed text that goes in all
+  gengetopt output files to be in the public domain:
+  we make no copyright claims on it.
+*/
+
+/* If we use autoconf.  */
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifndef FIX_UNUSED
+#define FIX_UNUSED(X) (void) (X) /* avoid warnings for unused params */
+#endif
+
+#include <getopt.h>
+
+#include "cmdline.h"
+
+const char *gengetopt_args_info_purpose = "Performs cross-platform microarray query-guided search";
+
+const char *gengetopt_args_info_usage = "Usage: SeekMiner [OPTIONS]... [FILES]...";
+
+const char *gengetopt_args_info_description = "";
+
+const char *gengetopt_args_info_help[] = {
+  "  -h, --help                   Print help and exit",
+  "  -V, --version                Print version and exit",
+  "\nMain:",
+  "  -x, --db=filename            Input a set of datasets",
+  "  -i, --input=filename         Input gene mapping",
+  "  -q, --query=filename         Query gene list",
+  "  -d, --dir_in=directory       Database directory",
+  "  -p, --dir_prep_in=directory  Prep directory (containing .gavg, .gpres files)",
+  "  -N, --is_nibble              Whether the input DB is nibble type  \n                                 (default=off)",
+    0
+};
+
+typedef enum {ARG_NO
+  , ARG_FLAG
+  , ARG_STRING
+} cmdline_parser_arg_type;
+
+static
+void clear_given (struct gengetopt_args_info *args_info);
+static
+void clear_args (struct gengetopt_args_info *args_info);
+
+static int
+cmdline_parser_internal (int argc, char **argv, struct gengetopt_args_info *args_info,
+                        struct cmdline_parser_params *params, const char *additional_error);
+
+static int
+cmdline_parser_required2 (struct gengetopt_args_info *args_info, const char *prog_name, const char *additional_error);
+
+static char *
+gengetopt_strdup (const char *s);
+
+static
+void clear_given (struct gengetopt_args_info *args_info)
+{
+  args_info->help_given = 0 ;
+  args_info->version_given = 0 ;
+  args_info->db_given = 0 ;
+  args_info->input_given = 0 ;
+  args_info->query_given = 0 ;
+  args_info->dir_in_given = 0 ;
+  args_info->dir_prep_in_given = 0 ;
+  args_info->is_nibble_given = 0 ;
+}
+
+static
+void clear_args (struct gengetopt_args_info *args_info)
+{
+  FIX_UNUSED (args_info);
+  args_info->db_arg = NULL;
+  args_info->db_orig = NULL;
+  args_info->input_arg = NULL;
+  args_info->input_orig = NULL;
+  args_info->query_arg = NULL;
+  args_info->query_orig = NULL;
+  args_info->dir_in_arg = NULL;
+  args_info->dir_in_orig = NULL;
+  args_info->dir_prep_in_arg = NULL;
+  args_info->dir_prep_in_orig = NULL;
+  args_info->is_nibble_flag = 0;
+  
+}
+
+static
+void init_args_info(struct gengetopt_args_info *args_info)
+{
+
+
+  args_info->help_help = gengetopt_args_info_help[0] ;
+  args_info->version_help = gengetopt_args_info_help[1] ;
+  args_info->db_help = gengetopt_args_info_help[3] ;
+  args_info->input_help = gengetopt_args_info_help[4] ;
+  args_info->query_help = gengetopt_args_info_help[5] ;
+  args_info->dir_in_help = gengetopt_args_info_help[6] ;
+  args_info->dir_prep_in_help = gengetopt_args_info_help[7] ;
+  args_info->is_nibble_help = gengetopt_args_info_help[8] ;
+  
+}
+
+void
+cmdline_parser_print_version (void)
+{
+  printf ("%s %s\n",
+     (strlen(CMDLINE_PARSER_PACKAGE_NAME) ? CMDLINE_PARSER_PACKAGE_NAME : CMDLINE_PARSER_PACKAGE),
+     CMDLINE_PARSER_VERSION);
+}
+
+static void print_help_common(void) {
+  cmdline_parser_print_version ();
+
+  if (strlen(gengetopt_args_info_purpose) > 0)
+    printf("\n%s\n", gengetopt_args_info_purpose);
+
+  if (strlen(gengetopt_args_info_usage) > 0)
+    printf("\n%s\n", gengetopt_args_info_usage);
+
+  printf("\n");
+
+  if (strlen(gengetopt_args_info_description) > 0)
+    printf("%s\n\n", gengetopt_args_info_description);
+}
+
+void
+cmdline_parser_print_help (void)
+{
+  int i = 0;
+  print_help_common();
+  while (gengetopt_args_info_help[i])
+    printf("%s\n", gengetopt_args_info_help[i++]);
+}
+
+void
+cmdline_parser_init (struct gengetopt_args_info *args_info)
+{
+  clear_given (args_info);
+  clear_args (args_info);
+  init_args_info (args_info);
+
+  args_info->inputs = 0;
+  args_info->inputs_num = 0;
+}
+
+void
+cmdline_parser_params_init(struct cmdline_parser_params *params)
+{
+  if (params)
+    { 
+      params->override = 0;
+      params->initialize = 1;
+      params->check_required = 1;
+      params->check_ambiguity = 0;
+      params->print_errors = 1;
+    }
+}
+
+struct cmdline_parser_params *
+cmdline_parser_params_create(void)
+{
+  struct cmdline_parser_params *params = 
+    (struct cmdline_parser_params *)malloc(sizeof(struct cmdline_parser_params));
+  cmdline_parser_params_init(params);  
+  return params;
+}
+
+static void
+free_string_field (char **s)
+{
+  if (*s)
+    {
+      free (*s);
+      *s = 0;
+    }
+}
+
+
+static void
+cmdline_parser_release (struct gengetopt_args_info *args_info)
+{
+  unsigned int i;
+  free_string_field (&(args_info->db_arg));
+  free_string_field (&(args_info->db_orig));
+  free_string_field (&(args_info->input_arg));
+  free_string_field (&(args_info->input_orig));
+  free_string_field (&(args_info->query_arg));
+  free_string_field (&(args_info->query_orig));
+  free_string_field (&(args_info->dir_in_arg));
+  free_string_field (&(args_info->dir_in_orig));
+  free_string_field (&(args_info->dir_prep_in_arg));
+  free_string_field (&(args_info->dir_prep_in_orig));
+  
+  
+  for (i = 0; i < args_info->inputs_num; ++i)
+    free (args_info->inputs [i]);
+
+  if (args_info->inputs_num)
+    free (args_info->inputs);
+
+  clear_given (args_info);
+}
+
+
+static void
+write_into_file(FILE *outfile, const char *opt, const char *arg, const char *values[])
+{
+  FIX_UNUSED (values);
+  if (arg) {
+    fprintf(outfile, "%s=\"%s\"\n", opt, arg);
+  } else {
+    fprintf(outfile, "%s\n", opt);
+  }
+}
+
+
+int
+cmdline_parser_dump(FILE *outfile, struct gengetopt_args_info *args_info)
+{
+  int i = 0;
+
+  if (!outfile)
+    {
+      fprintf (stderr, "%s: cannot dump options to stream\n", CMDLINE_PARSER_PACKAGE);
+      return EXIT_FAILURE;
+    }
+
+  if (args_info->help_given)
+    write_into_file(outfile, "help", 0, 0 );
+  if (args_info->version_given)
+    write_into_file(outfile, "version", 0, 0 );
+  if (args_info->db_given)
+    write_into_file(outfile, "db", args_info->db_orig, 0);
+  if (args_info->input_given)
+    write_into_file(outfile, "input", args_info->input_orig, 0);
+  if (args_info->query_given)
+    write_into_file(outfile, "query", args_info->query_orig, 0);
+  if (args_info->dir_in_given)
+    write_into_file(outfile, "dir_in", args_info->dir_in_orig, 0);
+  if (args_info->dir_prep_in_given)
+    write_into_file(outfile, "dir_prep_in", args_info->dir_prep_in_orig, 0);
+  if (args_info->is_nibble_given)
+    write_into_file(outfile, "is_nibble", 0, 0 );
+  
+
+  i = EXIT_SUCCESS;
+  return i;
+}
+
+int
+cmdline_parser_file_save(const char *filename, struct gengetopt_args_info *args_info)
+{
+  FILE *outfile;
+  int i = 0;
+
+  outfile = fopen(filename, "w");
+
+  if (!outfile)
+    {
+      fprintf (stderr, "%s: cannot open file for writing: %s\n", CMDLINE_PARSER_PACKAGE, filename);
+      return EXIT_FAILURE;
+    }
+
+  i = cmdline_parser_dump(outfile, args_info);
+  fclose (outfile);
+
+  return i;
+}
+
+void
+cmdline_parser_free (struct gengetopt_args_info *args_info)
+{
+  cmdline_parser_release (args_info);
+}
+
+/** @brief replacement of strdup, which is not standard */
+char *
+gengetopt_strdup (const char *s)
+{
+  char *result = 0;
+  if (!s)
+    return result;
+
+  result = (char*)malloc(strlen(s) + 1);
+  if (result == (char*)0)
+    return (char*)0;
+  strcpy(result, s);
+  return result;
+}
+
+int
+cmdline_parser (int argc, char **argv, struct gengetopt_args_info *args_info)
+{
+  return cmdline_parser2 (argc, argv, args_info, 0, 1, 1);
+}
+
+int
+cmdline_parser_ext (int argc, char **argv, struct gengetopt_args_info *args_info,
+                   struct cmdline_parser_params *params)
+{
+  int result;
+  result = cmdline_parser_internal (argc, argv, args_info, params, 0);
+
+  return result;
+}
+
+int
+cmdline_parser2 (int argc, char **argv, struct gengetopt_args_info *args_info, int override, int initialize, int check_required)
+{
+  int result;
+  struct cmdline_parser_params params;
+  
+  params.override = override;
+  params.initialize = initialize;
+  params.check_required = check_required;
+  params.check_ambiguity = 0;
+  params.print_errors = 1;
+
+  result = cmdline_parser_internal (argc, argv, args_info, &params, 0);
+
+  return result;
+}
+
+int
+cmdline_parser_required (struct gengetopt_args_info *args_info, const char *prog_name)
+{
+  int result = EXIT_SUCCESS;
+
+  if (cmdline_parser_required2(args_info, prog_name, 0) > 0)
+    result = EXIT_FAILURE;
+
+  return result;
+}
+
+int
+cmdline_parser_required2 (struct gengetopt_args_info *args_info, const char *prog_name, const char *additional_error)
+{
+  int error = 0;
+  FIX_UNUSED (additional_error);
+
+  /* checks for required options */
+  if (! args_info->db_given)
+    {
+      fprintf (stderr, "%s: '--db' ('-x') option required%s\n", prog_name, (additional_error ? additional_error : ""));
+      error = 1;
+    }
+  
+  if (! args_info->input_given)
+    {
+      fprintf (stderr, "%s: '--input' ('-i') option required%s\n", prog_name, (additional_error ? additional_error : ""));
+      error = 1;
+    }
+  
+  if (! args_info->query_given)
+    {
+      fprintf (stderr, "%s: '--query' ('-q') option required%s\n", prog_name, (additional_error ? additional_error : ""));
+      error = 1;
+    }
+  
+  if (! args_info->dir_in_given)
+    {
+      fprintf (stderr, "%s: '--dir_in' ('-d') option required%s\n", prog_name, (additional_error ? additional_error : ""));
+      error = 1;
+    }
+  
+  if (! args_info->dir_prep_in_given)
+    {
+      fprintf (stderr, "%s: '--dir_prep_in' ('-p') option required%s\n", prog_name, (additional_error ? additional_error : ""));
+      error = 1;
+    }
+  
+  
+  /* checks for dependences among options */
+
+  return error;
+}
+
+
+static char *package_name = 0;
+
+/**
+ * @brief updates an option
+ * @param field the generic pointer to the field to update
+ * @param orig_field the pointer to the orig field
+ * @param field_given the pointer to the number of occurrence of this option
+ * @param prev_given the pointer to the number of occurrence already seen
+ * @param value the argument for this option (if null no arg was specified)
+ * @param possible_values the possible values for this option (if specified)
+ * @param default_value the default value (in case the option only accepts fixed values)
+ * @param arg_type the type of this option
+ * @param check_ambiguity @see cmdline_parser_params.check_ambiguity
+ * @param override @see cmdline_parser_params.override
+ * @param no_free whether to free a possible previous value
+ * @param multiple_option whether this is a multiple option
+ * @param long_opt the corresponding long option
+ * @param short_opt the corresponding short option (or '-' if none)
+ * @param additional_error possible further error specification
+ */
+static
+int update_arg(void *field, char **orig_field,
+               unsigned int *field_given, unsigned int *prev_given, 
+               char *value, const char *possible_values[],
+               const char *default_value,
+               cmdline_parser_arg_type arg_type,
+               int check_ambiguity, int override,
+               int no_free, int multiple_option,
+               const char *long_opt, char short_opt,
+               const char *additional_error)
+{
+  char *stop_char = 0;
+  const char *val = value;
+  int found;
+  char **string_field;
+  FIX_UNUSED (field);
+
+  stop_char = 0;
+  found = 0;
+
+  if (!multiple_option && prev_given && (*prev_given || (check_ambiguity && *field_given)))
+    {
+      if (short_opt != '-')
+        fprintf (stderr, "%s: `--%s' (`-%c') option given more than once%s\n", 
+               package_name, long_opt, short_opt,
+               (additional_error ? additional_error : ""));
+      else
+        fprintf (stderr, "%s: `--%s' option given more than once%s\n", 
+               package_name, long_opt,
+               (additional_error ? additional_error : ""));
+      return 1; /* failure */
+    }
+
+  FIX_UNUSED (default_value);
+    
+  if (field_given && *field_given && ! override)
+    return 0;
+  if (prev_given)
+    (*prev_given)++;
+  if (field_given)
+    (*field_given)++;
+  if (possible_values)
+    val = possible_values[found];
+
+  switch(arg_type) {
+  case ARG_FLAG:
+    *((int *)field) = !*((int *)field);
+    break;
+  case ARG_STRING:
+    if (val) {
+      string_field = (char **)field;
+      if (!no_free && *string_field)
+        free (*string_field); /* free previous string */
+      *string_field = gengetopt_strdup (val);
+    }
+    break;
+  default:
+    break;
+  };
+
+
+  /* store the original value */
+  switch(arg_type) {
+  case ARG_NO:
+  case ARG_FLAG:
+    break;
+  default:
+    if (value && orig_field) {
+      if (no_free) {
+        *orig_field = value;
+      } else {
+        if (*orig_field)
+          free (*orig_field); /* free previous string */
+        *orig_field = gengetopt_strdup (value);
+      }
+    }
+  };
+
+  return 0; /* OK */
+}
+
+
+int
+cmdline_parser_internal (
+  int argc, char **argv, struct gengetopt_args_info *args_info,
+                        struct cmdline_parser_params *params, const char *additional_error)
+{
+  int c;	/* Character of the parsed option.  */
+
+  int error = 0;
+  struct gengetopt_args_info local_args_info;
+  
+  int override;
+  int initialize;
+  int check_required;
+  int check_ambiguity;
+  
+  package_name = argv[0];
+  
+  override = params->override;
+  initialize = params->initialize;
+  check_required = params->check_required;
+  check_ambiguity = params->check_ambiguity;
+
+  if (initialize)
+    cmdline_parser_init (args_info);
+
+  cmdline_parser_init (&local_args_info);
+
+  optarg = 0;
+  optind = 0;
+  opterr = params->print_errors;
+  optopt = '?';
+
+  while (1)
+    {
+      int option_index = 0;
+
+      static struct option long_options[] = {
+        { "help",	0, NULL, 'h' },
+        { "version",	0, NULL, 'V' },
+        { "db",	1, NULL, 'x' },
+        { "input",	1, NULL, 'i' },
+        { "query",	1, NULL, 'q' },
+        { "dir_in",	1, NULL, 'd' },
+        { "dir_prep_in",	1, NULL, 'p' },
+        { "is_nibble",	0, NULL, 'N' },
+        { 0,  0, 0, 0 }
+      };
+
+      c = getopt_long (argc, argv, "hVx:i:q:d:p:N", long_options, &option_index);
+
+      if (c == -1) break;	/* Exit from `while (1)' loop.  */
+
+      switch (c)
+        {
+        case 'h':	/* Print help and exit.  */
+          cmdline_parser_print_help ();
+          cmdline_parser_free (&local_args_info);
+          exit (EXIT_SUCCESS);
+
+        case 'V':	/* Print version and exit.  */
+        
+        
+          if (update_arg( 0 , 
+               0 , &(args_info->version_given),
+              &(local_args_info.version_given), optarg, 0, 0, ARG_NO,
+              check_ambiguity, override, 0, 0,
+              "version", 'V',
+              additional_error))
+            goto failure;
+          cmdline_parser_free (&local_args_info);
+          return 0;
+        
+          break;
+        case 'x':	/* Input a set of datasets.  */
+        
+        
+          if (update_arg( (void *)&(args_info->db_arg), 
+               &(args_info->db_orig), &(args_info->db_given),
+              &(local_args_info.db_given), optarg, 0, 0, ARG_STRING,
+              check_ambiguity, override, 0, 0,
+              "db", 'x',
+              additional_error))
+            goto failure;
+        
+          break;
+        case 'i':	/* Input gene mapping.  */
+        
+        
+          if (update_arg( (void *)&(args_info->input_arg), 
+               &(args_info->input_orig), &(args_info->input_given),
+              &(local_args_info.input_given), optarg, 0, 0, ARG_STRING,
+              check_ambiguity, override, 0, 0,
+              "input", 'i',
+              additional_error))
+            goto failure;
+        
+          break;
+        case 'q':	/* Query gene list.  */
+        
+        
+          if (update_arg( (void *)&(args_info->query_arg), 
+               &(args_info->query_orig), &(args_info->query_given),
+              &(local_args_info.query_given), optarg, 0, 0, ARG_STRING,
+              check_ambiguity, override, 0, 0,
+              "query", 'q',
+              additional_error))
+            goto failure;
+        
+          break;
+        case 'd':	/* Database directory.  */
+        
+        
+          if (update_arg( (void *)&(args_info->dir_in_arg), 
+               &(args_info->dir_in_orig), &(args_info->dir_in_given),
+              &(local_args_info.dir_in_given), optarg, 0, 0, ARG_STRING,
+              check_ambiguity, override, 0, 0,
+              "dir_in", 'd',
+              additional_error))
+            goto failure;
+        
+          break;
+        case 'p':	/* Prep directory (containing .gavg, .gpres files).  */
+        
+        
+          if (update_arg( (void *)&(args_info->dir_prep_in_arg), 
+               &(args_info->dir_prep_in_orig), &(args_info->dir_prep_in_given),
+              &(local_args_info.dir_prep_in_given), optarg, 0, 0, ARG_STRING,
+              check_ambiguity, override, 0, 0,
+              "dir_prep_in", 'p',
+              additional_error))
+            goto failure;
+        
+          break;
+        case 'N':	/* Whether the input DB is nibble type.  */
+        
+        
+          if (update_arg((void *)&(args_info->is_nibble_flag), 0, &(args_info->is_nibble_given),
+              &(local_args_info.is_nibble_given), optarg, 0, 0, ARG_FLAG,
+              check_ambiguity, override, 1, 0, "is_nibble", 'N',
+              additional_error))
+            goto failure;
+        
+          break;
+
+        case 0:	/* Long option with no short option */
+        case '?':	/* Invalid option.  */
+          /* `getopt_long' already printed an error message.  */
+          goto failure;
+
+        default:	/* bug: option not considered.  */
+          fprintf (stderr, "%s: option unknown: %c%s\n", CMDLINE_PARSER_PACKAGE, c, (additional_error ? additional_error : ""));
+          abort ();
+        } /* switch */
+    } /* while */
+
+
+
+  if (check_required)
+    {
+      error += cmdline_parser_required2 (args_info, argv[0], additional_error);
+    }
+
+  cmdline_parser_release (&local_args_info);
+
+  if ( error )
+    return (EXIT_FAILURE);
+
+  if (optind < argc)
+    {
+      int i = 0 ;
+      int found_prog_name = 0;
+      /* whether program name, i.e., argv[0], is in the remaining args
+         (this may happen with some implementations of getopt,
+          but surely not with the one included by gengetopt) */
+
+      i = optind;
+      while (i < argc)
+        if (argv[i++] == argv[0]) {
+          found_prog_name = 1;
+          break;
+        }
+      i = 0;
+
+      args_info->inputs_num = argc - optind - found_prog_name;
+      args_info->inputs =
+        (char **)(malloc ((args_info->inputs_num)*sizeof(char *))) ;
+      while (optind < argc)
+        if (argv[optind++] != argv[0])
+          args_info->inputs[ i++ ] = gengetopt_strdup (argv[optind-1]) ;
+    }
+
+  return 0;
+
+failure:
+  
+  cmdline_parser_release (&local_args_info);
+  return (EXIT_FAILURE);
+}

File tools/SeekMiner/cmdline.h

+/** @file cmdline.h
+ *  @brief The header file for the command line option parser
+ *  generated by GNU Gengetopt version 2.22.5
+ *  http://www.gnu.org/software/gengetopt.
+ *  DO NOT modify this file, since it can be overwritten
+ *  @author GNU Gengetopt by Lorenzo Bettini */
+
+#ifndef CMDLINE_H
+#define CMDLINE_H
+
+/* If we use autoconf.  */
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h> /* for FILE */
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#ifndef CMDLINE_PARSER_PACKAGE
+/** @brief the program name (used for printing errors) */
+#define CMDLINE_PARSER_PACKAGE "SeekMiner"
+#endif
+
+#ifndef CMDLINE_PARSER_PACKAGE_NAME
+/** @brief the complete program name (used for help and version) */
+#define CMDLINE_PARSER_PACKAGE_NAME "SeekMiner"
+#endif
+
+#ifndef CMDLINE_PARSER_VERSION
+/** @brief the program version */
+#define CMDLINE_PARSER_VERSION "1.0"
+#endif
+
+/** @brief Where the command line options are stored */
+struct gengetopt_args_info
+{
+  const char *help_help; /**< @brief Print help and exit help description.  */
+  const char *version_help; /**< @brief Print version and exit help description.  */
+  char * db_arg;	/**< @brief Input a set of datasets.  */
+  char * db_orig;	/**< @brief Input a set of datasets original value given at command line.  */
+  const char *db_help; /**< @brief Input a set of datasets help description.  */
+  char * input_arg;	/**< @brief Input gene mapping.  */
+  char * input_orig;	/**< @brief Input gene mapping original value given at command line.  */
+  const char *input_help; /**< @brief Input gene mapping help description.  */
+  char * query_arg;	/**< @brief Query gene list.  */
+  char * query_orig;	/**< @brief Query gene list original value given at command line.  */
+  const char *query_help; /**< @brief Query gene list help description.  */
+  char * dir_in_arg;	/**< @brief Database directory.  */
+  char * dir_in_orig;	/**< @brief Database directory original value given at command line.  */
+  const char *dir_in_help; /**< @brief Database directory help description.  */
+  char * dir_prep_in_arg;	/**< @brief Prep directory (containing .gavg, .gpres files).  */
+  char * dir_prep_in_orig;	/**< @brief Prep directory (containing .gavg, .gpres files) original value given at command line.  */
+  const char *dir_prep_in_help; /**< @brief Prep directory (containing .gavg, .gpres files) help description.  */
+  int is_nibble_flag;	/**< @brief Whether the input DB is nibble type (default=off).  */
+  const char *is_nibble_help; /**< @brief Whether the input DB is nibble type help description.  */
+  
+  unsigned int help_given ;	/**< @brief Whether help was given.  */
+  unsigned int version_given ;	/**< @brief Whether version was given.  */
+  unsigned int db_given ;	/**< @brief Whether db was given.  */
+  unsigned int input_given ;	/**< @brief Whether input was given.  */
+  unsigned int query_given ;	/**< @brief Whether query was given.  */
+  unsigned int dir_in_given ;	/**< @brief Whether dir_in was given.  */
+  unsigned int dir_prep_in_given ;	/**< @brief Whether dir_prep_in was given.  */
+  unsigned int is_nibble_given ;	/**< @brief Whether is_nibble was given.  */
+
+  char **inputs ; /**< @brief unamed options (options without names) */
+  unsigned inputs_num ; /**< @brief unamed options number */
+} ;
+
+/** @brief The additional parameters to pass to parser functions */
+struct cmdline_parser_params
+{
+  int override; /**< @brief whether to override possibly already present options (default 0) */
+  int initialize; /**< @brief whether to initialize the option structure gengetopt_args_info (default 1) */
+  int check_required; /**< @brief whether to check that all required options were provided (default 1) */
+  int check_ambiguity; /**< @brief whether to check for options already specified in the option structure gengetopt_args_info (default 0) */
+  int print_errors; /**< @brief whether getopt_long should print an error message for a bad option (default 1) */
+} ;
+
+/** @brief the purpose string of the program */
+extern const char *gengetopt_args_info_purpose;
+/** @brief the usage string of the program */
+extern const char *gengetopt_args_info_usage;
+/** @brief all the lines making the help output */
+extern const char *gengetopt_args_info_help[];
+
+/**
+ * The command line parser
+ * @param argc the number of command line options
+ * @param argv the command line options
+ * @param args_info the structure where option information will be stored
+ * @return 0 if everything went fine, NON 0 if an error took place
+ */
+int cmdline_parser (int argc, char **argv,
+  struct gengetopt_args_info *args_info);
+
+/**
+ * The command line parser (version with additional parameters - deprecated)
+ * @param argc the number of command line options
+ * @param argv the command line options
+ * @param args_info the structure where option information will be stored
+ * @param override whether to override possibly already present options
+ * @param initialize whether to initialize the option structure my_args_info
+ * @param check_required whether to check that all required options were provided
+ * @return 0 if everything went fine, NON 0 if an error took place
+ * @deprecated use cmdline_parser_ext() instead
+ */
+int cmdline_parser2 (int argc, char **argv,
+  struct gengetopt_args_info *args_info,
+  int override, int initialize, int check_required);
+
+/**
+ * The command line parser (version with additional parameters)
+ * @param argc the number of command line options
+ * @param argv the command line options
+ * @param args_info the structure where option information will be stored
+ * @param params additional parameters for the parser
+ * @return 0 if everything went fine, NON 0 if an error took place
+ */
+int cmdline_parser_ext (int argc, char **argv,
+  struct gengetopt_args_info *args_info,
+  struct cmdline_parser_params *params);
+
+/**
+ * Save the contents of the option struct into an already open FILE stream.
+ * @param outfile the stream where to dump options
+ * @param args_info the option struct to dump
+ * @return 0 if everything went fine, NON 0 if an error took place
+ */
+int cmdline_parser_dump(FILE *outfile,
+  struct gengetopt_args_info *args_info);
+
+/**
+ * Save the contents of the option struct into a (text) file.
+ * This file can be read by the config file parser (if generated by gengetopt)
+ * @param filename the file where to save
+ * @param args_info the option struct to save
+ * @return 0 if everything went fine, NON 0 if an error took place
+ */
+int cmdline_parser_file_save(const char *filename,
+  struct gengetopt_args_info *args_info);
+
+/**
+ * Print the help
+ */
+void cmdline_parser_print_help(void);
+/**
+ * Print the version
+ */
+void cmdline_parser_print_version(void);
+
+/**
+ * Initializes all the fields a cmdline_parser_params structure 
+ * to their default values
+ * @param params the structure to initialize
+ */
+void cmdline_parser_params_init(struct cmdline_parser_params *params);
+
+/**
+ * Allocates dynamically a cmdline_parser_params structure and initializes
+ * all its fields to their default values
+ * @return the created and initialized cmdline_parser_params structure
+ */
+struct cmdline_parser_params *cmdline_parser_params_create(void);
+
+/**
+ * Initializes the passed gengetopt_args_info structure's fields
+ * (also set default values for options that have a default)
+ * @param args_info the structure to initialize
+ */
+void cmdline_parser_init (struct gengetopt_args_info *args_info);
+/**
+ * Deallocates the string fields of the gengetopt_args_info structure
+ * (but does not deallocate the structure itself)
+ * @param args_info the structure to deallocate
+ */
+void cmdline_parser_free (struct gengetopt_args_info *args_info);
+
+/**
+ * Checks that all the required options were specified
+ * @param args_info the structure to check
+ * @param prog_name the name of the program that will be used to print
+ *   possible errors
+ * @return
+ */
+int cmdline_parser_required (struct gengetopt_args_info *args_info,
+  const char *prog_name);
+
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+#endif /* CMDLINE_H */

File tools/SeekMiner/stdafx.cpp

+/*****************************************************************************
+* This file is provided under the Creative Commons Attribution 3.0 license.
+*
+* You are free to share, copy, distribute, transmit, or adapt this work
+* PROVIDED THAT you attribute the work to the authors listed below.
+* For more information, please see the following web page:
+* http://creativecommons.org/licenses/by/3.0/
+*
+* This file is a component of the Sleipnir library for functional genomics,
+* authored by:
+* Curtis Huttenhower (chuttenh@princeton.edu)
+* Mark Schroeder
+* Maria D. Chikina
+* Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
+*
+* If you use this library, the included executable tools, or any related
+* code in your work, please cite the following publication:
+* Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
+* Olga G. Troyanskaya.
+* "The Sleipnir library for computational functional genomics"
+*****************************************************************************/
+#include "stdafx.h"
+
+/*!
+ * \page DBCombiner DBCombiner
+ * 
+ * 
+ * \section sec_usage Usage
+ * 
+ * \subsection ssec_usage_basic Basic Usage
+ * 
+ * \code
+ * DBCombiner -i <genes.txt> -x <db list> -d <input directory> -D <output_dir>
+ * \endcode
+ * 
+ * 
+ * \subsection ssec_usage_detailed Detailed Usage
+ * 
+ * \include DBCombiner/DBCombiner.ggo
+ * 
+ * <table><tr>
+ *	<th>Flag</th>
+ *	<th>Default</th>
+ *	<th>Type</th>
+ *	<th>Description</th>
+ * </tr><tr>
+ *	<td>-i</td>
+ *	<td>stdin</td>
+ *	<td>Text file</td>
+ *	<td>Tab-delimited text file containing two columns, numerical gene IDs (one-based) and unique gene
+ *		names (matching those in the input DAT/DAB files).</td>
+ * </tr><tr>
+ *	<td>-d</td>
+ *	<td>.</td>
+ *	<td>Directory</td>
+ *	<td>Input directory containing DB files</td>
+ * </tr><tr>
+ *	<td>-D</td>
+ *	<td>.</td>
+ *	<td>Directory</td>
+ *	<td>Output directory in which database files will be stored.</td>
+ * </tr><tr>
+ *	<td>-x</td>
+ *	<td>.</td>
+ *	<td>Text file</td>
+ *	<td>Input file containing list of CDatabaselets to combine</td>
+ * </tr></table>
+ */

File tools/SeekMiner/stdafx.h

+/*****************************************************************************
+* This file is provided under the Creative Commons Attribution 3.0 license.
+*
+* You are free to share, copy, distribute, transmit, or adapt this work
+* PROVIDED THAT you attribute the work to the authors listed below.
+* For more information, please see the following web page:
+* http://creativecommons.org/licenses/by/3.0/
+*
+* This file is a component of the Sleipnir library for functional genomics,
+* authored by:
+* Curtis Huttenhower (chuttenh@princeton.edu)
+* Mark Schroeder
+* Maria D. Chikina
+* Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
+*
+* If you use this library, the included executable tools, or any related
+* code in your work, please cite the following publication:
+* Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
+* Olga G. Troyanskaya.
+* "The Sleipnir library for computational functional genomics"
+*****************************************************************************/
+#ifndef STDAFX_H
+#define STDAFX_H
+
+#define __STDC_LIMIT_MACROS
+
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <string>
+#include <omp.h>
+using namespace std;
+
+#include <pthread.h>
+
+#include "bayesnet.h"
+#include "database.h"
+#include "seekmap.h"
+#include "seekweight.h"
+#include "seekdataset.h"
+#include "seekevaluate.h"
+#include "seekreader.h"
+#include "seekwriter.h"
+#include "seekquery.h"
+#include "meta.h"
+using namespace Sleipnir;
+
+#endif // STDAFX_H