Commits

James Rudd  committed 955a38f

Added the python scripts for EGSS, GGSS, and random gene set selection as EGS.py, GGS.py, and RGS_mp.py respectively.

  • Participants

Comments (0)

Files changed (3)

+#!/usr/bin/env python
+"""
+Usage: 
+	EGS.py <CorrelationMatrix> <CorThresh> [--fold=<FOLD>] [--seed=<SEED>] [--nGenes=<NG>] [--candidates=<CG>] [--mut=<MUT>] [--cross=<CROSS>] [--pop=<POP>] [--gen=<GEN>] [--summary=<SUM>] [--log=<FILE>] [--cpus=<NPROCESSES>]
+	EGS.py -h | --help
+	EGS.py --version
+
+Options:
+	-h --help 			  Show this screen.
+	--version 			  Show the version. 
+	--fold=<FOLD>			  The minimum number of genes in the solution that must correlate with a specific gene in order for it to be considered covered. 
+	--nGenes=<NG> 			  Number of genes in solution excluding candidate genes 
+	--candidates=<CG>		  Candidate Gene Set
+	--mut=<MUT> 			  Rate at which chromosomes are mutated   
+	--cross=<CROSS>			  Rate at which crossover occurres between chromosomes 
+	--pop=<POP>			  Solution population size 
+	--gen=<GEN>			  Maximum number of generations 
+	--cpus=<NPROCESSES>		  Number of independent processes to use for fitness evaluation
+	--summary=<SUM>			  Number of generations after which summarize the state of solution
+	--log=<FILE>		  File to which EGS will output evolution statistics
+	--seed=<SEED>		  Random seed to ensure reproducibility of results
+
+"""
+
+
+#../EGS.py ../../Data/firehose.tcga.unified.expression.corrmat.binary.0.7.txt 0.7 --fold=3 --seed=10 --nGenes=400 --pop=100 --gen=1000 --mut=0.5 --cross=0.5 --log=whatever.txt
+
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#GLOBAL VARIABLES
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+VERSION = '0.0.1-alpha'
+SEED = 123
+VERBOSE = True
+
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#LOAD PACKAGES
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+import math 						#This provides the absoulte value function
+import sys 							#This library allows the output stream to be flushed
+from docopt import docopt			#This library provides the command line interface
+import random as rand 				#This library provides the random integer functions for mutation and crossover
+import numpy as np					#This Library provides the binary array 
+from itertools import izip as zip, count	#This library provides the izip which is used to dynamically build the sets used to calculate coverage
+from time import time				#Used to calculate the runtime of the evolution vs runtime of data load
+from collections import Counter		#Used to count the frequency of a gene in the coverage list. If freq > k, gene is covered.
+from pyevolve import G1DList
+from pyevolve import GSimpleGA
+from pyevolve import Util
+from pyevolve import DBAdapters
+
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#GLOBAL VARIABLES
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+data = None				#Holds the correlation matrix
+candidates = None		#Holds the indeces for the candidate genes supplied
+arguments = None		#Holds the command line arguments.
+FOLD = None				#Hold the --fold command line argument
+CANDIDATE_COVERAGE = None	#Saves the genes covered by the candidates so that this doesn't have to 
+							#be recalculated for each individual fitness evaluation
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#FUNCTIONS
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+####################
+##     1D Set     ##
+####################
+#Initialization Function Specific for a 1D Set
+def G1DSetInitializatorInteger(genome, **args):
+   range_min = genome.getParam("rangemin", 0)
+   range_max = genome.getParam("rangemax", 100)
+   if(candidates == None):
+   	genome.genomeList = rand.sample(xrange(range_min,range_max+1),genome.getListSize())
+   else:
+   	universe = set(xrange(range_min,range_max+1))
+   	t = set(candidates)
+   	universe = universe - t
+   	universe = list(universe)
+   	genome.genomeList = rand.sample(universe,genome.getListSize())
+
+#Mutation Function Specific for a 1D set
+#Need to change this to sample from a set instead of using the indeterminate while loop.
+def G1DSetMutatorRealRange(genome, **args):
+   """ Simple real range mutator for G1DList
+
+   Accepts the *rangemin* and *rangemax* genome parameters, both optional.
+
+   """
+
+   #pmut is the probability of mutation; can't be less than 0
+   if args["pmut"] <= 0.0: return 0
+
+   #calculate the total number of mutations for this genome
+   listSize = len(genome)
+   mutations = args["pmut"] * (listSize)
+
+   #all possible gene candidates are the numbers 0 through 99 by default or the provided rangement and rangemax+1
+   candidatesAll = set(range(genome.getParam("rangemin", 0), genome.getParam("rangemax", 98) + 1))
+
+   #If candidate genes have been provided, make sure we don't mutate to one of those genes.
+   if candidates is not None:
+   		candidatesAll = candidatesAll - set(candidates)
+
+
+   #If there are less than 1 mutation for this genome (very small mutation rate)
+   #This is the slow way to do mutations but the only way for small rates
+   if mutations < 1.0 :
+      mutations = 0
+      for it in xrange(listSize):
+         if Util.randomFlipCoin(args["pmut"]):
+            
+            thisCandidates = candidatesAll - set(genome[:])
+            tmp = rand.sample(thisCandidates,1)
+            genome[it] = tmp[0]
+            #tmp = rand.uniform(genome.getParam("rangemin", Consts.CDefRangeMin), genome.getParam("rangemax", Consts.CDefRangeMax))
+	    #while tmp in genome:
+	    #  tmp = rand.uniform(genome.getParam("rangemin", Consts.CDefRangeMin), genome.getParam("rangemax", Consts.CDefRangeMax))
+
+            #genome[it] = tmp
+            mutations += 1
+   
+   #This is a faster way to do mutations but can only be done when the mutation rate * the genome length >= 1
+   else: 
+      for it in xrange(int(round(mutations))):
+         which_gene = rand.randint(0, listSize-1)
+         thisCandidates = candidatesAll - set(genome[:])
+         tmp = rand.sample(thisCandidates,1)
+
+
+         genome[which_gene] = tmp[0]
+
+   return int(mutations)
+
+
+
+#Crossover Function Specific to a 1DSet
+def G1DSetCrossoverMultiPoint(genome, **args):
+   
+   sister = None
+   brother = None
+   gMom = args["mom"]
+   gDad = args["dad"]
+   
+   mom = list(gMom)
+   dad = list(gDad)
+   pool = list(set(mom) | set(dad))
+
+   
+   
+
+   if args["count"] >= 1:
+      sister = gMom.clone()
+
+      sister[:] = rand.sample(pool,len(gMom))
+
+   if args["count"] == 2:
+      brother = gDad.clone()
+      brother[:] = rand.sample(pool,len(gMom))
+
+
+   #brother = None
+   return (sister, brother)
+
+#Evaluate only the genes in the current chromosome
+def eval_func(chromosome):
+	score = 0.0
+
+	geneIdx = []
+	for val in chromosome[:]:
+		t = coverage(int(val))
+		#geneIdx.append(t)
+		geneIdx.extend(t)
+
+	#geneIdx = [item for sublist in geneIdx for item in sublist]
+	count = 0.0
+
+	if FOLD == 1:
+		count = len(set(geneIdx))
+				
+	else:
+		cnt = Counter(geneIdx)
+		tmp = []
+		tmp = set([j for j in cnt if cnt[j] >= FOLD]) | set(chromosome[:])
+		count = len(tmp)
+
+
+	score = count #/ float(len(data.data))
+	return score
+
+
+#Evaluate the genes in the current chromosome together with a candiate gene list
+def eval_func2(chromosome):
+	score = 0.0
+	geneIdx = CANDIDATE_COVERAGE[:]
+
+	#for val in candidates:
+		#t = coverage(val)
+		#geneIdx.extend(t)
+
+	
+	#geneIdx = list(set(geneIdx))
+
+	for val in chromosome[:]:
+		t = coverage(int(val))
+		geneIdx.extend(t)
+
+	count = 0.0
+	if FOLD == 1:
+		count = len(set(geneIdx))
+
+	else:
+		cnt = Counter(geneIdx)
+		others = set(candidates) | set(chromosome[:])
+		tmp = set([j for j in cnt if cnt[j] >= FOLD]) | others
+		count = len(tmp)
+
+	score = count #/ float(len(data.data))
+	return score
+
+
+def initialize_candidates(candidates):
+	result = []
+	for val in candidates:
+		t = coverage(val)
+		result.extend(t)
+
+	#result = list(set(result))
+	return(result)
+
+def coverage(geneIdx):
+	corVals = data.data[geneIdx]
+	covered = []
+	
+	#for i in range(len(corVals)):
+		#if math.fabs(float(corVals[i])) > float(arguments['<CorThresh>']):
+		#if math.fabs(int(corVals[i])) == 1:
+			#covered.append(i)
+	
+	#covered = [i for i, j in zip(count(), corVals) if j == 1]
+	covered = np.where(corVals == 1)
+	covered = list(covered[0])
+	return(covered)
+
+
+class corMat:
+	def __init__(self, fName):
+		self.data = []
+		self.header = None
+		nGenes = None
+		self.load_cor_matrix(fName)
+
+
+	def load_cor_matrix(self,fName):
+
+		count = 0
+		try:
+			with open(fName) as input:
+				for line in input:
+					if(count == 0):
+						self.header = line.replace('"', '').strip().split('\t')
+						#print(self.header)
+						count = count + 1
+						
+
+					else:
+						tmp = np.array(line.strip().split('\t')[1:], dtype=np.int8)
+						self.data.append(tmp)
+				
+
+		except IOError:
+			print("Error: can\'t find the input gene correlation file or read the data")
+
+
+def runtimeParams():
+	global arguments
+	print("Binary Correlation Matrix: " + str(arguments['<CorrelationMatrix>']))
+	print("Log File: " + str(arguments['--log']))
+	print("Fold Coverage: " + str(arguments['--fold']))
+	print("Seed: " + str(arguments['--seed']))
+	print("CPUs: " + str(arguments['--cpus']))
+	print("Population Size: " + str(arguments['--pop']))
+	print("Generations: " + str(arguments['--gen']))
+	print("Mutation Rate: " + str(arguments['--mut']))
+	print("Crossover Rate: " + str(arguments['--cross']))
+	print("Number of Genes: " + str(arguments['--nGenes']))
+	print("Candidate Genes: " + str(arguments['--candidates']))
+	print("-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- ")
+
+def main_run():
+	global arguments
+	arguments = docopt(__doc__, version=VERSION)
+	if arguments['--fold'] is None:
+		arguments['--fold'] = 1
+
+	global FOLD
+	FOLD = int(arguments['--fold'])
+	#Output command line options
+	#print(arguments)
+	runtimeParams()
+
+
+	if arguments['--seed'] != None:
+		global SEED
+		SEED = int(arguments['--seed'])
+	else:
+		SEED = 123
+
+
+
+	t0 = time()
+	global data
+	data = corMat(arguments['<CorrelationMatrix>'])
+	if VERBOSE:
+		print("Loaded Binary Correlation Matrix")
+		print(str(time() - t0) + " seconds")
+		sys.stdout.flush()
+
+	
+	#Create the template for the chromosomes
+	genome = G1DList.G1DList(int(arguments['--nGenes']))
+
+	#Limit the range of integer values for the chromosomes
+	genome.setParams(rangemin=0, rangemax=len(data.data)-1)
+
+	#Specify the initialization function
+	genome.initializator.set(G1DSetInitializatorInteger)
+
+	#Spcify the mutation function
+	genome.mutator.set(G1DSetMutatorRealRange)
+
+	#Specify the crossover function
+	genome.crossover.set(G1DSetCrossoverMultiPoint)
+
+
+	#Assign the fitness function
+	if arguments['--candidates'] == None:
+		genome.evaluator.set(eval_func)
+
+	else:
+		global candidates
+		tmp = arguments['--candidates'].strip().split(',')
+		candidates = [data.header.index(x) for x in tmp] 
+		global CANDIDATE_COVERAGE
+		CANDIDATE_COVERAGE = initialize_candidates(candidates)
+		genome.evaluator.set(eval_func2)
+		#print(candidates)
+
+	
+
+
+	#Attach the genome to the GA and provide a random seed for reproducibility
+	ga = GSimpleGA.GSimpleGA(genome, seed=SEED)
+
+	#Attach a log collector to save the statistics of the evolution
+	if arguments['--log'] != None:
+		csvAdapter = DBAdapters.DBFileCSV(filename=arguments['--log'])
+		ga.setDBAdapter(csvAdapter)
+		#https://groups.google.com/forum/#!topic/pyevolve/LrEfX8TRHXo
+		#Generation
+		#Minimum Raw Score
+		#Fitness Average 
+		#Fitness Minimum
+		#Raw score variance
+		#Std Dev of Raw scores
+		#Raw Score Average
+		#Maximum Fitness
+		#Raw score maximum
+
+
+	#Set the number of generations
+	if arguments['--gen'] != None:
+		ga.setGenerations(int(arguments['--gen']))
+
+
+	#Set the population size
+	if arguments['--pop'] != None:
+		ga.setPopulationSize(int(arguments['--pop']))
+
+
+	#Set the mutation rate
+	if arguments['--mut'] != None:
+		ga.setMutationRate(float(arguments['--mut']))
+
+
+	#Set the crossover rate
+	if arguments['--cross'] != None:
+		ga.setCrossoverRate(float(arguments['--cross']))
+
+
+	#Enable multiprocessing
+	if(arguments['--cpus'] != None):
+		ga.setMultiProcessing(True, cpus=arguments['--cpus'])
+
+	#Begin the evolution
+	#ga.evolve(freq_stats=10)
+	t0 = time()
+	if VERBOSE:
+		print("Starting Evolution...")
+		sys.stdout.flush()
+		ga.evolve(freq_stats=min(100, ga.nGenerations))
+	else:
+		ga.evolve()
+
+
+	if VERBOSE:
+		print("Evolution Complete")
+		print(str(time() - t0) + " seconds")
+		print("-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- ")
+
+	t = ga.bestIndividual()
+	#print(t)
+	result = "Final Gene Set: " 
+	for i in t.genomeList:
+		result = result + str(data.header[i]) + " " 
+	print(result)
+	print("-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- ")
+	print("Score: " + str(t.score))
+	#scr = eval_func(t,True)
+	#print("Score (dbl check): " + str(scr))
+	print("Coverage (%): " + str(t.score / float(len(data.data)) * 100))
+	return 0
+
+
+
+if __name__ == "__main__":				#This helps to ensure Windows compatability
+   main_run()
+
+
+#!/usr/bin/env python
+"""
+Usage: 
+	GGS.py <CorrelationMatrix> [--fold=<FOLD>] [--nGenes=<NG>] [--candidates=<CG>] [--cpus=<NPROCESSES>]
+	GGS.py -h | --help
+	GGS.py --version
+
+Options:
+	-h --help 			  Show this screen.
+	--version 			  Show the version. 
+	--fold=<FOLD>			  The minimum number of genes in the solution that must correlate with a specific gene in order for it to be considered covered. 
+	--nGenes=<NG> 			  Number of genes in solution excluding candidate genes 
+	--candidates=<CG>		  Candidate Gene Set
    1. Jie Tan

      These are genes that must always be included in the solution; i.e. previously identified genes of interest based on experiemental results or expert knowledge etc. I'll update comments to make this more clear.

+	--cpus=<NPROCESSES>		  Number of independent processes to use for fitness evaluation
+	--log=<FILE>		  File to which EGS will output evolution statistics
+
+"""
+
+
+
+
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#GLOBAL VARIABLES
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+VERSION = '0.0.1-alpha'
+VERBOSE = True
+
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#LOAD PACKAGES
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+import math 						#This provides the absoulte value function
+import sys 							#This library allows the output stream to be flushed
+from docopt import docopt			#This library provides the command line interface
+import random as rand 				#This library provides the random integer functions for mutation and crossover
+import numpy as np					#This Library provides the binary array 
+from itertools import izip as zip, count	#This library provides the izip which is used to dynamically build the sets used to calculate coverage
+from time import time				#Used to calculate the runtime of the evolution vs runtime of data load
+from collections import Counter		#Used to count the frequency of a gene in the coverage list. If freq > k, gene is covered.
+import copy
+
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#GLOBAL VARIABLES
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+data = None				#Holds the correlation matrix
+data_const = None		#An unchanged version of data
+candidates = None		#Holds the indeces for the candidate genes supplied
+CANDIDATE_COVERAGE = None
+arguments = None		#Holds the command line arguments.
+
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#FUNCTIONS
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+
+
+#Evaluate only the genes in the current chromosome
+def eval_func(chromosome):
+	score = 0.0
+
+	global data
+	data = data_const
+
+	geneIdx = []
+	for val in chromosome[:]:
+		t = coverage(int(val))
+		#geneIdx.append(t)
+		geneIdx.extend(t)
+
+	#geneIdx = [item for sublist in geneIdx for item in sublist]
+	count = 0.0
+
+	if FOLD == 1:
+		count = len(set(geneIdx))
+				
+	else:
+		cnt = Counter(geneIdx)
+		tmp = set([j for j in cnt if cnt[j] >= FOLD]) | set(chromosome[:])
+		count = len(tmp)
+
+
+	score = count #/ float(len(data.data))
+	return score
+
+
+#Evaluate the genes in the current chromosome together with a candiate gene list
+def eval_func2(chromosome):
+	score = 0.0
+
+	global data
+	data = data_const
+
+	geneIdx = CANDIDATE_COVERAGE[:]
+
+	#for val in candidates:
+		#t = coverage(val)
+		#geneIdx.extend(t)
+
+	
+	#geneIdx = list(set(geneIdx))
+
+	for val in chromosome[:]:
+		t = coverage(int(val))
+		geneIdx.extend(t)
+
+	count = 0.0
+	if FOLD == 1:
+		count = len(set(geneIdx))
+
+	else:
+		cnt = Counter(geneIdx)
+		others = set(candidates) | set(chromosome[:])
+		tmp = set([j for j in cnt if cnt[j] >= FOLD]) | others
+		count = len(tmp)
+
+	score = count #/ float(len(data.data))
+	return score
+
+
+
+
+def coverage(geneIdx):
+	corVals = data.data[geneIdx]
+	covered = []
+	
+	#for i in range(len(corVals)):
+		#if math.fabs(float(corVals[i])) > float(arguments['<CorThresh>']):
+		#if math.fabs(int(corVals[i])) == 1:
+			#covered.append(i)
+	
+	#covered = [i for i, j in zip(count(), corVals) if j == 1]
+	covered = np.where(corVals == 1)
  1. Jie Tan

    What does the correlation matrix look like? Is it either 0 or 1 for not correlated and correlated, or it stores the real correlation value? Guessing from here, I feel it is a 0/1 matrix. Is it right?

+	covered = list(covered[0])
+	return(covered)
+
+
+class corMat:
+	def __init__(self, fName):
+		self.data = []
+		self.header = None
+		nGenes = None
+		self.load_cor_matrix(fName)
+
+
+	def load_cor_matrix(self,fName):
+
+		count = 0
+		try:
+			with open(fName) as input:
+				for line in input:
+					if(count == 0):
+						self.header = line.replace('"', '').strip().split('\t')
+						#print(self.header)
+						count = count + 1
+						
+
+					else:
+						tmp = np.array(line.strip().split('\t')[1:], dtype=np.int8)
+						self.data.append(tmp)
+				
+
+		except IOError:
+			print("Error: can\'t find the input gene correlation file or read the data")
+
+
+def runtimeParams():
+	global arguments
+	print("Binary Correlation Matrix: " + str(arguments['<CorrelationMatrix>']))
+	print("Fold Coverage: " + str(arguments['--fold']))
+	print("CPUs: " + str(arguments['--cpus']))
+	print("Number of Genes: " + str(arguments['--nGenes']))
+	print("Candidate Genes: " + str(arguments['--candidates']))
+	print("-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- ")
+
+
+def removeEdges(id):
+	global data
+
+	for i in xrange(len(data.data)):
+		data.data[i][id] = 0
+
+	data.data[id] = np.zeros(len(data.data), dtype=np.int8)
+
+	return 0
+
+def simpleGreedy(indeces, ng):
+	result = []
+	for i in xrange(ng):
+		tmp = [len(set(coverage(t))) for t in indeces]
+		best = tmp.index(max(tmp))
+		result.append(best)
+		removeEdges(best)
+		print("Number of Genes Selected: " + str(i+1) + "                          \r"),
+		sys.stdout.flush()
+	print("Number of Genes Selected: " + str(ng))
+	sys.stdout.flush()
+	return result
+
+
+def initialize_candidates(candidates):
+	result = []
+	for val in candidates:
+		t = coverage(val)
+		result.extend(t)
+
+	#result = list(set(result))
+	for val in candidates:
+		removeEdges(val)
+
+	return(result)
+
+def main_run():
+	global arguments
+	arguments = docopt(__doc__, version=VERSION)
+	if arguments['--fold'] is None:
+		arguments['--fold'] = 1
+
+	global FOLD
+	FOLD = int(arguments['--fold'])
+	#Output command line options
+	#print(arguments)
+	runtimeParams()
+
+
+
+	t0 = time()
+	global data
+	data = corMat(arguments['<CorrelationMatrix>'])
+	global data_const
+	data_const = copy.deepcopy(data)
+
+	if VERBOSE:
+		print("Loaded Binary Correlation Matrix")
+		print(str(time() - t0) + " seconds")
+		sys.stdout.flush()
+
+
+	result = []
+	score = 0.0
+	if arguments['--candidates'] is None:
+		result = simpleGreedy(range(len(data.data)), int(arguments['--nGenes']))
+		score = eval_func(result)
+
+	else:
+		global CANDIDATE_COVERAGE
  1. Jie Tan

    Is it necessary to declare CANDIDATE_COVERAGE a global variable? Why not just pass it into the eval_func2()? I feel it will be clearer that way. I don't use global variables very often since I heard that it's better not to use it if unnecessary. But I am not an expert in programming obviously...

    1. Jie Tan

      This is a remnant from the EGS.py code in which I have little control on when eval_func2() is called but I need to make sure that everytime it's called, the candidate genes are included. For GGS.py, you're right; the global status isn't necessary.

+		tmp = arguments['--candidates'].strip().split(',')
+		global candidates
+		candidates = [data.header.index(x) for x in tmp]
+		CANDIDATE_COVERAGE = initialize_candidates(candidates)
+		result = simpleGreedy(range(len(data.data)), int(arguments['--nGenes']))
+		score = eval_func2(result)
+	print("-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- ")
+	if VERBOSE:
+		print("Completed Greedy Gene Selection")
+		print(str(time() - t0) + " seconds")
+		sys.stdout.flush()
+	message = "Final Genes: "
+	for g in result:
+		message += str(data.header[g]) + " "
+	print(message)
+	print("Score: " + str(score))
+
+
+	return(0)
+
+
+
+if __name__ == "__main__":				#This helps to ensure Windows compatability
+   main_run()
+
+
+#!/usr/bin/env python
+"""
+Usage: 
+	RGS.py <CorrelationMatrix> [--fold=<FOLD>] [--seed=<SEED>] [--nGenes=<NG>] [--candidates=<CG>] [--pop=<POP>] [--cpus=<NPROCESSES>] [--rep=<REPLICATES>]
+	RGS.py -h | --help
+	RGS.py --version
+
+Options:
+	-h --help 			  Show this screen.
+	--version 			  Show the version. 
+	--fold=<FOLD>			  The minimum number of genes in the solution that must correlate with a specific gene in order for it to be considered covered. 
+	--nGenes=<NG> 			  Number of genes in solution excluding candidate genes 
+	--candidates=<CG>		  Candidate Gene Set   
+	--pop=<POP>			  Solution population size 
+	--cpus=<NPROCESSES>		  Number of independent processes to use for fitness evaluation
+	--seed=<SEED>		  Random seed to ensure reproducibility of results
+	--rep=<REPLICATES>    Number of replicates to perform
+
+"""
+
+
+
+
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#GLOBAL VARIABLES
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+VERSION = '0.0.1-alpha'
+SEED = -1
+VERBOSE = True
+
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#LOAD PACKAGES
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+import math 						#This provides the absoulte value function
+import sys 							#This library allows the output stream to be flushed
+from docopt import docopt			#This library provides the command line interface
+import random as rand 				#This library provides the random integer functions for mutation and crossover
+import numpy as np					#This Library provides the binary array 
+from itertools import izip as zip, count	#This library provides the izip which is used to dynamically build the sets used to calculate coverage
+from time import time				#Used to calculate the runtime of the evolution vs runtime of data load
+from collections import Counter		#Used to count the frequency of a gene in the coverage list. If freq > k, gene is covered.
+import ctypes						#Provides the shared array for scores
+import multiprocessing as mp		#Provides the pool object to run multiple processes
+
+
+
+
+
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#GLOBAL VARIABLES
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+data = None				#Holds the correlation matrix
+candidates = None		#Holds the indeces for the candidate genes supplied
+arguments = None		#Holds the command line arguments.
+FOLD = None				#Hold the --fold command line argument
+CANDIDATE_COVERAGE = None	#Saves the genes covered by the candidates so that this doesn't have to 
+							#be recalculated for each individual fitness evaluation
+universe = None			#All possible genes
+nGenes = None			#Number of genes to sample
+
+
+#This is the mutex lock to make this thread safe
+lock = mp.Lock()
+
+manager = mp.Manager()
+
+#This is the shared memory score array
+scoreArray_shared = manager.list([])
+
+#This is the shared memory best gene list
+#Initialized later once we know how many genes this should be
+bestGenes_shared = None; 		#manager.list([])
+
+#This is the current max score
+maxScore_shared = manager.Value('i',0)
+
+
+
+
+
+
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#FUNCTIONS
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+####################
+##     1D Set     ##
+####################
+#Initialization Function Specific for a 1D Set
+def G1DSetInitializatorInteger(genome, **args):
+   range_min = genome.getParam("rangemin", 0)
+   range_max = genome.getParam("rangemax", 100)
+
+   if(candidates == None):
+   	genome.genomeList = rand.sample(xrange(range_min,range_max+1),genome.getListSize())
+   else:
+   	universe = set(xrange(range_min,range_max+1))
+   	t = set(candidates)
+   	universe = universe - t
+   	universe = list(universe)
+   	genome.genomeList = rand.sample(universe,genome.getListSize())
+
+#Mutation Function Specific for a 1D set
+#Need to change this to sample from a set instead of using the indeterminate while loop.
+def G1DSetMutatorRealRange(genome, **args):
+   """ Simple real range mutator for G1DList
+
+   Accepts the *rangemin* and *rangemax* genome parameters, both optional.
+
+   """
+
+   #pmut is the probability of mutation; can't be less than 0
+   if args["pmut"] <= 0.0: return 0
+
+   #calculate the total number of mutations for this genome
+   listSize = len(genome)
+   mutations = args["pmut"] * (listSize)
+
+   #all possible gene candidates are the numbers 0 through 99 by default or the provided rangement and rangemax+1
+   candidatesAll = set(range(genome.getParam("rangemin", 0), genome.getParam("rangemax", 98) + 1))
+
+   #If candidate genes have been provided, make sure we don't mutate to one of those genes.
+   if candidates is not None:
+   		candidatesAll = candidatesAll - set(candidates)
+
+
+   #If there are less than 1 mutation for this genome (very small mutation rate)
+   #This is the slow way to do mutations but the only way for small rates
+   if mutations < 1.0 :
+      mutations = 0
+      for it in xrange(listSize):
+         if Util.randomFlipCoin(args["pmut"]):
+            
+            thisCandidates = candidatesAll - set(genome[:])
+            tmp = rand.sample(thisCandidates,1)
+            genome[it] = tmp[0]
+            #tmp = rand.uniform(genome.getParam("rangemin", Consts.CDefRangeMin), genome.getParam("rangemax", Consts.CDefRangeMax))
+	    #while tmp in genome:
+	    #  tmp = rand.uniform(genome.getParam("rangemin", Consts.CDefRangeMin), genome.getParam("rangemax", Consts.CDefRangeMax))
+
+            #genome[it] = tmp
+            mutations += 1
+   
+   #This is a faster way to do mutations but can only be done when the mutation rate * the genome length >= 1
+   else: 
+      for it in xrange(int(round(mutations))):
+         which_gene = rand.randint(0, listSize-1)
+         thisCandidates = candidatesAll - set(genome[:])
+         tmp = rand.sample(thisCandidates,1)
+
+
+         genome[which_gene] = tmp[0]
+
+   return int(mutations)
+
+
+
+#Crossover Function Specific to a 1DSet
+def G1DSetCrossoverMultiPoint(genome, **args):
+   
+   sister = None
+   brother = None
+   gMom = args["mom"]
+   gDad = args["dad"]
+   
+   mom = list(gMom)
+   dad = list(gDad)
+   pool = list(set(mom) | set(dad))
+
+   
+   
+
+   if args["count"] >= 1:
+      sister = gMom.clone()
+
+      sister[:] = rand.sample(pool,len(gMom))
+
+   if args["count"] == 2:
+      brother = gDad.clone()
+      brother[:] = rand.sample(pool,len(gMom))
+
+
+   #brother = None
+   return (sister, brother)
+
+#Evaluate only the genes in the current chromosome
+def eval_func(chromosome):
+	score = 0.0
+
+	geneIdx = []
+	for val in chromosome[:]:
+		t = coverage(int(val))
+		#geneIdx.append(t)
+		geneIdx.extend(t)
+
+	#geneIdx = [item for sublist in geneIdx for item in sublist]
+	count = 0.0
+
+	if FOLD == 1:
+		count = len(set(geneIdx))
+				
+	else:
+		cnt = Counter(geneIdx)
+		tmp = []
+		tmp = set([j for j in cnt if cnt[j] >= FOLD]) | set(chromosome[:])
+		count = len(tmp)
+
+
+	score = count #/ float(len(data.data))
+	return score
+
+
+#Evaluate the genes in the current chromosome together with a candiate gene list
+def eval_func2(chromosome):
+	score = 0.0
+	geneIdx = CANDIDATE_COVERAGE[:]
+
+	#for val in candidates:
+		#t = coverage(val)
+		#geneIdx.extend(t)
+
+	
+	#geneIdx = list(set(geneIdx))
+
+	for val in chromosome[:]:
+		t = coverage(int(val))
+		geneIdx.extend(t)
+
+	count = 0.0
+	if FOLD == 1:
+		count = len(set(geneIdx))
+
+	else:
+		cnt = Counter(geneIdx)
+		others = set(candidates) | set(chromosome[:])
+		tmp = set([j for j in cnt if cnt[j] >= FOLD]) | others
+		count = len(tmp)
+
+	score = count #/ float(len(data.data))
+	return score
+
+
+def initialize_candidates(candidates):
+	result = []
+	for val in candidates:
+		t = coverage(val)
+		result.extend(t)
+
+	#result = list(set(result))
+	return(result)
+
+def coverage(geneIdx):
+	corVals = data.data[geneIdx]
+	covered = []
+	
+	#for i in range(len(corVals)):
+		#if math.fabs(float(corVals[i])) > float(arguments['<CorThresh>']):
+		#if math.fabs(int(corVals[i])) == 1:
+			#covered.append(i)
+	
+	#covered = [i for i, j in zip(count(), corVals) if j == 1]
+	covered = np.where(corVals == 1)
+	covered = list(covered[0])
+	return(covered)
+
+
+class corMat:
+	def __init__(self, fName):
+		self.data = []
+		self.header = None
+		nGenes = None
+		self.load_cor_matrix(fName)
+
+
+	def load_cor_matrix(self,fName):
+
+		count = 0
+		try:
+			with open(fName) as input:
+				for line in input:
+					if(count == 0):
+						self.header = line.replace('"', '').strip().split('\t')
+						#print(self.header)
+						count = count + 1
+						
+
+					else:
+						tmp = np.array(line.strip().split('\t')[1:], dtype=np.int8)
+						self.data.append(tmp)
+				
+			
+		except IOError:
+			print("Error: can\'t find the input gene correlation file or read the data")
+
+
+def runtimeParams():
+	global arguments
+	print("Binary Correlation Matrix: " + str(arguments['<CorrelationMatrix>']))
+	print("Fold Coverage: " + str(arguments['--fold']))
+	print("Seed: " + str(arguments['--seed']))
+	print("CPUs: " + str(arguments['--cpus']))
+	print("Replicates: " + str(arguments['--rep']))
+	print("Population Size: " + str(arguments['--pop']))
+	print("Number of Genes: " + str(arguments['--nGenes']))
+	print("Candidate Genes: " + str(arguments['--candidates']))
+	print("-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- ")
+
+
+#-----------------------------------------------------
+#Each process will execute the following:
+#-----------------------------------------------------
+def processWork(indexID):
+	global lock
+	global scoreArray_shared
+	global maxScore_shared
+	global bestGenes_shared
+	global arguments
+	global universe
+	global nGenes
+	global SEED
+
+	if SEED >=0:
+		rand.seed(SEED+indexID)
+
+	gSet = rand.sample(universe,nGenes)
+	gScore = 0
+	if arguments['--candidates'] == None:
+		gScore = eval_func(gSet)
+	else:
+		gScore = eval_func2(gSet)
+
+	
+
+	with lock:
+		scoreArray_shared.append(gScore)
+		if(gScore > maxScore_shared.value):
+			maxScore_shared.value = gScore
+			bestGenes_shared[:] = gSet[:]
+			#print(gSet)
+
+
+
+def main_run():
+	global arguments
+	arguments = docopt(__doc__, version=VERSION)
+	if arguments['--fold'] is None:
+		arguments['--fold'] = 1
+
+	global FOLD
+	FOLD = int(arguments['--fold'])
+	#Output command line options
+	#print(arguments)
+	runtimeParams()
+
+
+	if arguments['--seed'] != None:
+		global SEED
+		SEED = int(arguments['--seed'])
+
+	#rand.seed(SEED)
+
+	t0 = time()
+	global data
+	data = corMat(arguments['<CorrelationMatrix>'])
+	if VERBOSE:
+		print("Loaded Binary Correlation Matrix")
+		print(str(time() - t0) + " seconds")
+		sys.stdout.flush()
+
+
+
+	
+	
+	global universe
+	universe = None
+
+	#Assign the fitness function
+	if arguments['--candidates'] == None:
+		print("No candidates supplied")
+		universe = set(range(len(data.header)))
+		#genome.evaluator.set(eval_func)
+
+	else:
+		print("Candidates supplied")
+		
+		global candidates
+		tmp = arguments['--candidates'].strip().split(',')
+		candidates = [data.header.index(x) for x in tmp] 
+		universe = set(range(len(data.header))) - set(candidates)
+		global CANDIDATE_COVERAGE
+		CANDIDATE_COVERAGE = initialize_candidates(candidates)
+		#genome.evaluator.set(eval_func2)
+		#print(candidates)
+
+	#-----------------------------------------------------
+	#Randomly select gene sets and score them
+	#-----------------------------------------------------
+	global nGenes
+	nGenes = int(arguments['--nGenes'])
+	maxScore = 0.0
+	bestSet = None
+	numRandSets = 100
+	if arguments['--pop']:
+		numRandSets = int(arguments['--pop'])
+	
+
+	
+	#Initialize the bestGenes_shared now that we know how many genes to include
+	global manager
+	global bestGenes_shared
+	bestGenes_shared = manager.list(range(nGenes))	
+
+	#Get the number of replicates
+	replicates = 1
+	if arguments['--rep'] != None:
+		replicates = int(arguments['--rep'])
+
+
+
+
+	for repl in range(replicates):
+
+
+		#Reset the bestGenes_shared lsit
+		bestGenes_shared[:] = range(nGenes)
+
+		#Reset the value
+		maxScore_shared.value = 0
+
+		#Reset the shared score array
+		scoreArray_shared[:] = []
+
+		#Update the SEED
+		if SEED >= 0:
+			SEED += numRandSets
+
+		#Create the pool of processes which will perform the work
+		processPool = mp.Pool(processes=int(arguments['--cpus']))
+
+		#Start the processes
+		indeces = range(numRandSets)
+		processPool.map(processWork,indeces)
+
+		#Make sure the processes are done
+		processPool.close()
+		processPool.join()
+
+		#-----------------------------------------------------
+		#Print the results
+		#-----------------------------------------------------
+
+		print("\nReplicate " + str(repl) + " Done")
+		result = "Final Gene Set: " 
+		for i in np.array(bestGenes_shared, dtype=np.int):
+			result = result + str(data.header[i]) + " " 
+		print(result)
+		#print("Score: " + str(maxScore_shared.value))
+		#print("Average Score: " + str(np.mean(scoreArray_shared)))
+		#print("Minimum Score: " + str(np.amin(scoreArray_shared)))
+		#print("SD: " + str(np.std(scoreArray_shared)))
+		
+		#-----------------------------------------------------
+		#Save the raw scoreArray for later analysis
+		#-----------------------------------------------------
+		if arguments['--candidates'] == None:
+			np.savetxt('RGS.raw.scoreArray.mp.' + str(repl) + '.out', np.array(scoreArray_shared,dtype=np.int), delimiter=',', fmt='%d')
+		else:
+			np.savetxt('RGS.candidates.raw.scoreArray.mp.' + str(repl) + '.out', np.array(scoreArray_shared,dtype=np.int), delimiter=',', fmt='%d')
+
+		#-----------------------------------------------------
+		#Write the Log file
+		#-----------------------------------------------------
+		f = None
+		if arguments['--candidates'] == None:
+			f = open("RGS.mp." + str(repl) + ".out",'w')
+		else: 
+			f = open("RGS.candidates.mp." + str(repl) + ".out",'w')
+		f.write(result)
+		f.write("\n")
+		f.write("-------------------------------------------------------\n")
+		if arguments['--candidates'] != None:
+			f.write("Candidates: " + arguments['--candidates'] + "\n")
+			f.write("-------------------------------------------------------\n")
+			
+		f.write("Seed: " + str(SEED) + "\n")
+		f.write("Population Size: " + str(numRandSets) + "\n")
+		f.write("Score: " + str(maxScore_shared.value) + "\n")
+		#f.write("Average Score: " + str(np.mean(scoreArray_shared)) + "\n")
+		#f.write("Minimum Score: " + str(np.amin(scoreArray_shared)) + "\n")
+		#f.write("SD: " + str(np.std(scoreArray_shared)) + "\n")
+
+
+	return 0
+
+
+
+if __name__ == "__main__":				#This helps to ensure Windows compatability
+   main_run()
+
+