Commits

opticall  committed f770f2f

fixed bug with initial sampling issues on block step

  • Participants
  • Parent commits f654d31

Comments (0)

Files changed (2)

File opticall/opticall.cpp

     
 	
 	//declare the variables for KMeans++
-	int attempts=50;
+	int attempts=150;
 	double * points  = new double[data_count*dim];
 	double * centres = new double[(numclasses-numcustomclasses)*dim];
 	int * assignments = new int[data_count];
 	//char * output_prob_ext = ".probs";
 	//char * output_call_ext = ".calls";
 
+	string version = "0.1.2";
+	cout << "opticall version " << version << endl;
+	cout << "thank you for choosing opticall for your genotyping needs" << endl;
+
+
 	string fname;
 	string fname_block = "";
 	string output_fstem;
 	vector < vector< string > > snpinfo;
 	vector < vector< string > > snpinfo2;
 
-
-
-	//cout << "number of individuals is" << individuals.size() << endl;
-
-
-	ifstream intensityfile (fname_block.c_str());
-
-	//cout << output_fstem+output_prob_ext << endl;
 	ofstream out_probsfile ((output_fstem+output_prob_ext).c_str());
 	ofstream out_callsfile ((output_fstem+output_call_ext).c_str());
 
 
-	if (intensityfile.is_open()  && intensityfile.good())
-	{
-		string header;
-		getline(intensityfile,header);
-	}
-	
-	
-	/**
-	 * STEP 1: Create one block mixture model - by sampling the file once
-	 */
-	//collect data
-	MatrixXd sample(min<int>(sample_block_size,snpcount*individuals.size()),2);
-	vector<int> sampling_indices;
-
-	//cout << "getting ready to smaple" << endl;
-
-	//first check if there's actually enough points to take a big sample
-	if (sample_block_size >= snpcount*individuals.size())
-	{
-		//cout << " not enough points for sample " << endl;
-		for (int i = 0; i < snpcount*individuals.size(); i++) {
-			sampling_indices.push_back(i);
-		}
-	}
-	else {
-		//cout << " enough points for random sample " << endl;
-		//select a bunch of random numbers
-		srand(time(0));
-		while (sampling_indices.size() < sample_block_size) {
-			float randnum = (float)rand()/(float)RAND_MAX;
-			int val = floor(randnum*snpcount*individuals.size());
-			if (val == snpcount*individuals.size()) {
-				//slight fudge if the random number chosen is exactly 1.0
-				val = val -1 ;
-			}
-			//our random value is not in the list - add it
-			if (find (sampling_indices.begin(), sampling_indices.end(), val)  ==   sampling_indices.end() ) {
-				sampling_indices.push_back(val);
-			}
-		}
-		//ok so now we have the required number of random values - sort them
-		sort (sampling_indices.begin(), sampling_indices.end(), lessthanfunction);
-	}
-
-	//cout << sampling_indices[0] << " " << sampling_indices[1] << endl;
-
-	//and now finally we fetch the sampled file
-	int counter = 0;
-	for (int snpt = 0; snpt < snpcount; snpt++)
-	{
-		//cout << "getting into loop" << endl;
-		MatrixXd block = readVblockFromIntFile(intensityfile,1,individuals.size(),snpinfo);	//read one line from the file
-		//now put the block into a sample
-
-		//cout << snpinfo[snpt][0] << endl;
-
-		for (int ind = 0; ind < individuals.size(); ind++)
-		{
-			if (snpt*individuals.size() + ind == sampling_indices[0])
-			{
-				sampling_indices.erase(sampling_indices.begin());
-				sample(counter,0) = block(0,ind*2);
-				sample(counter,1) = block(0,ind*2+1);
-				counter = counter + 1;
-			}
-		}
-	}
-
-	//cout << sample << endl;
-
-	//now close the intensity file - and reopen later
-	intensityfile.close();
-
-	//only transform the sample during the rescue stage
-	//sample = transform_data(sample);
-
-
-
-	vector<int> sample_nan_inds;
-	cout << "the sample is" << endl;
-	cout << sample << endl;
-	sample = strip_nans(sample, sample_nan_inds);
-	cout << "NaNs removed: " << sample_nan_inds.size() << endl;
-	cout << "---------" << endl;
-
-
 
 	//load data into mixture model
 	MatrixXd blockmus;
 	custommus.push_back(Matrix<double, 1, 2>::Zero());
 	customcovars.push_back(Matrix<double, 2, 2>::Identity() * 100	);
 
-	//cout << sample << endl;
-
-	MatrixXd sd_sample = sd_of_data(sample);
-	cout << "sd sample " << sd_sample << endl;
-	genotype_mm_block_caller(sample, vs, custommus, customcovars, blockmus, blockcovars, blockjs, blockvs);
-
-	//MatrixXi blockmap_arranging = MatrixXi::Zero(1, sample.rows());
-	arrange_mixture_components(sample , blockmus,  blockcovars,  blockjs,  blockvs);
-
-
-	cout << blockmus << endl;
-	cout << blockjs << endl;
-	cout << blockvs << endl;
+	MatrixXd sd_sample;
+
+	/*******SAMPLING & Block step*******/
+	/********************************/
+	int blockattempts = 0;
+	do {
+
+		ifstream intensityfile (fname_block.c_str());
+		if (intensityfile.is_open()  && intensityfile.good())
+		{
+			string header;
+			getline(intensityfile,header);
+		}
+
+
+		/**
+		 * STEP 1: Create one block mixture model - by sampling the file once
+		 */
+		//collect data
+		MatrixXd sample(min<int>(sample_block_size,snpcount*individuals.size()),2);
+		vector<int> sampling_indices;
+	
+		//cout << "getting ready to smaple" << endl;
+	
+		//first check if there's actually enough points to take a big sample
+		if (sample_block_size >= snpcount*individuals.size())
+		{
+			//cout << " not enough points for sample " << endl;
+			for (int i = 0; i < snpcount*individuals.size(); i++) {
+				sampling_indices.push_back(i);
+			}
+		}
+		else {
+			//cout << " enough points for random sample " << endl;
+			//select a bunch of random numbers
+			srand(time(0));
+			while (sampling_indices.size() < sample_block_size) {
+				float randnum = (float)rand()/(float)RAND_MAX;
+				int val = floor(randnum*snpcount*individuals.size());
+				if (val == snpcount*individuals.size()) {
+					//slight fudge if the random number chosen is exactly 1.0
+					val = val -1 ;
+				}
+				//our random value is not in the list - add it
+				if (find (sampling_indices.begin(), sampling_indices.end(), val)  ==   sampling_indices.end() ) {
+					sampling_indices.push_back(val);
+				}
+			}
+			//ok so now we have the required number of random values - sort them
+			sort (sampling_indices.begin(), sampling_indices.end(), lessthanfunction);
+		}
+
+		//cout << sampling_indices[0] << " " << sampling_indices[1] << endl;
+
+		//and now finally we fetch the sampled file
+		int counter = 0;
+		for (int snpt = 0; snpt < snpcount; snpt++)
+		{
+			//cout << "getting into loop" << endl;
+			MatrixXd block = readVblockFromIntFile(intensityfile,1,individuals.size(),snpinfo);	//read one line from the file
+			//now put the block into a sample
+
+			//cout << snpinfo[snpt][0] << endl;
+
+			for (int ind = 0; ind < individuals.size(); ind++)
+			{
+				if (snpt*individuals.size() + ind == sampling_indices[0])
+				{
+					sampling_indices.erase(sampling_indices.begin());
+					sample(counter,0) = block(0,ind*2);
+					sample(counter,1) = block(0,ind*2+1);
+					counter = counter + 1;
+				}
+			}
+		}
+
+		//cout << sample << endl;
+		//now close the intensity file - and reopen later
+		intensityfile.close();
+		//only transform the sample during the rescue stage
+		//sample = transform_data(sample);
+		vector<int> sample_nan_inds;
+		cout << "the sample is" << endl;
+		cout << sample << endl;
+		sample = strip_nans(sample, sample_nan_inds);
+		cout << "NaNs removed: " << sample_nan_inds.size() << endl;
+		cout << "---------" << endl;
+		//cout << sample << endl;
+		sd_sample = sd_of_data(sample);
+		cout << "sd sample " << sd_sample << endl;
+		genotype_mm_block_caller(sample, vs, custommus, customcovars, blockmus, blockcovars, blockjs, blockvs);
+		//MatrixXi blockmap_arranging = MatrixXi::Zero(1, sample.rows());
+		arrange_mixture_components(sample , blockmus,  blockcovars,  blockjs,  blockvs);
+
+		blockattempts++;
+
+		cout << "blocking attempts: " << blockattempts << endl;
+		cout << blockmus << endl;
+		cout << blockjs << endl;
+		cout << blockvs << endl;
+
+		if (   ! (isnan(blockmus.sum()) || isinf(blockmus.sum()) || blockjs.row(0).sum() == 0 || blockjs.row(1).sum() == 0 || blockjs.row(2).sum() == 0 )  )
+		{
+			cout << "prior information created successfully" << endl;
+			break;
+		}
+
+	} while (blockattempts <= 5);
+
+	if (  isnan(blockmus.sum()) || isinf(blockmus.sum()) || blockjs.row(0).sum() == 0 || blockjs.row(1).sum() == 0 || blockjs.row(2).sum() == 0   )
+	{
+		cout << "could not create priors from data following multiple attempts. Please check dataset & maybe try again. Exiting..." << endl;
+		exit(0);
+	}
+
+	/*******END OF SAMPLING & Block step*******/
+	/********************************/
+
+
+
 
 
 	/**

File opticall/opticallconverter.cpp

-/*
- * opticallconverter.cpp
- *
- *  Created on: 15 Jul 2011
- *      Author: ts8
- */
-
-
-#include <stdlib.h>
-#include <math.h>
-#include <iostream>
-#include <fstream>
-#include <vector>
-#include <ctime>
-#include <string>
-#include <limits>
-
-#include <time.h>
-
-#include <boost/tokenizer.hpp>
-#include <boost/algorithm/string.hpp>
-
-using namespace std;
-using namespace boost;
-
-
-vector<string> dataFromLine(string line) {
-
-
-	vector<string> linedata;
-	linedata.reserve(6000);    // make room for 10 elements
-	trim(line);
-	split(linedata, line, boost::is_any_of("\t "), token_compress_on);
-
-	//cout << "bits of line data " << linedata[5] << " " << linedata[linedata.size()-1] << " " << linedata.size()  << endl;
-	//cout << linedata.size() << "\n";
-	//vector<string> individs(&linedata[0]+3,&linedata[0]+linedata.size());
-	//vector<string> snpdesc(&linedata[0],&linedata[0]+3);
-	//snpinfo.push_back(snpdesc);
-	return linedata;
-
-}
-
-
-
-
-void readSamplesFromFinalReportFile(const char * fname) {
-	string line;
-	//vector<vector<string>> flat_intensities
-
-	vector<string> data;
-
-	vector<string> samples;
-
-	ifstream myfile (fname);
-
-	if (myfile.is_open())
-	{
-
-		int skipcounter = 0;
-		int samplecounter = 0;
-
-		do {
-			getline (myfile,line);
-			trim(line);
-		} while(line != "[Data]");
-
-		getline (myfile,line);
-
-		while ( myfile.good())
-		{
-			skipcounter++;
-			getline (myfile,line);
-			//cout << line << endl;
-			data = dataFromLine(line);
-			//now set the part of the matrix with the intensities
-			bool insamples = false;
-			for (int i = 0; i < samples.size(); i++)
-			{
-				if (samples[i] == data[1]) {
-					insamples = true;
-				}
-			}
-			if (!insamples)
-			{
-				samplecounter++;
-				samples.push_back(data[1]);
-				cout << "pushed back sample " << samplecounter  << " - "  << data[1] <<  " - " << skipcounter << endl;
-				skipcounter=0;
-			}
-
-
-		}
-		//myfile.close();
-	}
-	//return vblock;
-}
-
-
-
-
-
-
-/********************* Main function ****************************/
-
-int main(int argc, char* argv[]){
-
-	/*
-	if (argc < 5) { // Check the value of argc. If not enough parameters have been passed, inform user and exit.
-		cout << "Usage is -in <infile> -out <outdir>\n"; // Inform the user of how to use the program
-		//cin.get();
-		exit(0);
-	}
-	*/
-
-	string fname;
-
-
-	for (int i = 1; i < argc; i++) { /* We will iterate over argv[] to get the parameters stored inside.
-	 * Note that we're starting on 1 because we don't need to know the
-	 * path of the program, which is stored in argv[0] */
-		cout << i << endl;
-		if (string(argv[i]) == "-in") {
-			// We know the next argument *should* be the filename:
-			fname = string(argv[i + 1]);
-			i++;
-		} else if (string(argv[i]) == "-out") {
-			//output_fstem = string(argv[i + 1]);
-			i++;
-		} else {
-			cout << "Not enough or invalid arguments, please try again.\n";
-			cout << i << endl;
-			//cin.get();
-			exit(0);
-		}
-	}
-
-
-	readSamplesFromFinalReportFile(fname.c_str());
-
-}
-
-
-