Commits

opticall  committed 8cc257f

opticall update, increased precision for larger sample sizes and increased numbers of batches. fixed bug in inblock and minor bug in hwe-p value calculations

  • Participants
  • Parent commits acd2d68

Comments (0)

Files changed (2)

File opticall/filtering.cpp

-/*
- *  filtering.cpp
- *  
- *
- *  Created by Tejas Shah on 15/11/2011.
- *  Copyright 2011 WTSI. 
- *
- */
-
-
-#include <algorithm>
-#include "inclusions.h"
-
-
-using namespace std;
-using namespace boost;
-using namespace Eigen;
-
-/*********Counting the lines in a file********/
-
-
-unsigned int fileRead( istream & is, vector <char> & buff ) {
-	is.read( &buff[0], buff.size() );
-	return is.gcount();
-}
-
-unsigned int countLines( const vector <char> & buff, int sz ) {
-	int newlines = 0;
-	const char * p = &buff[0];
-	for ( int i = 0; i < sz; i++ ) {
-		if ( p[i] == '\n' ) {
-			newlines++;
-		}
-	}
-	return newlines;
-}
-
-unsigned int snpCount(const char * filename, bool header) {
-    time_t now = time(0);
-	
-	//cout << "buffer\n";
-	const int SZ = 1024 * 1024;
-	vector <char> buff( SZ );
-	ifstream ifs( filename );
-	int n = 0;
-	while( int cc = fileRead( ifs, buff ) ) {
-		n += countLines( buff, cc );
-	}
-	//cout << n << endl;
-    //cout << time(0) - now << endl;
-	
-	if (header) {
-		return n - 1;
-	}
-	return n;
-	
-}
-
-
-
-/***********************Parse a line to get the intensities***************/
-
-
-
-
-
-vector<string> intensitiesFromLine(string line, vector < vector <string>  > &snpinfo){
-	
-	
-	vector<string> linedata;
-	linedata.reserve(6000);    // make room for 10 elements
-	
-	trim(line);
-	
-	split(linedata, line, boost::is_any_of("\t "), token_compress_on);
-	
-	//cout << "bits of line data " << linedata[5] << " " << linedata[linedata.size()-1] << " " << linedata.size()  << endl;
-	
-	//cout << linedata.size() << "\n";
-	vector<string> individs(&linedata[0]+3,&linedata[0]+linedata.size());
-	vector<string> snpdesc(&linedata[0],&linedata[0]+3);
-	snpinfo.push_back(snpdesc);
-	return individs;
-	
-}
-
-
-
-
-/*****************Get Header Info**************/
-
-vector<string> readHeaderInfoFromIntFile(const char * fname) {
-	
-	//declare a vector to hold the data
-	vector<string> individualsx2;
-	vector< vector<string> > junkheaderdetail;
-	
-	string header;
-	ifstream myfile (fname);
-	if (myfile.is_open())
-	{
-		if ( myfile.good() )
-		{
-			getline (myfile,header);
-			//cout << line << endl;
-			individualsx2 = intensitiesFromLine(header,junkheaderdetail);
-		}
-		myfile.close();
-	}
-	else cout << "Unable to open file";
-	
-	vector<string> individuals;
-	
-	for(vector< string >::iterator ind_iter = individualsx2.begin();
-		ind_iter != individualsx2.end(); ++++ind_iter){
-		individuals.push_back((*ind_iter).substr(0,(*ind_iter).length()-1));
-	}
-	
-	
-	/*
-	 for(vector< string >::iterator ind_iter = individuals.begin();
-	 ind_iter != individuals.end(); ++ind_iter){
-	 cout << *ind_iter << "\n";
-	 }	
-	 
-	 cout << individuals.size() << "\n";
-	 */
-	return individuals;
-	
-	//provide the header info
-}
-
-
-
-
-
-/*********************Read a vblock from file and return *****************/
-
-MatrixXd readVblockFromIntFile(ifstream & myfile, int vertical_block_size, int numsamples, vector < vector <string>  > &snpinfo, int offset) {
-	string line;
-	//vector<vector<string>> flat_intensities
-	MatrixXd vblock(vertical_block_size,(numsamples)*2); 
-	vector<string> intensities;
-	
-	//ifstream myfile (fname);
-	if (myfile.is_open())
-	{
-		int snpcounter = 0;
-		while ( myfile.good() && snpcounter < vertical_block_size)
-		{
-			getline (myfile,line);
-			//cout << line << endl;
-			intensities = intensitiesFromLine(line,snpinfo);
-			//now set the part of the matrix with the intensities
-			for (int j = 0+2*offset; j < (offset+numsamples)*2; j = j+2 ) {
-				
-				//cout << "starting next step of loop for offset " << offset << "with counter " << snpcounter << "and j " << j << endl;
-				//cout << intensities.size() << endl;
-				
-				//cout << vblock(snpcounter,j - 2*offset) << endl;
-				
-				//cout << "didn't crap out" << endl;
-				
-				if (intensities[j] == "NaN")
-				{
-					//cout << "have an NaN in the dataset" << endl;
-					vblock(snpcounter,j - 2*offset) = numeric_limits<double>::quiet_NaN();
-					vblock(snpcounter,j+1 - 2*offset) = numeric_limits<double>::quiet_NaN();
-				}
-				
-				vblock(snpcounter,j - 2*offset) = atof( intensities[j].c_str() );
-				vblock(snpcounter,j+1 -2*offset) = atof( intensities[j+1].c_str() );
-				
-				//cout << "ccounter" << snpcounter << " vblock is " << vblock(snpcounter,j - 2*offset) << " " << vblock(snpcounter,j+1 -2*offset) << endl;
-				
-			}
-			snpcounter++;
-		}
-		//myfile.close();
-	}
-	
-	//cout << "\nblock:\n" << vblock.row(2) << endl;
-	return vblock;
-	//MatrixXi Vblock(vertical_block_size,numsamples); 	
-	
-	
-	//else cout << "Unable to open file";
-}
-
-
-/********************* Read gender file ******************************/
-
-vector<int> readInfoFile(string fname, vector<string> allsamples, MatrixXi &samplegenders, MatrixXi &samplebatches) {
-
-	string line;
-	
-	vector<string> infosampleids;
-	vector<int> infosamplegender;
-	vector<int> infosampleexclude;
-	vector<int> infosamplebatch;
-	
-	ifstream myfile (fname.c_str());
-	
-	if (myfile.is_open())
-	{
-		int samplecounter = 0;
-		while ( myfile.is_open()  && myfile.good() )
-		{
-			getline (myfile,line);
-			//split the file up and save it
-
-			
-			vector<string> linedata;
-			linedata.reserve(3000);    // make room for 5 elements
-			trim(line);
-			split(linedata, line, boost::is_any_of("\t "), token_compress_on);
-			
-			//cout << "gotten line " << linedata.size() << endl;
-			
-			if (linedata.size() < 3) 
-			{
-				continue;
-			}
-			
-			//linedata[0] is the sampleid, linedata[1] is the sex, linedata[2] is the exclusion flag, linedata[3] is the batch
-			infosampleids.push_back(linedata[0]);
-			infosamplegender.push_back(atoi(linedata[1].c_str()));
-			infosampleexclude.push_back(atoi(linedata[2].c_str()));
-            infosamplebatch.push_back(atoi(linedata[3].c_str()));
-			
-			
-		}
-	}
-	
-	myfile.close();
-	
-	//cout << "gotten lines" << endl;
-	
-	vector<int> exclusion_indices;
-			
-	//then compare with the sample list
-	for (int i = 0; i < allsamples.size(); i++)
-	{
-		samplegenders(i,0) = 0;
-        samplebatches(i,0) = 0;
-		for (int j = 0; j < infosampleids.size(); j++ )
-		{
-			if ( allsamples[i] == infosampleids[j] && infosampleexclude[j] == 1 &&  find (exclusion_indices.begin(), exclusion_indices.end(), i)  ==   exclusion_indices.end()  ) {
-				exclusion_indices.push_back(i);
-			}
-			if ( allsamples[i] == infosampleids[j]  ) {
-				samplegenders(i,0) = infosamplegender[j];
-                samplebatches(i,0) = infosamplebatch[j];
-			}
-		}
-	}
-	
-	//so exclusion list has been populated
-	return exclusion_indices;
-	
-}
-
-
-/********************* Combine two lists of indices to be excluded into one *************/
-
-vector<int> combineExclusionLists(vector<int> list1, vector<int> list2)
-{	
-	vector<int> outputlist = list1;
-	for (int i = 0; i < list2.size() ; i++)
-	{
-		if (outputlist.size() == 0  ||  find (outputlist.begin(), outputlist.end(), list2[i])  ==   outputlist.end() )
-		{
-			outputlist.push_back(list2[i]);
-		}
-	}
-	return outputlist;
-}
-
-
-/************************ returns a list of individualstobecalled i.e. individuals who have not been excluded  ***********************************/
-
-vector<string> individualsForCalling(vector<string> individuals, vector<int> excluded)
-{
-	vector<string> newindlist;
-	for (int i = 0; i < individuals.size() ; i++)
-	{
-		if (excluded.size() == 0 ||  find (excluded.begin(), excluded.end(), i)  ==   excluded.end()  )
-		{
-			newindlist.push_back(individuals[i]);
-		}
-	}
-	return newindlist;
-	
-}
-
-
-
-
-/********************* Sorting function for random numbers*******/
-
-bool lessthanfunction (int i,int j) { return (i<j); }
-
-
-
-
-/*************Find samples with mean intensities more than numsds away from the mean ******/
-
-vector<int> findIntensityOutliers(string fname, int snpcount, vector<string> individuals,double numsds, vector<int> exclude_list) {
-	
-	
-	
-	MatrixXd sumdistances = MatrixXd::Zero(individuals.size(),1);
-	MatrixXd countdistances = MatrixXd::Zero(individuals.size(),1);
-
-	//figure out the number of individuals to get on a run through the file assuming 500MB of RAM
-	int indsperrun = floor( 1000000 / (2*snpcount)  );
-	int indsdone = 0;
-	
-	cout << "removing outliers based on mean intensity" << endl;
-	
-
-	
-	while (indsdone < individuals.size())
-	{
-	
-		ifstream intensityfile (fname.c_str());
-		if (intensityfile.is_open()  && intensityfile.good())
-		{
-			string header;
-			getline(intensityfile,header);
-		}
-		
-
-		
-		int numsamples = min( (int) indsperrun, (int) individuals.size() - indsdone );  //minimum of indsperrun and (individuals.size() - indsdone)
-		int offset = indsdone;
-		
-		vector < vector< string > > snpinfo;
-		MatrixXd block = readVblockFromIntFile(intensityfile,snpcount,numsamples,snpinfo, offset   );	//get intensity data from the file
-		
-		//cout << block << endl;
-		//cout << block.rows() << " " << block.cols() << endl;
-		
-		intensityfile.close();
-
-		
-		for (int snpt = 0; snpt < snpcount; snpt++)
-		{
-			for(int ind = 0; ind < numsamples; ind++)
-			{
-				double distance = sqrt( block(snpt, 2*ind)*block(snpt, 2*ind) + block(snpt, 2*ind+1)*block(snpt, 2*ind+1)  );
-				if (!isnan(distance) && !isinf(distance)  )
-				{
-					sumdistances(ind+offset,0) = sumdistances(ind+offset,0) + distance;
-					countdistances(ind+offset,0)++;
-				}
-			}
-		}
-		
-
-		
-		indsdone = indsdone + numsamples;
-		
-		
-	}
-	
-	
-
-	MatrixXd meandistances = sumdistances.array() / countdistances.array();
-	//Next need to account for NANs - before calculating Stdev
-	double sumofmeandists = 0;
-	double sumofmeandistssq = 0;
-	int count = 0;
-	
-	for (int i = 0; i < meandistances.rows(); i++)
-	{
-		double mdist = meandistances(i,0);
-		if (!isnan(mdist) && !isinf(mdist)  && (exclude_list.size() == 0 || find (exclude_list.begin(), exclude_list.end(), i)  ==   exclude_list.end() )  )
-		{
-			count++;
-			sumofmeandists = sumofmeandists + mdist;
-			sumofmeandistssq = sumofmeandistssq + mdist*mdist;
-		}
-	}
-	
-	double meandist = sumofmeandists/count;
-	double sddist = sqrt( sumofmeandistssq/count - meandist*meandist );
-	
-	double lower_bound = meandist  - numsds*sddist;
-	double upper_bound = meandist + numsds*sddist;
-	
-	vector<int> outlying_samples;
-	
-	
-	cout << meandist << " " << sddist << endl;
-	cout << "end mean distances" << endl;
-	
-	for (int i = 0; i < meandistances.rows(); i++)
-	{
-		double mdist = meandistances(i,0);
-		if (  ( isnan(mdist) || isinf(mdist) || mdist < lower_bound || mdist > upper_bound )  && (exclude_list.size() == 0 || find (exclude_list.begin(), exclude_list.end(), i)  ==   exclude_list.end() )   )
-		{
-			outlying_samples.push_back(i);
-			cout << "excluding mean intensity outlier " << i << " with sampleid " << individuals[i]  << " and mdist " << mdist << endl;
-		}
-	}
-	
-	
-	
-	return outlying_samples;
-}
-
-
-
-
-
-
-/***********Create a random sample from the intensity data**********/
-
-MatrixXd fetchRandomIntensities(string fname, int sample_block_size, int snpcount, vector<string> individuals, vector<int> initial_outliers) {
-	
-	vector < vector< string > > snpinfo;
-	
-	ifstream intensityfile (fname.c_str());
-	if (intensityfile.is_open()  && intensityfile.good())
-	{
-		string header;
-		getline(intensityfile,header);
-	}
-
-
-	/**
-	 * STEP 1: Create one block mixture model - by sampling the file once
-	 */
-	//collect data
-	MatrixXd sample(min<int>(sample_block_size,snpcount*individuals.size()),2);
-	vector<int> sampling_indices;
-
-	//cout << "getting ready to smaple" << endl;
-
-	//first check if there's actually enough points to take a big sample
-	if (sample_block_size >= snpcount*individuals.size())
-	{
-		//cout << " not enough points for sample " << endl;
-		for (int i = 0; i < snpcount*individuals.size(); i++) {
-			//TODO: Need to consider the excluded sample indices
-			bool excluded = false;
-			for (int outlind = 0; outlind < initial_outliers.size(); outlind++ )
-			{
-				if ((i - initial_outliers[outlind])%individuals.size() == 0 ) 
-				{
-					excluded = true;
-				}
-			}
-			
-			if (!excluded)
-			{
-				sampling_indices.push_back(i);
-			}
-			
-		}
-		
-		
-	}
-	else {
-		//cout << " enough points for random sample " << endl;
-		//select a bunch of random numbers
-		srand(time(0));
-		while (sampling_indices.size() < sample_block_size) {
-			float randnum = (float)rand()/(float)RAND_MAX;
-			int val = floor(randnum*snpcount*individuals.size());
-			if (val == snpcount*individuals.size()) {
-				//slight fudge if the random number chosen is exactly 1.0
-				val = val -1 ;
-			}
-			
-			//TODO: Need to consider the excluded sample indices
-			bool excluded = false;
-			for (int outlind = 0; outlind < initial_outliers.size(); outlind++ )
-			{
-				if ((val - initial_outliers[outlind])%individuals.size() == 0 ) 
-				{
-					excluded = true;
-				}
-			}
-			
-			//our random value is not in the list - add it
-			if (!excluded && find (sampling_indices.begin(), sampling_indices.end(), val)  ==   sampling_indices.end() ) {
-				sampling_indices.push_back(val);
-			}
-		}
-		//ok so now we have the required number of random values - sort them
-		sort (sampling_indices.begin(), sampling_indices.end(), lessthanfunction);
-	}
-
-	//cout << sampling_indices[0] << " " << sampling_indices[1] << endl;
-
-	//and now finally we fetch the sampled file
-	int counter = 0;
-	for (int snpt = 0; snpt < snpcount; snpt++)
-	{
-		//cout << "getting into loop" << endl;
-		MatrixXd block = readVblockFromIntFile(intensityfile,1,individuals.size(),snpinfo,0);	//read one line from the file
-		//now put the block into a sample
-	
-		//cout << snpinfo[snpt][0] << endl;
-	
-		for (int ind = 0; ind < individuals.size(); ind++)
-		{
-			if (snpt*individuals.size() + ind == sampling_indices[0])
-			{
-				sampling_indices.erase(sampling_indices.begin());
-				sample(counter,0) = block(0,ind*2);
-				sample(counter,1) = block(0,ind*2+1);
-				counter = counter + 1;
-			}
-		}
-	}
-
-
-	intensityfile.close();
-	return sample;
-
-}
-
-
-
-
+/*
+ *  filtering.cpp
+ *  
+ *
+ *  Created by Tejas Shah on 15/11/2011.
+ *  Copyright 2011 WTSI. 
+ *
+ */
+
+
+#include <algorithm>
+#include "inclusions.h"
+
+
+using namespace std;
+using namespace boost;
+using namespace Eigen;
+
+/*********Counting the lines in a file********/
+
+
+unsigned int fileRead( istream & is, vector <char> & buff ) {
+	is.read( &buff[0], buff.size() );
+	return is.gcount();
+}
+
+unsigned int countLines( const vector <char> & buff, int sz ) {
+	int newlines = 0;
+	const char * p = &buff[0];
+	for ( int i = 0; i < sz; i++ ) {
+		if ( p[i] == '\n' ) {
+			newlines++;
+		}
+	}
+	return newlines;
+}
+
+unsigned int snpCount(const char * filename, bool header) {
+    time_t now = time(0);
+	
+	//cout << "buffer\n";
+	const int SZ = 1024 * 1024;
+	vector <char> buff( SZ );
+	ifstream ifs( filename );
+	int n = 0;
+	while( int cc = fileRead( ifs, buff ) ) {
+		n += countLines( buff, cc );
+	}
+	//cout << n << endl;
+    //cout << time(0) - now << endl;
+	
+	if (header) {
+		return n - 1;
+	}
+	return n;
+	
+}
+
+
+
+/***********************Parse a line to get the intensities***************/
+
+
+
+
+
+vector<string> intensitiesFromLine(string line, vector < vector <string>  > &snpinfo){
+	
+	
+	vector<string> linedata;
+	linedata.reserve(6000);    // make room for 10 elements
+	
+	trim(line);
+	
+	split(linedata, line, boost::is_any_of("\t "), token_compress_on);
+	
+	//cout << "bits of line data " << linedata[5] << " " << linedata[linedata.size()-1] << " " << linedata.size()  << endl;
+	
+	//cout << linedata.size() << "\n";
+	vector<string> individs(&linedata[0]+3,&linedata[0]+linedata.size());
+	vector<string> snpdesc(&linedata[0],&linedata[0]+3);
+	snpinfo.push_back(snpdesc);
+	return individs;
+	
+}
+
+
+
+
+/*****************Get Header Info**************/
+
+vector<string> readHeaderInfoFromIntFile(const char * fname) {
+	
+	//declare a vector to hold the data
+	vector<string> individualsx2;
+	vector< vector<string> > junkheaderdetail;
+	
+	string header;
+	ifstream myfile (fname);
+	if (myfile.is_open())
+	{
+		if ( myfile.good() )
+		{
+			getline (myfile,header);
+			//cout << line << endl;
+			individualsx2 = intensitiesFromLine(header,junkheaderdetail);
+		}
+		myfile.close();
+	}
+	else cout << "Unable to open file";
+	
+	vector<string> individuals;
+	
+	for(vector< string >::iterator ind_iter = individualsx2.begin();
+		ind_iter != individualsx2.end(); ++++ind_iter){
+		individuals.push_back((*ind_iter).substr(0,(*ind_iter).length()-1));
+	}
+	
+	
+	/*
+	 for(vector< string >::iterator ind_iter = individuals.begin();
+	 ind_iter != individuals.end(); ++ind_iter){
+	 cout << *ind_iter << "\n";
+	 }	
+	 
+	 cout << individuals.size() << "\n";
+	 */
+	return individuals;
+	
+	//provide the header info
+}
+
+
+
+
+
+/*********************Read a vblock from file and return *****************/
+
+MatrixXd readVblockFromIntFile(ifstream & myfile, int vertical_block_size, int numsamples, vector < vector <string>  > &snpinfo, int offset) {
+	string line;
+	//vector<vector<string>> flat_intensities
+	MatrixXd vblock(vertical_block_size,(numsamples)*2); 
+	vector<string> intensities;
+	
+	//ifstream myfile (fname);
+	if (myfile.is_open())
+	{
+		int snpcounter = 0;
+		while ( myfile.good() && snpcounter < vertical_block_size)
+		{
+			getline (myfile,line);
+			//cout << line << endl;
+			intensities = intensitiesFromLine(line,snpinfo);
+			//now set the part of the matrix with the intensities
+			for (int j = 0+2*offset; j < (offset+numsamples)*2; j = j+2 ) {
+				
+				//cout << "starting next step of loop for offset " << offset << "with counter " << snpcounter << "and j " << j << endl;
+				//cout << intensities.size() << endl;
+				
+				//cout << vblock(snpcounter,j - 2*offset) << endl;
+				
+				//cout << "didn't crap out" << endl;
+				
+				if (intensities[j] == "NaN")
+				{
+					//cout << "have an NaN in the dataset" << endl;
+					vblock(snpcounter,j - 2*offset) = numeric_limits<double>::quiet_NaN();
+					vblock(snpcounter,j+1 - 2*offset) = numeric_limits<double>::quiet_NaN();
+				}
+				
+				vblock(snpcounter,j - 2*offset) = atof( intensities[j].c_str() );
+				vblock(snpcounter,j+1 -2*offset) = atof( intensities[j+1].c_str() );
+				
+				//cout << "ccounter" << snpcounter << " vblock is " << vblock(snpcounter,j - 2*offset) << " " << vblock(snpcounter,j+1 -2*offset) << endl;
+				
+			}
+			snpcounter++;
+		}
+		//myfile.close();
+	}
+	
+	//cout << "\nblock:\n" << vblock.row(2) << endl;
+	return vblock;
+	//MatrixXi Vblock(vertical_block_size,numsamples); 	
+	
+	
+	//else cout << "Unable to open file";
+}
+
+
+/********************* Read gender file ******************************/
+
+vector<int> readInfoFile(string fname, vector<string> allsamples, MatrixXi &samplegenders, MatrixXi &samplebatches) {
+
+	string line;
+	
+	vector<string> infosampleids;
+	vector<int> infosamplegender;
+	vector<int> infosampleexclude;
+	vector<int> infosamplebatch;
+	
+	ifstream myfile (fname.c_str());
+	
+	if (myfile.is_open())
+	{
+		int samplecounter = 0;
+		while ( myfile.is_open()  && myfile.good() )
+		{
+			getline (myfile,line);
+			//split the file up and save it
+
+			
+			vector<string> linedata;
+			linedata.reserve(3000);    // make room for 5 elements
+			trim(line);
+			split(linedata, line, boost::is_any_of("\t "), token_compress_on);
+			
+			//cout << "gotten line " << linedata.size() << endl;
+			
+			if (linedata.size() < 3) 
+			{
+				continue;
+			}
+			
+			//linedata[0] is the sampleid, linedata[1] is the sex, linedata[2] is the exclusion flag, linedata[3] is the batch
+			infosampleids.push_back(linedata[0]);
+			infosamplegender.push_back(atoi(linedata[1].c_str()));
+			infosampleexclude.push_back(atoi(linedata[2].c_str()));
+            infosamplebatch.push_back(atoi(linedata[3].c_str()));
+			
+			
+		}
+	}
+	
+	myfile.close();
+	
+	//cout << "gotten lines" << endl;
+	
+	vector<int> exclusion_indices;
+			
+	//then compare with the sample list
+	for (int i = 0; i < allsamples.size(); i++)
+	{
+		samplegenders(i,0) = 0;
+        samplebatches(i,0) = 0;
+		for (int j = 0; j < infosampleids.size(); j++ )
+		{
+			if ( allsamples[i] == infosampleids[j] && infosampleexclude[j] == 1 &&  find (exclusion_indices.begin(), exclusion_indices.end(), i)  ==   exclusion_indices.end()  ) {
+				exclusion_indices.push_back(i);
+			}
+			if ( allsamples[i] == infosampleids[j]  ) {
+				samplegenders(i,0) = infosamplegender[j];
+                samplebatches(i,0) = infosamplebatch[j];
+			}
+		}
+	}
+	
+	//so exclusion list has been populated
+	return exclusion_indices;
+	
+}
+
+
+/********************* Combine two lists of indices to be excluded into one *************/
+
+vector<int> combineExclusionLists(vector<int> list1, vector<int> list2)
+{	
+	vector<int> outputlist = list1;
+	for (int i = 0; i < list2.size() ; i++)
+	{
+		if (outputlist.size() == 0  ||  find (outputlist.begin(), outputlist.end(), list2[i])  ==   outputlist.end() )
+		{
+			outputlist.push_back(list2[i]);
+		}
+	}
+	return outputlist;
+}
+
+
+/************************ returns a list of individualstobecalled i.e. individuals who have not been excluded  ***********************************/
+
+vector<string> individualsForCalling(vector<string> individuals, vector<int> excluded)
+{
+	vector<string> newindlist;
+	for (int i = 0; i < individuals.size() ; i++)
+	{
+		if (excluded.size() == 0 ||  find (excluded.begin(), excluded.end(), i)  ==   excluded.end()  )
+		{
+			newindlist.push_back(individuals[i]);
+		}
+	}
+	return newindlist;
+	
+}
+
+
+
+
+/********************* Sorting function for random numbers*******/
+
+bool lessthanfunction (int i, int j) { return (i<j); }
+
+bool longlessthanfunction (long int i, long int j) { return (i<j); }
+
+
+
+/*************Find samples with mean intensities more than numsds away from the mean ******/
+
+vector<int> findIntensityOutliers(string fname, int snpcount, vector<string> individuals,double numsds, vector<int> exclude_list) {
+	
+	
+	
+	MatrixXd sumdistances = MatrixXd::Zero(individuals.size(),1);
+	MatrixXd countdistances = MatrixXd::Zero(individuals.size(),1);
+
+	//figure out the number of individuals to get on a run through the file assuming 500MB of RAM
+	int indsperrun = floor( 1000000 / (2*snpcount)  );
+	int indsdone = 0;
+	
+	cout << "removing outliers based on mean intensity" << endl;
+	
+
+	
+	while (indsdone < individuals.size())
+	{
+	
+		ifstream intensityfile (fname.c_str());
+		if (intensityfile.is_open()  && intensityfile.good())
+		{
+			string header;
+			getline(intensityfile,header);
+		}
+		
+
+		
+		int numsamples = min( (int) indsperrun, (int) individuals.size() - indsdone );  //minimum of indsperrun and (individuals.size() - indsdone)
+		int offset = indsdone;
+		
+		vector < vector< string > > snpinfo;
+		MatrixXd block = readVblockFromIntFile(intensityfile,snpcount,numsamples,snpinfo, offset   );	//get intensity data from the file
+		
+		//cout << block << endl;
+		//cout << block.rows() << " " << block.cols() << endl;
+		
+		intensityfile.close();
+
+		
+		for (int snpt = 0; snpt < snpcount; snpt++)
+		{
+			for(int ind = 0; ind < numsamples; ind++)
+			{
+				double distance = sqrt( block(snpt, 2*ind)*block(snpt, 2*ind) + block(snpt, 2*ind+1)*block(snpt, 2*ind+1)  );
+				if (!isnan(distance) && !isinf(distance)  )
+				{
+					sumdistances(ind+offset,0) = sumdistances(ind+offset,0) + distance;
+					countdistances(ind+offset,0)++;
+				}
+			}
+		}
+		
+
+		
+		indsdone = indsdone + numsamples;
+		
+		
+	}
+	
+	
+
+	MatrixXd meandistances = sumdistances.array() / countdistances.array();
+	//Next need to account for NANs - before calculating Stdev
+	double sumofmeandists = 0;
+	double sumofmeandistssq = 0;
+	int count = 0;
+	
+	for (int i = 0; i < meandistances.rows(); i++)
+	{
+		double mdist = meandistances(i,0);
+		if (!isnan(mdist) && !isinf(mdist)  && (exclude_list.size() == 0 || find (exclude_list.begin(), exclude_list.end(), i)  ==   exclude_list.end() )  )
+		{
+			count++;
+			sumofmeandists = sumofmeandists + mdist;
+			sumofmeandistssq = sumofmeandistssq + mdist*mdist;
+		}
+	}
+	
+	double meandist = sumofmeandists/count;
+	double sddist = sqrt( sumofmeandistssq/count - meandist*meandist );
+	
+	double lower_bound = meandist  - numsds*sddist;
+	double upper_bound = meandist + numsds*sddist;
+	
+	vector<int> outlying_samples;
+	
+	
+	cout << meandist << " " << sddist << endl;
+	cout << "end mean distances" << endl;
+	
+	for (int i = 0; i < meandistances.rows(); i++)
+	{
+		double mdist = meandistances(i,0);
+		if (  ( isnan(mdist) || isinf(mdist) || mdist < lower_bound || mdist > upper_bound )  && (exclude_list.size() == 0 || find (exclude_list.begin(), exclude_list.end(), i)  ==   exclude_list.end() )   )
+		{
+			outlying_samples.push_back(i);
+			cout << "excluding mean intensity outlier " << i << " with sampleid " << individuals[i]  << " and mdist " << mdist << endl;
+		}
+	}
+	
+	
+	
+	return outlying_samples;
+}
+
+
+
+
+
+
+/***********Create a random sample from the intensity data**********/
+
+MatrixXd fetchRandomIntensities(string fname, int sample_block_size, int snpcount, vector<string> individuals, vector<int> initial_outliers) {
+        cout << "fetching random intensities" << endl;
+	
+	vector < vector< string > > snpinfo;
+	
+	ifstream intensityfile (fname.c_str());
+	if (intensityfile.is_open()  && intensityfile.good())
+	{
+		string header;
+		getline(intensityfile,header);
+	}
+        //cout << "num SNPs from block is " << snpcount  << endl;
+        //cout << "num inds from block is " << individuals.size()  << endl;
+        //cout << "product is " << snpcount*individuals.size()  << endl;
+        //cout << "min is " << min<long int>(sample_block_size,snpcount*individuals.size())  << endl;
+	/**
+	 * STEP 1: Create one block mixture model - by sampling the file once
+	 */
+	//collect data
+	MatrixXd sample(min<long int>(sample_block_size,snpcount*individuals.size()),2);
+	vector<long int> sampling_indices;
+
+	//cout << "getting ready to smaple" << endl;
+	//printf("%s\n", sample_block_size >= snpcount*individuals.size()?"true":"false");
+	//first check if there's actually enough points to take a big sample
+	if (sample_block_size >= snpcount*individuals.size())
+	{
+		//cout << " not enough points for sample " << endl;
+		for (long int i = 0; i < snpcount*individuals.size(); i++) {
+			//TODO: Need to consider the excluded sample indices
+			bool excluded = false;
+			for (int outlind = 0; outlind < initial_outliers.size(); outlind++ )
+			{
+				if ((i - initial_outliers[outlind])%individuals.size() == 0 ) 
+				{
+					excluded = true;
+				}
+			}
+			
+			if (!excluded)
+			{
+				sampling_indices.push_back(i);
+			}
+			
+		}
+		
+		
+	}
+	else {
+		//cout << " enough points for random sample " << endl;
+		//select a bunch of random numbers
+		srand(time(0));
+		while (sampling_indices.size() < sample_block_size) {
+			float randnum = (float)rand()/(float)RAND_MAX;
+			//long int val = floor(randnum*snpcount*individuals.size());
+			long int val = lrint( randnum*snpcount*individuals.size() );
+			//cout << "val is " << val << endl;
+			
+			if (val == snpcount*individuals.size()) {
+				//slight fudge if the random number chosen is exactly 1.0
+				val = val -1 ;
+			}
+			
+			//TODO: Need to consider the excluded sample indices
+			bool excluded = false;
+			for (int outlind = 0; outlind < initial_outliers.size(); outlind++ )
+			{
+				if ((val - initial_outliers[outlind])%individuals.size() == 0 ) 
+				{
+					excluded = true;
+					//cout << "exclusion " << val << "belongs to " << initial_outliers[outlind] << endl;
+				}
+			}
+			
+			//our random value is not in the list - add it
+			if (!excluded && find (sampling_indices.begin(), sampling_indices.end(), val)  ==   sampling_indices.end() ) {
+				//cout << "pushing back val " <<  val << endl;
+				sampling_indices.push_back(val);
+			}
+		}
+		//ok so now we have the required number of random values - sort them
+		sort (sampling_indices.begin(), sampling_indices.end(), longlessthanfunction);
+	}
+
+	//cout << sampling_indices[0] << " " << sampling_indices[1] << endl;
+	//for (int i=0; i<sampling_indices.size();i++){cout << sampling_indices[i] << endl;}
+	//cout << sampling_indices.size() << endl;
+	//cout << "done printing" << endl;
+	//and now finally we fetch the sampled file
+	int counter = 0;
+	for (int snpt = 0; snpt < snpcount; snpt++)
+	{
+		//cout << "getting into loop" << endl;
+		MatrixXd block = readVblockFromIntFile(intensityfile,1,individuals.size(),snpinfo,0);	//read one line from the file
+		//now put the block into a sample
+	
+		//cout << snpinfo[snpt][0] << endl;
+	
+		for (int ind = 0; ind < individuals.size(); ind++)
+		{
+
+			long int curr_pt_index = snpt*individuals.size() + ind;
+			if (curr_pt_index == sampling_indices[0])
+			{
+				sampling_indices.erase(sampling_indices.begin());
+				sample(counter,0) = block(0,ind*2);
+				sample(counter,1) = block(0,ind*2+1);
+				//cout << "got point " << sample(counter,0) << " " << sample(counter,1) << endl;
+				counter = counter + 1;
+			}
+		}
+	}
+
+
+	intensityfile.close();
+	return sample;
+
+}
+
+
+
+

File opticall/opticall.cpp

-// simple_example_2.cpp
-
-
-#include "inclusions.h"
-#include "filtering.h"
-
-#include "kmpp/KMeans.h"
-
-
-
-using namespace std;
-using namespace boost;
-using namespace Eigen;
-
-
-double G_CALL_THRESHOLD = 0.7;
-
-
-/*********Doesn't C++ have these?******/
-
-int roundtoint(double r) {
-    return (r > 0.0) ? floor(r + 0.5) : ceil(r - 0.5);
-}
-
-
-
-
-
-
-
-/*********Functions the Eigen library should have*****/
-
-
-
-vector<int>	argsort(MatrixXd matrix,int axis, int index, bool descending) {
-	//The argsort function that takes in a matrix, and an axis - 0 (row), 1(column), with an index
-	//then returns a list of indices based on the sorted values
-
-	//we just need to consider the relevant block of the matrix
-
-
-	MatrixXd sortingsegment;
-	if (axis == 0) {
-		sortingsegment = matrix.row(index).transpose();
-	}
-	else {
-		sortingsegment = matrix.col(index);
-	}
-
-	//cout << "sorting segment " << endl;
-	//cout << sortingsegment << endl;
-
-	vector<int>argsorts;
-	for (int i = 0; i < sortingsegment.rows(); i++)
-	{
-		argsorts.push_back(i);
-	}
-
-	//cout << argsorts.size() << endl;
-
-	//got the bit to be sorted - now I need to sort it
-	//there's waaay better algorithms than bubblesort - but I'll sort this later...no pun intended
-	bool swapped = false;
-	do
-	{
-		swapped = false;
-		for (int j = 0; j < sortingsegment.rows()-1; j++)
-		{
-			if (descending && sortingsegment(argsorts[j],0) < sortingsegment(argsorts[j+1],0)     ) {
-				int tmpargind = argsorts[j+1];
-				argsorts[j+1]=argsorts[j];
-				argsorts[j]=tmpargind;
-				swapped = true;
-			}
-			else if (!descending && sortingsegment(argsorts[j],0) > sortingsegment(argsorts[j+1],0) ) {
-				int tmpargind = argsorts[j+1];
-				argsorts[j+1]=argsorts[j];
-				argsorts[j]=tmpargind;
-				swapped = true;
-			}
-		}
-	} while (swapped);
-
-	return argsorts;
-
-}
-
-
-MatrixXd sortedmatrix(MatrixXd matrix, int axis, vector<int>argsorts) {
-
-	MatrixXd sortedoutput = MatrixXd::Zero(matrix.rows(), matrix.cols());
-	int loopnumber = matrix.cols();
-	if (axis == 1){
-		loopnumber = matrix.rows();
-	}
-	for (int i = 0; i < loopnumber; i++)
-	{
-		if (axis == 0) {
-			sortedoutput.col(i) = matrix.col(argsorts[i]);
-		}
-		else {
-			sortedoutput.row(i) =	matrix.row(argsorts[i]);
-		}
-	}
-
-	return sortedoutput;
-}
-
-
-/********************* Approx quantile fn. for low p-vals**************/
-
-
-double approx_quantile(double p)
-{
-    
-    //Approximation to normal zscore 
-    //from http://www.johndcook.com/normal_cdf_inverse.html
-    //originally from Abramowitz and Stegun
-    //Handbook of Mathematical Functions: with Formulas, Graphs, and Mathematical Tables
-    
-    double t = sqrt(log(1/(p*p)));
-    
-    double c0 = 2.515517;
-    double c1 = 0.802853;
-    double c2 = 0.010328;
-    double d1 = 1.432788;
-    double d2 = 0.189269;
-    double d3 = 0.001308;
-    
-    double numerator = c0 + c1*t + c2*t*t;
-    double denomenator = 1 + d1*t + d2*t*t + d3*t*t*t;
-    
-    return t-numerator/denomenator;
-    
-}
-
-
-
-double approx_right_tail_cdf(double z)
-{
-    
-    //Approximation to right normal tail integral 
-    //from "A UNIFORM APPROXIMATION TO THE RIGHT NORMAL TAIL INTEGRAL"
-    //W. BRYC
-    //formula (11) By Hart
-    //A Close Approximation Related to the Error Function
-    //http://www.ams.org/journals/mcom/1966-20-096/S0025-5718-1966-0203907-1/S0025-5718-1966-0203907-1.pdf
-    
-
-    
-    double p0 = sqrt(M_PI/2);
-    double a = (1+sqrt(1-2*M_PI*M_PI+6*M_PI)) / (2*M_PI);
-    double b = 2*M_PI*a*a;
-    
-    double multiplier = exp(-1*z*z/2)/(sqrt(2*M_PI)*z);
-    double numerator = sqrt(1+b*z*z )/(1+a*z*z );
-    
-    double denomenator =  p0*z + sqrt( p0*p0*z*z + exp(-1*z*z/2)*numerator   );
-    
-    
-
-    
-    return multiplier*(1-numerator/denomenator);
-    
-}
-
-
-
-
-
-/********************* Get Variances from the data *******************/
-
-double logcoeff(double coeff) {
-	return log(coeff);
-
-}
-
-double logcoeffpl1(double coeff) {
-	return log(coeff+1);
-}
-
-double expcoeff(double coeff) {
-	return exp(coeff);
-}
-
-double cleannan(double coeff) {
-	if (isinf(coeff) ||isnan(coeff)) {
-		return 0;
-	}
-	else {
-		return coeff;
-	}
-}
-
-MatrixXd sd_of_data(const MatrixXd &line_data) {
-	MatrixXd mu_data = line_data.colwise().sum()/line_data.rows();
-	MatrixXd mu_datasq = (line_data.array()*line_data.array()).colwise().sum()/line_data.rows();
-	MatrixXd var_data = mu_datasq.array()-(mu_data.array()*mu_data.array());
-	MatrixXd sd_data = var_data.array().sqrt();
-	return sd_data;
-}
-
-MatrixXd transform_data(const MatrixXd  &line_data) {
-	MatrixXd transf_data(line_data.rows(), line_data.cols());
-	transf_data.col(0) = (line_data.col(0) - line_data.col(1)).array()/(line_data.col(0) + line_data.col(1)).array();
-	transf_data.col(0) = transf_data.col(0).unaryExpr(ptr_fun(cleannan));
-
-	//cout << ((line_data.col(0) + line_data.col(1)).array() + 1) << endl;
-
-	transf_data.col(1) =  ((line_data.col(0) + line_data.col(1)).array() + 1);
-	transf_data.col(1) = transf_data.col(1).unaryExpr(ptr_fun(logcoeff));
-
-	//cout << transf_data.col(0) << endl;
-
-	return transf_data;
-}
-
-
-
-MatrixXd untransform_data(const MatrixXd &line_data) {
-	MatrixXd untransf_data(line_data.rows(), line_data.cols());
-	untransf_data.col(0) = 0.5*(line_data.col(0).array() + 1).array()*( line_data.col(1).unaryExpr(ptr_fun(expcoeff)).array() -1 ).array();
-	untransf_data.col(1) = 0.5*(1-line_data.col(0).array()).array()*( line_data.col(1).unaryExpr(ptr_fun(expcoeff)).array() -1 ).array();
-	return untransf_data;
-}
-
-
-/************************Nan cleaning functions*****************************/
-
-
-
-MatrixXd strip_outliers(const MatrixXd &inputdata, vector<int> &exc_indices, double &outlier_range, bool set_outlier_range) {
-	
-	vector<int> num_indices;
-	
-	MatrixXd distances = inputdata.rowwise().norm();
-	
-	double mean_dist = distances.mean();
-	double sd_dist = sd_of_data(distances)(0,0);
-	
-	
-	
-	if (set_outlier_range)
-	{
-		outlier_range = 4.0*sd_dist;
-	}
-	//else
-	//{
-	//	mean_dist = 0; //because in this case the outlier range includes the mean dist 
-	//}
-
-	
-	
-	for (int i = 0; i < distances.rows(); i++)
-	{
-		if ( distances(i,0) > mean_dist +  outlier_range )
-		{
-			exc_indices.push_back(i);
-			//cout << "excluding " << inputdata.row(i) << " with distance: " << inputdata.row(i).norm() << endl;
-		}
-		else
-		{
-			num_indices.push_back(i);
-		}
-	}
-	
-	MatrixXd num_data(num_indices.size(),inputdata.cols());
-	for (int i =0; i < num_indices.size(); i++)
-	{
-		num_data.row(i) = inputdata.row(num_indices[i]);
-	}
-	
-	return num_data;
-	
-	
-	
-}
-
-
-
-MatrixXd strip_nans(const MatrixXd &inputdata, vector<int> &nan_indices, bool nonorm, double &outlier_range, bool set_outlier_range, vector<int> &exclude_list, vector<int> &excluded_samples, MatrixXi &genders, MatrixXi &batches) {
-
-	vector<int> num_indices;
-	
-	vector<int> infofile_excluded_indices;
-
-	for (int i = 0; i < inputdata.rows(); i++)
-	{
-		if (excluded_samples.size() > 0  && find (excluded_samples.begin(), excluded_samples.end(), i)  !=   excluded_samples.end()) 
-		{
-			//these samples have been excluded from the info file
-			//don't add it to the numeric indices, nor the nan indices - so the output isn't reinflated to include it
-			infofile_excluded_indices.push_back(i);
-		}
-		else if ( isnan( inputdata.row(i).sum() )  )
-		{
-			//cout << "stripping NAN at " << i << endl;
-			nan_indices.push_back(i);
-		}
-		else if (exclude_list.size() > 0  && find (exclude_list.begin(), exclude_list.end(), i)  !=   exclude_list.end())
-		{
-			//sample is excluded
-			//cout << "mean intensity exclude " << i << endl;
-			nan_indices.push_back(i);
-		}
-		else
-		{
-			num_indices.push_back(i);
-		}
-	}
-
-	//cout << "num indices size " << num_indices.size() << endl;
-	
-    
-	if (num_indices.size() == 0) {
-        cout << "EMPTY DATA - EMPTY!!!!" << endl;
-        //really shouldn't return this, probably better to throw an exception
-		return inputdata;
-	}
-
-
-	MatrixXd num_data(num_indices.size(),inputdata.cols());
-	for (int i =0; i < num_indices.size(); i++)
-	{
-		num_data.row(i) = inputdata.row(num_indices[i]);
-	}
-
-	//cout << num_data << endl;
-
-	//cout << "numeric data" << endl;
-	
-
-	
-	//I could now look for outliers - and then use num_indices[i] as the exclusion
-	
-	vector<int> outlier_indices;
-    MatrixXd clean_data = num_data;
-	
-	if (!nonorm)
-    {
-        clean_data = strip_outliers(num_data, outlier_indices, outlier_range, set_outlier_range);
-	}
-
-	for (int i =0; i < outlier_indices.size(); i++)
-	{
-		nan_indices.push_back(num_indices[outlier_indices[i]]); 
-		//cout << "stripping SNPwise outlier at " << inputdata.row(num_indices[outlier_indices[i]]) << endl;
-	}
-	
-	
-
-	sort (nan_indices.begin(), nan_indices.end(), lessthanfunction);
-
-	
-	cout << "EXCLUSION CHECK" << endl;
-	for (int i =0; i < nan_indices.size(); i++)
-	{
-		cout << "i is: " << nan_indices[i] << " and data point is: " << inputdata.row(nan_indices[i]) << endl;
-	}
-
-
-
-	//Handling creating the correct gender data taking into account removals etc.
-	vector<int> combinedexcluded_indices(nan_indices);
-	combinedexcluded_indices.insert(combinedexcluded_indices.end(), infofile_excluded_indices.begin(), infofile_excluded_indices.end());
-	sort (combinedexcluded_indices.begin(), combinedexcluded_indices.end(), lessthanfunction);
-	
-    //cout << combinedexcluded_indices.size() << " combined excluded" << endl;
-	
-	MatrixXi new_gender_data(clean_data.rows(),1);
-	MatrixXi new_batch_data(clean_data.rows(),1);
-    //cout << genders.rows() << " " << new_gender_data.rows();
-	int counter = 0;
-	for (int i = 0; i < inputdata.rows() ; i++)
-	{
-		if (!(combinedexcluded_indices.size() > 0  && find (combinedexcluded_indices.begin(), combinedexcluded_indices.end(), i)  !=   combinedexcluded_indices.end())) 
-		{
-			new_gender_data(counter,0) = genders(i,0);
-            new_batch_data(counter,0) = batches(i,0);
-			counter++;
-		}
-
-	}
-	genders = new_gender_data;
-    batches = new_batch_data;
-	
-	
-	
-	return clean_data;
-
-
-
-}
-
-
-
-
-
-
-
-
-MatrixXd probs_with_nans(const MatrixXd &post_probs,const vector<int> &nan_indices, vector<int> &excluded_samples, int original_size ) {
-
-
-	//cout << "reinflating" << endl;
-	//cout << "original size " << original_size << endl;
-	//cout << post_probs.rows() << endl;
-
-	//cout << "nan_indices size: " << nan_indices.size() << endl;
-	
-	
-	
-	MatrixXd post_probs_w_nans(original_size,post_probs.cols());
-
-	vector<int> reinflatingindices = combineExclusionLists(nan_indices, excluded_samples);
-	
-	//cout << "reinflating indices size" << reinflatingindices.size() << endl;
-	
-	sort (reinflatingindices.begin(), reinflatingindices.end(), lessthanfunction);
-	
-
-	int num_probs_counter = 0;
-	int index_counter = 0;
-	for (int i = 0; i < original_size; i++)
-	{
-		if (index_counter < reinflatingindices.size() &&  i == reinflatingindices[index_counter]) {
-			
-			post_probs_w_nans.row(i) = MatrixXd::Zero(1, post_probs.cols());
-			
-			post_probs_w_nans(i,post_probs.cols()-1)=1;
-			//move to the next nan index
-			index_counter++;
-		}
-		else
-		{
-			post_probs_w_nans.row(i) = post_probs.row(num_probs_counter);
-			num_probs_counter++;
-
-		}
-	}
-
-	//cout << "successfully reinflated" << endl;
-	
-	//finally remove the excluded indices and return
-	MatrixXd outputmatrix(post_probs_w_nans.rows()-excluded_samples.size(), post_probs_w_nans.cols());
-	sort (excluded_samples.begin(), excluded_samples.end(), lessthanfunction);
-	int excluded_sample_counter = 0;
-	int outputcounter = 0;
-	for (int i = 0; i < post_probs_w_nans.rows(); i++)
-	{
-		if ( excluded_sample_counter < excluded_samples.size() &&  i == excluded_samples[excluded_sample_counter]  )
-		{
-			//don't add
-			excluded_sample_counter++;
-		}
-		else
-		{
-			//add
-			outputmatrix.row(outputcounter) = post_probs_w_nans.row(i);
-			outputcounter++;
-		}
-		
-	}
-	
-	
-	return outputmatrix;
-	//return post_probs_w_nans;
-
-}
-
-
-
-
-//run in the case that all the values are NaN
-MatrixXd probs_with_nans(int numclasses, vector<int> excluded_samples, int original_size ) {
-
-	//cout << "original size - excluded " << original_size - excluded_samples.size() << endl;
-	//cout << post_probs.rows() << endl;
-
-	MatrixXd post_probs_w_nans(original_size - excluded_samples.size(),numclasses);
-
-	//cout << post_probs_w_nans.rows() << " " << post_probs_w_nans.cols() << endl;
-
-	for (int i = 0; i < original_size - excluded_samples.size(); i++)
-	{
-		post_probs_w_nans.row(i) = MatrixXd::Zero(1, numclasses);
-		post_probs_w_nans(i,numclasses-1)=1;
-	}
-
-    //cout << "probs with NANs calculated" << endl;
-    
-
-	return post_probs_w_nans;
-}
-
-
-
-
-
-/********************* Return the pdf of a mv student's t distribution at x *****************/
-
-//auxiliary functions to reduce matrix inverse calcs speed up some code
-
-
-MatrixXd inverted_matrix(const MatrixXd &covar) {
-	//speedup - for 2x2 matrices
-	MatrixXd covar_inv_mat(2,2);
-	covar_inv_mat(0,0)=covar(1,1);
-	covar_inv_mat(1,1)=covar(0,0);
-	covar_inv_mat(0,1)=-1*covar(0,1);
-	covar_inv_mat(1,0)=-1*covar(1,0);
-	covar_inv_mat = 1/( covar(0,0)*covar(1,1) - covar(1,0)*covar(0,1)  )*covar_inv_mat;
-	
-	return covar_inv_mat;
-}
-
-double det_of_mat(const MatrixXd &covar) {
-	double covar_determ = covar(0,0)*covar(1,1)-covar(0,1)*covar(1,0);
-	
-	//if (covar.determinant() <= 0 )
-	if (covar_determ <= 0 || isnan(covar_determ) || isinf(covar_determ) )
-	{
-		return 0;
-	}
-	return covar_determ;
-}
-
-double mvt_pdf_multiplier(const MatrixXd &covar, const double v, int dim) {
-	
-	double covar_determ = det_of_mat(covar); 
-	double num = tgamma(dim*0.5*v)*sqrt(1.0/covar_determ);
-	double den = sqrt(  pow(v*M_PI,dim) )*tgamma(v/2.0);
-	return num/den;
-}
-
-
-
-class MvtStaticParts {
-
-	
-	public:
-		MatrixXd inverse_covar;
-		double numerator;
-		MvtStaticParts(const MatrixXd &covar, const double v, int dim)
-		{
-			inverse_covar = inverted_matrix(covar);
-			numerator = mvt_pdf_multiplier(covar, v, dim);
-		}
-	
-};
-
-
-
-
-
-//------------
-
-double mahala_sq_dist(const MatrixXd &x_i, const MatrixXd &mu, const MvtStaticParts &staticcovar) {
-
-	//speedup - for 2x2 matrices
-	//MatrixXd covar_inv_mat(2,2);
-	//covar_inv_mat(0,0)=covar(1,1);
-	//covar_inv_mat(1,1)=covar(0,0);
-	//covar_inv_mat(0,1)=-1*covar(0,1);
-	//covar_inv_mat(1,0)=-1*covar(1,0);
-
-	//covar_inv_mat = 1/( covar(0,0)*covar(1,1) - covar(1,0)*covar(0,1)  )*covar_inv_mat;
-
-	//should be even faster!
-	MatrixXd covar_inv_mat = staticcovar.inverse_covar;
-	
-	//MatrixXd test = ((x_i - mu)*covar.inverse()*(x_i - mu).transpose());
-	MatrixXd test = ((x_i - mu)*covar_inv_mat*(x_i - mu).transpose());
-
-	return test(0,0);
-}
-
-
-double mvt_pdf(const MatrixXd &x_i, const MatrixXd &mu, const MvtStaticParts &staticcovar, const double &v) {
-    
-	//double covar_determ = covar(0,0)*covar(1,1)-covar(0,1)*covar(1,0);
-
-	//if (covar.determinant() <= 0 )
-	//if (covar_determ <= 0 || isnan(covar_determ) || isinf(covar_determ) )
-	//{
-		//return 0;
-	//}
-	
-	//dim = max(np.shape(x_i))
-    //num = special.gamma(dim*0.5*v)*np.sqrt(1.0/np.linalg.det(covar))
-    //den = np.sqrt(np.power(v*np.pi,dim))*special.gamma(v/2.0)*np.power(1+mahala_sq_dist(x_i,mu,covar),0.5*(v+dim))
-    //pdf = num/den
-
-	int dim = x_i.cols();
-
-	//double num =  tgamma(dim*0.5*v)*sqrt(1.0/covar_determ);
-	//double den = sqrt(  pow(v*M_PI,dim) )*tgamma(v/2.0)*pow(1.0+mahala_sq_dist(x_i,mu,covar),0.5*(v+dim))  ;
-
-	double num = staticcovar.numerator;
-	double den = pow(1.0+mahala_sq_dist(x_i,mu,staticcovar),0.5*(v+dim))  ;
-
-	
-	//cout << num << "  " << den << "  "  << endl;
-	
-	double output = num/den;
-
-	if (isnan(output) || isinf(output) )
-	{
-		return 0;
-	}
-
-	return output;
-
-
-}
-
-
-/******************** Functions for making calls from posterior probabilities, and checking calls are equal************/
-
-bool calls_equal(const vector<string> &call1, const vector<string> &call2) {
-	
-	if (call1.size() == 0 && call2.size() != 0) {
-		return false;
-	}
-	
-	if (call1.size() != 0 && call2.size() == 0) {
-		return false;
-	}
-	
-	bool equality = true;
-	for (int i=0; i < call1.size(); i++)
-	{
-		equality = equality && (call1[i] == call2[i]);
-	}
-	return equality;
-}
-
-vector<string> calls_from_posterior_probs(MatrixXd post_probs) {
-	vector <string> outputcalls;
-	for (int i = 0; i < post_probs.rows(); i++) {
-		
-		
-		double max_prob = post_probs.row(i).maxCoeff();
-		
-		if (max_prob < G_CALL_THRESHOLD ) {
-			outputcalls.push_back("4");
-		}
-		else if (  isinf(post_probs.row(i).sum()) ||isnan(post_probs.row(i).sum())  )    
-		{
-			outputcalls.push_back("4");
-		}
-		else
-		{
-			if (post_probs(i,0) == max_prob) { outputcalls.push_back("1"); }
-			if (post_probs(i,1) == max_prob) { outputcalls.push_back("3"); }
-			if (post_probs(i,2) == max_prob) { outputcalls.push_back("2"); }
-			if (post_probs(i,3) == max_prob) { outputcalls.push_back("4"); }
-		}
-	}
-	return outputcalls;
-}
-
-
-
-
-/************************ Functions to remove samples before inferring mixture models, and reinsert them right after *******************/
-
-MatrixXd strip_from_line_inference(const MatrixXd &inputdata, vector<int> &removed_indices, double outlier_range, MatrixXi &genders) {
-
-
-    
-    vector<int> num_indices;
-	MatrixXd distances = inputdata.rowwise().norm();
-	
-	double mean_dist = distances.mean();
-	double sd_dist = sd_of_data(distances)(0,0);
-	
-	
-	for (int i = 0; i < distances.rows(); i++)
-	{
-		if ( distances(i,0) > mean_dist +  outlier_range )
-		{
-			removed_indices.push_back(i); 
-			cout << "excluding point " << i << " with values:  " <<  inputdata.row(i) << " and distance: " << inputdata.row(i).norm() << " from inference." << endl;
-		}
-		else
-		{
-			num_indices.push_back(i);
-		}
-	}
-	
-	MatrixXd num_data(num_indices.size(),inputdata.cols());
-    MatrixXi new_gender_data(num_indices.size(),genders.cols());
-	for (int i =0; i < num_indices.size(); i++)
-	{
-		num_data.row(i) = inputdata.row(num_indices[i]);
-        new_gender_data.row(i) = genders.row(num_indices[i]);
-	}
-    
-    genders = new_gender_data;
-	
-	return num_data;
-    
-    
-    
-
-}
-
-
-
-
-
-
-/******************** Run a mixture model of Student's t distributions and return the models for the block data*****/
-
-MatrixXd Uijs(const MatrixXd &data, const MatrixXd &vs, const MatrixXd &mus, const vector<MatrixXd> &covars  ) {
-	int dim = data.cols();
-	MatrixXd mat_uijs(data.rows(), vs.rows());
-
-	//speed improvement
-	vector<MvtStaticParts> staticcovars;
-	for (int c = 0; c < covars.size(); c++)
-	{
-		staticcovars.push_back(MvtStaticParts(covars[c],vs(c,0),dim));
-	}
-	
-
-	for (int i = 0; i < data.rows(); i++) {
-		for (int j = 0; j < vs.rows(); j++) {
-			mat_uijs(i,j) = (vs(j,0)+dim)*1.0/(vs(j,0)+mahala_sq_dist(data.block(i,0,1,dim), mus.block(j,0,1,dim) , staticcovars[j]));
-		}
-	}
-
-    return mat_uijs;
-
-}
-
-
-MatrixXd Tijs(const MatrixXd &data , const MatrixXd &js, const MatrixXd &vs, const MatrixXd &mus, const vector<MatrixXd> &covars ) {
-	int dim = data.cols();
-	MatrixXd mat_tijs(data.rows(), js.rows());
-
-	//speed improvement
-	vector<MvtStaticParts> staticcovars;
-	for (int c = 0; c < covars.size(); c++)
-	{
-		staticcovars.push_back(MvtStaticParts(covars[c],vs(c,0),dim));
-	}
-	
-	
-
-	for (int i = 0; i < data.rows(); i++) {
-		for (int j = 0; j < js.rows(); j++) {
-			mat_tijs(i,j) = mvt_pdf(data.block(i,0,1,dim), mus.block(j,0,1,dim), staticcovars[j], vs(j,0))*js(j,0);
-		}
-		MatrixXd unnorm_block = mat_tijs.block(i,0,1,js.rows());
-		mat_tijs.block(i,0,1,js.rows()) = unnorm_block*(1.0/unnorm_block.sum());
-	}
-
-	//cout << mat_tijs.rowwise().sum() << endl;
-	return mat_tijs;
-}
-
-
-
-
-void mus_covs_for_Y_MT(MatrixXd &block_data, MatrixXd &starting_mus, vector<MatrixXd> &starting_covars, MatrixXd &starting_js, int dim) {
-    
-
-    //get the lowest x% of the data
-        //next figure out the 
-            //y% with the lowest y values
-            //y% with the lowest x values 
-    //use these to set up the 3 genotype classes
-    
-    MatrixXd distances = block_data.rowwise().norm();
-    
-    
-    //cout << distances << endl;
-    
-    vector<int> sorteddistinds = argsort(distances,1, 0, false);
-    
-    /*
-    for (int i = 0; i < sorteddistinds.size() ; i++)
-    {
-        cout << sorteddistinds[i] << endl;
-    }
-    cout << sorteddistinds.size() << endl;
-    */
-    
-    MatrixXd sortedbydist  = sortedmatrix(block_data, 1, sorteddistinds);
-    
-    //cout << sortedbydist << endl;
-    
-    //split this matrix into bottom x% of rows - and rest
-    int bottom_set = (int) (5*sortedbydist.rows()/100);
-    
-    MatrixXd bottom_data = sortedbydist.block(0,0,bottom_set,dim);
-    MatrixXd rest_of_data = sortedbydist.block(bottom_set,0,sortedbydist.rows()-bottom_set,dim);
-    
-    
-    //next - need to sort rest of data based on x and y values
-    vector<int> sorted_low_y_inds = argsort(rest_of_data,1, 1, false);
-    vector<int> sorted_low_x_inds = argsort(rest_of_data,1, 0, false);    
-    
-    int bottom_set_x = (int) (25*rest_of_data.rows()/100);
-    int bottom_set_y = (int) (25*rest_of_data.rows()/100);
-    
-    MatrixXd x_homs = sortedmatrix(rest_of_data, 1, sorted_low_y_inds).block(0,0,bottom_set_x,dim);
-    MatrixXd y_homs = sortedmatrix(rest_of_data, 1, sorted_low_x_inds).block(0,0,bottom_set_y,dim);    
-    
-    
-    //then get means and covars - and done!
-    
-    MatrixXd mu_hom_x = x_homs.colwise().sum()/x_homs.rows();
-    MatrixXd mu_hom_y = y_homs.colwise().sum()/y_homs.rows();
-    MatrixXd mu_zeros = bottom_data.colwise().sum()/bottom_data.rows();
-    
-    cout << "Y chrom mus are" << endl;
-    cout << mu_hom_x << " x_homs " << endl;
-    cout << mu_hom_y << " y_homs " << endl;    
-    cout << mu_zeros << " zeros " << endl;      
-    
-    
-    //then get covariance matrices
-    MatrixXd cov_hom_x =   (x_homs - mu_hom_x.replicate(x_homs.rows(), 1)).transpose() * (x_homs - mu_hom_x.replicate(x_homs.rows(), 1)) / x_homs.rows();
-    MatrixXd cov_hom_y =   (y_homs - mu_hom_y.replicate(y_homs.rows(), 1)).transpose() * (y_homs - mu_hom_y.replicate(y_homs.rows(), 1)) / y_homs.rows();
-    MatrixXd cov_zeros =   (bottom_data - mu_zeros.replicate(bottom_data.rows(), 1)).transpose() * (bottom_data - mu_zeros.replicate(bottom_data.rows(), 1)) / bottom_data.rows();
-    
-    cout << "Y chrom covars are" << endl;
-    cout << cov_hom_x << " x_homs " << endl;
-    cout << cov_hom_y << " y_homs " << endl;    
-    cout << cov_zeros << " zeros " << endl;     
-    
-    
-    starting_mus.row(0) = mu_hom_x;
-    starting_mus.row(1) = mu_hom_y;
-    starting_mus.row(2) << 0.0,0.0 ;
-    
-    starting_covars.push_back(cov_hom_x);
-    starting_covars.push_back(cov_hom_y);
-    starting_covars.push_back(cov_zeros);
-    
-    starting_js << 0.25, 0.25, 0.25, 0.25;
-    
-}
-
-
-
-void mus_covs_for_autosomes_X(MatrixXd &int_data, MatrixXd &starting_mus, vector<MatrixXd> &starting_covars, MatrixXd &starting_js, int dim, int numclasses, int numcustomclasses) {
-
-    int data_count = int_data.rows()*int_data.cols()/2;
-
-	//declare the variables for KMeans++
-	int attempts=150;
-	double * points  = new double[data_count*dim];
-	double * centres = new double[(numclasses-numcustomclasses)*dim];
-	int * assignments = new int[data_count];
-	double cost;
-    
-	for (int i = 0; i < int_data.rows(); i++)
-	{
-		for (int j = 0; j < int_data.cols(); j++)
-		{
-			//cout << "j is " << j << ' ' << dim << endl;
-			points[i*dim+j] = int_data(i,j);
-		}
-	}
-	cost = RunKMeansPlusPlus(data_count, numclasses-numcustomclasses, dim,points, attempts, centres, assignments);
-
-	//setup the mus, covars, etc
-	for (int i = 0; i < numclasses-numcustomclasses; i++ ) {
-        
-		MatrixXd cov;
-		cov.setIdentity(dim, dim);
-		starting_covars.push_back(cov*cost*2.0/data_count);
-        
-		for (int j = 0; j < dim; j++ ) {
-			starting_mus(i,j) = centres[dim*i+j] ;
-		}
-		starting_js(i,0)=1.0/numclasses;
-	}
-    
-	//TODO: delete the points used in the clustering - Done
-	delete[] points;
-	delete[] centres;
-	delete[] assignments;
-    
-    
-}
-
-
-
-
-void genotype_mm_block_caller(const MatrixXd &block_data, const MatrixXd &vs, const vector<MatrixXd> &custommus, const vector<MatrixXd> &customcovars,
-		MatrixXd &blockmus, vector<MatrixXd> &blockcovars, MatrixXd &blockjs, MatrixXd &blockvs, string chrom_type,  int numsteps = 30  ) {
-
-	int dim = 2;
-	int numclasses = 4;
-	int numcustomclasses = 1;
-
-	//This whole block data stuff is unneccesary - since our sampled block is the right size
-	/*
-	int data_count = block_data.rows()*block_data.cols()/2;
-	//fix the data from a subblock
-	MatrixXd int_data_T(dim,data_count);
-	MatrixXd int_data;
-	MatrixXd block_data_T(block_data.cols(),block_data.rows());
-	block_data_T = block_data.transpose();
-	int_data_T = Map<MatrixXd>(block_data_T.data(),dim,data_count);
-	int_data = int_data_T.transpose();
-	 */
-	int data_count = block_data.rows()*block_data.cols()/2;
-	MatrixXd int_data;
-	int_data = block_data;
-
-	//cout << int_data << endl;
-	//cout << "int data" << endl;
-	
-	MatrixXd mus(numclasses,dim); 
-	vector<MatrixXd> covars;
-    MatrixXd js(numclasses,1); 
-    
-    
-    
-    if (chrom_type == "A" || chrom_type == "X")
-    {
-        mus_covs_for_autosomes_X(int_data, mus, covars, js,  dim,  numclasses,  numcustomclasses);
-	}
-    else
-    {
-        mus_covs_for_Y_MT(int_data, mus, covars, js, dim);
-    }
-    
-    /**
-	//declare the variables for KMeans++
-	int attempts=150;
-	double * points  = new double[data_count*dim];
-	double * centres = new double[(numclasses-numcustomclasses)*dim];
-	int * assignments = new int[data_count];
-	double cost;
-
-	for (int i = 0; i < int_data.rows(); i++)
-	{
-		for (int j = 0; j < int_data.cols(); j++)
-		{
-			//cout << "j is " << j << ' ' << dim << endl;
-			points[i*dim+j] = int_data(i,j);
-		}
-	}
-	cost = RunKMeansPlusPlus(data_count, numclasses-numcustomclasses, dim,points, attempts, centres, assignments);
-    **/
-    
-    
-    
-    
-    //COMMENTED BEFORE 28 MAY 2012
-	//calculate the variance of the assigned pts
-	//use as starting variance
-	/*
-	cout << cost*2/data_count << endl;
-	for (int i = 0; i < numclasses-numcustomclasses; i++ ) {
-		 cout << centres[dim*i] << ' ' << centres[dim*i+1] << endl;
-	}
-	*/
-
-
-
-
-    /*
-	//setup the mus, covars, etc
-	for (int i = 0; i < numclasses-numcustomclasses; i++ ) {
-
-		MatrixXd cov;
-		cov.setIdentity(dim, dim);
-		covars.push_back(cov*cost*2.0/data_count);
-
-		for (int j = 0; j < dim; j++ ) {
-			mus(i,j) = centres[dim*i+j] ;
-		}
-		js(i,0)=1.0/numclasses;
-	}
-    */
-    
-    
-    
-    
-    
-    
-	//setup the custom mus and covars
-	for (int i = 0; i < numcustomclasses; i++) {
-		covars.push_back(customcovars[i]);
-		mus.block(numclasses-numcustomclasses+i,0,1,dim) = custommus[i];
-		js(numclasses-numcustomclasses+i,0)=1.0/numclasses;
-	}
-
-	//TODO: delete the points used in the clustering - Done
-    /*
-	delete[] points;
-	delete[] centres;
-	delete[] assignments;
-    */
-     
-	//cout << mus << endl;
-	//cout << js << endl;
-	
-	//HERE I should fix the means to what I think is best for X,Y chrom
-
-
-	//then just run the algorithm & updates
-	vector<string> current_calls;
-	vector<string> old_calls;
-	int globalsteps = 0;
-	bool calls_matched = false;
-	int stepcounter = 0;
-	do {
-		globalsteps++;
-		if (calls_matched){
-			stepcounter++;
-		}
-		
-	
-	//for (int step = 0; step < numsteps; step++) {
-		//cout << "block step " << step << endl;
-		MatrixXd mat_tijs = Tijs(int_data , js, vs, mus, covars );
-		MatrixXd mat_uijs = Uijs(int_data , vs, mus, covars);
-
-		//cout << "Tijs " << endl << mat_tijs << endl;
-		//cout << "Uijs" << endl << mat_uijs << endl;
-
-
-		js = (mat_tijs.colwise().sum().transpose())/(1.0*mat_tijs.rows());
-
-		MatrixXd mat_tij_uij_product = mat_tijs.array() * mat_uijs.array();
-		MatrixXd mus_sums = mat_tij_uij_product.transpose()*int_data;
-		MatrixXd mus_normalizer(numclasses,dim);
-		MatrixXd norm_factors = mat_tij_uij_product.colwise().sum().transpose().array().inverse();
-		for (int i = 0; i < dim; i++) {
-			mus_normalizer.block(0,i,numclasses,1) = norm_factors;
-		}
-		mus = mus_sums.array() * mus_normalizer.array();
-        
-        //set for the y, mt chroms
-        if (!(chrom_type == "A" || chrom_type == "X"))
-        {
-            mus.row(2) << 0.0,0.0 ;
-        }
-
-
-
-		for (int i = 0; i < numclasses - numcustomclasses; i++) {
-            
-
-            
-            
-			MatrixXd data_minus_mu_j(data_count,dim);
-
-			for (int p = 0; p < data_minus_mu_j.rows(); p++) {
-				data_minus_mu_j.row(p)=int_data.row(p) - mus.row(i);
-			}
-
-			MatrixXd data_minus_mu_j_Tij = data_minus_mu_j;
-			for (int p = 0; p < data_minus_mu_j_Tij.cols(); p++) {
-				data_minus_mu_j_Tij.col(p)= data_minus_mu_j_Tij.col(p).array() * mat_tij_uij_product.col(i).array();
-			}
-
-			MatrixXd unnorm_covar_j = data_minus_mu_j_Tij.transpose() * data_minus_mu_j;
-			double norm_fact = mat_tijs.colwise().sum()(0,i);
-			
-            
-            //DO THE Y/MT correction here - make sure the Y chrom het -> unknown class doesn't go degenerate!
-            if (!(chrom_type == "A" || chrom_type == "X")  && i == 2 &&   (isinf( (unnorm_covar_j/(1.0*norm_fact)).sum() )    || isnan(  (unnorm_covar_j/(1.0*norm_fact)).sum()  )    )  )
-            {
-                
-            }
-            else{
-                covars[i]=unnorm_covar_j/(1.0*norm_fact);                
-            }
-            
-
-			//cout << "Covar " << i << endl << covars[i] << endl;
-            
-            
-            
-            
-		}
-
-		//reset the means and covars of the custom classes
-		for (int i = 0; i < numcustomclasses; i++) {
-			mus.block(numclasses-numcustomclasses+i,0,1,dim) = custommus[i];
-			covars[numclasses-numcustomclasses+i] = customcovars[i];
-		}
-
-
-        
-        
-        
-		//cout << "Covar " << 3 << endl << covars[3] << endl;
-		//cout << mus.block(numclasses-numcustomclasses,0,1,dim) << endl;
-
-	//}
-		
-		old_calls = current_calls;
-		current_calls = calls_from_posterior_probs(mat_tijs);     //probs_j_line(int_data,mus, covars,js, vs, mat_probs_j)) ; //mat_tijs);