Commits

Stephen Smith committed 5c4cb7e

more likelihood edits

Comments (0)

Files changed (1)

trees/tree_likelihood_calculator.py

 import numpy as np
 from numpy import *
 from scipy.linalg import *
+from scipy.optimize import fmin_powell
 
 #move to datatype
 position = {"A":[0],"C":[1],"G":[2],"T":[3],"-":[0,1,2,3]}
+LARGE = 10000000
+
 
 def match_tips_and_seqs(tree,seqs):
 	lvs = tree.leaves()
 
 
 """
-here are a bunch of functions for use in optimizing params
+this scales so that the sixth rate (G->T) is 1 
+the matrix will be
+  A C G T
+A   1 2 3
+C     4 5
+G 
+T 
+
+the three base composition rates will be parameters
+6, 7, 8. The last base composition (T) will be 1- sum(6,7,8)
+and the total will be scaled to 1
 """
+def set_gtr_rmatrix_basefreq(params):
+	rmatrix = ones((4,4))
+	rmatrix[0][1] = params[0]
+	rmatrix[1][0] = params[0]
+	rmatrix[0][2] = params[1]
+	rmatrix[2][0] = params[1]
+	rmatrix[0][3] = params[2]
+	rmatrix[3][0] = params[2]
+	rmatrix[1][2] = params[3]
+	rmatrix[2][1] = params[3]
+	rmatrix[1][3] = params[4]
+	rmatrix[3][1] = params[4]
+	rmatrix[2][3] = 1.
+	rmatrix[3][2] = 1.
+	fill_diagonal(rmatrix,0)
+	basecomp = ones(4)	
+	basecomp[0] = params[5]
+	basecomp[1] = params[6]
+	basecomp[2] = params[7]
+	basecomp[3] = 1-sum(params[5:8])
+	sc = sum(basecomp)
+	basecomp /= sc
+	return rmatrix,basecomp
 
-LARGE = 10000000
-def calc_params_rateparams(params,tree,sites):
+
+"""
+params contain the rmatrix and basefreq params in the form
+  A C G T
+A   1 2 3
+C     4 5
+G 
+T 
+the three base composition rates will be parameters
+6, 7, 8. The last base composition (T) will be 1- sum(6,7,8)
+and the total will be scaled to 1
+"""
+def calc_params_rateparams(params,tree,sites,alpha=None,cats=None):
 	for i in params: 
 		if i < 0:
 			return LARGE
 	if sum(params[5:8]) > 1:
 		return LARGE
 	rmatrix,basefreq = set_gtr_rmatrix_basefreq(params)
-	like = calc_nuc_tree_likelihood(tree,rmatrix,basefreq,sites)
+	like = -1
+	if alpha != None:
+		like = calc_nuc_tree_likelihood_gamma(tree,rmatrix,basefreq,sites,alpha,cats)
+	else:
+		like = calc_nuc_tree_likelihood(tree,rmatrix,basefreq,sites)
 	print like
 	if like < 0 or isnan(like):
 		return LARGE
 	return like
 
 """
-optimize everything but the tree topology and gamma
+params contains parameter values for everything but the tree topology and gamma
 """
-def calc_params_allparams(params,tree,sites):
+def calc_params_allparams(params,tree,sites,alpha,cats):
 	for i in params: 
 		if i < 0:
 			return LARGE
 			i.length = params[count]
 			count += 1
 	rmatrix,basefreq = set_gtr_rmatrix_basefreq(rateparams)
-	like = calc_nuc_tree_likelihood(tree,rmatrix,basefreq,sites)
+	like = -1
+	if alpha != None:
+		like = calc_nuc_tree_likelihood_gamma(tree,rmatrix,basefreq,sites,alpha,cats)
+	else:
+		like = calc_nuc_tree_likelihood(tree,rmatrix,basefreq,sites)
 	print like
 	if like < 0 or isnan(like):
 		return LARGE
 	return like
 
-def calc_params_rateparams_gamma(params,tree,sites,alpha,cats):
-	for i in params: 
-		if i < 0:
-			return LARGE
-	if sum(params[5:8]) > 1:
-		return LARGE
-	rmatrix,basefreq = set_gtr_rmatrix_basefreq(params)
-	like = calc_nuc_tree_likelihood_gamma(tree,rmatrix,basefreq,sites,alpha,cats)
-	if like < 0 or isnan(like): 
-		return LARGE
-	return like
-
+"""
+params contains just the alpha shape parameter
+"""
 def calc_params_gamma(params,tree,sites,rmatrix,basefreq,cats):
 	alpha = params[0]
 	if alpha < 0:
 	like = calc_nuc_tree_likelihood_gamma(tree,rmatrix,basefreq,sites,alpha,cats)
 	if like < 0 or isnan(like):
 		return LARGE
+	print like
 	return like
 
-def calc_params_treebl(params,tree,sites,rmatrix,basefreq):
-	count = 0
-	for i in tree.iternodes():
-		if i != tree:
-			if params[count] < 0:
-				return LARGE
-			i.length = params[count]
-			count += 1
-	like = calc_nuc_tree_likelihood(tree,rmatrix,basefreq,sites)
-	print tree.get_newick_repr(True)
-	if like < 0 or isnan(like):
-		return LARGE
-	return like
-
-def calc_params_treebl_gamma(params,tree,sites,rmatrix,basefreq,alpha,cats):
+"""
+params contains the branch lengths
+"""
+def calc_params_treebl(params,tree,sites,rmatrix,basefreq,alpha=None,cats=None):
 	count = 0
 	for i in tree.iternodes():
 		if i != tree:
 				return LARGE
 			i.length = params[count]
 			count += 1
-	like = calc_nuc_tree_likelihood_gamma(tree,rmatrix,basefreq,sites,alpha,cats)
-	print like
-	print tree.get_newick_repr(True)
+	like = -1
+	if alpha != None:
+		like = calc_nuc_tree_likelihood_gamma(tree,rmatrix,basefreq,sites,alpha,cats)
+	else:
+		like = calc_nuc_tree_likelihood(tree,rmatrix,basefreq,sites)
+	#print tree.get_newick_repr(True)
+	#print like
 	if like < 0 or isnan(like):
 		return LARGE
 	return like
 
 
-
 """
-this scales so that the sixth rate (G->T) is 1 
-the matrix will be
-  A C G T
-A   1 2 3
-C     4 5
-G 
-T 
-
-the three base composition rates will be parameters
-6, 7, 8. The last base composition (T) will be 1- sum(6,7,8)
-and the total will be scaled to 1
+optimizing params
 """
-def set_gtr_rmatrix_basefreq(params):
-	rmatrix = ones((4,4))
-	rmatrix[0][1] = params[0]
-	rmatrix[1][0] = params[0]
-	rmatrix[0][2] = params[1]
-	rmatrix[2][0] = params[1]
-	rmatrix[0][3] = params[2]
-	rmatrix[3][0] = params[2]
-	rmatrix[1][2] = params[3]
-	rmatrix[2][1] = params[3]
-	rmatrix[1][3] = params[4]
-	rmatrix[3][1] = params[4]
-	rmatrix[2][3] = 1.
-	rmatrix[3][2] = 1.
-	fill_diagonal(rmatrix,0)
-	basecomp = ones(4)	
-	basecomp[0] = params[5]
-	basecomp[1] = params[6]
-	basecomp[2] = params[7]
-	basecomp[3] = 1-sum(params[5:8])
-	sc = sum(basecomp)
-	basecomp /= sc
-	return rmatrix,basecomp
-
 
 """
-optimizing params
+this should really be done with derivatives, but 
+for demonstration purposes we will just use powell 
+derivative free things
 """
+def optimize_brlen(tree,sites,rmatrix,basefreq,alpha=None,cats=None):
+	blstart = []
+	for i in tree.iternodes():
+		if i != tree:
+			blstart.append(i.length)
+	res = fmin_powell(calc_params_treebl,blstart,args=(tree,sites,rmatrix,basefreq,alpha,cats),full_output=True)
+	return res
+
+def optimize_rateparams(a,tree,sites,alpha=None,cats=None):
+	assert len(a) == 8
+	res =  fmin_powell(calc_params_rateparams,a,args=(tree,sites,alpha,cats),full_output=True)
+	return res
+
 """
 #maximize rate params
 res = fmin_powell(optimize_rateparams,a,args=(tree,sites),full_output=True)
 print rmatrix
 print basefreq
 print calc_nuc_q_matrix(rmatrix,basefreq)
+"""
 
-#res = fmin_powell(optimize_treebl_gamma,blstart	,args=(tree,sites,rmatrix,basefreq,alpha,cats),maxiter=5,ftol=full_output=True)
-#res = fmin_powell(optimize_treebl,blstart,args=(tree,sites,rmatrix,basefreq),maxiter=5,full_output=True)
 
-alp = a + blstart
-print alp
-res = fmin_powell(optimize_allparams,alp,args=(tree,sites),full_output=True)
-print res[0]
-rmatrix,basefreq = set_gtr_rmatrix_basefreq(res[0][0:8])
-print rmatrix
-print basefreq
-print calc_nuc_q_matrix(rmatrix,basefreq)
-print tree.get_newick_repr(True)
-"""
+def optimize_allparams(a,tree,sites,alpha=None,cats=None):
+	assert len(a) == 8
+	blstart = []
+	for i in tree.iternodes():
+		if i != tree:
+			blstart.append(i.length)
+	a = a + blstart
+	res =  fmin_powell(calc_params_allparams,a,args=(tree,sites,alpha,cats),full_output=True)
+	return res
+
+def optimize_gammaparam(alphastart,tree,sites,rmatrix,basefreq,cats):
+	res =  fmin_powell(calc_params_gamma,[alphastart],args=(tree,sites,rmatrix,basefreq,cats),full_output=True)
+	return res