Commits

Stephen Smith committed 47b6c99

updated likelihood and gamma calc

Comments (0)

Files changed (2)

models/model_calc.py

 import numpy as np
 from numpy import *
 from scipy.linalg import *
+from scipy.stats import *
+from scipy.special import *
 
 """
 this takes seqstrings for two ALIGNED sequences
 		p += math.log(mat[position[seq1[i]]][position[seq2[i]]]) 
 		p += math.log(0.25)
 	return p
+
+def point_gamma(p,a, b):
+	return chi2.ppf(p,2*(a))/(2.0*(b))
+
+def get_gamma_cats(alpha,cats,median):
+	K=cats
+	a,b = alpha,alpha
+	factor=a/b*K
+	rK = [0]*K
+	if median:
+		gap05=1.0/(2.0*K)
+		for i in range(K):
+			rK[i] = point_gamma((i*2.0+1)*gap05,a,b)
+		t=0
+		for i in range(K):
+			t+=rK[i]
+		for i in range(K):
+			rK[i] *= factor/t
+		return rK
+	else:
+		freqK = [0]*K
+		for i in range(K-1):
+			freqK[i]=point_gamma((i+1.0)/K, a, b);
+		for i in range(K-1):
+			freqK[i]=gammainc(a+1,freqK[i]*b)
+		rK[0] = freqK[0]*factor;
+		rK[K-1] = (1-freqK[K-2])*factor;
+		for i in range(1,K-1):
+			rK[i] = (freqK[i]-freqK[i-1])*factor;
+		return rK

trees/tree_likelihood_calculator.py

 #move to datatype
 position = {"A":[0],"C":[1],"G":[2],"T":[3],"-":[0,1,2,3]}
 
-def calc_nuc_tree_likelihood(tree,rmatrix,basefreq,sites):
+def match_tips_and_seqs(tree,seqs):
+	lvs = tree.leaves()
+	for i in lvs:
+		test = False
+		for j in seqs:
+			if j.name == i.label:
+				i.data['seq'] = j.seq.upper()
+				test = True
+				break
+		if test == False:
+			print "can't find "+i.label+" in seqs"
+			return False
+
+def calc_nuc_tree_likelihood(tree,rmatrix,basefreq,sites,verbose=False):
 	q = calc_nuc_q_matrix(rmatrix,basefreq)
+	ps = {}#key is length, value is pmatrix
 	loglike = 0
 	for s in range(sites):
 		tempretlike = 0
 		for c in tree.iternodes(order = "postorder"):
-			tempretlike = 0
 			if len(c.children) > 0: #this is an internal node
 				c.data['probs'] = ones(4)
 				for j in range(4):
 					for i in c.children:
-						p = calc_p_matrix(q,i.length)
+						if i.length not in ps:
+							p = calc_p_matrix(q,i.length)
+							ps[i.length] = p
+						p = ps[i.length]
 						templike = 0
 						for k in range(4):
-							templike += p[j][k] * i.data['probs'][k]
+							templike += (p[j][k] * i.data['probs'][k])
 						c.data['probs'][j] *= templike
-					tempretlike  += c.data['probs'][j]*basefreq[j]
 			else: #this is a tip
 				c.data['probs'] = zeros(4)
-				#could add here for ambiguity
 				positions = position[c.data['seq'][s]]
 				for j in positions:
 					c.data['probs'][j] = 1
-		#at this point the tempretlike will be the sum(root prob * basefreq) for all bases
-		print "site",s,"log(L):",log(tempretlike)
+		#the tempretlike will be the sum(root prob * basefreq) for all bases
+		tempretlike = 0
+		for i in range(4):
+			tempretlike  += tree.data['probs'][i]*basefreq[i]
+		if verbose:
+			print "site",s,"log(L):",log(tempretlike),"like:",tempretlike
 		loglike -= log(tempretlike)
-		#print loglike
 	return loglike
 
+def calc_nuc_tree_likelihood_gamma(tree,rmatrix,basefreq,sites,alpha,cats,verbose=False):
+	q = calc_nuc_q_matrix(rmatrix,basefreq)
+	ps = {}#key is length, value is pmatrix
+	cs = get_gamma_cats(alpha,cats,False)
+	loglike = 0
+	for s in range(sites):
+		tempretlikes = 0
+		for x in cs:
+			tempretlike = 0
+			for c in tree.iternodes(order = "postorder"):
+				if len(c.children) > 0: #this is an internal node
+					c.data['probs'] = ones(4)
+					for j in range(4):
+						for i in c.children:
+							if i.length not in ps:
+								p = calc_p_matrix(q,i.length*x)
+								ps[i.length*x] = p
+							p = ps[i.length*x]
+							templike = 0
+							for k in range(4):
+								templike += p[j][k] * i.data['probs'][k]
+							c.data['probs'][j] *= templike
+				else: #this is a tip
+					c.data['probs'] = zeros(4)
+					#could add here for ambiguity
+					positions = position[c.data['seq'][s]]
+					for j in positions:
+						c.data['probs'][j] = 1
+			#at this point the tempretlike will be the sum(root prob * basefreq) for all bases
+			tempretlike = 0
+			for i in range(4):
+				tempretlike  += tree.data['probs'][i]*basefreq[i]
+			tempretlikes += (tempretlike*(1/float(len(cs))))
+			if verbose: 
+				print "site",s,"log(L):",log(tempretlike),log(tempretlikes)
+		loglike -= log(tempretlikes)
+		if verbose:
+			print log(tempretlikes)
+	return loglike
+
+
+"""
+here are a bunch of functions for use in optimizing params
+"""
+
+LARGE = 10000000
+def calc_params_rateparams(params,tree,sites):
+	for i in params: 
+		if i < 0:
+			return LARGE
+	if sum(params[5:8]) > 1:
+		return LARGE
+	rmatrix,basefreq = set_gtr_rmatrix_basefreq(params)
+	like = calc_nuc_tree_likelihood(tree,rmatrix,basefreq,sites)
+	print like
+	if like < 0 or isnan(like):
+		return LARGE
+	return like
+
+"""
+optimize everything but the tree topology and gamma
+"""
+def calc_params_allparams(params,tree,sites):
+	for i in params: 
+		if i < 0:
+			return LARGE
+	if sum(params[5:8]) > 1:
+		return LARGE
+	rateparams = params[0:8]
+	count = 8
+	for i in tree.iternodes():
+		if i != tree:
+			if params[count] < 0:
+				return LARGE
+			i.length = params[count]
+			count += 1
+	rmatrix,basefreq = set_gtr_rmatrix_basefreq(rateparams)
+	like = calc_nuc_tree_likelihood(tree,rmatrix,basefreq,sites)
+	print like
+	if like < 0 or isnan(like):
+		return LARGE
+	return like
+
+def calc_params_rateparams_gamma(params,tree,sites,alpha,cats):
+	for i in params: 
+		if i < 0:
+			return LARGE
+	if sum(params[5:8]) > 1:
+		return LARGE
+	rmatrix,basefreq = set_gtr_rmatrix_basefreq(params)
+	like = calc_nuc_tree_likelihood_gamma(tree,rmatrix,basefreq,sites,alpha,cats)
+	if like < 0 or isnan(like): 
+		return LARGE
+	return like
+
+def calc_params_gamma(params,tree,sites,rmatrix,basefreq,cats):
+	alpha = params[0]
+	if alpha < 0:
+		return LARGE
+	like = calc_nuc_tree_likelihood_gamma(tree,rmatrix,basefreq,sites,alpha,cats)
+	if like < 0 or isnan(like):
+		return LARGE
+	return like
+
+def calc_params_treebl(params,tree,sites,rmatrix,basefreq):
+	count = 0
+	for i in tree.iternodes():
+		if i != tree:
+			if params[count] < 0:
+				return LARGE
+			i.length = params[count]
+			count += 1
+	like = calc_nuc_tree_likelihood(tree,rmatrix,basefreq,sites)
+	print tree.get_newick_repr(True)
+	if like < 0 or isnan(like):
+		return LARGE
+	return like
+
+def calc_params_treebl_gamma(params,tree,sites,rmatrix,basefreq,alpha,cats):
+	count = 0
+	for i in tree.iternodes():
+		if i != tree:
+			if params[count] < 0:
+				return LARGE
+			i.length = params[count]
+			count += 1
+	like = calc_nuc_tree_likelihood_gamma(tree,rmatrix,basefreq,sites,alpha,cats)
+	print like
+	print tree.get_newick_repr(True)
+	if like < 0 or isnan(like):
+		return LARGE
+	return like
+
+
+
 """
 this scales so that the sixth rate (G->T) is 1 
 the matrix will be
 	basecomp /= sc
 	return rmatrix,basecomp
 
+
+"""
+optimizing params
+"""
+"""
+#maximize rate params
+res = fmin_powell(optimize_rateparams,a,args=(tree,sites),full_output=True)
+print res[0][0:6]
+print res[0][5:]
+print sum(res[0][5:])
+rmatrix,basefreq = set_gtr_rmatrix_basefreq(res[0])
+print rmatrix
+print basefreq
+print calc_nuc_q_matrix(rmatrix,basefreq)
+
+#res = fmin_powell(optimize_treebl_gamma,blstart	,args=(tree,sites,rmatrix,basefreq,alpha,cats),maxiter=5,ftol=full_output=True)
+#res = fmin_powell(optimize_treebl,blstart,args=(tree,sites,rmatrix,basefreq),maxiter=5,full_output=True)
+
+alp = a + blstart
+print alp
+res = fmin_powell(optimize_allparams,alp,args=(tree,sites),full_output=True)
+print res[0]
+rmatrix,basefreq = set_gtr_rmatrix_basefreq(res[0][0:8])
+print rmatrix
+print basefreq
+print calc_nuc_q_matrix(rmatrix,basefreq)
+print tree.get_newick_repr(True)
+"""