Commits

beroe  committed 49dd6b5

Added 5 scripts related to kog processing

  • Participants
  • Parent commits b94c8b2

Comments (0)

Files changed (4)

File molecular/KOGexamplesizes.zip

Binary file added.

File molecular/PlotKogHeatmap.R

+#plot a "heatmap" showing the size of a series of gene files
+# uses RColorBrewer, but not required if you want to change UseColor = rainbow(16)
+
+Path = "~/MBARI/Manuscripts/kogs/merged/"
+PROTEIN = TRUE
+# Expects a series of files called size_prot_something..., containing just a column of integers on their own lines, all in the same order (that order matching PTaxa below). 
+# Derives the gene name/number through a regexp search for ...KOG1234... and grabs just the numbers from that.
+
+# List of taxa in dataset, in the same order that the genes are in the individual files
+PTaxa = c("AmphimedCEGpred","Amphimedon","Arabidopsis","Capitella","Celegans","Ciona","Daphnia","Drosophila","Human","Micromonas","Mleidyi","Mnemiopsis","Monosiga","OscarCEGpred","Oscarella","Salpingoeca","Scervisiae","Spombe","Urchin","abylopsis","aeginasp","apolemia","atolla","bathocyroe","bathyctena","belongata","benthiccteno","beroeforskali","calanoides","charistephane","chelophyes","chiroteuthis","cmacrocephala","cmultidentata","conchoecissa","cordagalma","craseoa","ctenoceros","desmophyes","diphyes","dosidicus","efowleri","erenna","eucalanus","euplokamis","flota","forskalia","frillagalma","gaussia","harmothoe","hippopodius","hormiphora","lampea","lilyopsis","lionsmane","lychnagalma","marrus","nannocalanus","nanomia","nepheloctenawhite","octopoteuthis","ocyropsis","phylliroe","pleuromamma","pterygioteuthis","rhizophysa","rosacea","sergestes","stephalia","tcamp1","tglobosa","thalassocalyce","tomopteris","vampyroteuthis","vermillionlobate","weirdcteno")
+
+# paral
+cats=c("POR","POR","PLA","POL","WOR","CHO","ART","ART","CHO","PRO","CTE","CTE","PRO","POR","POR","PRO","YEA","YEA","ECH","SIP","CNI","SIP","CNI","CTE","CTE","SIP","CTE","CTE","ART","CTE","SIP","MOL","WOR","SIP","ART","SIP","SIP","CTE","SIP","SIP","MOL","WOR","SIP","ART","CTE","POL","SIP","SIP","ART","POL","SIP","CTE","CTE","SIP","SIP","SIP","SIP","ART","SIP","CTE","MOL","CTE","MOL","ART","MOL","SIP","SIP","ART","SIP","PRO","PRO","CTE","POL","MOL","CTE","CTE")
+
+
+if (PROTEIN) {
+	MyList =  dir(Path,pattern = "^size_prot_.*")
+	ThresholdSize = 10	
+	Taxa = PTaxa
+	UseTitle = "Protein lengths"
+	}	else 	{
+	MyList =  dir(Path,pattern = "^size_.*")
+	ThresholdSize = 36
+	Taxa = NTaxa
+	UseTitle = "Nucleotide lengths"
+}
+
+print(paste("Files found:",length(MyList) ))
+
+
+p = c()
+allx = c()
+allD = c()
+
+for (FileName in MyList){
+	KOGnum =sub(".*KOG([0-9]+).*","\\1",FileName)
+	koglen = read.table(file=paste(Path,FileName,sep=""), header=FALSE, sep="\t")
+	
+	xvals = koglen$V1
+	allx = rbind(allx,xvals)
+#	HistMax = 1000
+#	xvals[xvals>HistMax] = HistMax
+	
+	x = xvals
+
+	# Blank out reads less than 10, so they are white squares in image...
+	
+	x[x<ThresholdSize]=NA
+	
+	# used to plot the difference from median instead of overall size...
+	mx = median(x,na.rm=TRUE)
+	LenDiff = x - mx
+	allD = rbind(allD,LenDiff)
+}
+
+# could use NA here?
+allx[allx<ThresholdSize] = 1
+
+# convert false to zero or 1
+p = (allx > ThresholdSize) * 1
+   
+# sort them in order by taxa and then by genes, from most to least
+
+#Rows are taxa
+RSum = rowSums(p)
+logx = log(allx)
+roworderp = p[rev(order(RSum)),] 
+rowx = allx[rev(order(RSum)),]
+rowd = allD[rev(order(RSum)),]
+
+# columns are KOGs, 
+CSum=colSums(roworderp)
+colorderp = roworderp[,order(CSum)]
+
+# These are the matrices of plotted values (raw and diff from median), in order
+colx = rowx[,order(CSum)]
+cold = rowd[,order(CSum)]
+
+
+Get KOG names from file names
+kognames = sub("\\..+$","",sub(".+_KOG","",MyList))
+
+xd = data.frame(data=allx,row.names=kognames,check.names=FALSE)
+colnames(xd)=Taxa
+
+# library(gplots) # needed for rich.colors()
+
+# not implemented UseColor = colorRampPalette(c("red", "orange", "blue"), space = "Lab")
+# another option - UseColor = c("#FFFFFFFF",hcl(seq(0, 360, length = 20)))
+
+# Added red at the end so a few outliers don't swamp color palette. 
+# Could take log of colx instead
+# UseColor = c(rainbow(16),"red","red","red","red","red","red","red")
+# UseColor = c(heat.colors(16),"blue","blue","blue","blue","blue","blue")
+# UseColor = c(rich.colors(16),"red","red","red","red","red","red","red","red")
+
+# library(colorRamps)
+# UseColor = c(blue2red(16),"red","red","red","red","red","red","red","red")
+# UseColor = c(blue2green2red(32),"red","red","red","red","red","red","red","red")
+
+library(RColorBrewer)
+
+# display.brewer.all() # to see all palettes
+
+# UseColor = c(rev(brewer.pal(32,"Purples")),"red","red","red","red","red","red","red","red")
+# UseColor = c(brewer.pal(32,"Pastel1"),"red","red","red","red","red","red","red","red")
+# UseColor = c(brewer.pal(32,"Pastel2"),"red","red","red","red","red","red","red","red")
+# UseColor = c(brewer.pal(32,"Spectral"),"red","red","red","red","red","red","red","red")
+# UseColor = c(brewer.pal(32,"Dark2"),"red","red","red","red","red","red","red","red")
+UseColor = c(rev(brewer.pal(32,"Spectral"))
+# UseColor = c(rev(brewer.pal(32,"Spectral")),"red","red","red","red","red","red","red","red")
+# RdYlBu same as above
+# UseColor = c(rev(brewer.pal(11,"RdYlBu")),"red","red","red","red","red","red","red","red")
+
+# To plot in listed order instead of sorted order...
+empty = '
+ image((1:length(MyList)),(1:length(xvals)), axes=FALSE, xlab="", ylab = "", allx,mar=c(2,1,1,1),col=UseColor)
+ title(paste(UseTitle," - unsorted",sep=""))
+ axis(2,at=1:length(xvals),labels=Taxa,las=2, adj=1,cex.axis=0.6)
+ axis(1,at=1:length(MyList),labels=kognames,las=2, adj=1,cex.axis=0.6)
+'
+colx[colx<6]=NA
+
+### PLOT THE HEAT MAP 
+image((1:length(MyList)),(1:length(xvals)), axes=FALSE, xlab="", ylab = "", colx,mar=c(2,1,1,1),col=UseColor)
+
+#TO PLOT THE DIFF FROM MEDIAN INSTEAD, comment out the image() above and use this one...
+# for median, Spectral is best...
+# image((1:length(MyList)),(1:length(xvals)), axes=FALSE, xlab="", ylab = "", cold,mar=c(2,1,1,1),col=UseColor)
+
+
+
+title(UseTitle)
+
+# TODO: color by cats, might have to use text() instead of axis()
+axis(2,at=1:length(xvals),labels=Taxa[order(CSum)],las=2, adj=1,cex.axis=0.6)
+axis(4,at=1:length(xvals),labels=CSum[order(CSum)],las=2, adj=1,cex.axis=0.6)
+
+axis(1,at=1:length(MyList),labels=kognames[rev(order(rowSums(p)))],las=2, adj=1,cex.axis=0.6)
+axis(3,at=1:length(MyList),labels=RSum[rev(order(RSum))],las=2, adj=1,cex.axis=0.6)

File molecular/kogextract.py

+#! /usr/bin/env python
+"""Extract KOGs to files for alignment...
+
+THe master file was generated by catting all kogs into all_kog_nucleotides_26Feb13.fna
+
+Names were checked and one-offs edited out. (like BC1_ and Tr50_BC1)
+
+Sequences have names like:
+>chiroteuthis_h10_24383_Locus_6835_Transcript_6/7_Confidence_0.619_Length_1846___KOG0077
+We want the first (species) and last (KOG) fields.
+
+Some entities, like _h01 and _h10 have duplicates...
+
+After running this, can run generate sizes files and run the R script PlotKogs.R to plot
+
+Usage:
+     kogextract AllKog.fasta [1] [1]
+	 
+Last two parameters are to Write out the extracted files and
+to chop out (not include) the taxa from the overall list that are 
+missing gene sequences. (Default is to save the names with ---- as the seq.)
+
+If writing out the sequences, it expects there to be a subdirectory called 'merged/'.
+"""
+
+# requires mybio.py from http://www.mbari.org/staff/haddock/scripts
+import mybio as mb
+import sys
+
+# Looking at the beginning of the sequence name, make these full-name replacements
+
+
+# need to combine erenna and erenna-t into the same name
+# 7XXX = Drosophila, Y = S.cervisiae
+def cleannames(FullName):
+	""" This function will search for regular expression patterns in the first part of the 
+taxon name and replace them with standardized names. The underscore is used later to split off
+the short name, so if the original aame string does not have an appropriate underscore, you 
+can add that here as well."""
+	import re
+
+	RepList = [
+			[r'^7\d\d'		, 'Drosophila_'],
+			[r'^At\w'		, 'Arabidopsis_'],
+			[r'^CE\d'		, 'Celegans_'],
+			[r'^Hs\w'		, 'Human_'],
+			[r'^SP[A-Z]'	, 'Spombe_'],
+			[r'^Y[A-Z][A-Z]', 'Scervisiae_'],
+			[r'^ML\d'		, 'Mnemiopsis_'],
+			[r'^Mlei'		, 'Mleidyi_Joe'],
+			[r'^DP1'		, 'diphyes_'],
+			[r'^HT1'		, 'halitrephes_'],
+			[r'^glt'		, 'gaussia_'],
+			[r'^gnl'		, 'gaussia_'],
+			[r'^BC1'		, 'bathocyroe_'],
+			[r'^erenna-t'	, 'erenna_'],
+			[r'^jgi\|Capca'	, "Capitella_"],
+			[r'^jgi\|Dappu1'	, "Daphnia_"],
+			[r'^gi\|\d+.*'		, "Urchin_"],
+			[r'^slo' 		, "sergestes"],
+			[r'^calanoidescarinatus' 		, "calanoides"],
+			[r'^charistephanefugiens'		, "charistephane"],
+			[r'^eucalanushyalinus'			, "eucalanus"],
+			[r'^thalassocalyceinconstans'	, "thalassocalyce"],
+			[r'^ocyropsismaculata'			, "ocyropsis" ],
+			[r'^phylliroebucephalum'		, "phylliroe" ],
+			[r'^monosigabrevicollis'		, "Monosiga"],
+			[r'^micromonaspusilla'			, "Micromonas"],
+			[r'^salpingoeca'				, "Salpingoeca"],
+			[r'^amphimedon'					, "Amphimedon"],
+			[r'^AmphimedCEGpred'			, "AmphimedonCEG"],
+			[r'^OscarCEGpred'				, "OscarCEG"],
+			[r'^oscarellacarmela'			, "Oscarella"]
+			
+
+	]
+
+	FullName = FullName.replace("Tr50_","")
+	for Find,Replace in RepList:
+		FullName = re.sub(Find,Replace,FullName)
+	return FullName
+	
+def main():
+	WRITEFILES = False
+	CHOPFILES = False
+	if len(sys.argv)<2:
+		sys.stderr.write(__doc__)
+	else: 
+		if len(sys.argv) > 2:
+			WRITEFILES = bool(int(sys.argv[2]))
+			sys.stderr.write("WRITE to FILE is SET\n")
+		if len(sys.argv) > 3:
+			CHOPFILES = bool(int(sys.argv[3]))
+			sys.stderr.write("CHOP from FILE is SET\n")	
+		FileName = sys.argv[1]
+		Ordernames,Sequences = mb.readfasta_list(FileName)
+		print >> sys.stderr, len(Sequences), len(Ordernames)
+		KOGDict = {}
+		KOGNames = {}
+		NameFullList = []
+		KOGFullList = []
+		
+		# this tries to split the FASTA title into species name and gene names,
+		# based on the presence of three underscores separating them
+		# You would have to modify for other name formats. 
+		# The first part of the name can be modified by changing the search patterns
+		# inside the cleannames() function above
+		
+		# Expected pattern: 
+		# >SpeciesName_Ignore_this_stuff.Contig2203___GeneName3459
+		# This would yield values for SpeciesName and GeneName3459
+		
+		for Sequence, Name in zip(Sequences,Ordernames):
+			# some lines are badly formed, so the initial split by spaces should get just the first part: 
+			# >Tr50_DP1B.Contig2203___KOG3459 Tr50_DP1B.Contig2203 Tr50_DP1B.Contig2203___KOG3459 Tr50_DP1B.Contig2203
+			# try:
+				# There are only two underscores in the original KOG file for Ciona!
+				# This should give everything before the KOG name, but careful about Ciona.
+			try: 
+				# FullSpecies,KOG = Name.split(" ")[0].split("___")
+				FullSpecies,KOG = Name.split("___")
+			except ValueError:
+				# print >> sys.stderr, "Error splitting Name on __", Name
+				try:
+					FullSpecies,Garbage,KOG = Name.split("___")
+				except ValueError:
+					try:
+						FullSpecies,KOG = Name.split("__")
+					except:
+						print >> sys.stderr, "Error on name:", Name
+						raise
+				
+			KOG = KOG.strip("_")
+			FullSpecies = cleannames(FullSpecies)
+			# print >> sys.stderr, FullSpecies, " | ",KOG
+			PartialSpecies = FullSpecies.split("_")[0]
+			NameFullList.append(PartialSpecies)
+			KOGFullList.append(KOG)
+			# If there are duplicates with the same name, this should just overwrite...
+			# want to have it check and spit out an error?
+			if (KOG,PartialSpecies) in KOGDict:
+				# print out the two (or more) copies of a sequence so we can compare
+				print >> sys.stderr, (KOG,PartialSpecies), "duplicated ###########################"
+				print ">%s__%s" % (FullSpecies,KOG)
+				print mb.stringblock(Sequence,60)
+				print ">%s__%s" % (PartialSpecies,KOG)
+				print KOGDict[(KOG,PartialSpecies)]
+			
+			KOGDict[(KOG,PartialSpecies)] = mb.stringblock(Sequence,60)
+				# this is a dictionary with KOG as key and list of clean species names as values
+			if PartialSpecies not in KOGNames.get(KOG,[]):
+				KOGNames[KOG] = KOGNames.get(KOG,[]) + [PartialSpecies]
+			else:
+				print >> sys.stderr, "Duplicate entry?", PartialSpecies, KOG, FullSpecies
+		# add blank place holders for each missing name:
+			# except ValueError:
+			# 	print >> sys.stderr, "Dictionary Error:", Name 
+	
+		NameList = list(set(NameFullList))
+		NameList.sort()
+		print >> sys.stderr, NameList
+		KOGList = list(set(KOGFullList))
+		K = [K for K in KOGList if len(KOGNames[K])> 1]
+		K.sort()
+		KOGList= K[:]
+		OutFolder = "merged/"
+	
+		if ("pro" in FileName.lower()):
+			PROTEIN = True
+			TaxonFileName = "Taxa_pro.txt"
+			OutNameRoot = "prot_%s.faa"
+			if CHOPFILES:
+				OutNameRoot = "chopPro_%s.faa"
+		else:
+			PROTEIN = False
+			TaxonFileName = "Taxa_nuc.txt"
+			OutNameRoot = "merge_%s.faa"		
+			if CHOPFILES:
+				OutNameRoot = "chopNuc_%s.faa"
+		KDKeys = KOGDict.keys()
+		for KOG in KOGList:
+		# for KOG in ["KOG3954"]:
+			OutName = OutFolder + OutNameRoot % KOG
+
+			if WRITEFILES:
+				Kout = open(OutName,'w')
+			# Kout = sys.stdout
+			print >> sys.stderr, ["NOT",""][WRITEFILES], "Writing %d seqs to %s" %(len(KOGNames[KOG]), OutName)
+			
+			for Name in NameList:
+				if WRITEFILES:
+					if not(CHOPFILES) or ((KOG,Name) in KDKeys):
+						Kout.write(">%s\n" % Name)
+						Kout.write("%s"% (KOGDict.get((KOG,Name),"-----\n")))
+				# print >> sys.stderr, "Missing from %s: %s" % (KOG, Name)
+			if WRITEFILES:
+				Kout.close()
+		sys.stderr.write("Creating taxon list at %s\n" % TaxonFileName)
+		OutStr = ["N","P"][PROTEIN] + 'Taxa = c("'  # Create NTaxa or PTaxa statements for R
+		OutStr += '","'.join(NameList)
+		OutStr += '")'
+		print >> sys.stderr, OutStr
+	
+		Tout = open(TaxonFileName,"w")
+		OutStr = "## %s\n" % (FileName)
+		OutStr += "\n".join(NameList)
+		Tout.write(OutStr + "\n")
+		Tout.close()	
+		
+		# print out the category list from R...
+			# This part is optional to generate taxon categories for the R script
+		# if the library is not found, it will just skip this step
+		try:
+			from kogNumberCats import getvals, getRstring
+			print(getRstring(getvals(NameList)))
+		
+		print >> sys.stderr, "NameList Length: %d" %(len(NameList))
+		print >> sys.stderr, "KogList Length : %d" %(len(KOGList))
+
+if __name__ == '__main__':
+	main()
+#	print KOGNames

File molecular/sizeonly.py

+#! /usr/bin/env python
+
+# version 1.1 - cleaned up a bit, used an ordered list
+# version 1.0 - first version 
+
+
+"""
+SIZEONLY.PY - version 1.1
+  Print out the sizes of the sequences in a file
+  
+  Requires the mybio.py library from 
+  http://www.mbari.org/~haddock/scripts/ 
+
+Usage: 
+	sizeonly.py Tr100_filename.fasta > lengths.txt
+	
+"""
+import sys
+import mybio as mb
+
+if len(sys.argv)<2:
+	print __doc__	
+
+else:
+	
+	inputfilename = sys.argv[1]
+	names,sequences = mb.readfasta_list(inputfilename) 
+	# don't append sequences, nor remove dupes, not a quality file but do return an ordered list
+	
+	sys.stderr.write("Found %d sequences in %s\n" % (len(names), inputfilename))
+	for sval in sequences:
+		print len(sval)