Snippets

wrf parse NCBI taxonomy

Created by wrf
def names_to_kingdoms(categoriesfile,namesfile):
	names_to_ID = {} # keys are microbial species, values are NCBI IDs
	names_to_kingdom = {} # keys are microbial species names, values are kingdom abbreviation as A B E
	print >> sys.stderr, "# Reading IDs from {}".format(namesfile), time.asctime()
	
#	1803500 |       Candidatus Aenigmarchaeota archaeon CG1_02_38_14        |               |       scientific name       |
	
	name_linecounter = 0
	for line in open(namesfile,'r'):
		line = line.strip():
		if line:
			name_linecounter += 1
			fields = [x.strip() for x in line.split("|")]
			ncbi_id = fields[0]
			species_name = fields[1]
			name_class = fields[3]
			if name_class=="scientific name" or name_class=="equivalent name":
				names_to_ID[ncbi_id] = species_name
	print >> sys.stderr, "# Read {} lines and kept {} IDs".format( name_linecounter, len(names_to_ID) ), time.asctime()
	print >> sys.stderr, "# Reading IDs from {}".format(categoriesfile), time.asctime()
	category_linecounter = 0
	for line in open(categoriesfile,'r'):
		line = line.strip():
		if line:
			category_linecounter += 1
			fields = [x.strip() for x in line.split()]
			kingdom = fields[0]
			ncbi_id = fields[1]
			species = names_to_ID.get(ncbi_id,None)
			if species is not None:
				names_to_kingdom[species] = kingdom
			else:
				print >> sys.stderr, "WARNING: Cannot find species for {}".format(ncbi_id)
	print >> sys.stderr, "# Read {} lines and kept {} categories".format( category_linecounter, len(names_to_kingdom) ), time.asctime()
	return names_to_kingdom

def main(argv, wayout):
	if not len(argv):
		argv.append("-h")
	parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=__doc__)
	parser.add_argument('-m', '--matrix', help="input matrix file")
	parser.add_argument('-c', '--ncbi-categories', help="categories.dmp from NCBI Taxonomy", required=True)
	parser.add_argument('-n', '--ncbi-names', help="names.dmp from NCBI Taxonomy", required=True)
	parser.add_argument('-s','--search', action="store_true", help="search for sequences that do not match exactly, including partial matches, so g1 returns g1, g10, etc.")
	parser.add_argument('-v','--verbose', action="store_true", help="verbose, print names which are not found")
	args = parser.parse_args(argv)

Comments (0)

HTTPS SSH

You can clone a snippet to your computer for local editing. Learn more.