Commits

Casey Dunn committed 47aa3b8

wrote script to parse subtrees of itis taxonomy into json format

Comments (0)

Files changed (1)

itis_sql_to_json.py

+#!/usr/bin/env python	
+import sys
+import re
+import json
+
+Usage = """
+Usage:
+
+itis_sql_to_json.py taxonomic_units strippedauthor root
+
+
+
+taxonomic_units strippedauthor are from the itis dataset is available at:
+ http://www.itis.gov/downloads/itisMySQLTables.tar.gz
+
+root is the id of the root of the subtree that you would like to export
+
+
+"""
+
+
+"""
+Create Table taxonomic_units 
+ ( tsn INTEGER NOT NULL,
+   unit_ind1 CHAR(1),
+   unit_name1 CHAR(35) NOT NULL,
+   unit_ind2 CHAR(1),
+   unit_name2 VARCHAR(35),
+   unit_ind3 VARCHAR(7),
+   unit_name3 VARCHAR(35),
+   unit_ind4 VARCHAR(7),
+   unit_name4 VARCHAR(35),
+   unnamed_taxon_ind CHAR(1),
+   name_usage VARCHAR(12) NOT NULL,
+   unaccept_reason VARCHAR(50),
+   credibility_rtng VARCHAR(40) NOT NULL,
+   completeness_rtng CHAR(10),
+   currency_rating CHAR(7),
+   phylo_sort_seq SMALLINT,
+   initial_time_stamp DATETIME NOT NULL,
+   parent_tsn INTEGER,
+   taxon_author_id INTEGER,
+   hybrid_author_id INTEGER,
+   kingdom_id SMALLINT NOT NULL,
+   rank_id  SMALLINT NOT NULL,
+   update_date DATE NOT NULL,
+   uncertain_prnt_ind CHAR(3),
+   INDEX taxon_unit_index1 (tsn,parent_tsn),
+   INDEX taxon_unit_index2 (tsn,unit_name1,name_usage),
+   INDEX taxon_unit_index3 (kingdom_id,rank_id),
+   INDEX taxon_unit_index4 (tsn,taxon_author_id),
+   PRIMARY KEY (tsn))
+   ENGINE=MyISAM CHARSET=latin1;
+
+Create Table strippedauthor
+ ( taxon_author_id INTEGER NOT NULL,
+   shortauthor VARCHAR(100) NOT NULL,
+   INDEX (taxon_author_id,shortauthor),
+   PRIMARY KEY (taxon_author_id))
+   ENGINE=MyISAM CHARSET=latin1;
+
+
+"""
+
+# The minimum year to assign a date. This is necessary to clean up data that would 
+# otherwise come to dominate at deep nodes. Using the publication data of Systema Naturae, 
+# http://en.wikipedia.org/wiki/Systema_Naturae
+min_year = 1735
+
+
+class Taxon():
+	"""A Taxon """
+	def __init__(self, taxonomic_unit_sting = ''):
+		taxonomic_unit_sting = taxonomic_unit_sting.rstrip()
+		fields = taxonomic_unit_sting.split('|')
+		expected_fields = 18
+		if len(fields) < expected_fields:
+			raise ValueError('The following taxonomic_unit_sting does not have {0} fields:\n  {1}\n'.format(expected_fields, taxonomic_unit_sting))
+		self.id = int( fields[0] )
+		self.name_usage = fields[10]
+		
+		self.parent = -1
+		parent_string = fields[17]
+		try:
+			self.parent = int( parent_string )
+		except:
+			sys.stderr.write( 'WARNING: Invalid parent id in the string: {0}\n'.format( taxonomic_unit_sting ) )
+		self.children = list()
+		self.name = fields[2]
+		
+		self.author_id = -1
+		try:
+			self.author_id = int( fields[ 18 ] )
+		except:
+			sys.stderr.write( 'WARNING: Invalid taxon_author_id in the string: {0}\n'.format( taxonomic_unit_sting ) )
+		
+		self.author = ''
+		self.year = None
+		if len( fields[4] ) > 0:
+			self.name = self.name + ' ' + fields[4]
+	
+	
+	def is_valid(self):
+		"""True if the taxon is considered valid"""
+		if ( self.name_usage == 'valid' ) or ( self.name_usage == 'accepted' ):
+			return True
+		else:
+			return False
+			
+	def is_tip(self):
+		if len(self.children ) < 1:
+			return True
+		else:
+			return False
+	
+	def add_child(self, child):
+		"""Adds a child"""
+		self.children.append( child )
+	
+	def pretty_print(self):
+		"""return fields as a pretty string"""
+		print('  id: {0}'.format(self.id))
+		print('  name: {0}'.format(self.name))
+		print('  parent: {0}'.format(self.parent))
+		print('  name_usage: {0}'.format(self.name_usage))
+		print('  is_valid: {0}'.format(self.is_valid()))
+		print('  children: {0}'.format(self.children))
+		
+		print('  author: {0}'.format(self.author))
+		print('  year: {0}'.format(self.year))
+		
+		print('')
+		
+	def csv(self):
+		"""return fields in csv format"""
+		return '{0},{1},{2},{3}'.format(self.id, self.parent, self.name, self.year)
+		
+	def dict(self):
+		"""return fields in dictionary format"""
+		
+		return { 'id':self.id, 'parent':self.parent, 'name':self.name, 'year':self.year }
+		
+		
+if len(sys.argv) < 4:
+	print Usage
+else:
+	
+	# Parse arguments
+	taxon_file_name = sys.argv[1]
+	author_file_name = sys.argv[2]
+	root = int( sys.argv[3] )
+	
+	# Parse the authors file
+	author_handle = open(author_file_name, "rU")
+	
+	n = 0
+	authors = dict()
+	
+	for line in author_handle:
+		n = n + 1
+		line = line.strip()
+		
+		#if n==1:
+		#	continue
+			
+		fields = line.split('|')
+		
+		author_id = int( fields[0] )
+		author_year = fields[1]
+		author_year_words = author_year.split(' ')
+		author = ' '.join(author_year_words[:-1])
+		year_string = author_year_words[-1:][0]
+		year = None
+		
+		#print( year_string )
+		
+		if re.match('^\d\d\d\d$', year_string):
+			year = int( year_string )
+		else:
+			author = author + ' ' + year_string
+		
+		if year < min_year:
+			year = None
+		
+		authors[ author_id ] = ( author, year  ) 
+		
+		
+		#print ( line )
+		#print( id, author, year )
+	
+	# Parse the taxonomy file into the taxa dictionary
+	taxon_handle = open(taxon_file_name, "rU")
+
+	n = 0
+	taxa = dict()
+	
+	for line in taxon_handle:
+		n = n + 1
+		line = line.strip()
+		
+		#if n==1:
+		#	continue
+		
+		new_taxon = Taxon( line )
+		#new_taxon.pretty_print()
+		if new_taxon.is_valid():
+			if new_taxon.author_id in authors:
+				author_tuple = authors[ new_taxon.author_id ]
+				new_taxon.author = author_tuple[0]
+				new_taxon.year = author_tuple[1]
+			#new_taxon.pretty_print()
+			taxa[ new_taxon.id ] = new_taxon
+			
+	# Update the children for each taxon
+	for id in taxa.iterkeys():
+		parent = taxa[ id ].parent
+		if parent:
+			try:
+				taxa [ parent ].add_child( id )
+			except:
+				sys.stderr.write( 'WARNING: Node {0}, listed as the parent of {1}, does not exist\n'.format( parent, id ) )
+	
+	# Scrub the year from internal nodes, since going to propagate these back from the tips
+	for id in taxa.iterkeys():
+		if not taxa[ id ].is_tip():
+			taxa[ id ].year = None
+			
+	# Propagate years back in tree
+	# Loop over tips that have year datum
+	for id in taxa.iterkeys():
+		if taxa[ id ].is_tip() and taxa[ id ].year:
+			# Now traverse the ancestors of the tip, updating if need be
+			taxon_pointer = id
+			#sys.stderr.write( 'VERBOSE: updating ancestors of tip {0}, year {1}\n'.format( taxon_pointer, taxa[ taxon_pointer ].year ) )
+			while taxon_pointer:
+				# See if the taxon has a valid parent
+				parent = taxa[ taxon_pointer ].parent
+				if parent in taxa:
+					# Parent exists
+					# Check to see of the year should be updated
+					if (taxa[ taxon_pointer ].year) and ((taxa[ taxon_pointer ].year < taxa[ parent ].year) or ( taxa[ parent ].year == None )):
+						taxa[ parent ].year = taxa[ taxon_pointer ].year	# Update the parent year
+						taxon_pointer = parent	# Iterate the pointer to the parent
+					else:
+						taxon_pointer = None	# All done with this lineage
+						
+				else:
+					# Parent doesn't exist, all done
+					taxon_pointer = None
+	
+	
+	
+	
+	# Now, traverse the tree from the user specified root
+	node_stack = list()
+	node_stack.append( root )
+	
+	
+	tree = list() # a list of nodes in the tree
+	
+	#print 'id,parent,name,year'
+	
+	while len( node_stack ) > 0:
+		id = node_stack.pop()
+		# Only include lineages for which there is a date
+		if taxa[ id ].year:
+			#taxa[ id ].pretty_print()
+			
+			#print taxa[ id ].csv()
+			
+			tree.append( taxa[ id ].dict() )
+			
+			node_stack = node_stack + taxa[ id ].children
+			
+			
+	print json.dumps( tree, indent = 3 )
+
+	
+	
+