Source

TopDowner / top_downer.rb

class TopDowner
	attr_reader :token

	def initialize grammar, token_types
		@grammar = grammar
		@token_types = token_types
		@r_cache = Hash.new
		@first = Hash.new
	end

	def parse string
		@tree = Hash.new
		@enumerator = tokenize(string).each
		recognize(:Program)
	end

	private
	## A Token Type Detector ##
	def detect token
		@token_types.each do |type, matcher|
			if token =~ matcher
				return { :token => token, :type => type }
			end
		end
		raise ArgumentError.new "Unrecognizeable token: #{token}"
	end

	## String Tokenizer ##
	# Breaks up a string into an array of symbols.
	def tokenize string
		tokens = string.gsub(/[\n\t ]+/,' ').split(' ').map {|t| t.to_sym}
		tokens.map do |token|
			detect(token)
		end
		tokens << {:token => :'$', :type => :eof}
	end

	## Strips meta-grammar from symbol
	def strip_meta symbol
		symbol.to_s.gsub!(/[\|\*]/,'').to_sym
	end

	# First algorithm
	# returns a set of all valid first token types for a rule.
	# Memoizes return values for better performance.
	def first rule
		@first[rule] ||=
			if @grammar.has_key? rule
				case @grammar[rule][0][0]
				when '*'
					[first(strip_meta(@grammar[rule][0])),
						first(@grammar[rule][1])].flatten
				when '|'
					@grammar[rule].find_all{|s| s[0] =='|'}.map do |s|
						first(strip_meta(s))
					end.flatten
				else
					[first(@grammar[rule][0])].flatten
				end
			else
				[rule]
			end
	end

	def match t
		if @token[:type] == t
			tkn, @token = @token, @enumerator.next
			return tkn[:token]
		else
			raise ArgumentError.new "SYNTAX ERROR expected #{t}, got #{tkn[:token]}"
		end
	end

	# After we get a tokenized string, what should we do. The logic of the topdown
	# process dictates that we do as follows:
	# - expand the rules
	#   - matching input tokens
	#   	- as predicted by rules
	# So we've got the rules described by our @grammar and our token types
	# defined by @token_type. We may want to detect functions as distinct from
	# regular identifers but I don't want to if I don't have to.
	# Now that we have our enumerator, what do we do with it?
	# First we need to build some functions which recognize the first two
	# symbols for each rule so that we can accurately consume our language.
	# Now that we have a plan. Let's fuck some shit up!!!!
	def recognize rule
	end
end