Commits

Matt Bone committed e03ba85

the tokenizer is working for simple stuff

  • Participants
  • Parent commits 5d5b56e

Comments (0)

Files changed (1)

File in_py/parse_sexp.py

     close_paren_re: close_paren_token,
     open_paren_re: open_paren_token,
     symbol_start_re: symbol_token,
-    symbol_internal_re:symbol_token,
+    #symbol_internal_re:symbol_token,
     number_re:number_token,
     whitespace_re: whitespace_token,
     }
+
+def char_to_token(char):
+    matches = []
+
+    for regex, klass in RE_TO_TOKEN_MAP.iteritems():
+        if(regex.match(char)):
+            matches.append(klass(char))
+
+    if(len(matches) == 0):
+        #TODO specify exception and test
+        raise Exception("Could not match symbol %s" % char)
+
+    if(len(matches) > 1):
+        #TODO specify exception and test
+        raise Exception("Ambiguous symbol %s" % char)
+
+    return matches[0]
+
+def _tokenize(some_string):
+
+    current_token = None
     
-def _tokenize(some_string):
     for char in some_string:
-        yield char
+        token = char_to_token(char)
+
+        if current_token is None:
+            current_token = token
+
+        else:
+
+            try:
+                current_token = current_token + token
+            except next_token_exception:
+                yield current_token
+                current_token = next_token
+
+    yield current_token
  
 
 def tokenize(some_string):
 
 
 import unittest
+class TestTokenizer(unittest.TestCase):
+    def test_char_to_token(self):
+
+        a = char_to_token("a")
+        self.assertTrue(isinstance(a, symbol_token))
+        self.assertEqual(a.value, "a")
+
+        nine = char_to_token("9")
+        self.assertTrue(isinstance(nine, number_token))
+        self.assertEqual(nine.value, "9")
+
+        open = char_to_token("(")
+        self.assertTrue(isinstance(open, open_paren_token))
+        self.assertEqual(open.value, "(")
+
+        close = char_to_token(")")
+        self.assertTrue(isinstance(close, close_paren_token))
+        self.assertEqual(close.value, ")")
+
+        space = char_to_token(" ")
+        self.assertTrue(isinstance(space, whitespace_token))
+        self.assertEqual(space.value, " ")
+
+
+    def test_simple_under_tokenize(self):
+        tokens = list(_tokenize("a"))
+        self.assertEqual(len(tokens), 1)
+        token = tokens[0]
+        self.assertTrue(isinstance(token, symbol_token))
+        self.assertEqual(token.value, "a")
+        
+        tokens = list(_tokenize("abcd"))
+        self.assertEqual(len(tokens), 1)
+        token = tokens[0]
+        self.assertTrue(isinstance(token, symbol_token))
+        self.assertEqual(token.value, "abcd")
+
+        tokens = list(_tokenize("ab9+1-"))
+        self.assertEqual(len(tokens), 1)
+        token = tokens[0]
+        self.assertTrue(isinstance(token, symbol_token))
+        self.assertEqual(token.value, "ab9+1-")
+
+        tokens = list(_tokenize("9"))
+        self.assertEqual(len(tokens), 1)
+        token = tokens[0]
+        self.assertTrue(isinstance(token, number_token))
+        self.assertEqual(token.value, "9")
+        
+        tokens = list(_tokenize("9876"))
+        self.assertEqual(len(tokens), 1)
+        token = tokens[0]
+        self.assertTrue(isinstance(token, number_token))
+        self.assertEqual(token.value, "9876")
+        
+
 class TestTokens(unittest.TestCase):
     """Test the token classes which are responsible for determining
     what characters can be pushed together (i.e. they contain