Commits

Lenard Lindstrom committed 1894b15

Add token classification

A list of imgblit::Scanner::classify functions each check a token to determine
if it belongs to a particular token class.

To simplify the reuse of a imgblit::Scanner instance, the <<= operator is
overloaded to set the source code and classify functions of the scanner. Also,
the << operator is overloaded to permit multiple string and classify assignments
before token processing begins.

Comments (0)

Files changed (3)

include/imgblit/scanner.h

     class Scanner {
     
     public:
-        Scanner(const char *source);
+        typedef const char *(*classify)(const string &source,
+                                        string::size_type lexeme_start,
+                                        string::size_type lexeme_size);
+        typedef vector<classify> class_tests;
+        Scanner();
         virtual ~Scanner();
+        virtual Scanner &operator <<=(const char *source);
+        virtual Scanner &operator <<(const char *source);
         virtual void reset();
-        Token get_token();
-        void push_back_token(const Token t);
+        virtual Scanner &operator <<=(classify item);
+        virtual Scanner &operator <<(classify item);
+        virtual Token get_token();
+        virtual Scanner &operator <<(const Token t);
     protected:
-        Token read_token();
+        virtual Token read_token();
         string scr_source;
         vector<Token> scr_next;
-        string::size_type scr_read_start;
+        string::size_type scr_read_next;
+        class_tests identifier_list;
     private:
-        Scanner();
         Scanner(const Scanner &);
-        Scanner & operator=(const Scanner &);
+        Scanner &operator =(const Scanner &);
     };
 
-    class Parser {
-
-    public:
-        Parser(Scanner &source);
-        virtual ~Parser();
-        Reference parse();
-    protected:
-        Scanner &psr_source;
-    private:
-        Parser();
-        Parser(const Parser &);
-        Parser & operator=(const Parser &);
-    };
+    Scanner &operator +=(Scanner &ltype, Scanner::classify rtype);
+    Scanner &operator ,(Scanner &lhs, Scanner::classify rhs);
 };
 
 #endif
 #include "imgblit/scanner.h"
+#include "imgblit/exceptions.h"
 #include "imgblit/internal.h"
 #include <ostream>
+#include <sstream>
+
+#include <iostream>
 
 using std::ostream;
+using std::stringstream;
 
 namespace imgblit {
     struct token_t : public datum_t {
     return not_null_guard(dynamic_cast<token_t *>(get_datum_p()));
 }
 
-Scanner::Scanner(const char *source) :
-    scr_source(source), scr_read_start(0)
+Scanner::Scanner() : scr_source(""), scr_read_next(0)
 {
-    push_back_token(read_token());
 }
 
 Scanner::~Scanner()
 {
 }
 
+Scanner &
+Scanner::operator <<=(classify item)
+{
+    identifier_list.clear();
+    return *this << item;
+}
+
+Scanner &
+Scanner::operator <<(classify item)
+{
+    identifier_list.push_back(item);
+    return *this;
+}
+
+Scanner &
+Scanner::operator <<(const char *source)
+{
+    scr_source += source;
+    return *this;
+}
+
+Scanner &
+Scanner::operator <<=(const char *source)
+{
+    scr_source = source;
+    reset();
+    return *this;
+}
+
 void
 Scanner::reset()
 {
-    scr_read_start = 0;
+    scr_read_next = 0;
     scr_next.clear();
-    scr_next.push_back(read_token());
 }
 
 Token
 Scanner::get_token()
 {
-    Token t = scr_next.back();
-    scr_next.pop_back();
-    if (scr_next.empty()) {
-        scr_next.push_back(read_token());
+    if (!scr_next.empty()) {
+        Token t = scr_next.back();
+        scr_next.pop_back();
+        return t;
     }
-    return t;
+    return read_token();
 }
 
-void
-Scanner::push_back_token(const Token t)
+Scanner &
+Scanner::operator <<(const Token t)
 {
     scr_next.push_back(t);
+    return *this;
 }
 
 Token
 Scanner::read_token()
 {
-    const char * const token_class = "token";
+    const char *token_class = 0;
     string::size_type end = scr_source.size();
-    string::size_type i = scr_read_start;
+    string::size_type start = scr_read_next;
 
-    while (i != end && isspace(scr_source[i])) {
-        ++i;
+    while (start != end && isspace(scr_source[start])) {
+        ++start;
     }
-    scr_read_start = i;
-    if (i == end) {
+    scr_read_next = start;
+    if (scr_read_next == end) {
         return Token();
     }
-    while (scr_read_start != end &&
-           !isspace(scr_source[scr_read_start])) {
-        ++scr_read_start;
+    do {
+        ++scr_read_next;
     }
-    return Token(scr_source, i, scr_read_start - i, token_class);
+    while (scr_read_next != end && !isspace(scr_source[scr_read_next]));
+    int size = scr_read_next - start;
+
+    for (int i = 0, n = identifier_list.size(); i != n && !token_class; ++i) {
+        token_class = identifier_list[i](scr_source, start, size);
+    }
+    if (!token_class) {
+        stringstream msg;
+        msg << "Invalid token "
+            << '"' << scr_source.substr(start, size).c_str() << '"'
+            << " at position " << start << ".";
+        throw ValueError(msg.str());
+    }
+    return Token(scr_source, start, size, token_class);
 }
-
-Parser::Parser(Scanner &source) : psr_source(source)
-{
-}
-
-Parser::~Parser()
-{
-}
-
-Reference
-Parser::parse()
-{
-    return Reference();
-}

test/testscanner.cpp

 #include "imgblit/scanner.h"
+#include "imgblit/exceptions.h"
+#include <string>
 #include <sstream>
 #include <cassert>
+#include <cctype>
+#include <cstring>
+
 
 using namespace imgblit;
+using std::string;
 using std::stringstream;
+using std::isdigit;
+using std::isalnum;
+using std::strcmp;
+
+
+void test_tokenize();
+void test_reset();
+void test_classify();
+void test_unclassified();
+void test_source_concatenation();
+void test_replace();
 
 int main()
 {
-    Scanner scan("one two three");
+    test_tokenize();
+    test_reset();
+    test_classify();
+    test_unclassified();
+    test_source_concatenation();
+    test_replace();
+
+    return 0;
+}
+
+// Notes:
+// ------
+//
+// In an expression such as::
+// 
+//    std::c_out << fn1() << fn2() < fn3();
+//
+// the order in which the functions are called differs from the order in which
+// the << operators are evaluated. So when building a stringstream from tokens
+// for an assertion, the tokens are appended individually, rather than
+// in one single expression.
+//
+
+const char *
+its_a_token(const string &s, string::size_type pos, string::size_type sz)
+{
+    const char * const class_name = "token";
+    return class_name;
+}
+
+void
+test_tokenize() {
+    Scanner scan;
+    scan << its_a_token << "one two three";
     Token t_one = scan.get_token();
     Token t_two = scan.get_token();
     Token t_three = scan.get_token();
     assert(capture.str() == ("Token(\"one\", \"token\")"
 			     "Token(\"two\", \"token\")"
 			     "Token(\"three\", \"token\")"));
+}
+
+void
+test_reset()
+{
+    stringstream capture;
+    Scanner scan;
+    scan << its_a_token;
+
+    // Nothing to parse
+    assert(!static_cast<bool>(scan.get_token()));
+
+    // Give it a source string.
+    scan << "one two three";
+    capture << scan.get_token();  // "one"
+    capture << scan.get_token();  // "two"
+    capture << scan.get_token();  // "three"
+    capture << scan.get_token();  // non-reference
+    assert(capture.str() == ("Token(\"one\", \"token\")"
+			     "Token(\"two\", \"token\")"
+			     "Token(\"three\", \"token\")"
+			     "Reference()"));
 
     // Go back to the beginning of the source string.
     capture.str("");
     scan.reset();
-    Token t_reset_one = scan.get_token();
-    Token t_reset_two = scan.get_token();
-    Token t_reset_three = scan.get_token();
-    Token t_reset_none = scan.get_token();
-    capture << t_reset_one << t_reset_two << t_reset_three << t_reset_none;
+    capture << scan.get_token();  // "one"
+    capture << scan.get_token();  // "two"
+    capture << scan.get_token();  // "three"
+    capture << scan.get_token();  // non-reference
     assert(capture.str() == ("Token(\"one\", \"token\")"
 			     "Token(\"two\", \"token\")"
 			     "Token(\"three\", \"token\")"			
 			     "Reference()"));
+}
 
-    return 0;
+const char *
+its_digits(const string &s, string::size_type pos, string::size_type sz)
+{
+    const char * const class_name = "digits";
+    int i = pos;
+    int finish = pos + sz;
+
+    while (i != finish && isdigit(s[i])) {
+        ++i;
+    }
+    return i == finish ? class_name : 0;
 }
+
+const char *
+its_alphanumeric(const string &s, string::size_type pos, string::size_type sz)
+{
+    const char * const class_name = "alphanumeric";
+    int i = pos;
+    int finish = pos + sz;
+
+    while (i != finish && isalnum(s[i])) {
+        ++i;
+    }
+    return i == finish ? class_name : 0;
+}
+
+void
+test_classify()
+{
+    stringstream capture;
+
+    // The token class order differs slightly from the classify
+    // function call order its_digits, its_alphanumeric, its_a_token.
+    Scanner scan;
+    scan << its_digits << its_alphanumeric << its_a_token
+         << "z121 121 _z121";
+
+    // All classify functions are used. The non reference Reference()
+    // is not checked.
+    capture << scan.get_token();  // "z121"
+    capture << scan.get_token();  // "121"
+    capture << scan.get_token();  // "_z121"
+    capture << scan.get_token();  // non-reference
+    assert(capture.str() == ("Token(\"z121\", \"alphanumeric\")"
+                             "Token(\"121\", \"digits\")"
+			     "Token(\"_z121\", \"token\")"
+			     "Reference()"));
+}
+
+void
+test_unclassified()
+{
+    Scanner scan;
+    scan << its_digits << its_alphanumeric << "z121 121 _z121 45";
+    stringstream capture;
+
+    // The first two tokens have recognized token classes.
+    scan.get_token();
+    scan.get_token();
+
+    // The third token does not.
+    try {
+        scan.get_token();
+    }
+    catch (ValueError e) {
+        assert(strcmp(e.what(), "Invalid token \"_z121\" at position 9.") == 0);
+    }
+
+    // The scanner recovers and returns the fourth token as class digits.
+    capture << scan.get_token();
+    assert(capture.str() == "Token(\"45\", \"digits\")");
+}
+
+void
+test_source_concatenation()
+{
+    Scanner scan;
+    stringstream capture;
+
+    // First token.
+    scan << "100" << its_digits;
+    capture << scan.get_token();
+    assert(capture.str() == "Token(\"100\", \"digits\")");
+    assert(!static_cast<bool>(scan.get_token()));
+
+    // Second token.
+    scan << " 200";
+    capture.str("");
+    capture << scan.get_token();
+    assert(capture.str() == "Token(\"200\", \"digits\")");
+    assert(!static_cast<bool>(scan.get_token()));
+
+    // After resetting, both tokens will still be there.
+    scan.reset();
+    capture.str("");
+    capture << scan.get_token();  // "100"
+    capture << scan.get_token();  // "200"
+    capture << scan.get_token();  // non-reference
+    assert(capture.str() == ("Token(\"100\", \"digits\")"
+                             "Token(\"200\", \"digits\")"
+                             "Reference()"));
+}
+
+void
+test_replace()
+{
+    Scanner scan;
+    stringstream capture;
+
+    // First source string.
+    scan << its_digits << its_alphanumeric << "abc ";
+    capture << scan.get_token();  // "abc"
+    capture << scan.get_token();  // non-reference
+    assert(capture.str() == ("Token(\"abc\", \"alphanumeric\")"
+                             "Reference()"));
+
+    // Second source string.
+    capture.str("");
+    scan <<= "123 ";
+    capture << scan.get_token();  // "123"
+    capture << scan.get_token();  // non-reference
+    assert(capture.str() == ("Token(\"123\", \"digits\")"
+                             "Reference()"));
+
+    // Difference classify function.
+    capture.str("");
+    scan <<= its_a_token;
+    scan.reset();
+    capture << scan.get_token();  // "123"
+    capture << scan.get_token();  // non-reference
+    assert(capture.str() == ("Token(\"123\", \"token\")"
+                             "Reference()"));
+}