Commits

Lenard Lindstrom committed bf3a3e0

Save token class identification so far

  • Participants
  • Parent commits 9adddcc

Comments (0)

Files changed (2)

+# HG changeset patch
+# Parent 9184449a7eeb77303b49b1390a08575a655114f3
+Add token class identification; needs more unit tests.
+
+diff -r 9184449a7eeb -r 0125b1876a65 include/imgblit/scanner.h
+--- a/include/imgblit/scanner.h	Thu Apr 10 10:43:58 2014 -0700
++++ b/include/imgblit/scanner.h	Fri Apr 11 19:18:02 2014 -0700
+@@ -29,35 +29,32 @@
+     class Scanner {
+     
+     public:
+-        Scanner(const char *source);
++        typedef const char *(*classify)(const string &source,
++                                        string::size_type lexeme_start,
++                                        string::size_type lexeme_size);
++        typedef vector<classify> class_tests;
++        Scanner();
+         virtual ~Scanner();
++        virtual Scanner &operator <<=(const char *source);
++        virtual Scanner &operator <<(const char *source);
+         virtual void reset();
+-        Token get_token();
+-        void push_back_token(const Token t);
++        virtual Scanner &operator <<=(classify item);
++        virtual Scanner &operator <<(classify item);
++        virtual Token get_token();
++        virtual Scanner &operator <<(const Token t);
+     protected:
+-        Token read_token();
++        virtual Token read_token();
+         string scr_source;
+         vector<Token> scr_next;
+-        string::size_type scr_read_start;
++        string::size_type scr_read_next;
++        class_tests identifier_list;
+     private:
+-        Scanner();
+         Scanner(const Scanner &);
+-        Scanner & operator=(const Scanner &);
++        Scanner &operator =(const Scanner &);
+     };
+ 
+-    class Parser {
+-
+-    public:
+-        Parser(Scanner &source);
+-        virtual ~Parser();
+-        Reference parse();
+-    protected:
+-        Scanner &psr_source;
+-    private:
+-        Parser();
+-        Parser(const Parser &);
+-        Parser & operator=(const Parser &);
+-    };
++    Scanner &operator +=(Scanner &ltype, Scanner::classify rtype);
++    Scanner &operator ,(Scanner &lhs, Scanner::classify rhs);
+ };
+ 
+ #endif
+diff -r 9184449a7eeb -r 0125b1876a65 src/scanner.cpp
+--- a/src/scanner.cpp	Thu Apr 10 10:43:58 2014 -0700
++++ b/src/scanner.cpp	Fri Apr 11 19:18:02 2014 -0700
+@@ -1,8 +1,13 @@
+ #include "imgblit/scanner.h"
++#include "imgblit/exceptions.h"
+ #include "imgblit/internal.h"
+ #include <ostream>
++#include <sstream>
++
++#include <iostream>
+ 
+ using std::ostream;
++using std::stringstream;
+ 
+ namespace imgblit {
+     struct token_t : public datum_t {
+@@ -47,72 +52,97 @@
+     return not_null_guard(dynamic_cast<token_t *>(get_datum_p()));
+ }
+ 
+-Scanner::Scanner(const char *source) :
+-    scr_source(source), scr_read_start(0)
++Scanner::Scanner() : scr_source(""), scr_read_next(0)
+ {
+-    push_back_token(read_token());
+ }
+ 
+ Scanner::~Scanner()
+ {
+ }
+ 
++Scanner &
++Scanner::operator <<=(classify item)
++{
++    identifier_list.clear();
++    return *this << item;
++}
++
++Scanner &
++Scanner::operator <<(classify item)
++{
++    identifier_list.push_back(item);
++    return *this;
++}
++
++Scanner &
++Scanner::operator <<(const char *source)
++{
++    scr_source += source;
++    return *this;
++}
++
++Scanner &
++Scanner::operator <<=(const char *source)
++{
++    scr_source = source;
++    reset();
++    return *this;
++}
++
+ void
+ Scanner::reset()
+ {
+-    scr_read_start = 0;
++    scr_read_next = 0;
+     scr_next.clear();
+-    scr_next.push_back(read_token());
+ }
+ 
+ Token
+ Scanner::get_token()
+ {
+-    Token t = scr_next.back();
+-    scr_next.pop_back();
+-    if (scr_next.empty()) {
+-        scr_next.push_back(read_token());
++    if (!scr_next.empty()) {
++        Token t = scr_next.back();
++        scr_next.pop_back();
++        return t;
+     }
+-    return t;
++    return read_token();
+ }
+ 
+-void
+-Scanner::push_back_token(const Token t)
++Scanner &
++Scanner::operator <<(const Token t)
+ {
+     scr_next.push_back(t);
++    return *this;
+ }
+ 
+ Token
+ Scanner::read_token()
+ {
+-    const char * const token_class = "token";
++    const char *token_class = 0;
+     string::size_type end = scr_source.size();
+-    string::size_type i = scr_read_start;
++    string::size_type start = scr_read_next;
+ 
+-    while (i != end && isspace(scr_source[i])) {
+-        ++i;
++    while (start != end && isspace(scr_source[start])) {
++        ++start;
+     }
+-    scr_read_start = i;
+-    if (i == end) {
++    scr_read_next = start;
++    if (scr_read_next == end) {
+         return Token();
+     }
+-    while (scr_read_start != end &&
+-           !isspace(scr_source[scr_read_start])) {
+-        ++scr_read_start;
++    do {
++        ++scr_read_next;
+     }
+-    return Token(scr_source, i, scr_read_start - i, token_class);
++    while (scr_read_next != end && !isspace(scr_source[scr_read_next]));
++    int size = scr_read_next - start;
++
++    for (int i = 0, n = identifier_list.size(); i != n && !token_class; ++i) {
++        token_class = identifier_list[i](scr_source, start, size);
++    }
++    if (!token_class) {
++        stringstream msg;
++        msg << "Invalid token "
++            << '"' << scr_source.substr(start, size).c_str() << '"'
++            << " at position " << start << ".";
++        throw ValueError(msg.str());
++    }
++    return Token(scr_source, start, size, token_class);
+ }
+-
+-Parser::Parser(Scanner &source) : psr_source(source)
+-{
+-}
+-
+-Parser::~Parser()
+-{
+-}
+-
+-Reference
+-Parser::parse()
+-{
+-    return Reference();
+-}
+diff -r 9184449a7eeb -r 0125b1876a65 test/testscanner.cpp
+--- a/test/testscanner.cpp	Thu Apr 10 10:43:58 2014 -0700
++++ b/test/testscanner.cpp	Fri Apr 11 19:18:02 2014 -0700
+@@ -1,13 +1,30 @@
+ #include "imgblit/scanner.h"
++#include "imgblit/exceptions.h"
++#include <string>
+ #include <sstream>
+ #include <cassert>
++#include <cctype>
++#include <cstring>
+ 
+ using namespace imgblit;
++using std::string;
+ using std::stringstream;
++using std::isdigit;
++using std::isalnum;
++using std::strcmp;
+ 
+-int main()
++
++const char *
++its_a_token(const string &s, string::size_type pos, string::size_type sz)
+ {
+-    Scanner scan("one two three");
++    const char * const class_name = "token";
++    return class_name;
++}
++
++void
++test_tokenize() {
++    Scanner scan;
++    scan << its_a_token << "one two three";
+     Token t_one = scan.get_token();
+     Token t_two = scan.get_token();
+     Token t_three = scan.get_token();
+@@ -26,6 +43,30 @@
+ 			     "Token(\"two\", \"token\")"
+ 			     "Token(\"three\", \"token\")"));
+ 
++}
++
++void
++test_reset()
++{
++    stringstream capture;
++    Scanner scan;
++    scan << its_a_token;
++
++    // Nothing to parse
++    assert(!static_cast<bool>(scan.get_token()));
++
++    // Give it a source string.
++    scan << "one two three";
++    Token t_one = scan.get_token();
++    Token t_two = scan.get_token();
++    Token t_three = scan.get_token();
++    Token t_none = scan.get_token();
++    capture << t_one << t_two << t_three << t_none;
++    assert(capture.str() == ("Token(\"one\", \"token\")"
++			     "Token(\"two\", \"token\")"
++			     "Token(\"three\", \"token\")"
++			     "Reference()"));
++
+     // Go back to the beginning of the source string.
+     capture.str("");
+     scan.reset();
+@@ -38,6 +79,88 @@
+ 			     "Token(\"two\", \"token\")"
+ 			     "Token(\"three\", \"token\")"			
+ 			     "Reference()"));
++}
++
++const char *
++its_all_digits(const string &s, string::size_type pos, string::size_type sz)
++{
++    const char * const class_name = "digits";
++    int i = pos;
++    int finish = pos + sz;
++
++    while (i != finish && isdigit(s[i])) {
++        ++i;
++    }
++    return i == finish ? class_name : 0;
++}
++
++const char *
++its_alphanumeric(const string &s, string::size_type pos, string::size_type sz)
++{
++    const char * const class_name = "alphanumeric";
++    int i = pos;
++    int finish = pos + sz;
++
++    while (i != finish && isalnum(s[i])) {
++        ++i;
++    }
++    return i == finish ? class_name : 0;
++}
++
++void
++test_classify()
++{
++    stringstream capture;
++
++    // The token class order differs slightly from the classify
++    // function call order its_all_digits, its_alphanumeric, its_a_token.
++    Scanner scan;
++    scan << its_all_digits << its_alphanumeric << its_a_token
++         << "z121 121 _z121";
++
++    // All classify functions are used. The non reference Reference()
++    // is not checked.
++    Token t_alphanumeric = scan.get_token();
++    Token t_digits = scan.get_token();
++    Token t_token = scan.get_token();
++    Token t_none = scan.get_token();
++    capture << t_digits << t_alphanumeric << t_token << t_none;
++    assert(capture.str() == ("Token(\"121\", \"digits\")"
++			     "Token(\"z121\", \"alphanumeric\")"
++			     "Token(\"_z121\", \"token\")"
++			     "Reference()"));
++}
++
++void
++test_unclassified()
++{
++    Scanner scan;
++    scan << its_all_digits << its_alphanumeric << "z121 121 _z121 45";
++    stringstream capture;
++
++    // The first two tokens have recognized token classes.
++    scan.get_token();
++    scan.get_token();
++
++    // The third token does not.
++    try {
++        scan.get_token();
++    }
++    catch (ValueError e) {
++        assert(strcmp(e.what(), "Invalid token \"_z121\" at position 9.") == 0);
++    }
++
++    // The scanner recovers and returns the fourth token as class digits.
++    capture << scan.get_token();
++    assert(capture.str() == "Token(\"45\", \"digits\")");
++}
++
++int main()
++{
++    test_tokenize();
++    test_reset();
++    test_classify();
++    test_unclassified();
+ 
+     return 0;
+ }
+classify
 # Placed by Bitbucket