Source

Socrates / src / socrates / parser / prdparser / scanner.py

Full commit
#!/usr/bin/env python
# -*- coding:utf-8 -*-

from spark import GenericScanner

class Token(object):
    def __init__(self, type, attr=None):
        self.type = type
        self.attr = attr

    def __cmp__(self, o):
        return cmp(self.type, o)

    def __str__(self):
        return "%s:%s"%(self.type, self.attr)
    def __repr__(self):
        return "%s:%s"%(self.type, self.attr)

class PrdExprScanner(GenericScanner):
    """
>>> scanner = PrdExprScanner()
>>> scanner.tokenize("ssoid")
[chip:ssoid]
>>> scanner.tokenize('"ssoid"')
[":", chip:ssoid, ":"]
>>> scanner.tokenize('"I\\'m a ssoid"')
[":", chip:I, ':', chip:m a ssoid, ":"]
>>> scanner.tokenize('name')
[chip:name]
>>> scanner.tokenize('name:string')
[chip:name, :::, chip:string]
>>> scanner.tokenize("visit page")
[chip:visit page]
>>> scanner.tokenize("visit page:string")
[chip:visit page, :::, chip:string]
>>> scanner.tokenize('"test":"subject"')
[":", chip:test, ":", :::, ":", chip:subject, ":"]
>>> scanner.tokenize('"test subject":"subject"')
[":", chip:test subject, ":", :::, ":", chip:subject, ":"]
>>> scanner.tokenize('"test subject:page":"subject"')
[":", chip:test subject, :::, chip:page, ":", :::, ":", chip:subject, ":"]
>>> scanner.tokenize('"test subject":"string type"')
[":", chip:test subject, ":", :::, ":", chip:string type, ":"]
>>> scanner.tokenize('"test subject:page":"subject array"')
[":", chip:test subject, :::, chip:page, ":", :::, ":", chip:subject array, ":"]
>>> scanner.tokenize('"test subject:page":"subject array:integer"')
[":", chip:test subject, :::, chip:page, ":", :::, ":", chip:subject array, :::, chip:integer, ":"]
>>> scanner.tokenize('"test subject:page":"subject array:integer"')
[":", chip:test subject, :::, chip:page, ":", :::, ":", chip:subject array, :::, chip:integer, ":"]
>>> scanner.tokenize('\\\\\\'test subject:page\\\\\\':\\\\\\'subject array:integer\\\\\\\'')
[\\:\\, ':', chip:test subject, :::, chip:page, \\:\\, ':', :::, \\:\\, ':', chip:subject array, :::, chip:integer, \\:\\, ':']
    """
    def __init__(self):
        GenericScanner.__init__(self)
    def tokenize(self, input):
        self.rv = []
        self.status = None
        self.quote = None
        GenericScanner.tokenize(self, input)
        return self.rv

    # def t_whitespace(self, s):
    #     r' \s+'
    #     self.rv.append(s)
    def t_quote(self, s):
        r' "|\''
        self.rv.append(Token(type=s, attr=s))
        
    def t_escape(self, s):
        r' \\'
        self.rv.append(Token(type=s, attr=s))
    def t_colon(self, s):
        r' :'
        self.rv.append(Token(type=s, attr=s))
    def t_chip(self, s):
        r' [^\'\"\\\:]+'
        self.rv.append(Token(type='chip', attr=s))

if __name__=="__main__":
    import doctest
    doctest.testmod()