Source

whoosh / src / whoosh / automata / nfa.py

Full commit
# Copyright 2012 Matt Chaput. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
#    1. Redistributions of source code must retain the above copyright notice,
#       this list of conditions and the following disclaimer.
#
#    2. Redistributions in binary form must reproduce the above copyright
#       notice, this list of conditions and the following disclaimer in the
#       documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are
# those of the authors and should not be interpreted as representing official
# policies, either expressed or implied, of Matt Chaput.

from whoosh.automata.fst import Arc


class Instruction(object):
    def __repr__(self):
        return "%s()" % (self.__class__.__name__, )


class Char(Instruction):
    """
    Matches a literal character.
    """

    def __init__(self, c):
        self.c = c

    def __repr__(self):
        return "Char(%r)" % self.c

class Lit(Instruction):
    """
    Matches a literal string.
    """

    def __init__(self, c):
        self.c = c

    def __repr__(self):
        return "Lit(%r)" % self.c


class Any(Instruction):
    """
    Matches any character.
    """


class Match(Instruction):
    """
    Stop this thread: the string matched.
    """

    def __repr__(self):
        return "Match()"


class Jmp(Instruction):
    """
    Jump to a specified instruction.
    """

    def __init__(self, x):
        self.x = x

    def __repr__(self):
        return "Jmp(%s)" % self.x


class Split(Instruction):
    """
    Split execution: continue at two separate specified instructions.
    """

    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __repr__(self):
        return "Split(%s, %s)" % (self.x, self.y)


class Label(Instruction):
    """
    Placeholder to act as a target for JMP instructions
    """

    def __hash__(self):
        return id(self)

    def __repr__(self):
        return "L(%s)" % hex(id(self))


def concat(e1, e2):
    return e1 + e2


def alt(e1, e2):
    L1, L2, L3 = Label(), Label(), Label()
    return [L1] + e1 + [Jmp(L3), L2] + e2 + [L3]


def zero_or_one(e):
    L1, L2 = Label(), Label()
    return [Split(L1, L2), L1] + e + [L2]


def zero_or_more(e):
    L1, L2, L3 = Label(), Label(), Label()
    return [L1, Split(L2, L3), L2] + e + [Jmp(L1), L3]


def one_or_more(e):
    L1, L2 = Label(), Label()
    return [L1] + e + [Split(L1, L2), L2]


def fixup(program):
    refs = {}
    i = 0
    while i < len(program):
        op = program[i]
        if isinstance(op, Label):
            refs[op] = i
            program.pop(i)
        else:
            i += 1

    if refs:
        for op in program:
            if isinstance(op, (Jmp, Split)):
                op.x = refs[op.x]
            if isinstance(op, Split):
                op.y = refs[op.y]

    return program + [Match]


class ThreadList(object):
    def __init__(self, program, max=1000):
        self.program = program
        self.max = max
        self.threads = []

    def __nonzero__(self):
        return bool(self.threads)

    def current(self):
        return self.threads.pop()

    def add(self, thread):
        op = self.program[thread.pc]
        optype = type(op)
        if optype is Jmp:
            self.add(thread.at(op.x))
        elif optype is Split:
            self.add(thread.copy_at(op.x))
            self.add(thread.at(op.y))
        else:
            self.threads.append(thread)


class Thread(object):
    def __init__(self, pc, address, sofar='', accept=False):
        self.pc = pc
        self.address = address
        self.sofar = sofar
        self.accept = accept

    def at(self, pc):
        self.pc = pc
        return self

    def copy_at(self, pc):
        return Thread(pc, self.address, self.sofar, self.accept)

    def __repr__(self):
        d = self.__dict__
        return "Thread(%s)" % ",".join("%s=%r" % (k, v) for k, v in d.items())


def advance(thread, arc, c):
    thread.pc += 1
    thread.address = arc.target
    thread.sofar += c
    thread.accept = arc.accept


def run(graph, program, address):
    threads = ThreadList(program)
    threads.add(Thread(0, address))
    arc = Arc()
    while threads:
        thread = threads.current()
        address = thread.address
        op = program[thread.pc]
        optype = type(op)

        if optype is Char:
            if address:
                arc = graph.find_arc(address, op.c, arc)
                if arc:
                    advance(thread, arc)
                    threads.add(thread)
        elif optype is Lit:
            if address:
                c = op.c
                arc = graph.find_path(c, arc, address)
                if arc:
                    advance(thread, arc, c)
                    threads.add(thread)
        elif optype is Any:
            if address:
                sofar = thread.sofar
                pc = thread.pc + 1
                for arc in graph.iter_arcs(address, arc):
                    t = Thread(pc, arc.target, sofar + arc.label, arc.accept)
                    threads.add(t)
        elif op is Match:
            if thread.accept:
                yield thread.sofar
        else:
            raise Exception("Don't know what to do with %r" % op)


LO = 0
HI = 1


def regex_limit(graph, mode, program, address):
    low = mode == LO
    output = []
    threads = ThreadList(program)
    threads.add(Thread(0, address))
    arc = Arc()
    while threads:
        thread = threads.current()
        address = thread.address
        op = program[thread.pc]
        optype = type(op)

        if optype is Char:
            if address:
                arc = graph.find_arc(address, op.c, arc)
                if arc:
                    if low and arc.accept:
                        return thread.sofar + thread.label
                    advance(thread, arc)
                    threads.add(thread)
        elif optype is Lit:
            if address:
                labels = op.c
                for label in labels:
                    arc = graph.find_arc(address, label)
                    if arc is None:
                        return thread.sofar
            elif thread.accept:
                return thread.sofar
        elif optype is Any:
            if address:
                if low:
                    arc = graph.arc_at(address, arc)
                else:
                    for arc in graph.iter_arcs(address):
                        pass
                advance(thread, arc, arc.label)
                threads.add(thread)
            elif thread.accept:
                return thread.sofar
        elif op is Match:
            return thread.sofar
        else:
            raise Exception("Don't know what to do with %r" % op)


if __name__ == "__main__":
    from whoosh import index, query
    from whoosh.filedb.filestore import RamStorage
    from whoosh.automata import fst
    from whoosh.util.testing import timing

    st = RamStorage()
    gw = fst.GraphWriter(st.create_file("test"))
    gw.start_field("test")
    for key in ["aaaa", "aaab", "aabb", "abbb", "babb", "bbab", "bbba"]:
        gw.insert(key)
    gw.close()
    gr = fst.GraphReader(st.open_file("test"))

    program = one_or_more([Lit("a")])
    print program
    program = fixup(program)
    print program
    print list(run(gr, program, gr.root("test")))

    ix = index.open_dir("e:/dev/src/houdini/help/index")
    r = ix.reader()
    gr = r._get_graph()

#    program = fixup([Any(), Any(), Any(), Any(), Any()])
#    program = fixup(concat(zero_or_more([Any()]), [Char("/")]))
#    with timing():
#        x = list(run(gr, program, gr.root("path")))
#    print len(x)

    q = query.Regex("path", "^.[abc].*/$")
    with timing():
        y = list(q._btexts(r))
    print len(y)
    print y[0], y[-1]

    pr = [Any()] + alt([Lit("c")], alt([Lit("b")], [Lit("a")])) + zero_or_more([Any()]) + [Lit("/")]
    program = fixup(pr)
#    with timing():
#        x = list(run(gr, program, gr.root("path")))
#    print len(x), x

    with timing():
        print "lo=", regex_limit(gr, LO, program, gr.root("path"))
        print "hi=", regex_limit(gr, HI, program, gr.root("path"))



#int
#backtrackingvm(Inst *prog, char *input)
#{
#    enum { MAXTHREAD = 1000 };
#    Thread ready[MAXTHREAD];
#    int nready;
#    Inst *pc;
#    char *sp;
#
#    /* queue initial thread */
#    ready[0] = thread(prog, input);
#    nready = 1;
#
#    /* run threads in stack order */
#    while(nready > 0){
#        --nready;  /* pop state for next thread to run */
#        pc = ready[nready].pc;
#        sp = ready[nready].sp;
#        for(;;){
#            switch(pc->opcode){
#            case Char:
#                if(*sp != pc->c)
#                    goto Dead;
#                pc++;
#                sp++;
#                continue;
#            case Match:
#                return 1;
#            case Jmp:
#                pc = pc->x;
#                continue;
#            case Split:
#                if(nready >= MAXTHREAD){
#                    fprintf(stderr, "regexp overflow");
#                    return -1;
#                }
#                /* queue new thread */
#                ready[nready++] = thread(pc->y, sp);
#                pc = pc->x;  /* continue current thread */
#                continue;
#            }
#        }
#    Dead:;
#    }
#    return 0;
#}