1. Lars Yencken
  2. parse-vis


parse-vis / parse_vis.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#  vis_graph.py
#  vis-graph
#  Created by Lars Yencken on 2010-09-29.
#  Copyright 2010 NICTA. All rights reserved.

Visualizes GR parse graphs using graphviz.

import os
import sys
import optparse
import subprocess
import itertools
import gzip
import tempfile

import shell_color

def parse_vis(filename, fmt='pdf', skip=0):
    parses = _iter_parses(filename)

    if skip > 0:
        # skip the first n parses
        parses = itertools.islice(parses, skip, None)

    n = 0
    for graph, tokens in parses:
        with tempfile.NamedTemporaryFile(suffix='.dot') as dot_file:
            _dump_dot(graph, dot_file)
            fmt_file = _render_dot(dot_file.name, fmt)

        n += 1

    print 'Displayed %d parse graphs' % n

def _iter_parses(filename):
    "Returns an interator over all valid parses in the file."
    with open(filename) as istream:
        if filename.endswith('.gz'):
            istream = gzip.GzipFile(fileobj=istream, mode='r')
        graph, tokens = _find_parse(istream)
        while graph:
            yield graph, tokens
            graph, tokens = _find_parse(istream)

def _view_file(filename):
    "Open the given file in evince, and block until evince closes."
    retval = os.system('evince %s' % filename)
    if retval != 0:

def _render_dot(filename, fmt):
    "Runs dot to render the given file."
    output_name = os.path.splitext(filename)[0] + '.' + fmt
    dot = subprocess.Popen(['dot', '-T%s' % fmt, '-o', output_name],
    retcode = dot.wait()

    if retcode != 0:
        print >> sys.stderr, "Dot encountered an error: entering debugger"
        import pdb; pdb.set_trace()

    return output_name

def _dump_dot(graph, ostream):
    "Dump a graph in dot format."
    print >> ostream, "digraph G {" 
    for node, edges in graph.iteritems():
        # order edges by their order in the sentence
        edges = edges.items()
        edges.sort(key=lambda e: int(e[0].split('_')[-1]))

        for child, label in edges:
            print >> ostream, "  %s -> %s [label=\"%s\"]" % (
                    _normalize_token(node), _normalize_token(child),
    print >> ostream, "}"
    assert os.path.exists(ostream.name)

def _normalize_token(name):
    "Rewrites any tokens which would make dot complain."
    replacements = [
        ('-',   'DASH'),
        (',',   'COMMA'),
        ('(',   'LBRACKET'),
        (')',   'RBRACKET'),
        ('.',   'DOT'),
    for s, subs in replacements:
        name = name.replace(s, subs)
    return name

def _find_parse(istream):
    "Find and return the next parse in the stream."
    # scan to the next graph
    line = None
    for line in istream:
        line = line.rstrip()
        if line.startswith('(') and line.endswith(')'):

    if not line:
        return {}, None

    # push the line we just read back onto the stream
    istream = itertools.chain([line], istream)

    # build a graph dictionary: node -> (node -> edge_label)
    graph = {}
    for line in istream:
        line = line.rstrip()
        if not (line.startswith('(') and line.endswith(')')):

        line = line[1:-1]
        parts = line.split()
        label = parts.pop(0)
        if len(parts) == 3:
            if parts[0] == '_':
                del parts[0]
            elif parts[-1] == '_':
                del parts[-1]
            elif label in ('csubj', 'xsubj', 'ncsubj') \
                    and parts[-1] in ('obj', 'inv'):
                label = '%s[%s]' % (label, parts.pop())
            elif label.endswith('mod') or label.endswith('comp'):
                label = "%s['%s']" % (label, parts.pop(0))
                print >> sys.stderr, "unsure how to display this relation:"
                print >> sys.stderr, '\t' + line
                print >> sys.stderr, "entering debugger..."
                import pdb; pdb.set_trace()

        if len(parts) == 2:
            node_a, node_b = parts
            edges = graph.setdefault(node_a, {})
            assert node_b not in edges
            edges[node_b] = label
            raise ValueError(line)

    if line.startswith('<c>'):
        tokens = line[4:].split()
        tokens = None

    return graph, tokens

def _display_tokens(tokens):
    "Print numbered tokens in order to stdout."
    for i, token_data in enumerate(tokens):
        token = token_data.split('|')[0]
        print '%s_%s' % (token, shell_color.color(str(i), 'blue')),
    print '-----'

def _can_run_executable(name):
    "Checks if the required executable exists."
    p = subprocess.Popen(['which', name], stdout=open('/dev/null', 'w'))
    retval = p.wait()
    return retval == 0


def _create_option_parser():
    usage = \
"""%prog [options] parse_file

Visualises GR dependency parses found in the file, one by one."""

    parser = optparse.OptionParser(usage)

    parser.add_option('-s', '--skip', action='store', dest='skip', type='int',
            default=0, help='Skip the first n parses in the stream.')

    return parser

def main(argv):
    parser = _create_option_parser()
    (options, args) = parser.parse_args(argv)

    if not _can_run_executable('dot'):
        print >> sys.stderr, "Can't find 'dot' executable, please install graphviz or check your PATH."

    if len(args) != 1:

    parse_vis(*args, skip=options.skip)


if __name__ == '__main__':