cffi / demo / fastcsv.py

import csv
import cffi

# IN-PROGRESS.  See the demo at the end of the file


dialect2ffi = {}

def _make_ffi_from_dialect(dialect):

    ffi = cffi.FFI()

    ffi.cdef("""
        long parse_line(char *rawline, long inputlength);
    """)

    d = {'quotechar': ord(dialect.quotechar),
         'quoting': int(dialect.quoting),
         'skipinitialspace': int(dialect.skipinitialspace),
         'delimiter': ord(dialect.delimiter),
         'doublequote': int(dialect.doublequote),
         'strict': int(dialect.strict),
         }
    if dialect.escapechar is not None:
        d['is_escape_char'] = '== %d' % ord(dialect.escapechar)
    else:
        d['is_escape_char'] = '&& 0'

    lib = ffi.verify(r'''

    typedef enum {
        START_RECORD, START_FIELD, ESCAPED_CHAR, IN_FIELD,
        IN_QUOTED_FIELD, ESCAPE_IN_QUOTED_FIELD, QUOTE_IN_QUOTED_FIELD,
        EAT_CRNL
    } ParserState;

    typedef enum {
        QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE
    } QuoteStyle;

    typedef struct {
        ParserState state;          /* current CSV parse state */
        char *field;                /* build current field in here */
        int field_size;             /* size of allocated buffer */
        int field_len;              /* length of current field */
        int numeric_field;          /* treat field as numeric */
    } ReaderObj;

    static void
    parse_add_char(ReaderObj *self, char c)
    {
        *self->field++ = c;
    }

    static void
    parse_save_field(ReaderObj *self)
    {
        *self->field++ = 0;
    }

    static int
    parse_process_char(ReaderObj *self, char c)
    {
        switch (self->state) {
        case START_RECORD:
            /* start of record */
            if (c == '\0')
                /* empty line - return [] */
                break;
            else if (c == '\n' || c == '\r') {
                self->state = EAT_CRNL;
                break;
            }
            /* normal character - handle as START_FIELD */
            self->state = START_FIELD;
            /* fallthru */
        case START_FIELD:
            /* expecting field */
            if (c == '\n' || c == '\r' || c == '\0') {
                /* save empty field - return [fields] */
                parse_save_field(self);
                self->state = (c == '\0' ? START_RECORD : EAT_CRNL);
            }
            else if (c == %(quotechar)d &&
                     %(quoting)d != QUOTE_NONE) {
                /* start quoted field */
                self->state = IN_QUOTED_FIELD;
            }
            else if (c %(is_escape_char)s) {
                /* possible escaped character */
                self->state = ESCAPED_CHAR;
            }
            else if (c == ' ' && %(skipinitialspace)d)
                /* ignore space at start of field */
                ;
            else if (c == %(delimiter)d) {
                /* save empty field */
                parse_save_field(self);
            }
            else {
                /* begin new unquoted field */
                if (%(quoting)d == QUOTE_NONNUMERIC)
                    self->numeric_field = 1;
                parse_add_char(self, c);
                self->state = IN_FIELD;
            }
            break;

        case ESCAPED_CHAR:
            if (c == '\0')
                c = '\n';
            parse_add_char(self, c);
            self->state = IN_FIELD;
            break;

        case IN_FIELD:
            /* in unquoted field */
            if (c == '\n' || c == '\r' || c == '\0') {
                /* end of line - return [fields] */
                parse_save_field(self);
                self->state = (c == '\0' ? START_RECORD : EAT_CRNL);
            }
            else if (c %(is_escape_char)s) {
                /* possible escaped character */
                self->state = ESCAPED_CHAR;
            }
            else if (c == %(delimiter)d) {
                /* save field - wait for new field */
                parse_save_field(self);
                self->state = START_FIELD;
            }
            else {
                /* normal character - save in field */
                parse_add_char(self, c);
            }
            break;

        case IN_QUOTED_FIELD:
            /* in quoted field */
            if (c == '\0')
                ;
            else if (c %(is_escape_char)s) {
                /* Possible escape character */
                self->state = ESCAPE_IN_QUOTED_FIELD;
            }
            else if (c == %(quotechar)d &&
                     %(quoting)d != QUOTE_NONE) {
                if (%(doublequote)d) {
                    /* doublequote; " represented by "" */
                    self->state = QUOTE_IN_QUOTED_FIELD;
                }
                else {
                    /* end of quote part of field */
                    self->state = IN_FIELD;
                }
            }
            else {
                /* normal character - save in field */
                parse_add_char(self, c);
            }
            break;

        case ESCAPE_IN_QUOTED_FIELD:
            if (c == '\0')
                c = '\n';
            parse_add_char(self, c);
            self->state = IN_QUOTED_FIELD;
            break;

        case QUOTE_IN_QUOTED_FIELD:
            /* doublequote - seen a quote in an quoted field */
            if (%(quoting)d != QUOTE_NONE &&
                c == %(quotechar)d) {
                /* save "" as " */
                parse_add_char(self, c);
                self->state = IN_QUOTED_FIELD;
            }
            else if (c == %(delimiter)d) {
                /* save field - wait for new field */
                parse_save_field(self);
                self->state = START_FIELD;
            }
            else if (c == '\n' || c == '\r' || c == '\0') {
                /* end of line - return [fields] */
                parse_save_field(self);
                self->state = (c == '\0' ? START_RECORD : EAT_CRNL);
            }
            else if (!%(strict)d) {
                parse_add_char(self, c);
                self->state = IN_FIELD;
            }
            else {
                /* illegal */
                /*PyErr_Format(error_obj, "'%%c' expected after '%%c'",
                                dialect->delimiter,
                                dialect->quotechar);*/
                return -1;
            }
            break;

        case EAT_CRNL:
            if (c == '\n' || c == '\r')
                ;
            else if (c == '\0')
                self->state = START_RECORD;
            else {
                /*PyErr_Format(error_obj, "new-line character seen in unquoted field - do you need to open the file in universal-newline mode?");*/
                return -1;
            }
            break;

        }
        return 0;
    }

    static void
    parse_reset(ReaderObj *self, char *rawline)
    {
        self->field = rawline;
        self->state = START_RECORD;
        self->numeric_field = 0;
    }

    long parse_line(char *rawline, long inputlength)
    {
        char *p;
        ReaderObj reader;
        parse_reset(&reader, rawline);

        for (p=rawline; inputlength > 0; inputlength--, p++) {
            if (parse_process_char(&reader, *p) < 0)
                return -1;
        }
        if (parse_process_char(&reader, 0) < 0)
            return -1;
        return reader.field - rawline - 1;
    }
    ''' % d)

    return ffi, lib


def fastcsv_reader(f, dialect):
    dialect = csv.get_dialect(dialect)
    try:
        ffi, lib = dialect2ffi[dialect]
    except KeyError:
        ffi, lib = dialect2ffi[dialect] = _make_ffi_from_dialect(dialect)
    #
    linelen = -1
    for line in f:
        if linelen <= len(line):
            linelen = 2 * len(line)
            rawline = ffi.new("char[]", linelen)
        ffi.buffer(rawline, len(line))[:] = line
        n = lib.parse_line(rawline, len(line))
        assert n >= 0
        yield ffi.buffer(rawline, n)[:].split('\x00')


if __name__ == '__main__':
    csv.register_dialect('unixpwd', delimiter=':', quoting=csv.QUOTE_NONE)
    with open('/etc/passwd', 'rb') as f:
        reader = fastcsv_reader(f, 'unixpwd')
        for row in reader:
            print row
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.