Source

RPTParser / rptparser.py

import re

column_pattern = re.compile('([A-Za-z]+)')


class RPTParser(object):
    def __init__(self, filename):
        self.filename = filename
        self.parse()

    def parse(self):
        """Parses the .rps file."""

        columns = []
        column_lengths = []
        rows = []
        with open(self.filename) as f:
            # Find columns:
            raw_columns = f.readline().split(' ')
            column_count = 0
            for column in raw_columns:
                column = column.strip()
                if len(column):
                    column = column_pattern.findall(column)[0]
                    columns.append(column)
                    column_count += 1

            # Get column lengths based on '---- ---':
            lengths = f.readline().split(' ')
            for i in range(column_count):
                length = len(lengths[i]) + 1
                column_lengths.append(length)

            # Loop over rows:
            while True:
                line = f.readline()
                if not line:
                    break

                if len(line.strip()) == 0:
                    break

                # Loop over columns and get slice of row equal to column value's length:
                char = 0
                row = []
                for length in column_lengths:
                    old_char = char
                    char = char + length

                    column_data = line[old_char:char]
                    value = column_data.strip()

                    row.append(value)

                # Don't add empty rows:
                if len(row):
                    rows.append(row)

        self.parsed = {
            'columns': columns,
            'rows': rows
        }

    def to_sql(self, table, map_dict=None):
        """The map_dict should be a dictionary with keys of the same names as the
        columns in the .rpt file. Missing columns will be ignored in the output.

        Example:
        {
            'Id': 'id',
            'VehicleId': 'car_id',
        }

        If not provided, columns will have the same names as before.
        """

        columns = self.parsed['columns']

        # Replace column names
        if map_dict:
            new_columns = []
            for column in columns:
                if column in map_dict:
                    new_columns.append(map_dict[column])
                else:
                    new_columns.append(None)
            columns = new_columns

        # Insert query:
        query = ['INSERT INTO %s (' % table]

        # Comma separated column names:
        query_columns = []
        for column in columns:
            if column:
                query_columns.append(column)
        query.append(', '.join(query_columns))

        # Rows:
        query.append(') VALUES ')
        rows = self.parsed['rows']

        query_rows = []
        for row in rows:
            query_row = []
            i = 0
            for value in row:
                # Skip values for Null columns as specified in map_dict:
                i += 1
                if not columns[i - 1]:
                    continue

                # Add non-numbers with quotes:
                try:
                    int(value)
                    query_row.append(str(value))
                except ValueError:
                    query_row.append('"%s"' % value)

            # Add semicolon on last row:
            new_row = ', '.join(query_row)
            if row != rows[-1]:
                row = '(%s),' % new_row
            else:
                row = '(%s);' % new_row
            query_rows.append(row)

        query.append(' '.join(query_rows))
        return ''.join(query)