Source

RPTParser / rptparser.py

Full commit
import re

column_pattern = re.compile('([A-Za-z]+)')


class RPTParser(object):
    def __init__(self, filename):
        self.filename = filename
        self.parse()

    def parse(self):
        """Parses the .rps file."""

        columns = []
        column_lengths = []
        rows = []
        with open(self.filename) as f:
            # Find columns:
            raw_columns = f.readline().split(' ')
            column_count = 0
            for column in raw_columns:
                column = column.strip()
                if len(column):
                    column = column_pattern.findall(column)[0]
                    columns.append(column)
                    column_count += 1

            # Get column lengths based on '---- ---':
            lengths = f.readline().split(' ')
            for i in range(column_count):
                length = len(lengths[i]) + 1
                column_lengths.append(length)

            # Loop over rows:
            while True:
                line = f.readline()
                if not line:
                    break

                if len(line.strip()) == 0:
                    break

                # Loop over columns and get slice of row equal to column value's length:
                char = 0
                row = []
                for length in column_lengths:
                    old_char = char
                    char = char + length

                    column_data = line[old_char:char]
                    value = column_data.strip()

                    row.append(value)

                # Don't add empty rows:
                if len(row):
                    rows.append(row)

        self.parsed = {
            'columns': columns,
            'rows': rows
        }

    def to_sql(self, table, map_dict=None):
        """The map_dict should be a dictionary with keys of the same names as the
        columns in the .rpt file. Missing columns will be ignored in the output.

        Example:
        {
            'Id': 'id',
            'VehicleId': 'car_id',
        }

        If not provided, columns will have the same names as before.
        """

        columns = self.parsed['columns']

        # Replace column names
        if map_dict:
            new_columns = []
            for column in columns:
                if column in map_dict:
                    new_columns.append(map_dict[column])
                else:
                    new_columns.append(None)
            columns = new_columns

        queries = []
        row_offset = 0
        rows_per_query = 50
        rows = self.parsed['rows']
        while row_offset <= len(rows):
            # Insert query:
            query = ['INSERT INTO %s (' % table]

            # Comma separated column names:
            query_columns = []
            for column in columns:
                if column:
                    query_columns.append(column)
            query.append(', '.join(query_columns))

            # Rows:
            query.append(') VALUES\n')
            rows = self.parsed['rows']

            query_rows = []
            for i in range(rows_per_query):
                row = rows[row_offset + i]
                is_last_row = row == rows[-1]

                query_row = []
                j = 0
                for value in row:
                    # Skip values for Null columns as specified in map_dict:
                    j += 1
                    if not columns[j - 1]:
                        continue

                    # Add non-numbers with quotes:
                    try:
                        int(value)
                    except ValueError:
                        # Don't put NULL in quotes:
                        if value != 'NULL':
                            value = '"%s"' % value
                    query_row.append(str(value))

                # Add semicolon on last row in query:
                new_row = ', '.join(query_row)
                if i == 49 or is_last_row:
                    query_rows.append('(%s);\n' % new_row)
                else:
                    query_rows.append('(%s),' % new_row)

                if is_last_row:
                    break

            query.append('\n'.join(query_rows))
            queries.append(''.join(query))
            row_offset += rows_per_query

        return ' '.join(queries)