Source

rules-engine / rules_setup.py

Full commit
def create_training_data(inputfile, inputfields):
    '''Creates training and test data for a rules engine using input file

    fraud.txt has 5.8 million orders from 1-Dec-2008 to 27-May-2010

    test1.txt has 0.9 million orders, which is every orders on or after 17-Feb-2010
    test2.txt has 0.8 million random orders (ID ending in <17) before 17-Feb-2010
    train.txt has 4.1 million orders, which are the remaining orders
    '''
    import csv, datetime

    CUTOFF_DATE = datetime.date(2010, 2, 17)
    CUTOFF_ID = 17

    input = csv.reader(open(inputfile))
    heading = input.next()

    train  = csv.writer(open('train.txt', 'w'), lineterminator='\n')    # Training data
    test1  = csv.writer(open('test-future.output', 'w'), lineterminator='\n')    # Full 'future' test data
    test2  = csv.writer(open('test-random.output', 'w'), lineterminator='\n')    # Full 'random' test data
    test1a = csv.writer(open('test-future.input' , 'w'), lineterminator='\n')    # Input only for 'future' test data
    test2a = csv.writer(open('test-random.input' , 'w'), lineterminator='\n')    # Input only for 'random' test data

    testheading = [heading[i] for i in inputfields]
    train.writerow(heading)
    test1.writerow(heading)
    test2.writerow(heading)
    test1a.writerow(testheading)
    test2a.writerow(testheading)

    for row in input:
        target = train

        # If checkout >= 17-Feb-2010
        d, m, y = row[5].split(' ')[0].split('/')
        if datetime.date(int(y), int(m), int(d)) >= CUTOFF_DATE: target = test1

        # Else if transaction ID ends with a number < 17
        elif int(row[0][-3:-1]) < CUTOFF_ID: target = test2

        target.writerow(row)
        if target == test1: test1a.writerow([row[i] for i in inputfields])
        if target == test2: test2a.writerow([row[i] for i in inputfields])

if __name__ == '__main__':
    # All up to RISK_RATING, and BANK_OUTCOME, BIN
    import sys
    inputfields = list(range(0,29)) + list(range(30,33)) + [47,49,]
    create_training_data(len(sys.argv) > 1 and sys.argv[1] or 'fraud.txt', inputfields)