Source

ml-class / ex8.py

Full commit
#!/usr/bin/env python

import numpy as np
import matplotlib.pyplot as plt
from scipy.io import loadmat
from sklearn.covariance import EmpiricalCovariance, MinCovDet
from sklearn.metrics import fbeta_score


def p1(x, var, mue):
    return (1/np.sqrt(2*np.pi*var))*(np.e**(-(((x-mue)**2)/2*var)))


def p(x, mue, var):
    total = 1
    for xi, vari, muei in zip(x, var, mue):
        total *= p1(xi, vari, muei)
    return total


def calc_contour(data, fn):
    xs = np.linspace(data[:,0].min(), data[:,0].max(), 100)
    ys = np.linspace(data[:,1].min(), data[:,1].max(), 100)
    z = np.zeros(shape=(len(xs), len(ys)))
    for x, xv in enumerate(xs):
        for y, yv in enumerate(ys):
            z[x, y] = fn(np.array([xv, yv]))

    return xs, ys, z


def anplot(data, fn, use_exps=True):
    xs, ys, z = calc_contour(data, lambda x: fn(x))

    plt.scatter(data[:,0], data[:,1], marker='x')
    if use_exps:
        exps = np.arange(-20, -1, 3)
        fn = np.vectorize(lambda n: 10**n)
        plt.contour(xs, ys, z, fn(exps))
    else:
        plt.contour(xs, ys, z)

    plt.grid()
    plt.show()


def anomaly():
    data = loadmat('ex8/ex8data1.mat')
    train = data['X']

    mue = train.mean(0)
    var = train.var(0)
    fn = lambda x: p(x, mue, var)
    anplot(train, fn)


def anomaly_skl():
    data = loadmat('ex8/ex8data1.mat')
    X = data['X']

    #cov = MinCovDet().fit(X)
    cov = EmpiricalCovariance().fit(X)
    anplot(X, cov.score, False)


def find_threshold(fn):
    raw = loadmat('ex8/ex8data2.mat')
    X = raw['Xval']
    y = raw['yval'].ravel()

    dist = np.fromiter((fn(x) for x in X), float)

    best_f = 0
    best_t = 0
    for t in np.linspace(dist.min(), dist.max(), 100):
        fn = np.vectorize(lambda x: 0 if fn(x) < t else 1)
        f = fbeta_score(y, fn(X), 1.)
        if f > best_f:
            best_f = f
            best_t = t

    return best_t, best_f



if __name__ == '__main__':
    #anomaly()
    #raw_input()

    data = loadmat('ex8/ex8data1.mat')
    X = data['X']

    #cov = MinCovDet().fit(X)
    cov = EmpiricalCovariance().fit(X)
    fn = lambda x: abs(cov.score(x))
    t, f = find_threshold(fn)
    print(t, f)