ml-class / ex8.py

#!/usr/bin/env python

import numpy as np
import matplotlib.pyplot as plt
from scipy.io import loadmat
from sklearn.covariance import EmpiricalCovariance, MinCovDet
from sklearn.metrics import fbeta_score
from sklearn.decomposition import PCA


def p1(x, var, mue):
    return (1/np.sqrt(2*np.pi*var))*(np.e**(-(((x-mue)**2)/2*var)))


def p(x, mue, var):
    total = 1
    for xi, vari, muei in zip(x, var, mue):
        total *= p1(xi, vari, muei)
    return total


def calc_contour(data, fn):
    xs = np.linspace(data[:,0].min(), data[:,0].max(), 100)
    ys = np.linspace(data[:,1].min(), data[:,1].max(), 100)
    z = np.zeros(shape=(len(xs), len(ys)))
    for x, xv in enumerate(xs):
        for y, yv in enumerate(ys):
            z[x, y] = fn(np.array([xv, yv]))

    return xs, ys, z


def anplot(data, fn, use_exps=True):
    xs, ys, z = calc_contour(data, lambda x: fn(x))

    plt.scatter(data[:,0], data[:,1], marker='x')
    if use_exps:
        exps = np.arange(-20, -1, 3)
        fn = np.vectorize(lambda n: 10**n)
        plt.contour(xs, ys, z, fn(exps))
    else:
        plt.contour(xs, ys, z)

    plt.grid()
    plt.show()


def anomaly():
    data = loadmat('ex8/ex8data1.mat')
    train = data['X']

    mue = train.mean(0)
    var = train.var(0)
    fn = lambda x: p(x, mue, var)
    anplot(train, fn)


def anomaly_skl():
    data = loadmat('ex8/ex8data1.mat')
    X = data['X']

    #cov = MinCovDet().fit(X)
    cov = EmpiricalCovariance().fit(X)
    anplot(X, cov.score, False)


def find_threshold(fn, filename='ex8/ex8data1.mat'):
    raw = loadmat(filename)
    X = raw['Xval']
    y = raw['yval'].ravel()

    dists = np.fromiter((fn(x) for x in X), float)

    best_f = 0
    best_t = 0
    for t in np.linspace(dists.min(), dists.max(), 100):
        preds = (dists > t).astype(int)
        f = fbeta_score(y, preds, 1.)
        if f > best_f:
            best_f = f
            best_t = t

    return best_t, best_f



def show_threshold(filename='ex8/ex8data1.mat'):
    data = loadmat(filename)
    X = data['X']

    cov = MinCovDet().fit(X)
    #cov = EmpiricalCovariance().fit(X)
    def dist(x):
        return abs(cov.score(x))
    t, f = find_threshold(dist, filename)
    print('threshold: {}\nfscore: {}'.format(t, f))

    fn = cov.score
    xs, ys, z = calc_contour(X, fn)

    plt.scatter(X[:,0], X[:,1], marker='x')
    plt.contour(xs, ys, z)
    oxs, oys = [], []
    for x in X:
        if abs(cov.score(x)) > t:
            oxs.append(x[0])
            oys.append(x[1])
    plt.scatter(oxs, oys, marker='o', color='red')

    plt.grid()
    plt.show()


def show_threshold2(filename='ex8/ex8data2.mat'):
    data = loadmat(filename)
    X = data['X']

    cov = MinCovDet().fit(X)
    #cov = EmpiricalCovariance().fit(X)
    def dist(x):
        return abs(cov.score(x))
    t, f = find_threshold(dist, filename)
    print('threshold: {}\nfscore: {}'.format(t, f))

    pca = PCA(2)
    reduced = pca.fit_transform(X)

    plt.scatter(reduced[:,0], reduced[:,1], marker='x')
    outliers = []
    for x in X:
        if abs(cov.score(x)) > t:
            outliers.append(x)
    ored = pca.transform(outliers)
    plt.scatter(ored[:,0], ored[:,1], marker='o', color='red')

    plt.grid()
    plt.show()

if __name__ == '__main__':
    #anomaly()
    show_threshold2()
    raw_input()
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.