Source

dmfs-code / src / compute.py

Full commit
#!/usr/bin/env python
"""Compute the AUC from the scores.
"""

import numpy as np


class RunSVM(object):

    def __init__(self,fileName, Gamma=None, C=1):
        """
        Compute AUC using given file name.
        
        It is assumed that the file is generated originally for R
        and therefore has a header line, which needs to be deleted.
        """
        from PyML import VectorDataSet,SVM, ker
        data = VectorDataSet(fileName, labelsColumn=-1, idColumn=0, headerRow=True)
        data.normalize(2)                   
        # set the default value of Gamma=1/(# variables). This follows package e1071 of R.
        Gamma = 1./data.numFeatures if Gamma == None else Gamma
        
        svm = SVM(ker.Gaussian(Gamma), C=C)                     # C=?
        self.cv = svm.cv(data, numFolds=10)   # 10-fold cross-validation
        self.cv.save(fileName + ".SVM")       # save the results
    @property
    def acc(self):
        return self.cv.getSuccessRate()
    @property
    def auc(self):
        return self.cv.getROC()
class RunRF(object):

    def __init__(self, flnm, ndsz=1, mtry=20, ntree=500):
        """
        Return the random forest based on data from a file.

        Parameters:
        ---------------
        flnm : file name of the score file
        """
        from rpy2 import robjects as robj
        from rpy2.robjects import r as R
        self.R = R
        R("rm(list = ls(all = TRUE))")  # clear workspace
        R.library("randomForest")
        R("score <- read.table('%s')"%flnm)
        R("""y <- score$class
        nm <- names(score)
        x=score[,nm != 'class']
        if (class(x) != 'data.frame'){
        x <- data.frame(x)
        }""")
        if mtry is None:
            R("rf <- randomForest(x=x, y=y, ntree=%i, nodesize=%i)" % (ntree, ndsz))
        else:
            R("rf <- randomForest(x=x, y=y, mtry=%i, ntree=%i, nodesize=%i)" % (int(mtry),ntree, ndsz))
        R("save(rf, file='%s')"% (flnm+'.RF'))

        self.R("""# making sure that orders match between labels and votes
        tmp <- rf$votes
        order <- match(rownames(score), rownames(tmp))
        votes <- tmp[order,]
        # take positive vote counts as the score for ROC
        pos.votes <- votes[,2]
        labels <- ordered(score$class)
        suppressPackageStartupMessages(library(ROCR))
        pred <- prediction(pos.votes, labels)""")
        
        
        self.acc = 1-self.R("rf$err.rate[500,1]")[0]
        self.R(" auc <- performance(pred, 'auc') ")
        self.auc =  self.R("attr(auc,'y.values')[[1]]")[0]

if '__main__' == __name__:
    from optparse import OptionParser
    parser = OptionParser()
    parser.add_option('-c', '--classifier', dest='classifier', help='specify the classifier; svm or rf.')
    parser.add_option('-G', dest='Gamma', default=None, help='Gamma value for SVM.')
    parser.add_option('-C', dest='C', default=1., help='C value for SVM')
    parser.add_option('-n', dest='ntree', default=500, help='number of trees in random forest')
    parser.add_option('-m', dest='mtry', default=20, help='mtry')
    parser.add_option('-s', dest='ndsz', default=1, help='node size')
    parser.add_option('-f', dest='fl', help='score file name')
    opts, args = parser.parse_args()
    if opts.classifier == 'svm':
        res = RunSVM(opts.fl, opts.Gamma, opts.C)
    elif opts.classifier == 'rf':
        res = RunRF(opts.fl, opts.ndsz, opts.mtry, opts.ntree)
    else:
        print('Error: unknown classifier.')
    print(res.auc)