Commits

Anonymous committed 0f85f8c

Added two missing scripts and began updating the documentation.

Comments (0)

Files changed (3)

+#!/usr/bin/env python
+"""Compute the AUC from the scores.
+"""
+
+import numpy as np
+
+
+class RunSVM(object):
+
+    def __init__(self,fileName, Gamma=None, C=1):
+        """
+        Compute AUC using given file name.
+        
+        It is assumed that the file is generated originally for R
+        and therefore has a header line, which needs to be deleted.
+        """
+        from PyML import VectorDataSet,SVM, ker
+        data = VectorDataSet(fileName, labelsColumn=-1, idColumn=0, headerRow=True)
+        data.normalize(2)                   
+        # set the default value of Gamma=1/(# variables). This follows package e1071 of R.
+        Gamma = 1./data.numFeatures if Gamma == None else Gamma
+        
+        svm = SVM(ker.Gaussian(Gamma), C=C)                     # C=?
+        self.cv = svm.cv(data, numFolds=10)   # 10-fold cross-validation
+        self.cv.save(fileName + ".SVM")       # save the results
+    @property
+    def acc(self):
+        return self.cv.getSuccessRate()
+    @property
+    def auc(self):
+        return self.cv.getROC()
+class RunRF(object):
+
+    def __init__(self, flnm, ndsz=1, mtry=20, ntree=500):
+        """
+        Return the random forest based on data from a file.
+
+        Parameters:
+        ---------------
+        flnm : file name of the score file
+        """
+        from rpy2 import robjects as robj
+        from rpy2.robjects import r as R
+        self.R = R
+        R("rm(list = ls(all = TRUE))")  # clear workspace
+        R.library("randomForest")
+        R("score <- read.table('%s')"%flnm)
+        R("""y <- score$class
+        nm <- names(score)
+        x=score[,nm != 'class']
+        if (class(x) != 'data.frame'){
+        x <- data.frame(x)
+        }""")
+        if mtry is None:
+            R("rf <- randomForest(x=x, y=y, ntree=%i, nodesize=%i)" % (ntree, ndsz))
+        else:
+            R("rf <- randomForest(x=x, y=y, mtry=%i, ntree=%i, nodesize=%i)" % (int(mtry),ntree, ndsz))
+        R("save(rf, file='%s')"% (flnm+'.RF'))
+
+        self.R("""# making sure that orders match between labels and votes
+        tmp <- rf$votes
+        order <- match(rownames(score), rownames(tmp))
+        votes <- tmp[order,]
+        # take positive vote counts as the score for ROC
+        pos.votes <- votes[,2]
+        labels <- ordered(score$class)
+        suppressPackageStartupMessages(library(ROCR))
+        pred <- prediction(pos.votes, labels)""")
+        
+        
+        self.acc = 1-self.R("rf$err.rate[500,1]")[0]
+        self.R(" auc <- performance(pred, 'auc') ")
+        self.auc =  self.R("attr(auc,'y.values')[[1]]")[0]
+
+if '__main__' == __name__:
+    from optparse import OptionParser
+    parser = OptionParser()
+    parser.add_option('-c', '--classifier', dest='classifier', help='specify the classifier; svm or rf.')
+    parser.add_option('-G', dest='Gamma', default=None, help='Gamma value for SVM.')
+    parser.add_option('-C', dest='C', default=1., help='C value for SVM')
+    parser.add_option('-n', dest='ntree', default=500, help='number of trees in random forest')
+    parser.add_option('-m', dest='mtry', default=20, help='mtry')
+    parser.add_option('-s', dest='ndsz', default=1, help='node size')
+    parser.add_option('-f', dest='fl', help='score file name')
+    opts, args = parser.parse_args()
+    if opts.classifier == 'svm':
+        res = RunSVM(opts.fl, opts.Gamma, opts.C)
+    elif opts.classifier == 'rf':
+        res = RunRF(opts.fl, opts.ndsz, opts.mtry, opts.ntree)
+    else:
+        print('Error: unknown classifier.')
+    print(res.auc)
 separate program, svm.py, will run SVM over existing score files.  The
 results are printed to the screen.
 
+Using components
+~~~~~~~~~~~~~~~~
+
+The components of the pipeline can also be used separately for flexibility.
+
+* partseq.py fasta frac
+
+    Random partitioning of sequence files in FASTA format.
+
+The following is an example of using components::
+
+  ./partseq ozsolak.positive.fa 0.33
+  ./partseq.py ozsolak.negative.fa 0.33
+  wordspy ozsolak.positive.part1.fa 12 wordspy.m10.txt -e ozsolak.negative.part1.fa
+    -r0 -f -s -b -d -c
+  ./score_discrete.py -w wordspy.m12.txt -p ozsolak.positive.part2.fa
+    -n ozsolak.negative.part2.fa -F 0.5 -S 0.15 -M 2
+  ./compute.py -f score.m12.txt -c 'rf'
+
 Result
 ++++++
 
 
   is a R program file that runs random forest for classification.
 
-file_utils.py
+utils.py
 
   contains various utility functions for files.
 
+#! /usr/bin/python
+from __future__ import with_statement
+import numpy as np
+class motifRec(object):
+    """Obtain the motifs from wordspy outputfile.
+    """
+    def __init__(self, flname, protein):
+        with open(flname, 'r') as f:
+            strings = f.read()
+            # parsing the wordspy output file to obtain relevant information
+            import re
+            refloat = r'\d+\.\d+'
+            if not protein:
+                motpat = re.compile(r'^\s*([ACGT]+)\s+(%s)\s+(-?%s)\s+null\s+(0\.\d+)\s+(\d+)\s+\d+\s+\{[acgt,]+\}'% (refloat,refloat),re.M|re.I)
+            else:
+                motpat = re.compile(r'^\s*([ACDEFGHIKLMNOPQRSTVW]+)\s+(%s)\s+(-?%s)\s+null\s+(0\.\d+)\s+(\d+)\s+\d+\s+\{[ACDEFGHIKLMNOPQRSTVW,]+\}'% (refloat,refloat),re.M|re.I)
+            self.dupMotifs = motpat.findall(strings)
+            # retain the last instance of motifs into unique motifs
+            mtfsgns = dict()
+            # only retain the latest stats for identical motifs
+            for x in self.dupMotifs:
+                mtfsgns[x[0]] = x
+                # convert it to a recarray
+            strlen = len( max( mtfsgns, key=lambda x: len(x) ) )
+            rectype = np.dtype( [('motif', 'S%d'%strlen), ('Zscore', np.float64), ('NZscore', np.float64), ('Freq', np.float64), ('Occur', np.int_)] )
+            self.uniMotifs = np.array(mtfsgns.values(), dtype=rectype)
+