Commits

Hao Xiong  committed 150e47f

Moved the source files to src directory

  • Participants
  • Parent commits 0f85f8c

Comments (0)

Files changed (4)

File compute.py

-#!/usr/bin/env python
-"""Compute the AUC from the scores.
-"""
-
-import numpy as np
-
-
-class RunSVM(object):
-
-    def __init__(self,fileName, Gamma=None, C=1):
-        """
-        Compute AUC using given file name.
-        
-        It is assumed that the file is generated originally for R
-        and therefore has a header line, which needs to be deleted.
-        """
-        from PyML import VectorDataSet,SVM, ker
-        data = VectorDataSet(fileName, labelsColumn=-1, idColumn=0, headerRow=True)
-        data.normalize(2)                   
-        # set the default value of Gamma=1/(# variables). This follows package e1071 of R.
-        Gamma = 1./data.numFeatures if Gamma == None else Gamma
-        
-        svm = SVM(ker.Gaussian(Gamma), C=C)                     # C=?
-        self.cv = svm.cv(data, numFolds=10)   # 10-fold cross-validation
-        self.cv.save(fileName + ".SVM")       # save the results
-    @property
-    def acc(self):
-        return self.cv.getSuccessRate()
-    @property
-    def auc(self):
-        return self.cv.getROC()
-class RunRF(object):
-
-    def __init__(self, flnm, ndsz=1, mtry=20, ntree=500):
-        """
-        Return the random forest based on data from a file.
-
-        Parameters:
-        ---------------
-        flnm : file name of the score file
-        """
-        from rpy2 import robjects as robj
-        from rpy2.robjects import r as R
-        self.R = R
-        R("rm(list = ls(all = TRUE))")  # clear workspace
-        R.library("randomForest")
-        R("score <- read.table('%s')"%flnm)
-        R("""y <- score$class
-        nm <- names(score)
-        x=score[,nm != 'class']
-        if (class(x) != 'data.frame'){
-        x <- data.frame(x)
-        }""")
-        if mtry is None:
-            R("rf <- randomForest(x=x, y=y, ntree=%i, nodesize=%i)" % (ntree, ndsz))
-        else:
-            R("rf <- randomForest(x=x, y=y, mtry=%i, ntree=%i, nodesize=%i)" % (int(mtry),ntree, ndsz))
-        R("save(rf, file='%s')"% (flnm+'.RF'))
-
-        self.R("""# making sure that orders match between labels and votes
-        tmp <- rf$votes
-        order <- match(rownames(score), rownames(tmp))
-        votes <- tmp[order,]
-        # take positive vote counts as the score for ROC
-        pos.votes <- votes[,2]
-        labels <- ordered(score$class)
-        suppressPackageStartupMessages(library(ROCR))
-        pred <- prediction(pos.votes, labels)""")
-        
-        
-        self.acc = 1-self.R("rf$err.rate[500,1]")[0]
-        self.R(" auc <- performance(pred, 'auc') ")
-        self.auc =  self.R("attr(auc,'y.values')[[1]]")[0]
-
-if '__main__' == __name__:
-    from optparse import OptionParser
-    parser = OptionParser()
-    parser.add_option('-c', '--classifier', dest='classifier', help='specify the classifier; svm or rf.')
-    parser.add_option('-G', dest='Gamma', default=None, help='Gamma value for SVM.')
-    parser.add_option('-C', dest='C', default=1., help='C value for SVM')
-    parser.add_option('-n', dest='ntree', default=500, help='number of trees in random forest')
-    parser.add_option('-m', dest='mtry', default=20, help='mtry')
-    parser.add_option('-s', dest='ndsz', default=1, help='node size')
-    parser.add_option('-f', dest='fl', help='score file name')
-    opts, args = parser.parse_args()
-    if opts.classifier == 'svm':
-        res = RunSVM(opts.fl, opts.Gamma, opts.C)
-    elif opts.classifier == 'rf':
-        res = RunRF(opts.fl, opts.ndsz, opts.mtry, opts.ntree)
-    else:
-        print('Error: unknown classifier.')
-    print(res.auc)

File motifs.py

-#! /usr/bin/python
-from __future__ import with_statement
-import numpy as np
-class motifRec(object):
-    """Obtain the motifs from wordspy outputfile.
-    """
-    def __init__(self, flname, protein):
-        with open(flname, 'r') as f:
-            strings = f.read()
-            # parsing the wordspy output file to obtain relevant information
-            import re
-            refloat = r'\d+\.\d+'
-            if not protein:
-                motpat = re.compile(r'^\s*([ACGT]+)\s+(%s)\s+(-?%s)\s+null\s+(0\.\d+)\s+(\d+)\s+\d+\s+\{[acgt,]+\}'% (refloat,refloat),re.M|re.I)
-            else:
-                motpat = re.compile(r'^\s*([ACDEFGHIKLMNOPQRSTVW]+)\s+(%s)\s+(-?%s)\s+null\s+(0\.\d+)\s+(\d+)\s+\d+\s+\{[ACDEFGHIKLMNOPQRSTVW,]+\}'% (refloat,refloat),re.M|re.I)
-            self.dupMotifs = motpat.findall(strings)
-            # retain the last instance of motifs into unique motifs
-            mtfsgns = dict()
-            # only retain the latest stats for identical motifs
-            for x in self.dupMotifs:
-                mtfsgns[x[0]] = x
-                # convert it to a recarray
-            strlen = len( max( mtfsgns, key=lambda x: len(x) ) )
-            rectype = np.dtype( [('motif', 'S%d'%strlen), ('Zscore', np.float64), ('NZscore', np.float64), ('Freq', np.float64), ('Occur', np.int_)] )
-            self.uniMotifs = np.array(mtfsgns.values(), dtype=rectype)
-        

File src/compute.py

+#!/usr/bin/env python
+"""Compute the AUC from the scores.
+"""
+
+import numpy as np
+
+
+class RunSVM(object):
+
+    def __init__(self,fileName, Gamma=None, C=1):
+        """
+        Compute AUC using given file name.
+        
+        It is assumed that the file is generated originally for R
+        and therefore has a header line, which needs to be deleted.
+        """
+        from PyML import VectorDataSet,SVM, ker
+        data = VectorDataSet(fileName, labelsColumn=-1, idColumn=0, headerRow=True)
+        data.normalize(2)                   
+        # set the default value of Gamma=1/(# variables). This follows package e1071 of R.
+        Gamma = 1./data.numFeatures if Gamma == None else Gamma
+        
+        svm = SVM(ker.Gaussian(Gamma), C=C)                     # C=?
+        self.cv = svm.cv(data, numFolds=10)   # 10-fold cross-validation
+        self.cv.save(fileName + ".SVM")       # save the results
+    @property
+    def acc(self):
+        return self.cv.getSuccessRate()
+    @property
+    def auc(self):
+        return self.cv.getROC()
+class RunRF(object):
+
+    def __init__(self, flnm, ndsz=1, mtry=20, ntree=500):
+        """
+        Return the random forest based on data from a file.
+
+        Parameters:
+        ---------------
+        flnm : file name of the score file
+        """
+        from rpy2 import robjects as robj
+        from rpy2.robjects import r as R
+        self.R = R
+        R("rm(list = ls(all = TRUE))")  # clear workspace
+        R.library("randomForest")
+        R("score <- read.table('%s')"%flnm)
+        R("""y <- score$class
+        nm <- names(score)
+        x=score[,nm != 'class']
+        if (class(x) != 'data.frame'){
+        x <- data.frame(x)
+        }""")
+        if mtry is None:
+            R("rf <- randomForest(x=x, y=y, ntree=%i, nodesize=%i)" % (ntree, ndsz))
+        else:
+            R("rf <- randomForest(x=x, y=y, mtry=%i, ntree=%i, nodesize=%i)" % (int(mtry),ntree, ndsz))
+        R("save(rf, file='%s')"% (flnm+'.RF'))
+
+        self.R("""# making sure that orders match between labels and votes
+        tmp <- rf$votes
+        order <- match(rownames(score), rownames(tmp))
+        votes <- tmp[order,]
+        # take positive vote counts as the score for ROC
+        pos.votes <- votes[,2]
+        labels <- ordered(score$class)
+        suppressPackageStartupMessages(library(ROCR))
+        pred <- prediction(pos.votes, labels)""")
+        
+        
+        self.acc = 1-self.R("rf$err.rate[500,1]")[0]
+        self.R(" auc <- performance(pred, 'auc') ")
+        self.auc =  self.R("attr(auc,'y.values')[[1]]")[0]
+
+if '__main__' == __name__:
+    from optparse import OptionParser
+    parser = OptionParser()
+    parser.add_option('-c', '--classifier', dest='classifier', help='specify the classifier; svm or rf.')
+    parser.add_option('-G', dest='Gamma', default=None, help='Gamma value for SVM.')
+    parser.add_option('-C', dest='C', default=1., help='C value for SVM')
+    parser.add_option('-n', dest='ntree', default=500, help='number of trees in random forest')
+    parser.add_option('-m', dest='mtry', default=20, help='mtry')
+    parser.add_option('-s', dest='ndsz', default=1, help='node size')
+    parser.add_option('-f', dest='fl', help='score file name')
+    opts, args = parser.parse_args()
+    if opts.classifier == 'svm':
+        res = RunSVM(opts.fl, opts.Gamma, opts.C)
+    elif opts.classifier == 'rf':
+        res = RunRF(opts.fl, opts.ndsz, opts.mtry, opts.ntree)
+    else:
+        print('Error: unknown classifier.')
+    print(res.auc)

File src/motifs.py

+#! /usr/bin/python
+from __future__ import with_statement
+import numpy as np
+class motifRec(object):
+    """Obtain the motifs from wordspy outputfile.
+    """
+    def __init__(self, flname, protein):
+        with open(flname, 'r') as f:
+            strings = f.read()
+            # parsing the wordspy output file to obtain relevant information
+            import re
+            refloat = r'\d+\.\d+'
+            if not protein:
+                motpat = re.compile(r'^\s*([ACGT]+)\s+(%s)\s+(-?%s)\s+null\s+(0\.\d+)\s+(\d+)\s+\d+\s+\{[acgt,]+\}'% (refloat,refloat),re.M|re.I)
+            else:
+                motpat = re.compile(r'^\s*([ACDEFGHIKLMNOPQRSTVW]+)\s+(%s)\s+(-?%s)\s+null\s+(0\.\d+)\s+(\d+)\s+\d+\s+\{[ACDEFGHIKLMNOPQRSTVW,]+\}'% (refloat,refloat),re.M|re.I)
+            self.dupMotifs = motpat.findall(strings)
+            # retain the last instance of motifs into unique motifs
+            mtfsgns = dict()
+            # only retain the latest stats for identical motifs
+            for x in self.dupMotifs:
+                mtfsgns[x[0]] = x
+                # convert it to a recarray
+            strlen = len( max( mtfsgns, key=lambda x: len(x) ) )
+            rectype = np.dtype( [('motif', 'S%d'%strlen), ('Zscore', np.float64), ('NZscore', np.float64), ('Freq', np.float64), ('Occur', np.int_)] )
+            self.uniMotifs = np.array(mtfsgns.values(), dtype=rectype)
+