Commits

Marko Toplak  committed db7cc51

Remove obiAssess (superceeded by obiGeneSetSig).

  • Participants
  • Parent commits 0d18fa2

Comments (0)

Files changed (6)

File docs/reference-html/assess1.py

-import orange
-import obiAssess
-import obiGeneSets
-
-gs = obiGeneSets.collections([":kegg:hsa"])
-data = orange.ExampleTable("DLBCL.tab")
-
-asl = obiAssess.AssessLearner()
-ass = asl(data, "hsa", geneSets=gs)
-
-print "Enrichments for the first example (10 pathways)"
-enrichments = ass(data[0])
-for patw, enric in sorted(enrichments.items())[:10]:
-    print patw, enric
-

File docs/reference-html/assess2.py

-import orange
-import obiAssess
-import obiGeneSets
-
-gs = obiGeneSets.collections([":kegg:hsa"])
-data = orange.ExampleTable("DLBCL.tab")
-
-asl = obiAssess.AssessLearner()
-ass = asl(data, "hsa", geneSets=gs)
-
-def genesetsAsAttributes(data, ass, domain=None):
-    """
-    Construct new data set with gene sets as attributes from data
-    set "data" with assess model "ass".
-    """
-
-    ares = {}
-    for ex in data:
-        cres = ass(ex)
-        for name,val in cres.items():
-            aresl = ares.get(name, [])
-            aresl.append(val)
-            ares[name] = aresl
-
-    ares = sorted(ares.items())
-
-    if not domain: #construct new domain instance if needed
-        domain = orange.Domain([ orange.FloatVariable(name=name) \
-            for name in [ a[0] for a in ares]], data.domain.classVar )
-
-    examples = [ [ b[zap] for a,b in ares ] + \
-        [ data[zap][-1] ]   for zap in range(len(data)) ]
-
-    et = orange.ExampleTable(domain, examples)
-    return et
-
-tdata = genesetsAsAttributes(data, ass)
-
-print "First 10 attributes of the first example in transformed data set"
-for pathw, enric in zip(tdata.domain,tdata[0])[:10]:
-    print pathw.name, enric.value

File docs/reference-html/assess3.py

-import orange
-import obiAssess
-import obiGeneSets
-
-gs = obiGeneSets.collections([":kegg:hsa"])
-data = orange.ExampleTable("DLBCL.tab")
-
-asl = obiAssess.AssessLearner()
-
-def genesetsAsAttributes(data, ass, domain=None):
-    """
-    Construct new data set with gene sets as attributes from data
-    set "data" with assess model "ass".
-    """
-
-    ares = {}
-    for ex in data:
-        cres = ass(ex)
-        for name,val in cres.items():
-            aresl = ares.get(name, [])
-            aresl.append(val)
-            ares[name] = aresl
-
-    ares = sorted(ares.items())
-
-    if not domain: #construct new domain instance if needed
-        domain = orange.Domain([ orange.FloatVariable(name=name) \
-            for name in [ a[0] for a in ares]], data.domain.classVar )
-
-    examples = [ [ b[zap] for a,b in ares ] + \
-        [ data[zap][-1] ]   for zap in range(len(data)) ]
-
-    et = orange.ExampleTable(domain, examples)
-    return et
-
-offer = None
-
-def transformLearningS(data):
-    ass = asl(data, "hsa", geneSets=gs)
-    et = genesetsAsAttributes(data, ass)
-
-    global offer
-    offer = (et.domain, ass) #save assess model
-
-    return et
-   
-def transformTestingS(data):
-    global offer
-    if not offer:
-        a = fdfsdsdd #exception
-
-    domain, ass = offer
-    offer = None
-
-    return genesetsAsAttributes(data, ass, domain)
-
-
-import orngBayes, orngTest, orngStat
-learners = [ orngBayes.BayesLearner() ]
-
-resultsOriginal = orngTest.crossValidation(learners, data, folds=10)
-resultsTransformed = orngTest.crossValidation(learners, data, folds=10, 
-    pps = [("L", transformLearningS), ("T", transformTestingS)])
-
-print "Original", "CA:", orngStat.CA(resultsOriginal), "AUC:", orngStat.AUC(resultsOriginal)
-print "Transformed", "CA:", orngStat.CA(resultsTransformed), "AUC:", orngStat.AUC(resultsTransformed)
-

File docs/reference-html/obiAssess.htm

-<html>
-
-<head>
-<title>obiAssess: pathway enrichment for each sample</title>
-<link rel=stylesheet href="style.css" type="text/css">
-<link rel=stylesheet href="style-print.css" type="text/css" media=print>
-</head>
-
-<body>
-<h1>obiAssess: pathway enrichment for each sample</h1>
-<index name="modules/assess enrichment gsea">
-
-<p>Gene Set Enrichment Analysis (GSEA) is a method which tries to identify groups of genes that are regulated together. It computes pathway enrichments for the whole data set. ASSESS in inspired by GSEA and it computes enrichments for each sample in the data set.</p>
-
-<p>ASSESS takes gene expression with sample phenotypes and computes gene set enrichments for given gene sets. First pathway &quot;models&quot; have to be created with AssessLearner. Afterwards they are used to calculate enrichments for each pair of sample and pathway.</p>
-
-<h2>AssessLearner</h2>
-
-Class is used to build models, that can be used later to determine enrichments scores for each example. Note that domains of input data to <code>AssessLearner</code> and <code>Assess</code> instances must be the same.
-
-<dl class=attributes>
-
-<dt>__call__(self, data, organism, geneSets, minSize=3, maxSize=1000, minPart=0.1, classValues=None, rankingf=None)</dt>
-<dd>Function <code>__call__</code> returns an instance of class <code>Assess</code> which can be given an example and returns its enrichemnt in all pathways. Argument descriptions follow. 
-
-<dl class=arguments>
-
-  <dt>data</dt>
-  <dd>An <A href="ExampleTable.htm"><CODE>ExampleTable</CODE></A> with gene expression data. An example
-  should correspond to a sample with its phenotype (class value). Attributes represent individual genes. Their names 
-  should be meaningful gene aliases.</dd>
-
-  <dt>organism</dt>
-  <dd>Organism code as used in KEGG. Needed for matching gene names in data to those in gene sets. Some
-  examples: <code>hsa</code> for human, <code>mmu</code> for mouse. This is an required argument.</dd> 
-
-  <dt>classValues</dt>
-  <dd>A pair of class values describing phenotypes that are chosen as two distinct phenotypes on which gene correlations
-  are computed. Only examples with one of chosen class values are considered for analysis. If not specified, first
-  two class values in <code>classVar</code> attribute descriptor are used.</dd>
-
-  <dt>geneSets</dt>
-  <dd>A python dictionary of gene sets, where key is a gene set name which points to a list of gene aliases for genes
-  in the gene set. Default: gene sets in your collection directory.</dd>
-
-  <dt>minSize, maxSize</dt>
-  <dd>Minimum and maximum number of genes from gene set also present in the data set for that gene set to be analysed.
-  Defaults: 3 and 1000.</dd>
-
-  <dt>minPart</dt>
-  <dd>Minimum fraction of genes from the gene set also present in the data set for that gene set to be analysed. Default: 0.1.</dd> 
-
-  <dt>rankingf</dt>
-  <dd>Used to specify model type for individual gene sets. See source code for reference. We recommend leaving the parameter blank. In that case, a parametric model from Edelman, 2006 is used.</dd>
-
-</dl>
-
-</dd>
-
-</dl>
-
-<h2>Assess</h2>
-
-<dl class=attributes>
-
-<dt>__init__(**kwargs)</dt>
-<dd>Function <code>__init__</code> is usually called only by <code>AssessLearner</code>. It is used to save built &quot;model&quot; data. Saves all keyword arguments into object's namespace.</dd>
-
-<dt>__call__(example)</dt>
-<dd>Returns enrichments of all gene sets for this example. Enrichments are returned in a dictionary, where keys are gene set and values their enrichments. Note that example's domain must be the same as the domain on which the &quot;model&quot; was built.</dd>
-
-
-</dl>
-
-<h3>Example 1</h3>
-
-This example prints enrichmentes for the first sample in the data set. It uses KEGG as a gene set source.
-
-<p class="header"><a href="assess1.py">assess1.py</a> (uses <a href="http://www.ailab.si/orange/datasets/DLBCL.tab">DLBCL.tab</a>)</p>
-
-<xmp class=code>import orange
-import obiAssess
-import obiGeneSets
-
-gs = obiGeneSets.collections([":kegg:hsa"])
-data = orange.ExampleTable("DLBCL.tab")
-
-asl = obiAssess.AssessLearner()
-ass = asl(data, "hsa", geneSets=gs)
-
-print "Enrichments for the first example (10 pathways)"
-enrichments = ass(data[0])
-for patw, enric in sorted(enrichments.items())[:10]:
-    print patw, enric
-</xmp>
-
-<p>Output:</p>
-
-<xmp class=code>Enrichments for the first example (10 pathways)
-[KEGG] 1- and 2-Methylnaphthalene degradation -0.84674671525
-[KEGG] 3-Chloroacrylic acid degradation -0.587923507915
-[KEGG] ABC transporters - General -0.292198856631
-[KEGG] Acute myeloid leukemia 0.305086037192
-[KEGG] Adherens junction 0.387903973883
-[KEGG] Adipocytokine signaling pathway 0.404448748545
-[KEGG] Alanine and aspartate metabolism 0.400113861834
-[KEGG] Alkaloid biosynthesis I -0.677360944415
-[KEGG] Alkaloid biosynthesis II -0.437492650183
-[KEGG] Allograft rejection 0.491535468415
-</xmp>
-
-<h3>Example 2: transforming data sets</h3>
-
-This example builds a new data set, where attributes are gene sets instead of genes. It prints first 10 attributes for the first example of transformed data set. Note, that the output matches previous example (well, with the exception of floating point discrepancies).
-
-<p class="header"><a href="assess2.py">assess2.py</a> (uses <a href="http://www.ailab.si/orange/datasets/DLBCL.tab">DLBCL.tab</a>)</p>
-
-<xmp class=code>import orange
-import obiAssess
-import obiGeneSets
-
-gs = obiGeneSets.collections([":kegg:hsa"])
-data = orange.ExampleTable("DLBCL.tab")
-
-asl = obiAssess.AssessLearner()
-ass = asl(data, "hsa", geneSets=gs)
-
-def genesetsAsAttributes(data, ass, domain=None):
-    """
-    Construct new data set with gene sets as attributes from data
-    set "data" with assess model "ass".
-    """
-
-    ares = {}
-    for ex in data:
-        cres = ass(ex)
-        for name,val in cres.items():
-            aresl = ares.get(name, [])
-            aresl.append(val)
-            ares[name] = aresl
-
-    ares = sorted(ares.items())
-
-    if not domain: #construct new domain instance if needed
-        domain = orange.Domain([ orange.FloatVariable(name=name) \
-            for name in [ a[0] for a in ares]], data.domain.classVar )
-
-    examples = [ [ b[zap] for a,b in ares ] + \
-        [ data[zap][-1] ]   for zap in range(len(data)) ]
-
-    et = orange.ExampleTable(domain, examples)
-    return et
-
-tdata = genesetsAsAttributes(data, ass)
-
-print "First 10 attributes of the first example in transformed data set"
-for pathw, enric in zip(tdata.domain,tdata[0])[:10]:
-    print pathw.name, enric.value
-</xmp>
-
-<p>Output:</p>
-
-<xmp class=code>First 10 attributes of the first example in transformed data set
-[KEGG] 1- and 2-Methylnaphthalene degradation -0.846746742725
-[KEGG] 3-Chloroacrylic acid degradation -0.587923526764
-[KEGG] ABC transporters - General -0.292198866606
-[KEGG] Acute myeloid leukemia 0.305086046457
-[KEGG] Adherens junction 0.387903988361
-[KEGG] Adipocytokine signaling pathway 0.404448747635
-[KEGG] Alanine and aspartate metabolism 0.400113850832
-[KEGG] Alkaloid biosynthesis I -0.6773609519
-[KEGG] Alkaloid biosynthesis II -0.437492638826
-[KEGG] Allograft rejection 0.491535454988
-</xmp>
-
-<h3>Example 3: testing transformed data set quality</h3>
-
-We measure CA and AUC of transformed data set using cross validation and compare them to the original data set. Care needs to be taken to prevent overfitting: we must not use any knowledge about testing set when creating &quot;ASSESS models&quot; and we have to use the same &quot;ASSESS model&quot; for both learning and testing set. We solve this by saving the model to a global variable.
-
-<p class="header">part of <a href="assess3.py">assess3.py</a> (uses <a href="http://www.ailab.si/orange/datasets/DLBCL.tab">DLBCL.tab</a>)</p>
-
-<xmp class=code>offer = None
-
-def transformLearningS(data):
-    ass = asl(data, "hsa", geneSets=gs)
-    et = genesetsAsAttributes(data, ass)
-
-    global offer
-    offer = (et.domain, ass) #save assess model
-
-    return et
-   
-def transformTestingS(data):
-    global offer
-    if not offer:
-        a = fdfsdsdd #exception
-
-    domain, ass = offer
-    offer = None
-
-    return genesetsAsAttributes(data, ass, domain)
-
-
-import orngBayes, orngTest, orngStat
-learners = [ orngBayes.BayesLearner() ]
-
-resultsOriginal = orngTest.crossValidation(learners, data, folds=10)
-resultsTransformed = orngTest.crossValidation(learners, data, folds=10, 
-    pps = [("L", transformLearningS), ("T", transformTestingS)])
-
-print "Original", "CA:", orngStat.CA(resultsOriginal), "AUC:", orngStat.AUC(resultsOriginal)
-print "Transformed", "CA:", orngStat.CA(resultsTransformed), "AUC:", orngStat.AUC(resultsTransformed)
-</xmp>
-
-<p>Output:</p>
-
-<xmp class=code>Original CA: [0.8214285714285714] AUC: [0.78583333333333338]
-Transformed CA: [0.80714285714285716] AUC: [0.84250000000000003]
-</xmp>
-
-<HR>
-<H2>References</H2>
-
-<p>Edelman E, Porrello A, Guinney J, Balakumaran B, Bild A, Febbo PG, Mukherjee S. Analysis of sample set enrichment scores: assaying the enrichment of sets of genes for individual samples in genome-wide expression profiles. Bioinformatics. 2006 Jul 15; 22(14):e108-16. </p>
-
-</body>
-</html>
-

File orangecontrib/bio/obiAssess.py

-"""
-Construction of gene set scores for each sample.
-
-Learners in this module build models needed for construction
-of features from individual genes.
-
-The other classes just take example and return a
-dictionary of { name: score } for that example.
-"""
-
-from __future__ import absolute_import
-
-from collections import defaultdict
-import math
-
-import numpy
-
-import orange, Orange, statc
-
-from . import obiExpression, obiGene, obiGsea, obiGeneSets, stats
-
-def normcdf(x, mi, st):
-    return 0.5*(2. - stats.erfcc((x - mi)/(st*math.sqrt(2))))
-
-class AT_edelmanParametric(object):
-
-    def __init__(self, **kwargs):
-        for a,b in kwargs.items():
-            setattr(self, a, b)
-
-    def __call__(self, nval):
-
-        if self.mi1 == None or self.mi2 == None or self.st1 == None or self.st2 == None:
-            return 0 
-
-        try:
-            val = nval.value
-            if nval.isSpecial():
-                return 0 
-        except:
-            val = nval
-
-        try:
-            if val >= self.mi1:
-                p1 = 1 - normcdf(val, self.mi1, self.st1)
-            else:
-                p1 = normcdf(val, self.mi1, self.st1)
-
-            if val >= self.mi2:
-                p2 = 1 - normcdf(val, self.mi2, self.st2)
-            else:
-                p2 = normcdf(val, self.mi2, self.st2)
-
-            #print p1, p2
-            return math.log(p1/p2)
-        except:
-            #print p1, p2, "exception"
-            return 0
-
-class AT_edelmanParametricLearner(object):
-    """
-    Returns attribute transfromer for Edelman parametric measure for a given attribute in the
-    dataset.
-    Edelman et al, 06.
-
-    Modified a bit.
-    """
-
-    def __init__(self, a=None, b=None):
-        """
-        a and b are choosen class values.
-        """
-        self.a = a
-        self.b = b
-
-    def __call__(self, i, data):
-        cv = data.domain.classVar
-        #print data.domain
-
-        if self.a == None: self.a = cv.values[0]
-        if self.b == None: self.b = cv.values[1]
-
-        def avWCVal(value):
-            return [ex[i].value for ex in data if ex[-1].value == value and not ex[i].isSpecial() ]
-
-        list1 = avWCVal(self.a)
-        list2 = avWCVal(self.b)
-
-        mi1 = mi2 = st1 = st2 = None
-
-        try:
-            mi1 = statc.mean(list1)
-            st1 = statc.std(list1)
-        except:
-            pass
-    
-        try:
-            mi2 = statc.mean(list2)
-            st2 = statc.std(list2)
-        except:
-            pass
-
-        return AT_edelmanParametric(mi1=mi1, mi2=mi2, st1=st1, st2=st2)
-
-class AT_loess(object):
-
-    def __init__(self, **kwargs):
-        for a,b in kwargs.items():
-            setattr(self, a, b)
-
-    def __call__(self, nval):
-
-        val = nval.value
-        if nval.isSpecial():
-            return 0.0 #middle value
-        #return first class probablity
-
-        def saveplog(a,b):
-            try:
-                return math.log(a/b)
-            except:
-                if a < b:
-                    return -10
-                else:
-                    return +10
-
-        try:
-            ocene = self.condprob(val)
-            if sum(ocene) < 0.01:
-                return 0.0
-            return saveplog(ocene[0], ocene[1])
-
-        except:
-            return 0.0
-
-class AT_loessLearner(object):
-
-    def __call__(self, i, data):
-        cv = data.domain.classVar
-        #print data.domain
-        try:
-            ca = orange.ContingencyAttrClass(data.domain.attributes[i], data)
-            a = orange.ConditionalProbabilityEstimatorConstructor_loess(ca, nPoints=5)
-            return AT_loess(condprob=a)
-        except:
-            return AT_loess(condprob=None)
-
-def nth(l, n):
-    return [a[n] for a in l]
-
-class Assess(object):
-
-    def __init__(self, **kwargs):
-        for a,b in kwargs.items():
-            setattr(self, a, b)
-
-    def __call__(self, example):
-        enrichmentScores = [] 
-
-        lcor = [ self.attrans[at](example[at]) for at in range(len(self.attrans)) ]
-
-        ordered = obiGsea.orderedPointersCorr(lcor)
-
-        def rev(l):
-           return numpy.argsort(l)
-
-        rev2 = rev(ordered)
-
-        gsetsnumit = self.gsetsnum.items()
-
-        enrichmentScores = dict( 
-            (name, obiGsea.enrichmentScoreRanked(subset, lcor, ordered, rev2=rev2)[0]) \
-            for name,subset in gsetsnumit)
-
-        return enrichmentScores
-
-
-class AssessLearner(object):
-    """
-    Uses the underlying GSEA code to select genes.
-    Takes data and creates attribute transformations.
-    """
-    
-    def __call__(self, data, matcher, geneSets, minSize=3, maxSize=1000, minPart=0.1, classValues=None, rankingf=None):
-        data, oknames, gsetsnum = selectGenesetsData(data, matcher, geneSets, \
-            minSize=minSize, maxSize=maxSize, minPart=minPart, classValues=classValues)
-        
-        if rankingf == None:
-            rankingf = AT_edelmanParametricLearner()
-
-        #rank individual attributes on the training set
-        attrans = [ rankingf(iat, data) for iat, at in enumerate(data.domain.attributes) ]
-
-        return Assess(attrans=attrans, gsetsnum=gsetsnum)
-
-def selectGenesetsData(data, matcher, geneSets, minSize=3, maxSize=1000, minPart=0.1, classValues=None):
-    """
-    Returns gene sets and data which falling under upper criteria.
-    """
-    gso = obiGsea.GSEA(data, matcher=matcher, classValues=classValues, atLeast=0)
-    gso.addGenesets(geneSets)
-    okgenesets = gso.selectGenesets(minSize=minSize, maxSize=maxSize, minPart=minPart).keys()
-    gsetsnum = gso.to_gsetsnum(okgenesets)
-    return gso.data, okgenesets, gsetsnum
-
-def corgs_activity_score(ex, corg):
-    """ activity score for a sample for pathway given by corgs """
-    #print [ ex[i].value for i in corg ] #FIXME what to do with unknown values?
-    return sum(ex[i].value if ex[i].value != '?' else 0.0 for i in corg)/len(corg)**0.5
-
-class CORGs(object):
-
-    def __init__(self, **kwargs):
-        for a,b in kwargs.items():
-            setattr(self, a, b)
-
-    def __call__(self, example):
-        return dict( (name,corgs_activity_score(example, corg)) \
-            for name, corg in self.corgs.items() )
-
-class CORGsLearner(object):
-    
-    def __call__(self, data, matcher, geneSets, minSize=3, maxSize=1000, minPart=0.1, classValues=None):
-        """
-        WARNING: input has to be z_ij table! each gene needs to be normalized
-        (mean=0, stdev=1) for all samples.
-        """
-        data, oknames, gsetsnum = selectGenesetsData(data, matcher, geneSets, \
-            minSize=minSize, maxSize=maxSize, minPart=minPart, classValues=classValues)
-    
-        tscorecache = {}
-
-        def tscorec(data, at, cache=None):
-            """ Cached attribute  tscore calculation """
-            if cache != None and at in cache: return cache[at]
-            ma = obiExpression.MA_t_test()(at,data)
-            if cache != None: cache[at] = ma
-            return ma
-
-        def compute_corg(data, inds):
-            """
-            Compute CORG for this geneset specified with gene inds
-            in the example table. Output is the list of gene inds
-            in CORG.
-
-            """
-            #order member genes by their t-scores: decreasing, if av(t-score) >= 0,
-            #else increasing
-            tscores = [ tscorec(data, at, tscorecache) for at in inds ]
-            sortedinds = nth(sorted(zip(inds,tscores), key=lambda x: x[1], \
-                reverse=statc.mean(tscores) >= 0), 0)
-
-            def S(corg):
-                """ Activity score separation - S(G) in 
-                the article """
-                asv = orange.FloatVariable(name='AS')
-                asv.getValueFrom = lambda ex,rw: orange.Value(asv, corgs_activity_score(ex, corg))
-                data2 = orange.ExampleTable(orange.Domain([asv], data.domain.classVar), data)
-                return abs(tscorec(data2, 0)) #FIXME absolute - nothing in the article about it
-                    
-            #greedily find CORGS procing the best separation
-            g = S(sortedinds[:1])
-            bg = 1
-            for a in range(2, len(sortedinds)+1):
-                tg = S(sortedinds[:a])
-                if tg > g:
-                    g = tg
-                    bg = a
-                else:
-                    break
-                
-            return sortedinds[:a]
-
-        #now, on the learning set produce the CORGS for each geneset and
-        #save it for use in further prediction
-
-        corgs = {}
-
-        for name, inds in gsetsnum.items():
-            inds = sorted(set(inds)) # take each gene only once!
-            corgs[name] = compute_corg(data, inds)
-
-        return CORGs(corgs=corgs)
-
-class GSA(object):
-
-    def __init__(self, **kwargs):
-        for a,b in kwargs.items():
-            setattr(self, a, b)
-
-    def __call__(self, example):
-        return dict( (name, statc.mean([example[i].value for i in inds if example[i].value != "?"]) ) \
-            for name, inds in self.subsets.items() )
-
-class GSALearner(object):
-    
-    def __call__(self, data, matcher, geneSets, minSize=3, maxSize=1000, minPart=0.1, classValues=None):
-        """
-        WARNING: input has to be z_ij table! each gene needs to be normalized
-        (mean=0, stdev=1) for all samples.
-        """
-        import scipy.stats
-
-        data, oknames, gsetsnum = selectGenesetsData(data, matcher, geneSets, \
-            minSize=minSize, maxSize=maxSize, minPart=minPart, classValues=classValues)
-    
-        def tscorec(data, at, cache=None):
-            ma = obiExpression.MA_t_test()(at,data)
-            return ma
-
-        tscores = [ tscorec(data, at) for at in data.domain.attributes ]
-
-        def to_z_score(t):
-            return float(scipy.stats.norm.ppf(scipy.stats.t.cdf(t, len(data)-2)))
-
-        zscores = map(to_z_score, tscores)
-
-        subsets = {}
-
-        for name, inds in gsetsnum.items():
-            inds = sorted(set(inds)) # take each gene only once!
-
-            D = statc.mean([max(zscores[i],0) for i in inds]) \
-                + statc.mean([min(zscores[i],0) for i in inds])
-
-            if D >= 0:
-                subsets[name] = [ i for i in inds if zscores[i] > 0.0 ]
-            else:
-                subsets[name] = [ i for i in inds if zscores[i] < 0.0 ]
-
-        return GSA(subsets=subsets)
-
-def pls_transform(example, constt):
-    """
-    Uses calculated PLS weights to transform the given example.
-    """
-
-    inds, xmean, W, P = constt
-    dom = orange.Domain([example.domain.attributes[i1] for i1 in inds ], False)
-    newex = orange.ExampleTable(dom, [example])
-    ex = newex.toNumpy()[0]
-    ex = ex - xmean # same input transformation
-
-    nc = W.shape[1]
-
-    TR = numpy.empty((1, nc))
-    XR = ex
-
-    dot = numpy.dot
-
-    for i in range(nc):
-       t = dot(XR, W[:,i].T)
-       XR = XR - t*numpy.array([P[:,i]])
-       TR[:,i] = t
-
-    return TR
-
-def PLSCall(data, y=None, x=None, nc=None, weight=None, save_partial=False):
-
-    def normalize(vector):
-        return vector / numpy.linalg.norm(vector)
-
-    if y == None:
-        y = [ data.domain.classVar ]
-    if x == None:
-        x = [v for v in data.domain.variables if v not in y]
-
-    Ncomp = nc if nc is not None else len(x)
-        
-    dataX = orange.ExampleTable(orange.Domain(x, False), data)
-    dataY = orange.ExampleTable(orange.Domain(y, False), data)
-
-    # transformation to numpy arrays
-    X = dataX.toNumpy()[0]
-    Y = dataY.toNumpy()[0]
-
-    # data dimensions
-    n, mx = numpy.shape(X)
-    my = numpy.shape(Y)[1]
-
-    # Z-scores of original matrices
-    YMean = numpy.mean(Y, axis = 0)
-    XMean = numpy.mean(X, axis = 0)
-    
-    X = (X-XMean)
-    Y = (Y-YMean)
-
-    P = numpy.empty((mx,Ncomp))
-    T = numpy.empty((n,Ncomp))
-    W = numpy.empty((mx,Ncomp))
-    E,F = X,Y
-
-    dot = numpy.dot
-    norm = numpy.linalg.norm
-
-    #PLS1 - from Gutkin, shamir, Dror: SlimPLS
-
-    for i in range(Ncomp):
-        w = dot(E.T,F)
-        w = w/norm(w) #normalize w in Gutkin et al the do w*c, where c is 1/norm(w)
-        t = dot(E, w) #t_i -> a row vector
-        p = dot(E.T, t)/dot(t.T, t) #p_i t.T is a row vector - this is inner(t.T, t.T)
-        q = dot(F.T, t)/dot(t.T, t) #q_i
-            
-        E = E - dot(t, p.T)
-        F = F - dot(t, q.T)
-
-        T[:,i] = t.T
-        W[:,i] = w.T
-        P[:,i] = p.T
-
-    return XMean, W, P, T
-
-class PLS(object):
-
-    def __init__(self, **kwargs):
-        for a,b in kwargs.items():
-            setattr(self, a, b)
-
-    def __call__(self, example):
-
-        od = {}
-
-        for name, constt in self.constructt.items():
-            ts = pls_transform(example, constt)[0]
-            if len(ts) == 1:
-                od[name] = ts[0]
-            else:
-                for i,s in enumerate(ts):
-                   od[name + "_LC_" + str(i+1)] = s
- 
-        return od
-
-class PLSLearner(object):
-    """ Transforms gene sets using Principal Leasts Squares. """
-    
-    def __call__(self, data, matcher, geneSets, minSize=3, maxSize=1000, minPart=0.1, classValues=None, components=1):
-        """
-        If more that 1 components are used, _LC_componetsNumber is appended to 
-        the name of the gene set.
-        """
-
-        data, oknames, gsetsnum = selectGenesetsData(data, matcher, geneSets, \
-            minSize=minSize, maxSize=maxSize, minPart=minPart, classValues=classValues)
-    
-        constructt = {}
-
-        #build weight matrices for every gene set
-        for name, inds in gsetsnum.items():
-            dom2 = orange.Domain([ data.domain.attributes[i1] for i1 in inds ], data.domain.classVar)
-            data_gs = orange.ExampleTable(dom2, data)
-            xmean, W, P, T = PLSCall(data_gs, nc=components, y=[data_gs.domain.classVar], save_partial=True)
-            constructt[name] = inds, xmean, W, P
-
-        return PLS(constructt=constructt)
-
-class PCA(object):
-
-    def __init__(self, **kwargs):
-        for a,b in kwargs.items():
-            setattr(self, a, b)
-
-    def __call__(self, example):
-        od = {}
-
-        for name, constt in self.constructt.items():
-            ts = pca_transform(example, constt)[0]
-            od[name] = ts
-
-        return od
-
-def pca_transform(example, constt):
-    inds, evals, evect, xmean = constt
-    dom = orange.Domain([example.domain.attributes[i1] for i1 in inds ], False)
-    newex = orange.ExampleTable(dom, [example])
-    arr = newex.toNumpy()[0]
-    arr = arr - xmean # same input transformation - a row in a matrix
-
-    ev0 = evect[0] #this is a row in a matrix - do a dot product
-    a = numpy.dot(arr, ev0)
-    return a
-
-def pca(data, snapshot=0):
-    "Perform PCA on M, return eigenvectors and eigenvalues, sorted."
-    M = data.toNumpy("a")[0]
-    XMean = numpy.mean(M, axis = 0)
-    M = M - XMean
-
-    T, N = numpy.shape(M)
-    # if there are less rows T than columns N, use snapshot method
-    if (T < N) or snapshot:
-        C = numpy.dot(M, numpy.transpose(M))
-        evals, evecsC = numpy.linalg.eigh(C) #columns of evecsC are eigenvectors
-        evecs = numpy.dot(M.T, evecsC)/numpy.sqrt(numpy.abs(evals))
-    else:
-        K = numpy.dot(numpy.transpose(M), M)
-        evals, evecs = numpy.linalg.eigh(K)
-    
-    evecs = numpy.transpose(evecs)
-
-    # sort the eigenvalues and eigenvectors, decending order
-    order = (numpy.argsort(numpy.abs(evals))[::-1])
-    evecs = numpy.take(evecs, order, 0)
-    evals = numpy.take(evals, order)
-    return evals, evecs, XMean
-
-class PCALearner(object):
-    """ Transforms gene sets using Principal Leasts Squares. """
-    
-    def __call__(self, data, matcher, geneSets, minSize=3, maxSize=1000, minPart=0.1, classValues=None):
-
-        data, oknames, gsetsnum = selectGenesetsData(data, matcher, geneSets, \
-            minSize=minSize, maxSize=maxSize, minPart=minPart, classValues=classValues)
-    
-        constructt = {}
-
-        #build weight matrices for every gene set
-        for name, inds in gsetsnum.items():
-            dom2 = orange.Domain([ data.domain.attributes[i1] for i1 in inds ], data.domain.classVar)
-
-            data_gs = orange.ExampleTable(dom2, data)
-            evals, evect, xmean = pca(data_gs)
-            constructt[name] = inds, evals, evect, xmean
-
-        return PCA(constructt=constructt)
-
-
-class SimpleFun(object):
-
-    def __init__(self, **kwargs):
-        for a,b in kwargs.items():
-            setattr(self, a, b)
-
-    def __call__(self, example):
-        #for  name,ids in self.gsets.items():
-        #    print name, [example[i].value for i in ids], self.fn([example[i].value for i in ids])
-        return dict( (name, self.fn([example[i].value for i in ids])) \
-            for name,ids in self.gsets.items() )
-
-class SimpleFunLearner(object):
-    """
-    Just applies a function taking attribute values of an example and
-    produces to all gene sets.    
-    """
-    def __call__(self, data, matcher, geneSets, minSize=3, maxSize=1000, minPart=0.1, classValues=None, fn=None):
-        data, oknames, gsetsnum = selectGenesetsData(data, matcher, geneSets, \
-            minSize=minSize, maxSize=maxSize, minPart=minPart, classValues=classValues)
-        return SimpleFun(gsets=gsetsnum, fn=fn)
-
-class MedianLearner(object):
-    
-    def __call__(self, data, matcher, geneSets, minSize=3, maxSize=1000, minPart=0.1, classValues=None, fn=None):
-       sfl =  SimpleFunLearner()
-       return sfl(data, matcher, geneSets, \
-            minSize=minSize, maxSize=maxSize, minPart=minPart, classValues=classValues, fn=statc.median)
-
-class MeanLearner(object):
-    
-    def __call__(self, data, matcher, geneSets, minSize=3, maxSize=1000, minPart=0.1, classValues=None, fn=None):
-       sfl =  SimpleFunLearner()
-       return sfl(data, matcher, geneSets, \
-            minSize=minSize, maxSize=maxSize, minPart=minPart, classValues=classValues, fn=statc.mean)
-
-def impute_missing(data):
-    #remove attributes with only unknown values
-    newatts = []
-    for at in data.domain.attributes:
-        svalues = [ 1 for a in data if a[at].isSpecial() ]
-        real = len(data) - len(svalues)
-        if real > 0:
-            newatts.append(at)
-
-    dom2 = orange.Domain(newatts, data.domain.classVar)
-    data = orange.ExampleTable(dom2, data)
-
-    #impute
-    from Orange.orng import orngTree 
-    imputer = orange.ImputerConstructor_model() 
-    imputer.learnerContinuous = imputer.learnerDiscrete = orange.MajorityLearner()
-    imputer = imputer(data)
-
-    data = imputer(data)
-    return data
-
-def setSig_example(ldata, ex, genesets):
-    """
-    Create an dictionary with (geneset_name, geneset_expression) for every
-    geneset on input.
-
-    ldata is class-annotated data
-    """
-    #seznam ocen genesetov za posamezen primer v ucni mzozici
-    pathways = {}
-
-    def setSig_example_geneset(ex, data):
-        """ ex contains only selected genes """
-
-        distances = [ [], [] ]    
-
-        def pearsonr(v1, v2):
-            try:
-                return statc.pearsonr(v1, v2)[0]
-            except:
-                return numpy.corrcoef([v1, v2])[0,1]
-
-        def pearson(ex1, ex2):
-            attrs = range(len(ex1.domain.attributes))
-            vals1 = [ ex1[i].value for i in attrs ]
-            vals2 = [ ex2[i].value for i in attrs ]
-            return pearsonr(vals1, vals2)
-
-        def ttest(ex1, ex2):
-            try:
-                return stats.lttest_ind(ex1, ex2)[0]
-            except:
-                return 0.0
-        
-        #maps class value to its index
-        classValueMap = dict( [ (val,i) for i,val in enumerate(data.domain.classVar.values) ])
-     
-        #create distances to all learning data - save or other class
-        for c in data:
-            distances[classValueMap[c[-1].value]].append(pearson(c, ex))
-
-        return ttest(distances[0], distances[1])
-           
-    for name, gs in genesets.items(): #for each geneset
-        #for each gene set: take the attribute subset and work on the attribute subset only
-        #only select the subset of genes from the learning data
-        domain = orange.Domain([ldata.domain.attributes[ai] for ai in gs], ldata.domain.classVar)
-        datao = orange.ExampleTable(domain, ldata)
-        example = orange.Example(domain, ex) #domains need to be the same
-      
-        ocena = setSig_example_geneset(example, datao)
-        pathways[name] = ocena
-        
-    return pathways
-
-class SetSig(object):
-
-    def __init__(self, **kwargs):
-        for a,b in kwargs.items():
-            setattr(self, a, b)
-
-    def __call__(self, example):
-        return setSig_example(self.learndata, example, self.genesets)
-
-class SetSigLearner(object):
-
-    def __call__(self, data, matcher, geneSets, minSize=3, maxSize=1000, minPart=0.1, classValues=None):
-        data, oknames, gsetsnum = selectGenesetsData(data, matcher, geneSets, \
-            minSize=minSize, maxSize=maxSize, minPart=minPart, classValues=classValues)
-        return SetSig(learndata=data, genesets=gsetsnum)
-
-if __name__ == "__main__":
-
-    data = Orange.data.Table("iris")
-    gsets = obiGeneSets.collections({
-        #"ALL": ['sepal length', 'sepal width', 'petal length', 'petal width'],
-        "f3": ['sepal length', 'sepal width', 'petal length'],
-        "l3": ['sepal width', 'petal length', 'petal width'],
-        })
-
-    fp = 120
-    ldata = orange.ExampleTable(data.domain, data[:fp])
-    tdata = orange.ExampleTable(data.domain, data[fp:])
-
-    matcher = obiGene.matcher([])
-
-    choosen_cv = ["Iris-setosa", "Iris-versicolor"]
-    #ass = AssessLearner()(data, matcher, gsets, rankingf=AT_loessLearner())
-    ass = AssessLearner()(data, matcher, gsets, minPart=0.0)
-    #ass = MeanLearner()(data, matcher, gsets, default=False)
-    #ass = CORGsLearner()(data, matcher, gsets)
-    #ass = MedianLearner()(data, matcher, gsets)
-    #ass = PLSLearner()(data, matcher, gsets, classValues=choosen_cv, minPart=0.0)
-    #ass = SetSigLearner()(ldata, matcher, gsets, classValues=choosen_cv, minPart=0.0)
-    #ass = PCALearner()(ldata, matcher, gsets, classValues=choosen_cv, minPart=0.0)
-    #ass = GSALearner()(ldata, matcher, gsets, classValues=choosen_cv, minPart=0.0)
-
-    ar = defaultdict(list)
-    for d in (list(ldata) + list(tdata))[:5]:
-        for a,b in ass(d).items():
-            ar[a].append(b)
-
-    def pp1(ar):
-        ol =  sorted(ar.items())
-        print '\n'.join([ a.id + ": " +str(b) for a,b in ol])
-
-    pp1(ar)
-

File orangecontrib/bio/obiGeneSetSig.py

 from collections import defaultdict
 
 import scipy.stats
-
 import numpy
-
 import Orange, Orange.utils, statc
 
 if __name__ == "__main__":
     __package__ = "Orange.bio"
 
-from .obiGsea import takeClasses
-from .obiAssess import pca, PLSCall, corgs_activity_score
 from . import obiExpression, obiGene, obiGeneSets, obiGsea, stats
 
+
+def corgs_activity_score(ex, corg):
+    """ activity score for a sample for pathway given by corgs """
+    #print [ ex[i].value for i in corg ] #FIXME what to do with unknown values?
+    return sum(ex[i].value if ex[i].value != '?' else 0.0 for i in corg)/len(corg)**0.5
+
+
+def PLSCall(data, y=None, x=None, nc=None, weight=None, save_partial=False):
+
+    def normalize(vector):
+        return vector / numpy.linalg.norm(vector)
+
+    if y == None:
+        y = [ data.domain.classVar ]
+    if x == None:
+        x = [v for v in data.domain.variables if v not in y]
+
+    Ncomp = nc if nc is not None else len(x)
+        
+    dataX = Orange.data.Table(Orange.data.Domain(x, False), data)
+    dataY = Orange.data.Table(Orange.data.Domain(y, False), data)
+
+    # transformation to numpy arrays
+    X = dataX.toNumpy()[0]
+    Y = dataY.toNumpy()[0]
+
+    # data dimensions
+    n, mx = numpy.shape(X)
+    my = numpy.shape(Y)[1]
+
+    # Z-scores of original matrices
+    YMean = numpy.mean(Y, axis = 0)
+    XMean = numpy.mean(X, axis = 0)
+    
+    X = (X-XMean)
+    Y = (Y-YMean)
+
+    P = numpy.empty((mx,Ncomp))
+    T = numpy.empty((n,Ncomp))
+    W = numpy.empty((mx,Ncomp))
+    E,F = X,Y
+
+    dot = numpy.dot
+    norm = numpy.linalg.norm
+
+    #PLS1 - from Gutkin, shamir, Dror: SlimPLS
+
+    for i in range(Ncomp):
+        w = dot(E.T,F)
+        w = w/norm(w) #normalize w in Gutkin et al the do w*c, where c is 1/norm(w)
+        t = dot(E, w) #t_i -> a row vector
+        p = dot(E.T, t)/dot(t.T, t) #p_i t.T is a row vector - this is inner(t.T, t.T)
+        q = dot(F.T, t)/dot(t.T, t) #q_i
+            
+        E = E - dot(t, p.T)
+        F = F - dot(t, q.T)
+
+        T[:,i] = t.T
+        W[:,i] = w.T
+        P[:,i] = p.T
+
+    return XMean, W, P, T
+
+
+def pca(data, snapshot=0):
+    "Perform PCA on M, return eigenvectors and eigenvalues, sorted."
+    M = data.toNumpy("a")[0]
+    XMean = numpy.mean(M, axis = 0)
+    M = M - XMean
+
+    T, N = numpy.shape(M)
+    # if there are less rows T than columns N, use snapshot method
+    if (T < N) or snapshot:
+        C = numpy.dot(M, numpy.transpose(M))
+        evals, evecsC = numpy.linalg.eigh(C) #columns of evecsC are eigenvectors
+        evecs = numpy.dot(M.T, evecsC)/numpy.sqrt(numpy.abs(evals))
+    else:
+        K = numpy.dot(numpy.transpose(M), M)
+        evals, evecs = numpy.linalg.eigh(K)
+    
+    evecs = numpy.transpose(evecs)
+
+    # sort the eigenvalues and eigenvectors, decending order
+    order = (numpy.argsort(numpy.abs(evals))[::-1])
+    evecs = numpy.take(evecs, order, 0)
+    evals = numpy.take(evals, order)
+    return evals, evecs, XMean
+
+
 class GeneSetTrans(object):
 
     __new__ = Orange.utils._orange__new__(object)
     def __call__(self, data, weight_id=None):
 
         #selection of classes and gene sets
-        data = takeClasses(data, classValues=self.class_values)
+        data = obiGsea.takeClasses(data, classValues=self.class_values)
         nm,_ =  self._mat_ni(data)
         gene_sets = select_genesets(nm, self.gene_sets, self.min_size, self.max_size, self.min_part)
 
         ol =  sorted(ar.items())
         print '\n'.join([ a + ": " +str(b) for a,b in ol])
 
-    ass = LLR(data, matcher=matcher, gene_sets=gsets, class_values=choosen_cv, min_part=0.0, normalize=True)
+    #ass = LLR(data, matcher=matcher, gene_sets=gsets, class_values=choosen_cv, min_part=0.0, normalize=True)
     #ass = LLR_slow(data, matcher=matcher, gene_sets=gsets, class_values=choosen_cv, min_part=0.0)
     ass = CORGs(data, matcher=matcher, gene_sets=gsets, class_values=choosen_cv, min_part=0.0, cv=True)
     ar = to_old_dic(ass.domain, data[:5])