Commits

Marko Toplak  committed 9728b6d

GSEA documentation.

  • Participants
  • Parent commits e39bd86

Comments (0)

Files changed (11)

File docs/reference-html/gsea1.py

-import Orange
-from Orange.bio import obiGsea, obiGene
-
-data = Orange.data.Table("iris")
-
-gen1 = dict([
-    ("sepal",["sepal length", "sepal width"]), 
-    ("petal",["petal length", "petal width", "petal color"])
-    ])
-
-res = obiGsea.runGSEA(data, matcher=obiGene.matcher([]), minSize=2, geneSets=gen1)
-print "%5s  %6s %6s %s" % ("LABEL", "NES", "P-VAL", "GENES")
-for gs,resu in res.items():
-    print "%5s  %6.3f %6.3f %s" % (gs.id, resu["nes"], resu["p"], str(resu["genes"]))

File docs/reference-html/gsea2.py

-import Orange
-from Orange.bio import obiDicty, obiGeneSets, obiGsea,obiGene
-
-dbc = obiDicty.DatabaseConnection()
-data = dbc.get_single_data(sample='pkaC-', time="8")
-
-#select the first chip (the first attribute)
-data = data.translate([data.domain.attributes[0]], True)
-
-matcher = obiGene.matcher([[obiGene.GMKEGG("dicty"), obiGene.GMDicty()]])
-genesets =  obiGeneSets.collections((("KEGG",), "dicty"))
-
-res = obiGsea.runGSEA(data, matcher=matcher, minPart=0.05, 
-    geneSets=genesets, permutation="gene")
-
-print "%-40s %6s %6s %6s %7s" % ("LABEL", "NES", "P-VAL", "SIZE", "MATCHED")
-for name,resu in sorted(res.items()[:10], key=lambda x: x[1]["p"]): 
-    print "%-40s %6.3f %6.3f %6d %7d" % (name.name[:35], resu["nes"],
-        resu["p"], resu["size"], resu["matched_size"]) 
-

File docs/reference-html/gsea3.py

-import Orange
-from Orange.bio import obiDicty, obiGeneSets, obiGsea, obiGene, obiGEO
-
-gds = obiGEO.GDS("GDS10")
-data = gds.getdata() 
-
-print "Possible phenotype descriptors:"
-print map(lambda x: x[0], obiGsea.allgroups(data).items())
-
-matcher = obiGene.matcher([obiGene.GMKEGG("Homo sapiens")])
-genesets = obiGeneSets.collections((("KEGG",), "Homo sapiens"))
-
-#the number of permutations (n) should be much higher
-res = obiGsea.runGSEA(data, matcher=matcher, minPart=0.05, geneSets=genesets, 
-    permutation="class", n=10, phenVar="tissue", geneVar="gene") 
-
-print
-print "GSEA results (descriptor: tissue)"
-print "%-40s %6s %6s %6s %7s" % ("LABEL", "NES", "FDR", "SIZE", "MATCHED") 
-for gs, resu in sorted(res.items(), key=lambda x: x[1]["fdr"])[:10]: 
-    print "%-40s %6.3f %6.3f %6d %7d" % (gs.name[:30],
-        resu["nes"], resu["fdr"], resu["size"], resu["matched_size"]) 

File docs/reference-html/obiKEGG.htm

-<html>
-
-<head>
-<title>obiKEGG: Interface to KEGG(Kyoto Encyclopedia of Genes and Genomes)</title>
-<link rel=stylesheet href="style.css" type="text/css">
-<link rel=stylesheet href="style-print.css" type="text/css" media=print>
-</head>
-
-<body>
-<h1>obiKEGG: Interface to KEGG (Kyoto Encyclopedia of Genes and Genomes)</h1>
-<p>obiKEGG is an interface to data and information from <a href="http://www.genome.jp/kegg/">Kyoto Encyclopedia of Genes and Genomes</a>. The module supports a programmable access to KEGG pathways and information on genes.</p>
-
-<h2>Local data manipulation</h2>
-<h2><index>Update</index></h2>
-<p>Is a class used for managing local KEGG database copy.</p>
-<p class=section>Methods</p>
-<dl class=attributes>
-	<dt>getinstance(local_database_path=None, progressCallback=None)</dt>
-	<dd>Return an instance of the Update class initialized to local_database_path. progressCallback will be used to report progress an all download progress.</dd>
-	
-	<dt>UpdateOrganism(org)</dt>
-	<dd>Download organism specific pathways and genes to the local database</dd>
-	
-	<dt>UpdateReference()</dt>
-	<dd>Download reference pathways</dd>
-	
-	<dt>UpdateEnzymeAndCompounds()</dt>
-	<dd>Downloads enzyme and compound data</dd>
-</dl>
-
-<h2><index>KEGGOrganism</index></h2>
-<p><code>KEGGOrgansim</code> is a class for easy access to organism specific data.</p>
-<p class=section>Attributes</p>
-<dl class=attributes>
-	<dt>org</dt>
-	<dd>Kegg organism code (e.g. "hsa")</dd>
-	<dt>local_database_path</dt>
-	<dd>Path to the local database. If <code>None</code> the <code>obiKEGG.default_database_path</code> will be used</dd>
-</dl>
-<p class="header">Example: construction</p>
-<xmp class="code">org = KEGGOrganism("hsa", local_database_path="C:\\kegg")</xmp>
-<p class=section>Methods</p>
-<dl class=attributes>
-	<dt>list_pathways()</dt>
-	<dd>Return a list of all organism specific pathways.</dd>
-	
-	<dt>get_linked_pathways(pathway_id)</dt>
-	<dd>Return a list of all organism specific pathways that pathway with <code>pathway_id</code> links to.</dd>
-	
-	<dt>get_genes()</dt>
-	<dd>Return a list of all specific genes</dd>
-	
-	<dt>get_genes_by_pathway(pathway_id)</dt>
-	<dd>Return a set of all organism specific genes that are in the pathway with <code>pathway_id</code>.</dd>
-	
-	<dt>get_enzymes_by_pathway(pathway_id)</dt>
-	<dd>Return a set of all organism specific enzymes that are in the pathway with <code>pathway_id</code>.</dd>
-	
-	<dt>get_compounds_by_pathway(pathway_id)</dt>
-	<dd>Return a set of all organism specific compounds that are in the pathway with <code>pathway_id</code>.</dd>
-	
-	<dt>get_pathways_by_genes(genes)</dt>
-	<dd>Return a list of all organism specific pathways that contain all the genes.</dd>
-	
-	<dt>get_enriched_pathways_by_genes(genes, reference=None, callback=None)</dt>
-	<dd>Return a dictionary with enriched pathways ids as keys and (list_of_genes, p_value, num_of_reference_genes) tuples as items.</dd>
-	
-	<dt>get_pathways_by_enzymes(enzymes)</dt>
-	<dd>Return a list of all organism specific pathways that contain all the enzymes.</dd>
-	
-	<dt>get_pathways_by_compounds(compounds)</dt>
-	<dd>Return a list of all organism specific pathways that contain all the compounds.</dd>
-	
-	<dt>get_enzymes_by_compound(compound)</dt>
-	<dd>Return a list of all organism specific enzymes that are involved in a reaction with <code>compound</code>.</dd>
-	
-	<dt>get_compounds_by_enzyme(enzyme)</dt>
-	<dd>Return a list of all compounds that are involved in a reaction with the <code>enzyme</code>.</dd>
-	
-	<dt>get_genes_by_enzyme(enzyme)</dt>
-	<dd>Return a list of all genes that are involved with the production of <code>enzyme</code>.</dd>
-	
-	<dt>get_enzymes_by_gene(gene)</dt>
-	<dd>Return a list of all enzymes that are a product of <code>gene</code>.</dd>
-	
-	<dt>get_unique_gene_ids(genes, caseSensitive=True)</dt>
-	<dd>Return a tuple with three elements. The first is a dictionary mapping from unique gene ids to gene names in <code>genes</code>, the second is a list of conflicting gene names and the third is a list of unknown genes.</dd>
-	
-</dl>
-
-<h3>Examples<h3>
-
-<p class="header">Getting enriched pathways</p>
-<xmp class="code">org = KEGGOrganism("hsa")
-genes, _, _ = org.get_unique_gene_ids([...])
-res = orng.get_enriched_pathways(genes)
-for p_id, (genes, p_value, refCount) in res.items():
-	print "Pathway id %s with p-value %.4f" %(p_id, p_value)</xmp>
-
-
-<h2><index>KEGGPathway</index></h2>
-<p><code>KEGGPathway</code> is a class for pathway specific data</p>
-<p class=section>Attributes</p>
-<dl class=attributes>
-	<dt>pathway_id</dt>
-	<dd>Kegg pathway id code (e.g. "hsa:00052")</dd>
-	<dt>local_database_path</dt>
-	<dd>Path to the local database. If <code>None</code> the <code>obiKEGG.default_database_path</code> will be used</dd>
-</dl>
-<p class=section>Methods</p>
-<dl class=attributes>
-	<dt>get_image()</dt>
-	<dd>Return an PIL image of the pathway</dd>
-	
-	<dt>get_colored_image(objects)</dt>
-	<dd>Return an PIL image of the pathway with marked <code>objects</code></dd>
-	
-	<dt>get_bounding_box(object)</dt>
-	<dd>Return a bounding box of the form (x1, y1, x2, y2) of <code>object</code> on the pathway image</dd>
-	
-	<dt>get_bounding_box_dict()</dt>
-	<dd>Return a dictionary mapping all objects on the pathways to bounding boxes (x1, y1, x2, y2) on the pathway image</dd>
-	
-	<dt>get_genes()</dt>
-	<dd>Return all genes on the pathway</dd>
-	
-	<dt>get_enzymes()</dt>
-	<dd>Return all enzymes on the pathway</dd>
-	
-	<dt>get_compounds()</dt>
-	<dd>Return all compounds on the pathway</dd>
-	
-</dl>
-
-<h3>Example<h3>
-<p class="header">Retrieval of enriched pathway images with marked genes</p>
-<xmp class="code">for p_id, (genes, _, _) in res.items():
-	pathway = KEGGPathway(p_id)
-	image = pathway.get_colored_image(genes)
-	image.save(p_id)</xmp>

File docs/rst/index.rst

    reference/geo.rst
    reference/omim.rst
    reference/go.rst
+   reference/gsea.rst
 
 Installation
 ------------

File docs/rst/reference/code/gsea1.py

+import Orange.bio.gsea
+import Orange.bio.gene
+import Orange.bio.geneset
+
+data = Orange.data.Table("iris")
+
+gen1 = Orange.bio.geneset.GeneSets(dict([
+    ("sepal", ["sepal length", "sepal width"]), 
+    ("petal", ["petal length", "petal width", "petal color"])
+    ]))
+
+res = Orange.bio.gsea.runGSEA(data, matcher=Orange.bio.gene.matcher([]), minSize=2, geneSets=gen1)
+print "%5s  %6s %6s %s" % ("LABEL", "NES", "P-VAL", "GENES")
+for gs,resu in res.items():
+    print "%5s  %6.3f %6.3f %s" % (gs.id, resu["nes"], resu["p"], str(resu["genes"]))

File docs/rst/reference/code/gsea2.py

+import Orange
+from Orange.bio import dicty, geneset, gsea, gene
+
+dbc = dicty.DatabaseConnection()
+data = dbc.get_single_data(sample='pkaC-', time="8")
+
+#select the first chip (the first attribute)
+data = data.translate([data.domain.attributes[0]], True)
+
+matcher = gene.matcher([[gene.GMKEGG("dicty"), gene.GMDicty()]])
+genesets =  geneset.collections((("KEGG",), "dicty"))
+
+res = gsea.runGSEA(data, matcher=matcher, minPart=0.05, 
+    geneSets=genesets, permutation="gene")
+
+print "%-40s %6s %6s %6s %7s" % ("LABEL", "NES", "P-VAL", "SIZE", "MATCHED")
+for name,resu in sorted(res.items()[:10], key=lambda x: x[1]["p"]): 
+    print "%-40s %6.3f %6.3f %6d %7d" % (name.name[:35], resu["nes"],
+        resu["p"], resu["size"], resu["matched_size"]) 
+

File docs/rst/reference/code/gsea3.py

+import Orange
+from Orange.bio import dicty, geneset, gsea, gene, geo
+
+gds = geo.GDS("GDS10")
+data = gds.getdata() 
+
+matcher = gene.matcher([gene.GMKEGG("Homo sapiens")])
+genesets = geneset.collections((("KEGG",), "Homo sapiens"))
+
+#the number of permutations (n) should be much higher
+res = gsea.runGSEA(data, matcher=matcher, minPart=0.05, 
+    geneSets=genesets, permutation="class", n=10, 
+    phenVar="tissue", geneVar="gene") 
+
+print
+print "GSEA results (descriptor: tissue)"
+print "%-40s %6s %6s %6s %7s" % ("LABEL", "NES", "FDR", "SIZE", "MATCHED") 
+for gs, resu in sorted(res.items(), key=lambda x: x[1]["fdr"])[:10]: 
+    print "%-40s %6.3f %6.3f %6d %7d" % (gs.name[:30],
+        resu["nes"], resu["fdr"], resu["size"], resu["matched_size"]) 

File docs/rst/reference/code/gsea4.py

+import Orange
+from Orange.bio import dicty, geneset, gsea, gene, geo
+
+gds = geo.GDS("GDS10")
+data = gds.getdata(transpose=True) 
+
+matcher = gene.matcher([gene.GMKEGG("Homo sapiens")])
+genesets = geneset.collections((("KEGG",), "Homo sapiens"))
+
+#the number of permutations (n) should be much higher
+res = gsea.runGSEA(data, matcher=matcher, minPart=0.05, 
+    geneSets=genesets, permutation="class", n=10, 
+    phenVar=data.domain["tissue"], geneVar=True) 
+
+print
+print "GSEA results (descriptor: tissue)"
+print "%-40s %6s %6s %6s %7s" % ("LABEL", "NES", "FDR", "SIZE", "MATCHED") 
+for gs, resu in sorted(res.items(), key=lambda x: x[1]["fdr"])[:10]: 
+    print "%-40s %6.3f %6.3f %6d %7d" % (gs.name[:30],
+        resu["nes"], resu["fdr"], resu["size"], resu["matched_size"]) 

File docs/rst/reference/gsea.rst

+============================================================
+Gene Set Enrichment Analysis (GSEA, :mod:`gsea`)
+============================================================
+
+.. py:currentmodule:: Orange.bio.gsea
+
+
+Gene Set Enrichment Analysis (GSEA) aims to identify enriched gene sets
+given gene expression data for multiple samples with their phenotypes.
+
+.. autofunction:: Orange.bio.gsea.runGSEA
+
+Examples: gene expression data
+------------------------------
+
+The following examples use a gene expression data set from the GEO database.
+We show the same analysis on two formats of data.
+
+With samples as instances (in rows):
+
+.. literalinclude:: code/gsea4.py
+
+With samples as features (in columns):
+
+.. literalinclude:: code/gsea3.py
+
+Both scripts output::
+
+    GSEA results (descriptor: tissue)
+    LABEL                                       NES    FDR   SIZE MATCHED
+    Porphyrin and chlorophyll meta           -1.817  0.000     43      23
+    Staphylococcus aureus infectio           -1.998  0.000     59      28
+    Non-homologous end-joining                1.812  0.000     13      12
+    Fanconi anemia pathway                    1.911  0.000     53      27
+    Cell cycle                                1.777  0.000    124     106
+    Glycine, serine and threonine            -2.068  0.000     39      29
+    HIF-1 signaling pathway                  -1.746  0.000    106      90
+    Ether lipid metabolism                   -1.788  0.000     42      27
+    Fc epsilon RI signaling pathwa           -1.743  0.000     70      53
+    B cell receptor signaling path           -1.782  0.000     72      62
+
+Example: our own gene sets
+--------------------------
+
+We present a simple example on iris data set. Because data set
+is not a gene expression data set, we had to specify our own
+sets of features that belong together.
+
+.. literalinclude:: code/gsea1.py
+
+The output::
+
+    LABEL     NES  P-VAL GENES
+    sepal   1.087  0.630 ['sepal width', 'sepal length']
+    petal  -1.117  0.771 ['petal width', 'petal length']
+
+
+Example: directly passing correlation data
+------------------------------------------
+
+GSEA can also directly use correlation data between individual genes
+and a phenotype. If (1) input data with only one example (attribute names 
+are gene names) or (2) there is only one continuous feature in the given 
+data set (gene names are in the first :obj:`Orange.feature.String`.
+The following outputs ten pathways with smallest p-values.
+
+.. literalinclude:: code/gsea2.py
+
+The output::
+
+    LABEL                                       NES  P-VAL   SIZE MATCHED
+    Biosynthesis of amino acids               1.407  0.056     58      40
+    beta-Alanine metabolism                   1.165  0.232     13      10
+    Taurine and hypotaurine metabolism        1.160  0.413      4       3
+    Porphyrin and chlorophyll metabolis      -0.990  0.517     14       5
+    Valine, leucine and isoleucine degr       0.897  0.585     29      21
+    Ether lipid metabolism                    0.713  0.857     10       6
+    Biosynthesis of unsaturated fatty a       0.659  0.922     10       6
+    Protein processing in endoplasmic r       0.647  0.941     71      40
+    RNA polymerase                            0.550  0.943     24       7
+    Glycosylphosphatidylinositol(GPI)-a      -0.540  0.946     19       4
+
+

File orangecontrib/bio/gsea.py

 
         #transpose is not needed if phenVar is classVar or phenVar is False
         #and there is only one sample
-        if phenVar == data.domain.classVar or \
+        if isinstance(phenVar, Orange.feature.Descriptor) or \
             (phenVar == False and len(data) == 1):
 
             if geneVar == None: #if not specified, set as true in this stage
             dom = orange.Domain(floatvars, phenVar)
             return orange.ExampleTable(dom, data)
 
-        elif phenVar == False or phenVar != data.domain.classVar:
+        elif phenVar == False or isinstance(phenVar, basestring):
 
             cands = allgroups(data)
             pv = False
         matcher=None, geneVar=None, phenVar=None, caseSensitive=False, 
         **kwargs):
     """
-    phenVar and geneVar specify the phenotype and gene variable.
+    Run Gene Set Enrichment Analysis.
+
+    :param str organism: The organism taxonomy id (or name). Needed 
+       to build default gene sets and gene matcher.
+    :param Orange.data.Table data: Gene expression data.
+    :param Orange.bio.gene.Matcher matcher: Initialized gene matcher. If not given, a gene matcher is built with the KEGG database.
+    :param tuple classValues: A pair of values describing two distinct phenotypes. Each element can also be a list of values.
+       are computed. Only examples with a chosen phenotypes are analysised. Default: use all phenotypes in the data.
+    :param Orange.bio.geneset.GeneSets geneSets: Gene sets. Default: ???.
+    :param n: Number of permutations for significance computation. Default: 100.
+    :param str permutation: Permutation type, "class" (default) for phenotypes, "gene" for genes. We recommend "gene" permutations for data sets with less than 10 samples even though they ignore gene-gene interactions.
+    :param int minSize:
+    :param int maxSize: Minimum and maximum allowed number of genes from gene set also the data set. Defaults: 3 and 1000.
+    :param float minPart: Minimum fraction of genes from the gene set also in the data set. Default: 0.1.
+    :param int atLeast: Minimum number of valid gene values for each phenotype (the rest are ignored). Default: 3.
+    :param phenVar: Location of data on phenotypes. By default the `data.domain.class_var` is used if it exists. If :obj:`phenVar` is False, the genes are already taken as ranked. If string, the corresponding entry from ``attributes`` dictionary of individual features specifies the phenotype. In latter case, each attribute represents one sample.
+    :param geneVar: Locations of gene names. If True, gene names are attribute names. If a string, that entry of the individual features'``attributes`` dictionary is used. If each attribute specifies a sample, then the user should pass the meta variable containing the gene names. Defaults to attribute names if each example specifies one sample.
+    
+    :return: a dictionary where key is a gene set label and 
+        its value a dictionary of 
+        (1) enrichment score (es), 
+        (2) normalized enrichment score (nes), 
+        (3) P-value (p), 
+        (4) FDR (fdr), 
+        (5) whole gene set size (size), 
+        (6) number of matched genes from the gene set (matched_size), 
+        (7) gene names of matched genes in the data set (genes).
+
     """
     gso = GSEA(data, organism=organism, matcher=matcher, 
         classValues=classValues, atLeast=atLeast, caseSensitive=caseSensitive,