Commits

Marko Toplak committed 7e1dc12

Converted obiGEO documentation to rst.

  • Participants
  • Parent commits 236efba

Comments (0)

Files changed (8)

docs/rst/reference/code/geo_gds1.py

+"""
+Print out some information on specific GEO's data set.
+Does not download the data set.
+"""
+
+from Orange.bio import obiGEO
+import textwrap
+
+gdsinfo = obiGEO.GDSInfo()
+gds = gdsinfo["GDS10"]
+
+print "ID:", gds["dataset_id"]
+print "Features:", gds["feature_count"]
+print "Genes:", gds["gene_count"]
+print "Organism:", gds["platform_organism"]
+print "PubMed ID:", gds["pubmed_id"]
+print "Sample types:"
+for sampletype in set([sinfo["type"] for sinfo in gds["subsets"]]):
+    ss = [sinfo["description"] for sinfo in gds["subsets"] if sinfo["type"]==sampletype]
+    print "  %s (%s)" % (sampletype, ", ".join(ss))
+print
+print "Description:"
+print "\n".join(textwrap.wrap(gds["description"], 70))

docs/rst/reference/code/geo_gds2.py

+from Orange.bio import obiGEO
+
+gds = obiGEO.GDS("GDS1210")
+
+data = gds.getdata(report_genes=True, transpose=False)
+print "report_genes=True, transpose=False"
+print "Report=Genes, Rows=Genes/Spots"
+print "rows=%d cols=%d has_class=%s" % (len(data), len(data.domain.attributes), data.domain.classVar<>None)
+print
+
+data = gds.getdata(report_genes=False, transpose=False)
+print "report_genes=False, transpose=False"
+print "Report=Spots, Rows=Genes/Spots"
+print "rows=%d cols=%d has_class=%s" % (len(data), len(data.domain.attributes), data.domain.classVar<>None)
+print
+
+data = gds.getdata(report_genes=True, transpose=True)
+print "report_genes=True, transpose=True"
+print "Report=Genes, Rows=Samples"
+print "rows=%d cols=%d has_class=%s" % (len(data), len(data.domain.attributes), data.domain.classVar<>None)
+print "Class values:", " ".join([str(cv) for cv in data.domain.classVar.values]) 
+print
+
+
+data = gds.getdata(report_genes=True, transpose=True, sample_type="tissue")
+print 'report_genes=True, transpose=True sample_type="tissue"'
+print "Report=Genes, Rows=Samples"
+print "rows=%d cols=%d has_class=%s" % (len(data), len(data.domain.attributes), data.domain.classVar<>None)
+print "Class values:", " ".join([str(cv) for cv in data.domain.classVar.values]) 
+print

docs/rst/reference/code/geo_gds3.py

+from Orange.bio import obiGEO
+
+gds = obiGEO.GDS("GDS1676")
+data = gds.getdata(sample_type="infection")
+print "Genes: %d, Samples: %d" % (len(data), len(data.domain.attributes))
+
+for a in data.domain.attributes:
+    print a.name, a.attributes

docs/rst/reference/code/geo_gds4.py

+import Orange
+import glob
+import re
+
+filenames = glob.glob(Orange.utils.serverfiles.localpath("GEO") + "/GDS*.soft.gz")
+m = re.compile("(GDS[0-9]*).soft")
+print "%d data files cached:" % len(filenames)
+print " ".join([m.search(fn).group(1) for fn in filenames])
+

docs/rst/reference/code/geo_gds5.py

+"""
+Check all data files from GEO, find those which include at least N
+samples in all sample subsets of at least one sample type. Useful
+when, for instance, filtering out the data sets that could be used for
+supervised machine learning.
+"""
+
+from Orange.bio import obiGEO
+
+def valid(info, n=40):
+    """Return a set of subset types containing more than n samples in every subset"""
+    invalid = set()
+    subsets = set([sinfo["type"] for sinfo in info["subsets"]])
+    for sampleinfo in info["subsets"]:
+        if len(sampleinfo["sample_id"]) < n:
+            invalid.add(sampleinfo["type"])
+    return subsets.difference(invalid)
+
+def report(stypes, info):
+    """Pretty-print GDS and valid susbset types"""
+    for id, sts in stypes:
+        print id
+        for st in sts:
+            print "  %s:" % st,
+            gds = info[id]
+            print ", ".join(["%s/%d" % (sinfo["description"], len(sinfo["sample_id"])) \
+                             for sinfo in gds["subsets"] if sinfo["type"]==st])
+
+gdsinfo = obiGEO.GDSInfo()
+valid_subset_types = [(id, valid(info)) for id, info in gdsinfo.items() if valid(info)]
+report(valid_subset_types, gdsinfo)
+
+print 'N =', len(valid_subset_types)

docs/rst/reference/code/geo_gds6.py

+import Orange
+from Orange.bio import obiGEO
+
+gds = obiGEO.GDS("GDS2960")
+data = gds.getdata(sample_type="disease state", transpose=True)
+print "Samples: %d, Genes: %d" % (len(data), len(data.domain.attributes))
+
+learners = [ Orange.classification.logreg.LibLinearLogRegLearner ]
+results = Orange.evaluation.testing.cross_validation(learners, data, folds=10)
+print "AUC = %.3f" % Orange.evaluation.scoring.AUC(results)[0]

docs/rst/reference/geo.rst

 .. index:: microarray data sets
 
 **************************************************************
-An interface to NCBI's Gene Expression Omnibus (:mod:`obiGEO`)
+NCBI's Gene Expression Omnibus interface (:mod:`obiGEO`)
 **************************************************************
 
 :obj:`obiGEO` provides an interface to `NCBI
     >>> data.domain.attributes[0].attributes
     Out[191]: {'dose': '20 U/ml IL-2', 'infection': 'acute ', 'time': '1 d'}
 
-GDS
-===
+GDS classes
+===========
 
 .. autoclass:: GDSInfo
    :members:
 
-::
+An example that uses obj:`GDSInfo`::
 
-    >>> import obiGEO
+    >>> from Orage import obiGEO
     >>> info = obiGEO.GDSInfo()
     >>> info.keys()[:5]
     >>> ['GDS2526', 'GDS2524', 'GDS2525', 'GDS2522', 'GDS1618']
 
 
 Examples
-==================
+========
 
-Genes can have multiple aliases. When we combine data from different
-sources, for example expression data with GO gene sets, we have to
-match gene aliases representing the same genes. All implemented matching
-methods are based on sets of gene aliases for one gene.
+The following script prints out some information about a specific data
+set. It does not download the data set, just uses the (local) GEO data
+sets information file (:download:`geo_gds1.py <code/geo_gds1.py>`).
 
-.. autoclass:: Matcher
-   :members:
+.. literalinclude:: code/geo_gds1.py
+   :lines: 6-
 
-This modules provides the following gene matchers:
+The output of this script is::
 
-.. autoclass:: MatcherAliasesKEGG
+    ID: GDS10
+    Features: 39114
+    Genes: 19883
+    Organism: Mus musculus
+    PubMed ID: 11827943
+    Sample types:
+      disease state (diabetic, diabetic-resistant, nondiabetic)
+      strain (NOD, Idd3, Idd5, Idd3+Idd5, Idd9, B10.H2g7, B10.H2g7 Idd3)
+      tissue (spleen, thymus)
 
-.. autoclass:: MatcherAliasesGO
+    Description:
+    Examination of spleen and thymus of type 1 diabetes nonobese diabetic
+    (NOD) mouse, four NOD-derived diabetes-resistant congenic strains and
+    two nondiabetic control strains.
 
-.. autoclass:: MatcherAliasesDictyBase
 
-.. autoclass:: MatcherAliasesNCBI
+GEO data sets provide a sort of mini ontology for sample labeling. Samples
+belong to sample subsets, which in turn belong to specific types. Like
+above GDS10, which has three sample types, of which the subsets for
+the tissue type are spleen and thymus. For supervised data mining it
+would be useful to find out which data sets provide enough samples for
+each label. It is (semantically) convenient to perform classification
+within sample subsets of the same type. We therefore need a script
+that goes through the entire set of data sets and finds those, where
+there are enough samples within each of the subsets for a specific
+type. The following script does the work. The function ``valid``
+determines which subset types (if any) satisfy our criteria. The
+number of requested samples in the subset is by default set to ``n=40``
+(:download:`geo_gds5.py <code/geo_gds5.py>`).
 
-.. autoclass:: MatcherAliasesEnsembl
+.. literalinclude:: code/geo_gds5.py
+   :lines: 8-
 
-.. autoclass:: MatcherDirect
+The requested number of samples, ``n=40``, seems to be a quite
+a stringent criteria met - at the time of writing of this documentation -
+by 35 sample subsets. The output starts with::
 
-Gene name matchers can applied in sequence (until the first match) or combined (overlapping sets of gene aliases of multiple gene matchers are combined) with the :obj:`matcher` function.
+    GDS1611
+      genotype/variation: wild type/48, upf1 null mutant/48
+    GDS3553
+      cell type: macrophage/48, monocyte/48
+    GDS3953
+      protocol: training set/46, validation set/47
+    GDS3704
+      protocol: PUFA consumption/42, SFA consumption/42
+    GDS3890
+      agent: vehicle, control/46, TCE/48
+    GDS1490
+      other: non-neural/50, neural/100
+    GDS3622
+      genotype/variation: wild type/56, Nrf2 null/54
+    GDS3715
+      agent: untreated/55, insulin/55
 
-.. autofunction:: matcher
+Let us now pick data set GDS2960 and see if we can predict the disease
+state. We will use logistic regression, and within 10-fold cross
+validation measure AUC, the area under ROC. AUC is the probably for
+correctly distinguishing between two classes if picking the sample from
+target (e.g., the disease) and non-target class (e.g., control). From
+(:download:`geo_gds6.py <code/geo_gds6.py>`)
 
-The following example tries to match input genes onto KEGG gene aliases (:download:`genematch2.py <code/genematch2.py>`).
+.. literalinclude:: code/geo_gds6.py
 
-.. literalinclude:: code/genematch2.py
+The output of this script is::
+    
+    Samples: 101, Genes: 4068
+    AUC = 0.960
 
-Results show that GO aliases can not match onto KEGG gene IDs. For the last gene only joined GO and KEGG aliases produce a match::
-
-        gene         KEGG           GO      KEGG+GO
-        cct7    hsa:10574         None    hsa:10574
-        pls1     hsa:5357         None     hsa:5357
-        gdi1     hsa:2664         None     hsa:2664
-       nfkb2     hsa:4791         None     hsa:4791
-      a2a299         None         None     hsa:7052
-
-
-The following example finds KEGG pathways with given genes (:download:`genematch_path.py <code/genematch_path.py>`).
-
-.. literalinclude:: code/genematch_path.py
-
-Output::
-
-    Fndc4 is in
-      /
-    Itgb8 is in
-      PI3K-Akt signaling pathway
-      Focal adhesion
-      ECM-receptor interaction
-      Cell adhesion molecules (CAMs)
-      Regulation of actin cytoskeleton
-      Hypertrophic cardiomyopathy (HCM)
-      Arrhythmogenic right ventricular cardiomyopathy (ARVC)
-      Dilated cardiomyopathy
-    Cdc34 is in
-      Ubiquitin mediated proteolysis
-      Herpes simplex infection
-    Olfr1403 is in
-      Olfactory transduction
-
-
+The AUC for this data set is very high, indicating that using these gene
+expression data it is almost trivial to separate the two classes.

orangecontrib/bio/obiGEO.py

     loaded locally, else it downloads it from `NCBI's GEO FTP site
     <ftp://ftp.ncbi.nih.gov/pub/geo/DATA/SOFT/GDS/>`_.  The compressed
     data file resides in the cache directory after the call of the
-    constructor (call to ``orngServerFiles.localpath("GEO")`` reveals
+    constructor (call to ``Orange.utils.serverfiles.localpath("GEO")`` reveals
     the path of this directory).
 
     :param gdsname: an NCBI's ID for the data set in the form "GDSn"