Marko Toplak avatar Marko Toplak committed 7b87ecd

Gene matching documentation.

Comments (0)

Files changed (6)


+   reference/gene.rst


+from import obiGene
+#matching targets are NCBI gene IDs
+targets = obiGene.NCBIGeneInfo("Homo sapiens").keys()
+gm = obiGene.GMNCBI("9606")
+for gene in [ "cct7", "pls1", "gdi1", "nfkb2", "dlg7" ]:
+    print 'Gene', gene, 'is NCBI gene', gm.umatch(gene)


+from import obiGene, obiKEGG
+targets = obiKEGG.KEGGOrganism("9606").get_genes() #KEGG gene IDs
+gmkegg = obiGene.GMKEGG("9606")
+gmgo = obiGene.GMGO("9606")
+gmkegggo = obiGene.matcher([[gmkegg, gmgo]], direct=False) #joined matchers
+genes = [ "cct7", "pls1", "gdi1", "nfkb2", "a2a299" ]
+print "%12s" % "gene", "%12s" % "KEGG", "%12s" % "GO", "%12s" % "KEGG+GO"
+for gene in genes:
+    print "%12s" % gene, "%12s" % gmkegg.umatch(gene), \
+          "%12s" % gmgo.umatch(gene), \
+          "%12s" % gmkegggo.umatch(gene)


+from import obiKEGG, obiGene
+keggorg = obiKEGG.KEGGOrganism("mmu")
+kegg_genes = keggorg.get_genes() 
+query = [ "Fndc4", "Itgb8", "Cdc34", "Olfr1403" ] 
+gm = obiGene.GMKEGG("mmu") #use KEGG aliases for gene matching
+gm.set_targets(kegg_genes) #set KEGG gene aliases as targets
+for name in query:
+    match = gm.umatch(name)
+    if match:
+    	pwys = keggorg.get_pathways_by_genes([match])
+        print name, "is in"
+        pathways = [ obiKEGG.KEGGPathway(p).title for p in pwys ]
+        if pathways:
+            for a in pathways:
+                print ' ', a
+        else:
+            print '  /'


+.. py:currentmodule::
+.. index:: gene matching
+.. index:: gene name matching
+.. index:: matching
+.. index:: NCBI
+.. index:: gene info
+Gene name matching (:mod:`obiGene`)
+To use gene matchers
+first set the target gene names with :obj:`~Matcher.set_targets` and then
+match  with :obj:`~Matcher.match` or :obj:`~Matcher.umatch` functions. The
+following example (:download:` <code/>`)
+matches gene names to NCBI gene IDs:
+.. literalinclude:: code/
+Gene name matching
+Genes can have multiple aliases. When we combine data from different
+sources, for example expression data with GO gene sets, we have to
+match gene aliases representing the same genes. All implemented matching
+methods are based on sets of gene aliases for one gene.
+.. autoclass:: Matcher
+   :members:
+This modules provides the following gene matchers:
+.. autoclass:: MatcherAliasesKEGG
+.. autoclass:: MatcherAliasesGO
+.. autoclass:: MatcherAliasesDictyBase
+.. autoclass:: MatcherAliasesNCBI
+.. autoclass:: MatcherAliasesEnsembl
+.. autoclass:: MatcherDirect
+Gene name matchers can applied in sequence (until the first match) or combined (overlapping sets of gene aliases of multiple gene matchers are combined) with the :obj:`matcher` function.
+.. autofunction:: matcher
+The following example tries to match input genes onto KEGG gene aliases (:download:` <code/>`).
+.. literalinclude:: code/
+Results show that GO aliases can not match onto KEGG gene IDs. For the last gene only joined GO and KEGG aliases produce a match::
+        gene         KEGG           GO      KEGG+GO
+        cct7    hsa:10574         None    hsa:10574
+        pls1     hsa:5357         None     hsa:5357
+        gdi1     hsa:2664         None     hsa:2664
+       nfkb2     hsa:4791         None     hsa:4791
+      a2a299         None         None     hsa:7052
+The following example finds KEGG pathways with given genes (:download:` <code/>`).
+.. literalinclude:: code/
+    Fndc4 is in
+      /
+    Itgb8 is in
+      PI3K-Akt signaling pathway
+      Focal adhesion
+      ECM-receptor interaction
+      Cell adhesion molecules (CAMs)
+      Regulation of actin cytoskeleton
+      Hypertrophic cardiomyopathy (HCM)
+      Arrhythmogenic right ventricular cardiomyopathy (ARVC)
+      Dilated cardiomyopathy
+    Cdc34 is in
+      Ubiquitin mediated proteolysis
+      Herpes simplex infection
+    Olfr1403 is in
+      Olfactory transduction


 class Matcher(object):
-    Gene matcher tries to match an input gene to some target.
+    Matches an input gene to some target gene (set in advance).
     def copy(self):
     def set_targets(self, targets):
-        Set input list of gene names as targets. 
-        Abstract function.
+        Set input list of gene names (a list of strings) as target genes.
     def match(self, gene):
-        """Returns a list of matching target gene names."""
+        """Return a list of target gene aliases which share a set of aliases with the input gene (can be empty)."""
     def umatch(self, gene):
-        """Returns an unique (only one matching target) target or None"""
+        """Return an the single (unique)  matching target gene or None, if there are no matches or multiple matches."""
         mat = self.match(gene)
         return mat[0] if len(mat) == 1 else None
     def explain(self, gene):
-        Returns an gene matches with explanations as lists of tuples. 
-        Each tuple consists of a list of target genes in a set
-        of aliases matched to input gene, returned as a second part
-        of the tuple.
+        Return gene matches with explanations as lists of tuples:
+        a list of matched target genes and the corresponding set of gene aliases.
 class MatcherAliasesKEGG(MatcherAliasesPickled):
+    """ Alias: GMKEGG. 
+    """
     def _organism_name(self, organism):
         return obiKEGG.organism_name_search(organism)
 class MatcherAliasesGO(MatcherAliasesPickled):
+    """ Alias: GMGO.
+    """
     def _organism_name(self, organism):
         """ Returns internal GO organism name. Used to define file name. """
         MatcherAliasesPickled.__init__(self, ignore_case=ignore_case)
 class MatcherAliasesDictyBase(MatcherAliasesPickled):
+    """ Alias: GMDicty.
+    """
     def create_aliases(self):
         from . import obiDicty
         MatcherAliasesPickled.__init__(self, ignore_case=ignore_case)
 class MatcherAliasesNCBI(MatcherAliasesPickled):
+    """ Alias: GMNCBI.
+    """
     def _organism_name(self, organism):
         return NCBIGeneInfo.organism_name_search(organism)
 class MatcherAliasesEnsembl(MatcherAliasesPickled):
-    """ A matcher for Ensemble ids
+    """ A matcher for Ensemble ids. Alias: GMEnsemble.
     DEF_ATTRS = ["ensembl_gene_id", "external_gene_id", "entrezgene"]
     # taxid: (dataset_name, [name_attr1, name_attr2 ...])
 class MatcherDirect(Matcher):
-    Direct matching to targets.
+    Directly match target names. Can ignore case. Alias: GMDirect.
     def __init__(self, ignore_case=True):
 def matcher(matchers, direct=True, ignore_case=True):
-    Build a matcher from a sequence of matchers. If a sequence element is a
-    sequence, join matchers in the subsequence.
+    Builds a new matcher from a list of gene matchers. Apply matchers in
+    the input list successively until a match is found. If a list element
+    is a a list, join matchers in the list by joining overlapping sets
+    of aliases.
-    direct - if True, add a direct matcher to targets
-    ignore_case - if True, ignores case with optionally added direct matcher 
+    :param matchers: gene matchers.  
+    :param direct: If True, first try
+      to match gene directly (a :obj:`MatcherDirect` is inserted in front of the
+      gene matcher sequence).  
+    :param ignore_case: passed to the added
+      direct matcher.
     seqmat = []
     if direct:
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.