Commits

Aleš Erjavec committed cf24e5d

Added obiExperiments module (functions from Genotype Distances widget).

Comments (0)

Files changed (1)

obiExperiments.py

+
+from collections import defaultdict
+from operator import add
+import numpy
+import math
+
+def data_type(vals):
+    try:
+        _ = [ int(a) for a in vals ]
+        return int
+    except ValueError:
+        try:
+            _ = [ float(a) for a in vals ]
+            return float
+        except ValueError:
+            return lambda x: x
+
+def separate_by(data, separate, ignore=[], consider=None, add_empty=True):
+    """
+    data - the data - annotations are saved in the at.attributes
+    annotatitions: keys of at.attributes  by which to separate
+    ignore: ignore values of these annotations
+    consider: consider only these annotations
+    """
+    ignore = set(ignore)
+
+    annotations = [ at.attributes for at in data.domain.attributes ]
+
+    all_values = defaultdict(set)
+    for a in annotations:
+        for k,v in a.iteritems():
+            all_values[k].add(v)
+
+    types = {}
+    for k,vals in all_values.iteritems():
+        types[k] = data_type(vals)
+    
+    groups = defaultdict(list)
+    for i,a in enumerate(annotations):
+        groups[tuple(a[k] for k in separate)].append(i)
+
+    different_in_all = set(k \
+        for k,vals in all_values.iteritems() \
+        if len(vals) == len(annotations) or len(vals) == 1)
+
+    other_relevant = set(all_values.keys()) - different_in_all - ignore - set(separate)
+    if consider != None:
+        other_relevant &= set(consider)
+    other_relevant = sorted(other_relevant) #TODO how to order them?
+
+    def relevant_vals(annotation):
+        if isinstance(annotation, tuple):
+            return annotation
+        return tuple(types[v](annotation[v]) for v in other_relevant)
+
+    other_relevant_d2 = defaultdict(int) #"multiset" - number
+    #of maximum occurances of a relevant value in a group
+    for _,g in groups.items():
+        d = defaultdict(int)
+        for i in g:
+            d[relevant_vals(annotations[i])] += 1
+        for rv,n in d.items():
+            if n > other_relevant_d2[rv]:
+                other_relevant_d2[rv] = n
+    
+    if add_empty: #fill in with "empty" relevant vals
+        ngroups = {}
+        for g in groups:
+            need_to_fill = other_relevant_d2.copy()
+            for i in groups[g]:
+                need_to_fill[relevant_vals(annotations[i])] -= 1
+            add = []
+            for rv,num in need_to_fill.items():
+                for a in range(num):
+                    add.append(rv)
+            ngroups[g] = groups[g] + add
+        groups = ngroups
+
+    ngroups = {}
+    uniquepos = {} #which positions are unique
+    for g in groups:
+        elements = list(groups[g])
+
+        rv2 = lambda x: relevant_vals(annotations[x] if isinstance(x,int) else x)
+
+        ngroups[g] = map(lambda x: x if isinstance(x,int) else None,
+            sorted(elements, key=rv2))
+
+        d = defaultdict(int) #get groups of different relevant values
+        for i in elements:
+            d[rv2(i)] += 1
+        
+        uniquepos[g] = map(lambda x: not d[rv2(x)] > 1,
+             sorted(elements, key=rv2))
+    
+    return ngroups, uniquepos
+
+def float_or_none(value):
+    return value.value if value.value != "?" else None
+
+def linearize(data, ids):
+    """ Returns a list of floats in the data subspace (or None's
+    if the values are unknown or not present. """
+    l = [ [ None ] * len(data) if id1 == None \
+        else [ float_or_none(ex[id1]) for ex in data ] for id1 in ids ]
+    l = reduce(add, l)
+    return l
+
+def pearson_lists(l1, l2):
+    """ Returns pearson correlation between two lists. Ignores elements
+    which are None."""
+    okvals = [ (a,b) for a,b in zip(l1,l2) if a != None and b != None ]
+    return numpy.corrcoef([ [ v[0] for v in okvals], [ v[1] for v in okvals] ])[0,1]
+
+def euclidean_lists(l1, l2):
+    """ Returns pearson correlation between two lists. Ignores elements
+    which are None."""
+    okvals = [ (a,b) for a,b in zip(l1,l2) if a != None and b != None ]
+    return math.sqrt( sum((a-b)*(a-b) for a,b in okvals ))
+
+def spearman_lists(l1, l2):
+    """ Returns pearson correlation between two lists. Ignores elements
+    which are None."""
+    import scipy.stats
+    okvals = [ (a,b) for a,b in zip(l1,l2) if a != None and b != None ]
+    #print okvals, len(okvals)
+    return scipy.stats.spearmanr([ v[0] for v in okvals], [ v[1] for v in okvals] )[0]
+
+def dist_spearman(l1, l2):
+    return (1.-spearman_lists(l1, l2))/2
+
+def dist_pcorr(l1, l2):
+    #normalized to 0..1
+    return (1.-pearson_lists(l1, l2))/2
+
+def dist_eucl(l1, l2):
+    return euclidean_lists(l1, l2)
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.