Commits

Miha Stajdohar committed 22ee2f8

Compute the class entropy and the conditional class entropy given projections.

  • Participants
  • Parent commits c6b442c

Comments (0)

Files changed (1)

File examples/projections/entropy_vr_mm.py

+__author__ = '"Miha Stajdohar" <miha.stajdohar@gmail.com>'
+
+import matplotlib
+matplotlib.use('Agg')
+
+import os.path, sys, time, itertools
+import numpy as np
+import _modelmaps as mm
+import cPickle as pickle
+
+import scatterplot
+import radviz
+
+from Orange import clustering, data, utils
+
+ROOT = "/home/miha/work/res/modelmaps"
+ROOT = "C:\\Users\\Miha\\work\\res\\modelmaps"
+#ROOT = "/Network/Servers/xgridcontroller.private/lab/mihas/modelmaps"
+
+def parse_arg(X):
+    if type(X) == data.Instance and "model" in X.domain:
+        return X["model"].value.instance_predictions
+    elif type(X) == np.ndarray:
+        return X
+    else:
+        raise AttributeError("Argument must be an Instance of model meta-data table or a ndarray of class values.")
+
+def entropy_slower(*X):
+    X = [parse_arg(x) for x in X]
+    return np.sum(-p * np.log2(p) if p > 0 else 0 for p in
+        (np.mean(reduce(np.logical_and, (predictions == c for predictions, c in zip(X, classes))))
+            for classes in itertools.product(*[set(x) for x in X])))
+
+def entropy(*X, **kwargs):
+    predictions = parse_arg(X[0])
+    H = kwargs["H"] if "H" in kwargs else 0
+    v = kwargs["v"] if "v" in kwargs else np.array([True] * len(predictions))
+
+    if not np.sum(v):
+        return H
+
+    for c in set(predictions):
+        if len(X) > 1:
+            H = entropy(*X[1:], v=np.logical_and(v, predictions == c), H=H)
+        else:
+            p = np.mean(np.logical_and(v, predictions == c))
+            H -= p * np.log2(p) if p > 0 else 0
+    return H
+
+def conditional_entropy(X, Y):
+    predictions_x = parse_arg(X)
+    predictions_y = parse_arg(Y)
+
+    H = 0
+    for c_x in set(predictions_x):
+        for c_y in set(predictions_y):
+            p_xy = np.mean(np.logical_and(predictions_x == c_x, predictions_y == c_y))
+            p_y =  np.mean(predictions_y == c_y)
+            H += p_xy * np.log2(p_y / p_xy) if p_xy > 0 else 0
+
+    return H
+
+def mutual_information(X, Y):
+    return entropy(X) - conditional_entropy(X, Y)
+
+def conditional_mutual_information(X, Y, Z):
+    return entropy(X, Z) + entropy(Y, Z) - entropy(X, Y, Z) - entropy(Z)
+
+def mutual_information3(X, Y, Z):
+    return mutual_information(X, Y) - conditional_mutual_information(X, Y, Z)
+
+def print_mi(models):
+    smi = 0
+    for i in range(len(models)):
+        for j in range(i):
+            mi = mutual_information(models[i], models[j])
+            smi += mi
+            print mi
+    print smi
+
+def class_uncertainty(models):
+    classes = models[0]["model"].value.instance_classes
+
+    j = [entropy(*(models[:i + 1])) for i in range(len(models))]
+    u = [entropy(*([classes] + models[:i + 1])) - j[i] for i in range(len(models))]
+    p = [model["P"].value for model in models[:len(u)]]
+    a = []
+    seen = set()
+    for model in models[:len(u)]:
+        seen.update(model["attributes"].value.split(", "))
+        a.append(len(seen))
+
+    return u, p, a, j
+
+def compare_model_maps(mm1, mm2):
+    return smx1.get_values() == smx2.get_values()
+
+def compare_model_tables(t1, t2):
+    return np.mean([np.mean(t1[i]["model"].value.probabilities == t2[i]["model"].value.probabilities) for i in range(len(t1))]) == 1.
+
+def print_res(txt):
+    fp = open(os.path.join(ROOT, "_projections_", "entropy_results_%s.txt" % sys.platform), "a")
+    fp.write(txt + "\n")
+    fp.close()
+
+DATASETS = [("zoo", None), ("breast-cancer-wisconsin", None), ("wine", None), ("voting", None), ("adult_sample", None),
+            ("car", None), ("glass", None), ("lenses", None), ("marketing", None), ("dermatology", None),
+            ("mushroom", None), ("vehicle",  None), ("wdbc", None), ("primary-tumor", None)]
+
+DATASETS = {DATASET: {"maxproj": maxproj} for DATASET, maxproj in DATASETS}
+
+DO = ["breast-cancer-wisconsin", "voting", "zoo", "mushroom"]
+#DO = ["adult_sample", "glass", "marketing", "primary-tumor", "vehicle", "wdbc"] # "dermatology",
+
+DO = ["mushroom"]
+
+for DATASET in DATASETS.iterkeys():
+    fname = os.path.join(utils.environ.dataset_install_dir, "%s%s" % (DATASET, ".tab"))
+    if not (os.path.exists(fname) and os.path.isfile(fname)):
+        fname = os.path.join(ROOT, "tab", "%s%s" % (DATASET, ".tab"))
+
+        if not (os.path.exists(fname) and os.path.isfile(fname)):
+            raise IOError("File %s not found." % fname)
+
+for DATASET in DO:
+    print "DATASET:", DATASET
+    print "building Scatterplot map..."
+    scatterplot.build_scatterplots(DATASET, ROOT)
+    print "building Radviz 3 map..."
+    radviz.build_radviz(DATASET, ROOT, 3)
+    print "building Radviz 4 map..."
+    radviz.build_radviz(DATASET, ROOT, 4)
+
+method = [#("scatterplots", scatterplot.scatterplots_in_vr_mm),
+          #("radviz_3", radviz.radviz_in_vr_mm),
+          ("radviz_4", radviz.radviz_in_vr_mm_4)
+]
+
+for key in DO:
+    val = DATASETS[key]
+    for method_name, method_func in method:
+
+        print "Dataset: %s" % key
+        print "VizRank: %s" % method_name
+
+        fname = os.path.join(ROOT, "_projections_", "%s_%s_%s.bz2" % (method_name, key, sys.platform))
+        if not (os.path.exists(fname) and os.path.isfile(fname)):
+            print "Warning: %s file not found." % fname
+            continue
+
+        _, model_data, original_data = mm.load(fname)
+
+        if len(model_data) > 9000:
+            indices = sorted([(i, ex) for i, ex in enumerate(model_data)], key=lambda x: x[1]["P"].value, reverse=True)[:9000]
+            indices, _ = zip(*indices)
+            model_data = model_data.get_items_ref(list(indices))
+
+        models = sorted((ex for ex in model_data), key=lambda x: x["P"].value, reverse=True)
+
+        print_res("Dataset: %s" % key)
+        print_res("Method: %s" % method_name)
+
+        model_limit = 15 if val["maxproj"] is None else val["maxproj"]
+
+        uncertainty, score, attributes, entropy_ = class_uncertainty(models[:model_limit])
+        n_models = len(uncertainty)
+
+        print_res("VizRank")
+        print_res("uncertainty: %s" % ", ".join([str(s) for s in uncertainty]))
+        print_res("entropy: %s" % ", ".join([str(s) for s in entropy_]))
+        print_res("score: %s" % ", ".join([str(s) for s in score]))
+        print_res("attributes: %s" % ", ".join([str(s) for s in attributes]))
+
+#        print "Model Map: kmeans (%d models)" % len(model_data)
+#        _, models = method_func(None, model_data, original_data, key, ROOT, clusters=n_models, seed=0)
+#        uncertainty, score, attributes, entropy_ = class_uncertainty(models[:n_models])
+#
+#        print_res("Model Map: kmeans")
+#        print_res("uncertainty: %s" % ", ".join([str(s) for s in uncertainty]))
+#        print_res("entropy: %s" % ", ".join([str(s) for s in entropy_]))
+#        print_res("score: %s" % ", ".join([str(s) for s in score]))
+#        print_res("attributes: %s" % ", ".join([str(s) for s in attributes]))
+#
+        print "Model Map: kmeans iterative"
+        _, models = method_func(None, model_data, original_data, key, ROOT, clusters=n_models, seed=0, iterative_clustering=True)
+        uncertainty, score, attributes, entropy_ = class_uncertainty(models[:n_models])
+
+#        pickle.dump(data.Table(models[0].domain, models), open(os.path.join(ROOT, "_projections_", "entropy_%s_%s_%s.pkl" % (key, method_name, sys.platform)), "wb"), -1)
+
+        print_res("Model Map: kmeans iterative")
+        print_res("uncertainty: %s" % ", ".join([str(s) for s in uncertainty]))
+        print_res("entropy: %s" % ", ".join([str(s) for s in entropy_]))
+        print_res("score: %s" % ", ".join([str(s) for s in score]))
+        print_res("attributes: %s" % ", ".join([str(s) for s in attributes]))
+#
+#        print "Model Map: hierarchical AVERAGE linkage"
+#        _, models = method_func(None, model_data, original_data, key, ROOT, clusters=n_models, seed=0, linkage=clustering.hierarchical.AVERAGE)
+#        uncertainty, score, attributes, entropy_ = class_uncertainty(models[:n_models])
+#
+#        print_res("Model Map: hierarchical AVERAGE")
+#        print_res("uncertainty: %s" % ", ".join([str(s) for s in uncertainty]))
+#        print_res("entropy: %s" % ", ".join([str(s) for s in entropy_]))
+#        print_res("score: %s" % ", ".join([str(s) for s in score]))
+#        print_res("attributes: %s" % ", ".join([str(s) for s in attributes]))
+
+#        print "Model Map: hierarchical SINGLE linkage"
+#        _, models = method_func(None, model_data, original_data, key, ROOT, clusters=n_models, seed=0, linkage=clustering.hierarchical.SINGLE)
+#        uncertainty, score, attributes, entropy_ = class_uncertainty(models[:n_models])
+#
+#        print_res("Model Map: hierarchical SINGLE")
+#        print_res("uncertainty: %s" % ", ".join([str(s) for s in uncertainty]))
+#        print_res("entropy: %s" % ", ".join([str(s) for s in entropy_]))
+#        print_res("score: %s" % ", ".join([str(s) for s in score]))
+#        print_res("attributes: %s" % ", ".join([str(s) for s in attributes]))
+#
+#        print "Model Map: hierarchical COMPLETE linkage"
+#        _, models = method_func(None, model_data, original_data, key, ROOT, clusters=n_models, seed=0, linkage=clustering.hierarchical.COMPLETE)
+#        uncertainty, score, attributes, entropy_ = class_uncertainty(models[:n_models])
+#        print
+#
+#        print_res("Model Map: hierarchical COMPLETE")
+#        print_res("uncertainty: %s" % ", ".join([str(s) for s in uncertainty]))
+#        print_res("entropy: %s" % ", ".join([str(s) for s in entropy_]))
+#        print_res("score: %s" % ", ".join([str(s) for s in score]))
+#        print_res("attributes: %s" % ", ".join([str(s) for s in attributes]))