Commits

Jure Žbontar  committed 8f125f7

Test logistic regression and majority.

  • Participants
  • Parent commits 98409ac

Comments (0)

Files changed (5)

+- Brez razreda Table.
+    OK, ampak razred Table rabimo za postreSQL.
+
+    Hm, kaj pa ce bi se postreSQL Table obnasal kot numpy ndarray?
+    To bi blo super, ker potem bi lahko pisali algoritme za numpy, ki pa
+    bi slucajno delali tudi na postreSQL tabelah.
+- Zakaj imamo Classifier in Learner?
+- Manj kode prosim! Pa kaj ce vrnemo vedno prvega pri izenacenju.
+- Ni bolje, da sta klasifikacija in regresija loceni?
+- Zakaj sploh dedujemo? Vec kode na eni lokaciji.
+- Mogoce bi bilo bolje, da bi bil y 1d.
+- Pri klasifikaciji bi moral biti y tipa int.
+- Problem z domenami pri logisticni regresiji in iris.
+
+- (Moje) Pri klasifikaciji se lahko bincount zmoti v stevilu razredov.

File Orange/classification/majority.py

 import collections
+import warnings
+
+from scipy.optimize import fmin_l_bfgs_b
+import bottleneck as bn
 import numpy as np
-import bottleneck as bn
+import scipy.sparse as sp
+
 from Orange import classification
 import Orange.data
 
-'''
-- Brez razreda Table. 
-    OK, ampak razred Table rabimo za postreSQL.
-
-    Hm, kaj pa ce bi se postreSQL Table obnasal kot numpy ndarray?
-    To bi blo super, ker potem bi lahko pisali algoritme za numpy, ki pa
-    bi slucajno delali tudi na postreSQL tabelah.
-- Zakaj imamo Classifier in Learner?
-- Manj kode prosim! Pa kaj ce vrnemo vedno prvega pri izenacenju.
-- Ni bolje, da sta klasifikacija in regresija loceni?
-- Zakaj sploh dedujemo? Vec kode na eni lokaciji.
-- Mogoce bi bilo bolje, da bi bil y 1d.
-- Pri klasifikaciji bi moral biti y tipa int.
-
-- (Moje) Pri klasifikaciji se lahko bincount zmoti v stevilu razredov.
-'''
+### utils
+def atleast2d(X):
+    if not sp.issparse(X):
+        return np.atleast_2d(X)
 
 ### Majority
 class MajorityRegressor:
         self.median = np.median(y)
 
     def predict(self, X):
-        X = np.atleast_2d(X)
-        return np.tile(self.median, X.shape[0])
+        return np.tile(self.median, atleast2d(X).shape[0])
         
 
 class MajorityClassifier:
         self.dist = np.bincount(y, weights=w).astype(float) / X.shape[0]
 
     def predict_proba(self, X):
-        X = np.atleast_2d(X)
-        return np.tile(self.dist, (X.shape[0], 1))
+        return np.tile(self.dist, (atleast2d(X).shape[0], 1))
 
 
 ### Logisticna regresija
-import warnings
-
-from scipy.optimize import fmin_l_bfgs_b
-import numpy as np
-import scipy.sparse as sp
-
 # helper functions
 def append_ones(X):
     if sp.issparse(X):
         return j, grad
 
     def fit(self, X, y):
+        assert np.issubdtype(y.dtype, int)
+        assert y.ndim == 1
+        assert y.min() == 0 and y.max() == 1
+
         theta = np.zeros(X.shape[1])
-        theta, _, ret = fmin_l_bfgs_b(self.cost_grad, theta, args=(X, y, lambda_))
+        self.theta, _, ret = fmin_l_bfgs_b(self.cost_grad, theta, 
+            args=(X, y, self.lambda_))
         if ret['warnflag'] != 0:
             warnings.warn('L-BFGS failed to converge')
-        return theta
 
-    def predict(self, X):
-        return sigmoid(X.dot(theta))
+    def predict_proba(self, X):
+        p1 = sigmoid(atleast2d(X).dot(self.theta))
+        return np.column_stack((np.ones_like(p1) - p1, p1))
 
 '''
 Morda lahko bloatify resi moje probleme (upam, da ne)
 '''
 
+# ugly, ugly, ugly
 def bloatify(model):
     def classifier(*args, **kwargs):
         def fit(train):
-            def predict(test):
-                return m.predict(test)
-            m.fit(train)
+            def predict(test, what='values'):
+                if isinstance(test, Orange.data.Instance):
+                    X_test = test._x
+                else:
+                    X_test = test.X
+                if what == 'probabilities':
+                    P = m.predict_proba(X_test)
+                    return P
+                elif what == 'values':
+                    if hasattr(m, 'predict'):
+                        return m.predict(X_test)
+                    P = m.predict_proba(X_test)
+                    ind = np.argmax(P, axis=1)
+                    return np.array(train.domain.class_vars[0].values)[ind]
+            m.fit(train.X, train.Y)
+            return predict
         m = model(*args, **kwargs)
+        return fit
     return classifier
         
 MajorityClassifierBloat = bloatify(MajorityClassifier)
             raise NotImplementedError(
                 "Majority learner does not support multiple classes")
         class_var = data.domain.class_var
-        y = data.Y.ravel()
+        y = data.Y
         if isinstance(data.domain.class_var, Orange.data.ContinuousVariable):
             return DefaultClassifier(data.domain, bn.nanmedian(y))
         else:
             distr = np.asarray(distr, np.float)[:n_values]
             return DefaultClassifier(data.domain, distr=distr)
 
-
 class DefaultClassifier(classification.Classifier):
     def __init__(self, value=None, distr=None):
         if value is None:
         else:
             return np.tile(self.distr, len(x)).reshape(len(x), -1)
 
+
 if __name__ == '__main__':
+    ## classification
+    data = Orange.data.Table('../doc/datasets/iris') 
+    
+    ### majority
+    X, y = data.X, data.Y
 
-    data = Orange.data.Table('../doc/datasets/iris') 
     m = MajorityClassifier()
-    m.fit(data.X, data.Y.ravel().astype(int))
-    print(m.predict_proba(data.X[[0]]))
+    m.fit(X, y)
+    print(m.predict_proba(X[0]))
 
-    m = MajorityLearner()
+    m = MajorityClassifierBloat()
     c = m(data)
-    print(c(data[0]))
+    print(c(data[0], what='values'))
 
-    data = Orange.data.Table('../doc/datasets/housing') 
-    m = MajorityRegressor()
-    m.fit(data.X, data.Y.ravel())
+    ### logistic regression
+    y[y != 0] = 1
 
+    m = LogisticRegression()
+    m.fit(X, y)
+    print(m.predict_proba(X[0]))
 
+    m = LogisticRegressionBloat()
+    c = m(data)
+    print(c(data[0], what='probabilities'))
+#    print(c(data, what='values'))  # Aja, sej to itak ne dela z domenami
+
+
+#    ## regression
+#    data = Orange.data.Table('../doc/datasets/housing') 
+#    m = MajorityRegressor()
+#    m.fit(data.X, data.Y)
+#    print(m.predict(data.X[0]))

File Orange/data/instance.py

         if n_attrs > 5:
             res += ", ..."
         if self.domain.class_vars:
-            res += " | " + ", ".join(var.str_val(value) for var, value in
-                zip(self.domain.class_vars, self._y[:5]))
+            res += " | " + self.domain.class_vars[0].str_val(self._y)
+#            res += " | " + ", ".join(var.str_val(value) for var, value in
+#                zip(self.domain.class_vars, self._y[:5]))
         res += "]"
         if self.domain.metas:
             res += " {"
 
 
     def checksum(self):
-        return zlib.adler32(self._metas, zlib.adler32(self._values))
+        return zlib.adler32(self._metas, zlib.adler32(self._values))

File Orange/data/io.py

         table = Table.new_from_domain(domain, nExamples, self.weight_column >= 0)
         self.read_data(filename, table)
         self.reorder_values(table)
+        if len(domain.class_vars) == 1:
+            table._Y = table._Y.ravel()
         return table
 

File Orange/data/table.py

 
 from .instance import *
 from Orange.data import io
+import Orange.data
 
 class RowInstance(Instance):
     def __init__(self, table, row_index):
         self.domain = domain
         self.n_rows = n_rows
         self._X = np.zeros((n_rows, len(domain.attributes)))
-        self._Y = np.zeros((n_rows, len(domain.class_vars)))
+
+        if isinstance(domain.class_var, Orange.data.ContinuousVariable):
+            dtype = float
+        else:
+            dtype = int
+
+        self._Y = np.zeros((n_rows, len(domain.class_vars)), dtype=dtype)
         if weights:
             self._W = np.ones(n_rows)
         else: