1. Aleš Erjavec
  2. orangecontrib-earth

Commits

Aleš Erjavec  committed 247c4c0

Rename top level package.

  • Participants
  • Parent commits 00b1a7f
  • Branches default

Comments (0)

Files changed (22)

File orangecontrib/__init__.py

View file
+"""
+Namespace stub
+"""
+import pkg_resources
+pkg_resources.declare_namespace(__name__)

File orangecontrib/earth/__init__.py

View file
+
+from .earth import EarthLearner, EarthClassifier, ScoreEarthImportance
+
+from .evimp import evimp, plot_evimp, bagged_evimp
+
+from .core import gcv

File orangecontrib/earth/core.py

View file
+"""
+EARTH interface
+"""
+
+from __future__ import division
+
+import ctypes
+import numpy
+
+from numpy import ctypeslib
+
+from . import _earth
+
+_c_earth_lib = ctypeslib.load_library(_earth.__file__, "")
+
+_c_earth = _c_earth_lib.Earth
+
+_c_earth.argtypes = [
+    # pBestGcv: GCV of the best model.
+    ctypes.POINTER(ctypes.c_double),
+    # pnTerms: number of terms in the final model
+    ctypes.POINTER(ctypes.c_int),
+    # BestSet: nMaxTerms x 1, indices of the best set of cols of bx
+    ctypeslib.ndpointer(dtype=ctypes.c_bool),
+    # bx: nCases x nMaxTerms basis matrix
+    ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"),
+    # Dirs: nMaxTerms x nPreds, -1,0,1,2 for iTerm, iPred
+    ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=2, flags="F_CONTIGUOUS"),
+    # Cuts: nMaxTerms x nPreds, cut for iTerm, iPred
+    ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"),
+    # Residuals: nCases x nResp
+    ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"),
+    # Betas: nMaxTerms x nResp
+    ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"),
+    # x: nCases x nPreds
+    ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"),
+    # y: nCases x nResp
+    ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"),
+    # WeightsArg: nCases x 1, can be NULL, currently ignored
+    ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=1),
+    # nCases: number of rows in x and elements in y
+    ctypes.c_int,
+    # nResp: number of cols in y
+    ctypes.c_int,
+    # nPred: number of cols in x
+    ctypes.c_int,
+    # nMaxDegree: Friedman's mi
+    ctypes.c_int,
+    # nMaxTerms: includes the intercept term
+    ctypes.c_int,
+    # Penalty: GCV penalty per knot
+    ctypes.c_double,
+    # Thresh: forward step threshold
+    ctypes.c_double,
+    # nMinSpan: set to non zero to override internal calculation
+    ctypes.c_int,
+    # Prune: do backward pass
+    ctypes.c_bool,
+    # nFastK: Fast MARS K
+    ctypes.c_int,
+    # FastBeta: Fast MARS aging coef
+    ctypes.c_double,
+    # NewVarPenalty: penalty for adding a new variable
+    ctypes.c_double,
+    # LinPreds: nPreds x 1, 1 if predictor must enter linearly
+    ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=1),
+    # UseBetaCache: 1 to use the beta cache, for speed
+    ctypes.c_bool,
+    # Trace: 0 none 1 overview 2 forward 3 pruning 4 more pruning
+    ctypes.c_double,
+    # sPredNames: predictor names in trace printfs, can be NULL
+    ctypes.c_char_p
+]
+
+
+def forward_pass(x, y, weights=None, degree=1, terms=None, penalty=None,
+                 thresh=0.001, min_span=0, fast_k=21, fast_beta=1,
+                 new_var_penalty=2, trace=0):
+    """
+    Earth forward pass.
+
+    Parameters
+    ----------
+    x : numpy.array
+        An N x M array of predictors
+    y : numpy.array
+        An N x R array of responses
+    weights : numpy.array
+        An N x 1 array of instance weights (unused)
+    degree : int
+        Maximum term degree (default 1)
+    terms : int
+        Maximum number of terms to induce.
+    penalty : float
+        GCV penalty per knot.
+    thresh : float
+        RSS stopping threshold.
+    min_span: int
+    new_var_penalty : int
+        Penalty for addin a new variable in the model.
+    fast_k : int
+    fast_beta : int
+
+    """
+    x = numpy.asfortranarray(x, dtype=ctypes.c_double)
+    y = numpy.asfortranarray(y, dtype=ctypes.c_double)
+
+    if x.shape[0] != y.shape[0]:
+        raise ValueError("First dimensions of x and y must be the same.")
+
+    if y.ndim == 1:
+        y = y.reshape((-1, 1), order="F")
+
+    if penalty is None:
+        penalty = 3. if degree > 1. else 2.
+    elif penalty < 0 and penalty != -1.:
+        raise ValueError("Invalid 'penalty'")
+
+    n_cases, n_preds = x.shape
+
+    _, n_resp = y.shape
+
+    # Output variables
+    p_best_gcv = ctypes.c_double()
+    n_term = ctypes.c_int()
+    best_set = numpy.zeros((terms,), dtype=ctypes.c_bool, order="F")
+    bx = numpy.zeros((n_cases, terms), dtype=ctypes.c_double, order="F")
+    dirs = numpy.zeros((terms, n_preds), dtype=ctypes.c_int, order="F")
+    cuts = numpy.zeros((terms, n_preds), dtype=ctypes.c_double, order="F")
+    residuals = numpy.zeros((n_cases, n_resp), dtype=ctypes.c_double, order="F")
+    betas = numpy.zeros((terms, n_resp), dtype=ctypes.c_double, order="F")
+
+    weights = numpy.ones((n_cases,), dtype=ctypes.c_double, order="F")
+    lin_preds = numpy.zeros((n_preds,), dtype=ctypes.c_int, order="F")
+    use_beta_cache = True
+    prune = False
+
+    # These tests are performed in ForwardPass, and if they fail the function
+    # calls exit. So we must check it here and raise a exception to avoid a
+    # process shutdown.
+    if n_cases < 8:
+        raise ValueError("Need at least 8 data instances.")
+    if n_cases > 1e8:
+        raise ValueError("To many data instances.")
+    if n_resp < 1:
+        raise ValueError("No response column.")
+    if n_resp > 1e6:
+        raise ValueError("To many response columns.")
+    if n_preds < 1:
+        raise ValueError("No predictor columns.")
+    if n_preds > 1e5:
+        raise ValueError("To many predictor columns.")
+    if degree <= 0 or degree > 100:
+        raise ValueError("Invalid 'degree'.")
+    if terms < 3 or terms > 10000:
+        raise ValueError("'terms' must be in >= 3 and <= 10000.")
+    if penalty < 0 and penalty != -1:
+        raise ValueError("Invalid 'penalty' (the only legal negative value "
+                         "is -1).")
+    if penalty > 1000:
+        raise ValueError("Invalid 'penalty' (must be <= 1000).")
+    if thresh < 0.0 or thresh >= 1.0:
+        raise ValueError("Invalid 'thresh' (must be in [0.0, 1.0) ).")
+    if fast_beta < 0 or fast_beta > 1000:
+        raise ValueError("Invalid 'fast_beta' (must be in [0, 1000] ).")
+    if new_var_penalty < 0 or new_var_penalty > 10:
+        raise ValueError("Invalid 'new_var_penalty' (must be in [0, 10] ).")
+    if (numpy.var(y, axis=0) <= 1e-8).any():
+        raise ValueError("Variance of y is zero (or near zero).")
+
+    _c_earth(ctypes.byref(p_best_gcv), ctypes.byref(n_term),
+             best_set, bx, dirs, cuts, residuals, betas,
+             x, y, weights,
+             n_cases, n_resp, n_preds,
+             degree, terms, penalty, thresh, min_span, prune,
+             fast_k, fast_beta, new_var_penalty, lin_preds,
+             use_beta_cache, trace, None)
+
+    return n_term.value, best_set, bx, dirs, cuts
+
+
+def gcv(rss, n, n_effective_params):
+    """
+    Return the generalized cross validation (GCV).
+
+    .. math:: gcv = rss / (n * (1 - NumEffectiveParams / n) ^ 2)
+
+    Parameters
+    ----------
+    rss : array_like
+        Residual sum of squares.
+    n : float
+        Number of training instances.
+    n_effective_params : array_like
+        Number of effective parameters.
+
+    """
+    return  rss / (n * (1. - n_effective_params / n) ** 2)
+
+
+def pruning_pass(bx, y, penalty, max_pruned_terms=None):
+    """
+    Earth pruning pass.
+
+    Parameters
+    ----------
+    bx : array_like
+        An N x P array basis matrix as induced by :func:`forward_pass`.
+    y : array_like
+        An N x R array of responses.
+    penalty : float
+        GCV penalty.
+    max_pruned_terms : int
+        Maximum number of terms to leave in the model. If `None` then
+        all model sizes are considered.
+
+    """
+
+    subsets, rss_vec = subset_selection_xtx(bx, y)
+
+    cases, terms = bx.shape
+    assert rss_vec.shape == (terms,)
+
+    # Effective parameters for all subsets sizes (of terms)
+    n_effective_params = numpy.arange(terms) + 1.0
+    n_effective_params += penalty * (n_effective_params - 1.0) / 2.0
+
+    gcv_vec = gcv(rss_vec, cases, n_effective_params)
+
+    if max_pruned_terms is None:
+        max_pruned_terms = terms
+
+    min_i = numpy.argmin(gcv_vec[:max_pruned_terms])
+
+    used = numpy.zeros((terms), dtype=bool)
+
+    used[subsets[min_i, :min_i + 1]] = True
+
+    return used, subsets, rss_vec, gcv_vec
+
+
+def base_term_transform(X, cuts, directions, intercept=False,
+                        out=None):
+    """
+    Transform a data array onto a set of earth basis terms.
+
+    The terms are defined by an earth model parameterized by
+    cuts and directions.
+
+    Parameters
+    ----------
+    X : array_like
+        An N x P input array.
+    cuts : array_like
+        An T x P array of hinge term knots
+    directions : array_like
+        An T x P array of hinge term directions.
+    intercept : bool
+        If True then the first row of directions and cuts is taken to be
+        the intercept term (directions[0] must have all 0 elements).
+
+    """
+    X = numpy.asanyarray(X)
+    directions = numpy.asarray(directions)
+    cuts = numpy.asarray(cuts)
+
+    if directions.ndim == 1:
+        directions = directions.reshape((1, -1))
+
+    if cuts.ndim == 1:
+        cuts = cuts.reshape((1, -1))
+
+    if cuts.shape != directions.shape:
+        raise ValueError("'cuts' and 'directions' must have the same shape")
+
+    N, P = X.shape
+    M, P_1 = directions.shape
+    if P != P_1:
+        raise ValueError("Dimension mismatch.")
+
+    if out is None:
+        out = numpy.zeros((N, M))
+    elif out.shape != (N, M):
+        raise ValueError("'out' is of wrong size")
+
+    start_term = 0
+    if intercept:
+        if numpy.any(directions[0]):
+            raise ValueError("")
+        out[:, 0] = 1.0
+
+        start_term = 1
+
+    for termi in range(start_term, M):
+        term_dirs = directions[termi]
+        term_cuts = cuts[termi]
+
+        dir_p1 = numpy.where(term_dirs == 1)[0]
+        dir_m1 = numpy.where(term_dirs == -1)[0]
+        dir_2 = numpy.where(term_dirs == 2)[0]
+
+        x1 = X[:, dir_p1] - term_cuts[dir_p1]
+        x2 = term_cuts[dir_m1] - X[:, dir_m1]
+        x3 = X[:, dir_2]
+
+        x1 = numpy.where(x1 > 0.0, x1, 0.0)
+        x2 = numpy.where(x2 > 0.0, x2, 0.0)
+
+        Term = numpy.cumprod(numpy.hstack((x1, x2, x3)), axis=1)
+        # Term can have a (N, 0) shape if all(term_dirs == 0)
+        out[:, termi] = Term[:, -1] if Term.size else 0.0
+
+    return out
+
+
+def subset_selection_xtx(X, Y, lstsq=None):
+    """
+    A re-implementation of EvalSubsetsUsingXtx in the Earth c code.
+
+    Using numpy least squares fitter.
+
+    """
+    X = numpy.asarray(X)
+    Y = numpy.asarray(Y)
+
+    if lstsq is None:
+        lstsq = numpy.linalg.lstsq
+
+    n_pred = X.shape[1]
+    rss_vec = numpy.zeros(n_pred)
+    working_set = range(n_pred)
+    subsets = numpy.zeros((n_pred, n_pred), dtype=int)
+
+    for subset_size in reversed(range(n_pred)):
+        subsets[subset_size, :subset_size + 1] = working_set
+        X_work = X[:, working_set]
+        b = lstsq(X_work, Y)[0]
+
+        rss_vec[subset_size] = numpy.sum((Y - numpy.dot(X_work, b)) ** 2)
+
+        XtX = numpy.dot(X_work.T, X_work)
+        iXtX = numpy.linalg.pinv(XtX)
+        diag = numpy.diag(iXtX).reshape((-1, 1))
+
+        if subset_size == 0:
+            break
+
+        delta_rss = b ** 2 / diag
+        delta_rss = numpy.sum(delta_rss, axis=1)
+        delete_i = numpy.argmin(delta_rss[1:]) + 1  # Keep the intercept
+        del working_set[delete_i]
+
+    return subsets, rss_vec

File orangecontrib/earth/earth.py

View file
+"""
+Earth Orange interface
+
+Example::
+
+    >>> import Orange, orangekit.earth
+    >>> data = Orange.data.Table("housing")
+    >>> c = orangekit.earth.EarthLearner(data, degree=2, terms=10)
+    >>> print c.to_string(precision=2)
+    MEDV =
+       23.59
+       -0.61 * max(0, LSTAT - 6.12)
+       +11.90 * max(0, RM - 6.43)
+       +1.14 * max(0, 6.43 - RM)
+       -228.80 * max(0, NOX - 0.65) * max(0, RM - 6.43)
+       +0.02 * max(0, TAX - 307.00) * max(0, 6.12 - LSTAT)
+       +0.03 * max(0, 307.00 - TAX) * max(0, 6.12 - LSTAT)
+
+"""
+
+from __future__ import division
+
+__all__ = ["EarthLearner", "EarthClassifier", "evimp", "bagged_evimp",
+           "ScoreEarthImportance"]
+
+import numpy
+
+import Orange
+
+from .core import forward_pass, pruning_pass, base_term_transform
+
+
+def is_discrete(var):
+    return isinstance(var, Orange.feature.Discrete)
+
+
+def is_continuous(var):
+    return isinstance(var, Orange.feature.Continuous)
+
+
+def expand_discrete(var):
+    """
+    Expand a discrete variable ``var`` returning one continuous indicator
+    variable for each value of ``var`` (if the number of values is grater
+    then 2 else return only one indicator variable).
+
+    """
+    if len(var.values) > 2:
+        values = var.values
+    elif len(var.values) == 2:
+        values = var.values[-1:]
+    else:
+        values = var.values[:1]
+    new_vars = []
+    for value in values:
+        new = Orange.feature.Continuous("{0}={1}".format(var.name, value))
+        new.get_value_from = cls = Orange.core.ClassifierFromVar(whichVar=var)
+        cls.transformer = Orange.core.Discrete2Continuous()
+        cls.transformer.value = int(Orange.core.Value(var, value))
+        new.source_variable = var
+        new_vars.append(new)
+    return new_vars
+
+
+def select_attrs(table, features, class_var=None,
+                 class_vars=None, metas=None):
+    """
+    Select only ``features`` from the ``table``.
+    """
+    if class_vars is None:
+        domain = Orange.data.Domain(features, class_var)
+    else:
+        domain = Orange.data.Domain(features, class_var, class_vars=class_vars)
+    if metas:
+        domain.add_metas(metas)
+
+    return Orange.data.Table(domain, table)
+
+
+class EarthLearner(Orange.regression.base.BaseRegressionLearner):
+    """
+    Earth learner class.
+
+    Supports both regression and classification problems. For classification,
+    class values are expanded into continuous indicator columns (one for
+    each value if the number of values is grater then 2), and a multi
+    response model is fit to these new columns. The resulting classifier
+    computes response values on new instances to select the final
+    predicted class.
+
+    :param int degree:
+        Maximum degree (num. of hinge functions per term) of the terms
+        in the model (default: 1).
+    :param int terms:
+        Maximum number of terms in the forward pass. If set to ``None``
+        (default), ``min(200, max(20, 2 * n_attributes)) + 1`` will be
+        used, like the default setting in earth R package.
+    :param float penalty:
+        Penalty for hinges in the GCV computation (used in the pruning pass).
+        Default is 3.0 if ``degree`` is above 1, and 2.0 otherwise.
+    :param float thresh:
+        Threshold for RSS decrease in the forward pass (default: 0.001).
+    :param int min_span: TODO.
+    :param float new_var_penalty:
+        Penalty for introducing a new variable in the model during the
+        forward pass (default: 0).
+    :param int fast_k: Fast k.
+    :param float fast_beta: Fast beta.
+    :param int pruned_terms:
+        Maximum number of terms in the model after pruning (default:
+        ``None``, no limit).
+    :param bool scale_resp:
+        Scale responses prior to forward pass (default: ``True``);
+        Ignored for models with multiple responses.
+    :param bool store_instances:
+        Store training instances in the model (default: ``True``).
+
+    """
+
+    def __new__(cls, instances=None, weight_id=None, **kwargs):
+        self = Orange.regression.base.BaseRegressionLearner.__new__(cls)
+        if instances is not None:
+            self.__init__(**kwargs)
+            return self.__call__(instances, weight_id)
+        else:
+            return self
+
+    def __init__(self, degree=1, terms=None, penalty=None, thresh=1e-3,
+                 min_span=0, new_var_penalty=0, fast_k=20, fast_beta=1,
+                 pruned_terms=None, scale_resp=True, store_instances=True,
+                **kwds):
+
+        super(EarthLearner, self).__init__()
+
+        self.degree = degree
+        self.terms = terms
+        if penalty is None:
+            penalty = 3 if degree > 1 else 2
+        self.penalty = penalty
+        self.thresh = thresh
+        self.min_span = min_span
+        self.new_var_penalty = new_var_penalty
+        self.fast_k = fast_k
+        self.fast_beta = fast_beta
+        self.pruned_terms = pruned_terms
+        self.scale_resp = scale_resp
+        self.store_instances = store_instances
+        self.__dict__.update(kwds)
+
+        self.continuizer.class_treatment = \
+            Orange.data.preprocess.DomainContinuizer.Ignore
+
+    def __call__(self, instances, weight_id=None):
+        """
+        Train an :class:`EarthClassifier` instance on the `instances`.
+
+        :param Orange.data.Table instances: Training instances.
+
+        .. note:: `weight_id` is ignored.
+
+        """
+        expanded_class = None
+        multitarget = False
+
+        if instances.domain.class_var:
+            instances = self.impute_table(instances)
+            instances = self.continuize_table(instances)
+
+            if is_discrete(instances.domain.class_var):
+                # Expand a discrete class with indicator columns
+                expanded_class = expand_discrete(instances.domain.class_var)
+                y_table = select_attrs(instances, expanded_class)
+                (y, ) = y_table.to_numpy_MA("A")
+                (x, ) = instances.to_numpy_MA("A")
+            elif is_continuous(instances.domain.class_var):
+                x, y, _ = instances.to_numpy_MA()
+                y = y.reshape((-1, 1))
+            else:
+                raise ValueError("Cannot handle the response.")
+        elif instances.domain.class_vars:
+            # Multi-target domain
+            if not all(map(is_continuous, instances.domain.class_vars)):
+                raise TypeError("Only continuous multi-target classes are "
+                                "supported.")
+            x_table = select_attrs(instances, instances.domain.attributes)
+            y_table = select_attrs(instances, instances.domain.class_vars)
+
+            # Impute and continuize only the x_table
+            x_table = self.impute_table(x_table)
+            x_table = self.continuize_table(x_table)
+
+            (x, ) = x_table.to_numpy_MA("A")
+            (y, ) = y_table.to_numpy_MA("A")
+
+            multitarget = True
+        else:
+            raise ValueError("Class variable expected.")
+
+        # check for non-finite values in y.
+        if not numpy.isfinite(y).all():
+            raise ValueError("Non-finite values present in Y")
+
+        # mask non-finite values in x.
+        x = numpy.ma.masked_invalid(x, copy=False)
+
+        if self.scale_resp and y.shape[1] == 1:
+            sy = y - numpy.ma.mean(y, axis=0)
+            sy = sy / numpy.ma.std(sy, axis=0)
+        else:
+            sy = y
+
+        # replace masked values with means.
+        if numpy.ma.is_masked(sy):
+            mean_sy = numpy.ma.mean(sy, axis=0)
+            sy = numpy.where(sy.mask, mean_sy, sy)
+
+        if numpy.ma.is_masked(x):
+            mean_x = numpy.ma.mean(x, axis=0)
+            x = numpy.where(x.mask, mean_x, x)
+
+        terms = self.terms
+        if terms is None:
+            # Automatic maximum number of terms
+            terms = min(200, max(20, 2 * x.shape[1])) + 1
+
+        n_terms, used, bx, dirs, cuts = forward_pass(
+            x, sy, degree=self.degree, terms=terms, penalty=self.penalty,
+            thresh=self.thresh, min_span=self.min_span,
+            fast_k=self.fast_k, fast_beta=self.fast_beta,
+            new_var_penalty=self.new_var_penalty
+        )
+
+        # discard unused terms from bx, dirs, cuts
+        bx = bx[:, used]
+        dirs = dirs[used, :]
+        cuts = cuts[used, :]
+
+        # pruning
+        used, subsets, rss_per_subset, gcv_per_subset = \
+            pruning_pass(bx, y, self.penalty,
+                         max_pruned_terms=self.pruned_terms)
+
+        # Fit final betas to the selected subset of basis terms.
+        bx_used = bx[:, used]
+        betas, res, rank, s = numpy.linalg.lstsq(bx_used, y)
+
+        return EarthClassifier(
+                   instances.domain, used, dirs, cuts, betas.T,
+                   subsets, rss_per_subset, gcv_per_subset,
+                   instances=instances if self.store_instances else None,
+                   multitarget=multitarget,
+                   expanded_class=expanded_class
+               )
+
+
+def soft_max(values):
+    values = numpy.asarray(values)
+    return numpy.exp(values) / numpy.sum(numpy.exp(values))
+
+
+class EarthClassifier(Orange.core.ClassifierFD):
+    """
+    Earth classifier.
+    """
+    def __init__(self, domain, best_set, dirs, cuts, betas, subsets=None,
+                 rss_per_subset=None, gcv_per_subset=None, instances=None,
+                 multitarget=False, expanded_class=None,
+                 **kwargs):
+        self.multitarget = multitarget
+        self.domain = domain
+        self.class_var = domain.class_var
+        if self.multitarget:
+            self.class_vars = domain.class_vars
+
+        self.best_set = best_set
+        self.dirs = dirs
+        self.cuts = cuts
+        self.betas = betas
+        self.subsets = subsets
+        self.rss_per_subset = rss_per_subset
+        self.gcv_per_subset = gcv_per_subset
+        self.instances = instances
+        self.expanded_class = expanded_class
+        self.__dict__.update(kwargs)
+
+    def __call__(self, instance, result_type=Orange.core.GetValue):
+        """
+        Predict the response value on `instance`.
+
+        :param Orange.data.Instance instance:
+            Input data instance.
+
+        """
+        if self.multitarget and self.domain.class_vars:
+            resp_vars = list(self.domain.class_vars)
+        elif is_discrete(self.class_var):
+            resp_vars = self.expanded_class
+        else:
+            resp_vars = [self.class_var]
+
+        vals = self.predict(instance)
+        vals = [var(val) for var, val in zip(resp_vars, vals)]
+
+        from Orange.statistics.distribution import Distribution
+
+        if not self.multitarget and is_discrete(self.class_var):
+            dist = Distribution(self.class_var)
+
+            # Random gen. for tie breaking.
+            dist.random_generator = Orange.misc.random.Random(hash(instance))
+
+            if len(self.class_var.values) == 2:
+                probs = [1 - float(vals[0]), float(vals[0])]
+            else:
+                probs = soft_max(map(float, vals))
+
+            for val, p in zip(self.class_var.values, probs):
+                dist[val] = p
+            value = dist.modus()
+            vals, probs = [value], [dist]
+        else:
+            probs = []
+            for var, val in zip(resp_vars, vals):
+                dist = Distribution(var)
+                dist[val] = 1.0
+                probs.append(dist)
+
+        if not self.multitarget:
+            vals, probs = vals[0], probs[0]
+
+        if result_type == Orange.core.GetValue:
+            return vals
+        elif result_type == Orange.core.GetBoth:
+            return vals, probs
+        else:
+            return probs
+
+    def base_matrix(self, instances=None):
+        """
+        Return the base matrix (bx) of the Earth model for the table.
+
+        Base matrix is a len(instances) * num_terms matrix of computed values
+        of terms in the model (not multiplied by beta) for each instance.
+
+        If table is not supplied, the base matrix of the training instances
+        is returned.
+
+        :param Orange.data.Table instances:
+            Input instances for the base matrix.
+
+        """
+        if instances is None:
+            instances = self.instances
+        instances = select_attrs(instances, self.domain.attributes)
+        (data,) = instances.to_numpy_MA("A")
+        bx = base_term_transform(data, self.cuts, self.dirs, intercept=True)
+        return bx
+
+    def base_features(self):
+        """
+        Return a list of constructed features of Earth terms.
+
+        The features can be used in Orange's domain translation
+        (i.e. they define the proper :func:`get_value_from` functions).
+
+        :rtype: list of :class:`Orange.feature.Descriptor`
+
+        """
+        terms = []
+        dirs = self.dirs[self.best_set]
+        cuts = self.cuts[self.best_set]
+        # For faster domain translation all the features share
+        # this _instance_cache.
+        _instance_cache = {}
+        for dir, cut in zip(dirs[1:], cuts[1:]):  # Drop the intercept
+            hinge = [_format_knot(attr.name, dir1, cut1)
+                     for (attr, dir1, cut1) in
+                     zip(self.domain.attributes, dir, cut)
+                     if dir1 != 0.0]
+            term_name = " * ".join(hinge)
+            term = Orange.feature.Continuous(term_name)
+            term.get_value_from = term_computer(
+                term, self.domain, dir, cut,
+                _instance_cache=_instance_cache
+            )
+
+            terms.append(term)
+        return terms
+
+    def predict(self, instance):
+        """
+        Predict the response values.
+
+        :param Orange.data.Instance instance:
+            Data instance
+
+        """
+        data = Orange.data.Table(self.domain, [instance])
+        bx = self.base_matrix(data)
+        bx_used = bx[:, self.best_set]
+        vals = numpy.dot(bx_used, self.betas.T).ravel()
+        return vals
+
+    def used_attributes(self, term=None):
+        """
+        Return the used features in `term` (index).
+
+        If no term is given, return all features used in the model.
+
+        :param int term: Term index
+
+        """
+        if term is None:
+            return reduce(set.union, [self.used_attributes(i) \
+                                      for i in range(self.best_set.size)],
+                          set())
+
+        attrs = self.domain.attributes
+
+        used_mask = self.dirs[term, :] != 0.0
+        return [a for a, u in zip(attrs, used_mask) if u]
+
+    def evimp(self, used_only=True):
+        """
+        Return the estimated variable importances.
+
+        :param bool used_only: If True return only used attributes.
+
+        """
+        return evimp(self, used_only)
+
+    def __reduce__(self):
+        return (type(self), (self.domain, self.best_set, self.dirs,
+                            self.cuts, self.betas),
+                dict(self.__dict__))
+
+    def to_string(self, precision=3, indent=3):
+        """
+        Return a string representation of the model.
+
+        This is also the default string representation of the model
+        (as returned by :func:`str`)
+
+        """
+        if self.multitarget:
+            r_vars = list(self.domain.class_vars)
+        elif is_discrete(self.class_var):
+            r_vars = self.expanded_class
+        else:
+            r_vars = [self.class_var]
+
+        feature_names = [f.name for f in self.domain.features]
+        r_names = [v.name for v in r_vars]
+        betas = self.betas
+
+        resp = []
+        for name, betas in zip(r_names, betas):
+            resp.append(
+                _format_response(name, betas, self.cuts[self.best_set],
+                                 self.dirs[self.best_set], feature_names,
+                                 precision, indent))
+
+        return "\n\n".join(resp)
+
+    def __str__(self):
+        return self.to_string()
+
+
+def _format_response(resp_name, betas, cuts, dirs, pred_names,
+                     precision=3, indent=3):
+    header = "%s =" % resp_name
+    indent = " " * indent
+    fmt = "%." + str(precision) + "f"
+    terms = [([], fmt % betas[0])]  # Intercept
+
+    for beta, cut1, dir1 in zip(betas[1:], cuts[1:], dirs[1:]):
+        knots = [_format_knot(name, d, c, precision)
+                 for d, c, name in zip(dir1, cut1, pred_names)
+                 if d != 0]
+        term_attrs = [name for name, d in zip(pred_names, dir1) if d != 0]
+        sign = "-" if beta < 0 else "+"
+        beta = fmt % abs(beta)
+        if knots:
+            terms.append((term_attrs,
+                          sign + " * ".join([beta] + knots)))
+        else:
+            raise Exception
+
+    terms = sorted(terms, key=lambda t: len(t[0]))
+    return "\n".join([header] + [indent + t for _, t in terms])
+
+
+def _format_knot(name, direction, cut, precision=3):
+    fmt = "%%.%if" % precision
+    if direction == 1:
+        txt = ("max(0, %s - " + fmt + ")") % (name, cut)
+    elif direction == -1:
+        txt = ("max(0, " + fmt + " - %s)") % (cut, name)
+    elif direction == 2:
+        txt = name
+    return txt
+
+
+class term_computer(Orange.core.ClassifierFD):
+    """
+    An utility class for computing basis terms. Can be used as
+    a :obj:`~Orange.feature.Descriptior.get_value_from` member.
+
+    """
+    def __init__(self, term_var=None, domain=None, dirs=None, cuts=None,
+                 _instance_cache=None):
+        self.class_var = term_var
+        self.domain = domain
+
+        self.dirs = dirs
+        self.cuts = cuts
+
+        self.mask = self.dirs != 0
+        self.masked_dirs = numpy.ascontiguousarray(self.dirs[self.mask])
+        self.masked_cuts = numpy.ascontiguousarray(self.cuts[self.mask])
+
+        self._instance_cache = _instance_cache
+
+    def __call__(self, instance, return_what=Orange.core.GetValue):
+        instance = self._instance_as_masked_array(instance)
+
+        values = instance[self.mask]
+        if numpy.ma.is_masked(values):
+            # Can't compute the term.
+            return self.class_var("?")
+
+        # Works faster with contiguous arrays
+        values = numpy.ascontiguousarray(values)
+        values -= self.masked_cuts
+        values *= self.masked_dirs
+
+        values[values < 0] = 0
+        value = numpy.prod(values)
+
+        return self.class_var(value)
+
+    def _instance_as_masked_array(self, instance):
+        array = None
+        if self._instance_cache is not None:
+            array = self._instance_cache.get(instance, None)
+
+        if array is None:
+            table = Orange.data.Table(self.domain, [instance])
+            (array,) = table.to_numpy_MA("A")
+            array = array[0]
+
+            if self._instance_cache is not None:
+                self._instance_cache.clear()
+                self._instance_cache[instance] = array
+        return array
+
+    def __reduce__(self):
+        return (type(self),
+                (self.class_var, self.domain, self.dirs, self.cuts),
+                dict(self.__dict__.items()))
+
+
+from .evimp import evimp, bagged_evimp, ScoreEarthImportance

File orangecontrib/earth/evimp.py

View file
+"""
+"""
+from collections import defaultdict
+
+import numpy
+
+import Orange
+from Orange.ensemble import bagging
+from Orange.feature import scoring
+
+from .earth import EarthLearner, EarthClassifier
+from .core import subset_selection_xtx
+
+__all__ = ["evimp", "bagged_evimp", "plot_evimp", "ScoreEarthImportance"]
+
+
+def evimp(model, used_only=True):
+    """
+    Return the estimated variable importance for the model.
+
+    Parameters
+    ----------
+    model : Earth
+        Earth model.
+
+    """
+    if model.subsets is None:
+        raise ValueError("No subsets. Use the learner with 'prune=True'.")
+
+    subsets = model.subsets
+    n_subsets = numpy.sum(model.best_set)
+
+    rss = -numpy.diff(model.rss_per_subset)
+    gcv = -numpy.diff(model.gcv_per_subset)
+    attributes = list(model.domain.variables)
+
+    attr2ind = dict(zip(attributes, range(len(attributes))))
+    importances = numpy.zeros((len(attributes), 4))
+    importances[:, 0] = range(len(attributes))
+
+    for i in range(1, n_subsets):
+        term_subset = subsets[i, :i + 1]
+        used_attributes = reduce(set.union, [model.used_attributes(term) \
+                                             for term in term_subset], set())
+        for attr in used_attributes:
+            importances[attr2ind[attr]][1] += 1.0
+            importances[attr2ind[attr]][2] += gcv[i - 1]
+            importances[attr2ind[attr]][3] += rss[i - 1]
+    imp_min = numpy.min(importances[:, [2, 3]], axis=0)
+    imp_max = numpy.max(importances[:, [2, 3]], axis=0)
+
+    # Normalize importances.
+    importances[:, [2, 3]] = 100.0 * (importances[:, [2, 3]] \
+                            - [imp_min]) / ([imp_max - imp_min])
+
+    importances = list(importances)
+    # Sort by n_subsets and gcv.
+    importances = sorted(importances, key=lambda row: (row[1], row[2]),
+                         reverse=True)
+    importances = numpy.array(importances)
+
+    if used_only:
+        importances = importances[importances[:, 1] > 0.0]
+
+    res = [(attributes[int(row[0])], tuple(row[1:])) for row in importances]
+    return res
+
+
+def bagged_evimp(classifier, used_only=True):
+    """
+    Extract combined (average) :func:`evimp` from an instance of
+    :class:`~Orange.ensemble.bagging.BaggedClassifier` using
+    :class:`EarthLearner` as a `base_learner`.
+
+    Example::
+
+        from Orange.ensemble.bagging import BaggedLearner
+        bc = BaggedLearner(EarthLearner(degree=3, terms=10), data)
+        bagged_evimp(bc)
+
+    .. seealso:: :class:`ScoreEarthImportance`
+
+    """
+    def assert_type(obj, class_):
+        if not isinstance(obj, class_):
+            raise TypeError("Instance of %r expected." % (class_))
+
+    assert_type(classifier, bagging.BaggedClassifier)
+    bagged_imp = defaultdict(list)
+    attrs_by_name = defaultdict(list)
+    for c in classifier.classifiers:
+        assert_type(c, EarthClassifier)
+        imp = evimp(c, used_only=used_only)
+        for attr, score in imp:
+            bagged_imp[attr.name].append(score)  # map by name
+            attrs_by_name[attr.name].append(attr)
+
+    for attr, scores in bagged_imp.items():
+        scores = numpy.average(scores, axis=0)
+        bagged_imp[attr] = tuple(scores)
+
+    bagged_imp = sorted(bagged_imp.items(),
+                        key=lambda t: (t[1][0], t[1][1]),
+                        reverse=True)
+
+    bagged_imp = [(attrs_by_name[name][0], scores)
+                  for name, scores in bagged_imp]
+
+    if used_only:
+        bagged_imp = [(a, r) for a, r in bagged_imp if r[0] > 0]
+    return bagged_imp
+
+
+def plot_evimp(evimp):
+    """
+    Plot the variable importances as returned from
+    :func:`EarthClassifier.evimp`.
+
+    ::
+
+        import Orange, orangekit.earth
+        data = Orange.data.Table("housing")
+        c = orangekit.earth.EarthLearner(data, degree=3)
+        orangekit.earth.plot_evimp(c.evimp())
+
+    .. image:: images/earth-evimp.png
+
+    The left axis is the nsubsets measure and on the right are the normalized
+    RSS and GCV.
+
+    """
+    import pylab
+
+    if isinstance(evimp, EarthClassifier):
+        evimp = evimp.evimp()
+    elif isinstance(evimp, bagging.BaggedClassifier):
+        evimp = bagged_evimp(evimp)
+
+    fig = pylab.figure()
+    axes1 = fig.add_subplot(111)
+    attrs = [a for a, _ in evimp]
+    imp = [s for _, s in evimp]
+    imp = numpy.array(imp)
+    X = range(len(attrs))
+    l1 = axes1.plot(X, imp[:, 0], "b-", label="nsubsets")
+    axes2 = axes1.twinx()
+
+    l2 = axes2.plot(X, imp[:, 1], "g-", label="gcv")
+    l3 = axes2.plot(X, imp[:, 2], "r-", label="rss")
+
+    x_axis = axes1.xaxis
+    x_axis.set_ticks(X)
+    x_axis.set_ticklabels([a.name for a in attrs], rotation=90)
+
+    axes1.yaxis.set_label_text("nsubsets")
+    axes2.yaxis.set_label_text("normalized gcv or rss")
+
+    axes1.legend((l1[0], l2[0], l3[0]), ("nsubsets", "gcv", "rss"))
+
+    axes1.set_title("Variable importance")
+    fig.show()
+
+
+#########################################################
+# High level interface for measuring variable importance
+# (compatible with Orange.feature.scoring module).
+#########################################################
+
+def collect_source(variables):
+    """
+    Given a list of variables ``variables``, return a mapping from source
+    variables (``source_variable`` or ``get_value_from.variable`` members)
+    back to the variables in ``variables`` (assumes the default preprocessor
+    in EarthLearner).
+
+    """
+    source = defaultdict(list)
+    for var in variables:
+        if var.source_variable:
+            source[var.source_variable].append(var)
+        elif isinstance(var.get_value_from, Orange.core.ClassifierFromVar):
+            source[var.get_value_from.variable].append(var)
+        elif isinstance(var.get_value_from, Orange.core.ImputeClassifier):
+            source[var.get_value_from.classifier_from_var.variable].append(var)
+        else:
+            source[var].append(var)
+    return dict(source)
+
+
+class ScoreEarthImportance(scoring.Score):
+    """
+    A subclass of :class:`Orange.feature.scoring.Score` that scores
+    features based on their importance in the Earth model using
+    ``bagged_evimp``.
+
+    :param int t:
+        Number of earth models to train on the data.
+
+    :param int degree:
+        The maximum degree of the induced models.
+
+    :param int terms:
+        Maximum number of terms induced in the forward pass.
+
+    :param int score_what:
+        What to return as a score. Can be one of:
+
+        * ``"nsubsets"``
+        * ``"rss"``
+        * ``"gcv"``
+
+        string or or class constants:
+
+        * ``NSUBSETS``
+        * ``RSS``
+        * ``GCV``
+
+    """
+    # Return types
+
+    #: The number of subsets the feature is included during the
+    #: pruning pass.
+    NSUBSETS = 0
+
+    #: Residual squared error increase when the feature was removed
+    #: during the pruning pass (averaged over all `t` models)
+    RSS = 1
+
+    #: GCV increase when the feature was removed during the pruning pass
+    #: (averaged over all `t` models)
+    GCV = 2
+
+    handles_discrete = True
+    handles_continuous = True
+    computes_thresholds = False
+    needs = scoring.Score.Generator
+
+    def __new__(cls, attr=None, data=None, weight_id=None, **kwargs):
+        self = scoring.Score.__new__(cls)
+        if attr is not None and data is not None:
+            self.__init__(**kwargs)
+            # TODO: Should raise a warning, about caching
+            return self.__call__(attr, data, weight_id)
+        elif not attr and not data:
+            return self
+        else:
+            raise ValueError("Both 'attr' and 'data' arguments expected.")
+
+    def __init__(self, t=10, degree=2, terms=10, score_what="nsubsets",
+                 cached=True):
+        self.t = t
+        self.degree = degree
+        self.terms = terms
+        if isinstance(score_what, basestring):
+            score_what = {"nsubsets": self.NSUBSETS, "rss": self.RSS,
+                          "gcv": self.GCV}.get(score_what, None)
+
+        if score_what not in range(3):
+            raise ValueError("Invalid  'score_what' parameter.")
+
+        self.score_what = score_what
+        self.cached = cached
+        self._cache_ref = None
+        self._cached_evimp = None
+
+    def __call__(self, attr, data, weight_id=None):
+        """
+        Return the score for `attr` as evaluated on `data`.
+
+        :param Orange.feature.Descriptor attr:
+            A feature descriptor present in ``data.domain.features``.
+
+        :param Orange.data.Table data:
+            Model training data.
+
+        .. note:: `weight_id` is ignored.
+
+        """
+        ref = self._cache_ref
+        if ref is not None and ref is data:
+            evimp = self._cached_evimp
+        else:
+            learner = bagging.BaggedLearner(
+                EarthLearner(degree=self.degree, terms=self.terms),
+                t=self.t
+            )
+            bc = learner(data, weight_id)
+            evimp = bagged_evimp(bc, used_only=False)
+            self._cache_ref = data
+            self._cached_evimp = evimp
+
+        evimp = dict(evimp)
+        score = evimp.get(attr, None)
+
+        if score is None:
+            source = collect_source(evimp.keys())
+            if attr in source:
+                # Return average of source var scores
+                return numpy.average([evimp[v][self.score_what] \
+                                      for v in source[attr]])
+            else:
+                raise ValueError("Attribute %r not in the domain." % attr)
+        else:
+            return score[self.score_what]
+
+
+class ScoreRSS(scoring.Score):
+
+    handles_discrete = False
+    handles_continuous = True
+    computes_thresholds = False
+
+    def __new__(cls, attr=None, data=None, weight_id=None, **kwargs):
+        self = scoring.Score.__new__(cls)
+        if attr is not None and data is not None:
+            self.__init__(**kwargs)
+            # TODO: Should raise a warning, about caching
+            return self.__call__(attr, data, weight_id)
+        elif not attr and not data:
+            return self
+        else:
+            raise ValueError("Both 'attr' and 'data' arguments expected.")
+
+    def __init__(self):
+        self._cache_data = None
+        self._cache_rss = None
+
+    def __call__(self, attr, data, weight_id=None):
+        ref = self._cache_data
+        if ref is not None and ref is data:
+            rss = self._cache_rss
+        else:
+            x, y = data.to_numpy_MA("1A/c")
+            subsets, rss = subset_selection_xtx(x, y)
+            rss_diff = -numpy.diff(rss)
+            rss = numpy.zeros_like(rss)
+            for s_size in range(1, subsets.shape[0]):
+                subset = subsets[s_size, :s_size + 1]
+                rss[subset] += rss_diff[s_size - 1]
+            rss = rss[1:]  # Drop the intercept
+            self._cache_data = data
+            self._cache_rss = rss
+
+        index = list(data.domain.attributes).index(attr)
+        return rss[index]

File orangecontrib/earth/tests/__init__.py

Empty file added.

File orangecontrib/earth/tests/test_core.py

View file
+import unittest
+
+import numpy
+
+from .. import core
+
+
+class TestEarth(unittest.TestCase):
+    def assertAllClose(self, a, b, rtol=1e-05, atol=1e-08):
+        if numpy.allclose(a, b, rtol, atol):
+            return
+        else:
+            self.fail("%s != %s to within tolerance" % (a, b))
+
+    def test_forward(self):
+        x = numpy.linspace(0, 1, num=100, endpoint=True)
+        y = numpy.hstack((numpy.zeros(50), (x[50:] - 0.5)))
+
+        n, best_set, bx, dirs, cuts = \
+            core.forward_pass(x.reshape(-1, 1), y, degree=1, terms=3,
+                               penalty=0, new_var_penalty=0)
+
+        self.assertEqual(n, 3)
+        self.assertTrue((best_set == [True, True, True]).all())
+
+        self.assertTrue(numpy.allclose(cuts.ravel(),
+                                       [0, 0.5, 0.5], atol=0.05))
+
+        self.assertTrue((dirs.ravel() == [0, 1, -1]).all())
+
+        # raise y by 1.0 and repeat
+        y += 1.0
+
+        n, best_set, bx, dirs, cuts = \
+            core.forward_pass(x.reshape(-1, 1), y, degree=1, terms=3,
+                               penalty=0, new_var_penalty=0)
+
+        self.assertEqual(n, 3)
+        self.assertTrue((best_set == [True, True, True]).all())
+
+        self.assertTrue(numpy.allclose(cuts.ravel(),
+                                       [0, 0.5, 0.5], atol=0.05))
+
+        self.assertTrue((dirs.ravel() == [0, 1, -1]).all())
+
+    def test_base_transform(self):
+        X = [[1., 2., 3.],
+             [0., 5., 10.]]
+
+        # H(X[0] - 0.5) * H(7 - X[2])
+        cuts = [0.5, 3., 7.]
+        dirs = [1, 0, -1]
+
+        bX = core.base_term_transform(X, cuts, dirs)
+        self.assertAllClose(bX, [[2.], [0.]])
+
+        cuts = [[0, 0, 0], cuts]
+        dirs = [[0, 0, 0], dirs]
+
+        bX = core.base_term_transform(X, cuts, dirs)
+        self.assertAllClose(bX, [[0., 2.], [0., 0.]])
+
+        bX = core.base_term_transform(X, cuts, dirs, intercept=True)
+        self.assertAllClose(bX, [[1., 2.], [1., 0.]])

File orangecontrib/earth/tests/test_earth.py

View file
+import unittest
+
+import numpy
+
+import Orange
+from Orange.testing import testing
+from Orange.testing.testing import datasets_driven, test_on_data
+
+from .. import earth
+
+@datasets_driven(datasets=testing.REGRESSION_DATASETS + \
+                 testing.CLASSIFICATION_DATASETS)
+class TestEarthLearner(testing.LearnerTestCase):
+
+    def setUp(self):
+        self.learner = earth.EarthLearner(degree=2, terms=10)
+
+    @test_on_data
+    def test_learner_on(self, dataset):
+        if len(dataset) < 30:
+            raise unittest.SkipTest("Not enough examples.")
+        testing.LearnerTestCase.test_learner_on(self, dataset)
+        str = self.classifier.to_string()
+        evimp = self.classifier.evimp()
+
+        # Test base_features (make sure the domain translation works)
+        basis_features = self.classifier.base_features()
+        basis_domain = Orange.data.Domain(basis_features, None)
+        basis_table = Orange.data.Table(basis_domain, dataset)
+        basis_matrix = self.classifier.base_matrix(dataset)
+        # Filter best set
+        basis_matrix = basis_matrix[:, self.classifier.best_set]
+        # Remove intercept
+        basis_matrix = basis_matrix[:, 1:]
+        basis_matrix_a = basis_table.to_numpy_MA("A")[0]
+        # Fill unknowns
+        basis_matrix[basis_matrix_a.mask] = 0
+        basis_matrix_a = basis_matrix_a.filled(0)
+        diff = basis_matrix - basis_matrix_a
+        self.assertAlmostEqual(numpy.max(diff), 0, places=3)
+
+    @test_on_data
+    def test_bagged_evimp(self, dataset):
+        from Orange.ensemble.bagging import BaggedLearner
+        bagged_learner = BaggedLearner(earth.EarthLearner(terms=10, degree=2),
+                                       t=5)
+
+        bagged_classifier = bagged_learner(dataset)
+        evimp = earth.bagged_evimp(bagged_classifier, used_only=False)
+
+
+@datasets_driven(datasets=testing.REGRESSION_DATASETS + \
+                 testing.CLASSIFICATION_DATASETS)
+class TestScoreEarthImportance(testing.MeasureAttributeTestCase):
+    def setUp(self):
+        self.measure = earth.ScoreEarthImportance(t=5, score_what="rss")
+
+
+@datasets_driven(datasets=["multitarget-synthetic"])
+class TestEarthMultitarget(unittest.TestCase):
+    @test_on_data
+    def test_multi_target_on_data(self, dataset):
+        self.learner = earth.EarthLearner(degree=2, terms=10)
+
+        self.predictor = self.multi_target_test(self.learner, dataset)
+
+        self.assertTrue(bool(self.predictor.multitarget))
+
+        s = str(self.predictor)
+        self.assertEqual(s, self.predictor.to_string())
+        self.assertNotEqual(s, self.predictor.to_string(3, 6))
+
+    def multi_target_test(self, learner, data):
+        indices = Orange.data.sample.SubsetIndices2(p0=0.3)(data)
+        learn = data.select(indices, 1)
+        test = data.select(indices, 0)
+
+        predictor = learner(learn)
+        self.assertIsInstance(predictor, Orange.classification.Classifier)
+        self.multi_target_predictor_interface(predictor, learn.domain)
+
+        from Orange.evaluation import testing as _testing
+
+        r = _testing.test_on_data([predictor], test)
+
+        def all_values(vals):
+            for v in vals:
+                self.assertIsInstance(v, Orange.core.Value)
+
+        def all_dists(dist):
+            for d in dist:
+                self.assertIsInstance(d, Orange.core.Distribution)
+
+        for ex in test:
+            preds = predictor(ex, Orange.core.GetValue)
+            all_values(preds)
+
+            dist = predictor(ex, Orange.core.GetProbabilities)
+            all_dists(dist)
+
+            preds, dist = predictor(ex, Orange.core.GetBoth)
+            all_values(preds)
+            all_dists(dist)
+
+            for d in dist:
+                if isinstance(d, Orange.core.ContDistribution):
+                    dist_sum = sum(d.values())
+                else:
+                    dist_sum = sum(d)
+
+                self.assertGreater(dist_sum, 0.0)
+                self.assertLess(abs(dist_sum - 1.0), 1e-3)
+
+        return predictor
+
+    def multi_target_predictor_interface(self, predictor, domain):
+        self.assertTrue(hasattr(predictor, "class_vars"))
+        self.assertIsInstance(predictor.class_vars, (list, Orange.core.VarList))
+        self.assertTrue(all(c1 == c2 for c1, c2 in \
+                            zip(predictor.class_vars, domain.class_vars)))
+
+
+def load_tests(loader, tests, ignore):
+    import doctest
+    tests.addTests(doctest.DocTestSuite(earth))
+    return tests
+
+
+if __name__ == "__main__":
+    unittest.main()

File orangecontrib/earth/widgets/OWEarth.py

View file
+"""
+<name>Earth</name>
+<description>Multivariate Adaptive Regression Splines (MARS)</description>
+<category>Regression</category>
+<icon>icons/EarthMars.svg</icon>
+<priority>100</priority>
+<tags>MARS, Multivariate, Adaptive, Regression, Splines</tags>
+"""
+
+import Orange
+
+from OWWidget import *
+import OWGUI
+
+from orngWrap import PreprocessedLearner
+
+from orangekit import earth
+
+NAME = "Earth"
+DESCRIPTION = "Multivariate Adaptive Regression Splines (MARS)"
+ICON = "icons/EarthMars.svg"
+CATEGORY = "Regression"
+PRIORITY = 100
+
+KEYWORDS = ["MARS", "Multivariate", "Adaptive", "Regression", "Splines"]
+
+INPUTS = (
+    {"name": "Data",
+     "type": Orange.data.Table,
+     "handler": "set_data",
+     "doc": "Set input training data set."},
+
+    {"name": "Preprocessor",
+     "type": PreprocessedLearner,
+     "handler": "set_preprocessor",
+     "doc": "Data Preprocessor"}
+)
+
+OUTPUTS = (
+    ("Learner", earth.EarthLearner),
+    ("Predictor", earth.EarthClassifier),
+    ("Basis Matrix", Orange.data.Table)
+)
+
+REPLACES = ["Orange.OrangeWidgets.Regression.OWEarth.OWEarth"]
+
+
+class OWEarth(OWWidget):
+    settingsList = ["name", "degree", "terms", "penalty"]
+
+    def __init__(self, parent=None, signalManager=None,
+                 title="Earth"):
+        OWWidget.__init__(self, parent, signalManager, title,
+                          wantMainArea=False)
+
+        self.name = "Earth Learner"
+        self.degree = 1
+        self.terms = 21
+        self.penalty = 2
+
+        self.loadSettings()
+
+        #####
+        # GUI
+        #####
+
+        OWGUI.lineEdit(self.controlArea, self, "name",
+                       box="Learner/Classifier Name",
+                       tooltip="Name for the learner/predictor")
+
+        box = OWGUI.widgetBox(self.controlArea, "Forward Pass", addSpace=True)
+        OWGUI.spin(box, self, "degree", 1, 3, step=1,
+                   label="Max. term degree",
+                   tooltip="Maximum degree of the terms derived "
+                           "(number of hinge functions).")
+        s = OWGUI.spin(box, self, "terms", 1, 200, step=1,
+                       label="Max. terms",
+                       tooltip="Maximum number of terms derived in the "
+                               "forward pass.")
+        s.control.setSpecialValueText("Automatic")
+
+        box = OWGUI.widgetBox(self.controlArea, "Pruning Pass", addSpace=True)
+        OWGUI.doubleSpin(box, self, "penalty", min=0.0, max=10.0, step=0.25,
+                   label="Knot penalty")
+
+        OWGUI.button(self.controlArea, self, "&Apply",
+                     callback=self.apply)
+
+        self.data = None
+        self.preprocessor = None
+        self.resize(300, 200)
+
+        self.apply()
+
+    def set_data(self, data=None):
+        """
+        Set the input data set.
+        """
+        self.data = data
+
+    def set_preprocessor(self, pproc=None):
+        self.preprocessor = pproc
+
+    def handleNewSignals(self):
+        self.apply()
+
+    def apply(self):
+        learner = earth.EarthLearner(
+            degree=self.degree,
+            terms=self.terms if self.terms >= 2 else None,
+            penalty=self.penalty,
+            name=self.name
+        )
+
+        predictor = None
+        basis_matrix = None
+        if self.preprocessor:
+            learner = self.preprocessor.wrapLearner(learner)
+
+        self.error(0)
+        if self.data is not None:
+            try:
+                predictor = learner(self.data)
+                predictor.name = self.name
+            except Exception, ex:
+                self.error(0, "An error during learning: %r" % ex)
+
+            if predictor is not None:
+                base_features = predictor.base_features()
+                basis_domain = Orange.data.Domain(
+                    base_features,
+                    self.data.domain.class_var,
+                    self.data.domain.class_vars)
+                basis_domain.add_metas(self.data.domain.get_metas())
+                basis_matrix = Orange.data.Table(basis_domain, self.data)
+
+        self.send("Learner", learner)
+        self.send("Predictor", predictor)
+        self.send("Basis Matrix", basis_matrix)
+
+    def sendReport(self):
+        self.reportSettings(
+            "Learning parameters",
+            [("Degree", self.degree),
+             ("Terms", self.terms if self.terms >= 2 else "Automatic"),
+             ("Knot penalty", "%.2f" % self.penalty)
+             ])
+
+        self.reportData(self.data)
+
+if __name__ == "__main__":
+    app = QApplication(sys.argv)
+    w = OWEarth()
+    w.set_data(Orange.data.Table("auto-mpg"))
+    w.show()
+    app.exec_()
+    w.saveSettings()

File orangecontrib/earth/widgets/__init__.py

View file
+
+intersphinx = (
+    ("{DEVELOP_ROOT}/docs/build/html/", None),
+)

File orangecontrib/earth/widgets/icons/EarthMars.svg

View file
Added
New image

File orangekit/__init__.py

-"""
-Namespace stub
-"""
-import pkg_resources
-pkg_resources.declare_namespace(__name__)

File orangekit/earth/__init__.py

-
-from .earth import EarthLearner, EarthClassifier, ScoreEarthImportance
-
-from .evimp import evimp, plot_evimp, bagged_evimp
-
-from .core import gcv

File orangekit/earth/core.py

-"""
-EARTH interface
-"""
-
-from __future__ import division
-
-import ctypes
-import numpy
-
-from numpy import ctypeslib
-
-from . import _earth
-
-_c_earth_lib = ctypeslib.load_library(_earth.__file__, "")
-
-_c_earth = _c_earth_lib.Earth
-
-_c_earth.argtypes = [
-    # pBestGcv: GCV of the best model.
-    ctypes.POINTER(ctypes.c_double),
-    # pnTerms: number of terms in the final model
-    ctypes.POINTER(ctypes.c_int),
-    # BestSet: nMaxTerms x 1, indices of the best set of cols of bx
-    ctypeslib.ndpointer(dtype=ctypes.c_bool),
-    # bx: nCases x nMaxTerms basis matrix
-    ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"),
-    # Dirs: nMaxTerms x nPreds, -1,0,1,2 for iTerm, iPred
-    ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=2, flags="F_CONTIGUOUS"),
-    # Cuts: nMaxTerms x nPreds, cut for iTerm, iPred
-    ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"),
-    # Residuals: nCases x nResp
-    ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"),
-    # Betas: nMaxTerms x nResp
-    ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"),
-    # x: nCases x nPreds
-    ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"),
-    # y: nCases x nResp
-    ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"),
-    # WeightsArg: nCases x 1, can be NULL, currently ignored
-    ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=1),
-    # nCases: number of rows in x and elements in y
-    ctypes.c_int,
-    # nResp: number of cols in y
-    ctypes.c_int,
-    # nPred: number of cols in x
-    ctypes.c_int,
-    # nMaxDegree: Friedman's mi
-    ctypes.c_int,
-    # nMaxTerms: includes the intercept term
-    ctypes.c_int,
-    # Penalty: GCV penalty per knot
-    ctypes.c_double,
-    # Thresh: forward step threshold
-    ctypes.c_double,
-    # nMinSpan: set to non zero to override internal calculation
-    ctypes.c_int,
-    # Prune: do backward pass
-    ctypes.c_bool,
-    # nFastK: Fast MARS K
-    ctypes.c_int,
-    # FastBeta: Fast MARS aging coef
-    ctypes.c_double,
-    # NewVarPenalty: penalty for adding a new variable
-    ctypes.c_double,
-    # LinPreds: nPreds x 1, 1 if predictor must enter linearly
-    ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=1),
-    # UseBetaCache: 1 to use the beta cache, for speed
-    ctypes.c_bool,
-    # Trace: 0 none 1 overview 2 forward 3 pruning 4 more pruning
-    ctypes.c_double,
-    # sPredNames: predictor names in trace printfs, can be NULL
-    ctypes.c_char_p
-]
-
-
-def forward_pass(x, y, weights=None, degree=1, terms=None, penalty=None,
-                 thresh=0.001, min_span=0, fast_k=21, fast_beta=1,
-                 new_var_penalty=2, trace=0):
-    """
-    Earth forward pass.
-
-    Parameters
-    ----------
-    x : numpy.array
-        An N x M array of predictors
-    y : numpy.array
-        An N x R array of responses
-    weights : numpy.array
-        An N x 1 array of instance weights (unused)
-    degree : int
-        Maximum term degree (default 1)
-    terms : int
-        Maximum number of terms to induce.
-    penalty : float
-        GCV penalty per knot.
-    thresh : float
-        RSS stopping threshold.
-    min_span: int
-    new_var_penalty : int
-        Penalty for addin a new variable in the model.
-    fast_k : int
-    fast_beta : int
-
-    """
-    x = numpy.asfortranarray(x, dtype=ctypes.c_double)
-    y = numpy.asfortranarray(y, dtype=ctypes.c_double)
-
-    if x.shape[0] != y.shape[0]:
-        raise ValueError("First dimensions of x and y must be the same.")
-
-    if y.ndim == 1:
-        y = y.reshape((-1, 1), order="F")
-
-    if penalty is None:
-        penalty = 3. if degree > 1. else 2.
-    elif penalty < 0 and penalty != -1.:
-        raise ValueError("Invalid 'penalty'")
-
-    n_cases, n_preds = x.shape
-
-    _, n_resp = y.shape
-
-    # Output variables
-    p_best_gcv = ctypes.c_double()
-    n_term = ctypes.c_int()
-    best_set = numpy.zeros((terms,), dtype=ctypes.c_bool, order="F")
-    bx = numpy.zeros((n_cases, terms), dtype=ctypes.c_double, order="F")
-    dirs = numpy.zeros((terms, n_preds), dtype=ctypes.c_int, order="F")
-    cuts = numpy.zeros((terms, n_preds), dtype=ctypes.c_double, order="F")
-    residuals = numpy.zeros((n_cases, n_resp), dtype=ctypes.c_double, order="F")
-    betas = numpy.zeros((terms, n_resp), dtype=ctypes.c_double, order="F")
-
-    weights = numpy.ones((n_cases,), dtype=ctypes.c_double, order="F")
-    lin_preds = numpy.zeros((n_preds,), dtype=ctypes.c_int, order="F")
-    use_beta_cache = True
-    prune = False
-
-    # These tests are performed in ForwardPass, and if they fail the function
-    # calls exit. So we must check it here and raise a exception to avoid a
-    # process shutdown.
-    if n_cases < 8:
-        raise ValueError("Need at least 8 data instances.")
-    if n_cases > 1e8:
-        raise ValueError("To many data instances.")
-    if n_resp < 1:
-        raise ValueError("No response column.")
-    if n_resp > 1e6:
-        raise ValueError("To many response columns.")
-    if n_preds < 1:
-        raise ValueError("No predictor columns.")
-    if n_preds > 1e5:
-        raise ValueError("To many predictor columns.")
-    if degree <= 0 or degree > 100:
-        raise ValueError("Invalid 'degree'.")
-    if terms < 3 or terms > 10000:
-        raise ValueError("'terms' must be in >= 3 and <= 10000.")
-    if penalty < 0 and penalty != -1:
-        raise ValueError("Invalid 'penalty' (the only legal negative value "
-                         "is -1).")
-    if penalty > 1000:
-        raise ValueError("Invalid 'penalty' (must be <= 1000).")
-    if thresh < 0.0 or thresh >= 1.0:
-        raise ValueError("Invalid 'thresh' (must be in [0.0, 1.0) ).")
-    if fast_beta < 0 or fast_beta > 1000:
-        raise ValueError("Invalid 'fast_beta' (must be in [0, 1000] ).")
-    if new_var_penalty < 0 or new_var_penalty > 10:
-        raise ValueError("Invalid 'new_var_penalty' (must be in [0, 10] ).")
-    if (numpy.var(y, axis=0) <= 1e-8).any():
-        raise ValueError("Variance of y is zero (or near zero).")
-
-    _c_earth(ctypes.byref(p_best_gcv), ctypes.byref(n_term),
-             best_set, bx, dirs, cuts, residuals, betas,
-             x, y, weights,
-             n_cases, n_resp, n_preds,
-             degree, terms, penalty, thresh, min_span, prune,
-             fast_k, fast_beta, new_var_penalty, lin_preds,
-             use_beta_cache, trace, None)
-
-    return n_term.value, best_set, bx, dirs, cuts
-
-
-def gcv(rss, n, n_effective_params):
-    """
-    Return the generalized cross validation (GCV).
-
-    .. math:: gcv = rss / (n * (1 - NumEffectiveParams / n) ^ 2)
-
-    Parameters
-    ----------
-    rss : array_like
-        Residual sum of squares.
-    n : float
-        Number of training instances.
-    n_effective_params : array_like
-        Number of effective parameters.
-
-    """
-    return  rss / (n * (1. - n_effective_params / n) ** 2)
-
-
-def pruning_pass(bx, y, penalty, max_pruned_terms=None):
-    """
-    Earth pruning pass.
-
-    Parameters
-    ----------
-    bx : array_like
-        An N x P array basis matrix as induced by :func:`forward_pass`.
-    y : array_like
-        An N x R array of responses.
-    penalty : float
-        GCV penalty.
-    max_pruned_terms : int
-        Maximum number of terms to leave in the model. If `None` then
-        all model sizes are considered.
-
-    """
-
-    subsets, rss_vec = subset_selection_xtx(bx, y)
-
-    cases, terms = bx.shape
-    assert rss_vec.shape == (terms,)
-
-    # Effective parameters for all subsets sizes (of terms)
-    n_effective_params = numpy.arange(terms) + 1.0
-    n_effective_params += penalty * (n_effective_params - 1.0) / 2.0
-
-    gcv_vec = gcv(rss_vec, cases, n_effective_params)
-
-    if max_pruned_terms is None:
-        max_pruned_terms = terms
-
-    min_i = numpy.argmin(gcv_vec[:max_pruned_terms])
-
-    used = numpy.zeros((terms), dtype=bool)
-
-    used[subsets[min_i, :min_i + 1]] = True
-
-    return used, subsets, rss_vec, gcv_vec
-
-
-def base_term_transform(X, cuts, directions, intercept=False,
-                        out=None):
-    """
-    Transform a data array onto a set of earth basis terms.
-
-    The terms are defined by an earth model parameterized by
-    cuts and directions.
-
-    Parameters
-    ----------
-    X : array_like
-        An N x P input array.
-    cuts : array_like
-        An T x P array of hinge term knots
-    directions : array_like
-        An T x P array of hinge term directions.
-    intercept : bool
-        If True then the first row of directions and cuts is taken to be
-        the intercept term (directions[0] must have all 0 elements).
-
-    """
-    X = numpy.asanyarray(X)
-    directions = numpy.asarray(directions)
-    cuts = numpy.asarray(cuts)
-
-    if directions.ndim == 1:
-        directions = directions.reshape((1, -1))
-
-    if cuts.ndim == 1:
-        cuts = cuts.reshape((1, -1))
-
-    if cuts.shape != directions.shape:
-        raise ValueError("'cuts' and 'directions' must have the same shape")
-
-    N, P = X.shape
-    M, P_1 = directions.shape
-    if P != P_1:
-        raise ValueError("Dimension mismatch.")
-
-    if out is None:
-        out = numpy.zeros((N, M))
-    elif out.shape != (N, M):
-        raise ValueError("'out' is of wrong size")
-
-    start_term = 0
-    if intercept:
-        if numpy.any(directions[0]):
-            raise ValueError("")
-        out[:, 0] = 1.0
-
-        start_term = 1
-
-    for termi in range(start_term, M):
-        term_dirs = directions[termi]
-        term_cuts = cuts[termi]
-
-        dir_p1 = numpy.where(term_dirs == 1)[0]
-        dir_m1 = numpy.where(term_dirs == -1)[0]
-        dir_2 = numpy.where(term_dirs == 2)[0]
-
-        x1 = X[:, dir_p1] - term_cuts[dir_p1]
-        x2 = term_cuts[dir_m1] - X[:, dir_m1]
-        x3 = X[:, dir_2]
-
-        x1 = numpy.where(x1 > 0.0, x1, 0.0)
-        x2 = numpy.where(x2 > 0.0, x2, 0.0)
-
-        Term = numpy.cumprod(numpy.hstack((x1, x2, x3)), axis=1)
-        # Term can have a (N, 0) shape if all(term_dirs == 0)
-        out[:, termi] = Term[:, -1] if Term.size else 0.0
-
-    return out
-
-
-def subset_selection_xtx(X, Y, lstsq=None):
-    """
-    A re-implementation of EvalSubsetsUsingXtx in the Earth c code.
-
-    Using numpy least squares fitter.
-
-    """
-    X = numpy.asarray(X)
-    Y = numpy.asarray(Y)
-
-    if lstsq is None:
-        lstsq = numpy.linalg.lstsq
-
-    n_pred = X.shape[1]
-    rss_vec = numpy.zeros(n_pred)
-    working_set = range(n_pred)
-    subsets = numpy.zeros((n_pred, n_pred), dtype=int)
-
-    for subset_size in reversed(range(n_pred)):
-        subsets[subset_size, :subset_size + 1] = working_set
-        X_work = X[:, working_set]
-        b = lstsq(X_work, Y)[0]
-
-        rss_vec[subset_size] = numpy.sum((Y - numpy.dot(X_work, b)) ** 2)
-
-        XtX = numpy.dot(X_work.T, X_work)
-        iXtX = numpy.linalg.pinv(XtX)
-        diag = numpy.diag(iXtX).reshape((-1, 1))
-
-        if subset_size == 0:
-            break
-
-        delta_rss = b ** 2 / diag
-        delta_rss = numpy.sum(delta_rss, axis=1)
-        delete_i = numpy.argmin(delta_rss[1:]) + 1  # Keep the intercept
-        del working_set[delete_i]
-
-    return subsets, rss_vec

File orangekit/earth/earth.py

-"""
-Earth Orange interface
-
-Example::
-
-    >>> import Orange, orangekit.earth
-    >>> data = Orange.data.Table("housing")
-    >>> c = orangekit.earth.EarthLearner(data, degree=2, terms=10)
-    >>> print c.to_string(precision=2)
-    MEDV =
-       23.59
-       -0.61 * max(0, LSTAT - 6.12)
-       +11.90 * max(0, RM - 6.43)
-       +1.14 * max(0, 6.43 - RM)
-       -228.80 * max(0, NOX - 0.65) * max(0, RM - 6.43)
-       +0.02 * max(0, TAX - 307.00) * max(0, 6.12 - LSTAT)
-       +0.03 * max(0, 307.00 - TAX) * max(0, 6.12 - LSTAT)
-
-"""
-
-from __future__ import division
-
-__all__ = ["EarthLearner", "EarthClassifier", "evimp", "bagged_evimp",
-           "ScoreEarthImportance"]
-
-import numpy
-
-import Orange
-
-from .core import forward_pass, pruning_pass, base_term_transform
-
-
-def is_discrete(var):
-    return isinstance(var, Orange.feature.Discrete)
-
-
-def is_continuous(var):
-    return isinstance(var, Orange.feature.Continuous)
-