Commits

Olivier Grisel committed b491b5b

initial checkin with content from the launchpad bzr branch

Comments (0)

Files changed (6)

+syntax: glob
+*.pyc
+*.swp
+*.egg-info
+eggs/*
+dist/*
+build/*
+lib/*
+include/*
+
+=====
+PySGD
+=====
+
+Experimental implementation of Stochastic Gradient Descent for linear SVM
+regression / learning.
+
+See paper by L. Bottou and O. Bousquet, 2008 for the reasons of using such
+a bad optimizer for large scale machine learning problems and A.I.
+
+Roadmap (only approximately ordered):
+
+  [x] implement heuristic search of t0 from data
+
+  [ ] implement stopping criterion based on:
+
+     [x] test set cost evaluation by splitting training set
+
+     [ ] estimate of k (in [1, kapa^2]) from data to guess optimal number of
+         epochs
+
+  [ ] implement pre-conditioning
+
+     [ ] adjust average input vector norm to 1.0
+
+     [ ] fit component-wise variances to 1.0 in the input space
+
+  [ ] implement heuristic search of hyper parameter lambda
+
+  [ ] optimize sigmoid loss function (-> logistic regression)
+
+  [ ] implement parsing tool for libSVM text format and binary counterpart
+
+  [ ] implement 2 layers perceptron with sigmoid units based on
+      Lecun, Bottou 98 practical recommendations
+
+  [ ] implement live monitoring utility that non-blockingly pipes the
+      current state of the model bying trained into a unix socket to get
+      evaluated/plotted live by a second processus
+
+  [ ] plot early convergence behavior against libsvm and liblinear
+
+  [ ] implement Quasi-Newton Diagonal SGD and compare to First order SGD
+
+  [ ] implement pegasos scaling of ||w|| to 1 / sqrt(lambda) and compare
+      to plain SGD
+
+  [ ] play with // implementation with the multiprocessing module
+
+  [ ] play with http://cython.org to speed up the algorithmic code
+
+  [ ] play with SIMD extensions
+
+  [ ] have a look at the MDP API and implementations of backprop
+
+

pysgd/logging.ini

+[loggers]
+keys=root
+
+[logger_root]
+level=INFO
+handlers=console
+
+[handlers]
+keys=console
+
+[handler_console]
+class=StreamHandler
+level=NOTSET
+formatter=default
+args=(sys.stderr,)
+
+[formatters]
+keys=default
+
+[formatter_default]
+format=%(asctime)s %(levelname)s %(message)s
+datefmt=
+class=logging.Formatter
+

pysgd/parallelsgd.py

+"""Multi process implementation of PySGD
+"""
+
+import multiprocessing as mp
+from numpy import random
+import pysgd
+
+
+class ChunkedDataset(list):
+    """Utility to split datasets into a list of smaller datasets
+
+    This is useful for the map API of the multiprocessing.Pool class in order
+    to
+
+    For instance let us build a datasets with 10000 input vectors and 10000
+    outputs values::
+
+      >>> dataset_size = 10000
+      >>> nb_features = 12
+      >>> x = random.ranf((dataset_size, nb_features)) * 2 - 1
+      >>> y = random.ranf(dataset_size) * 2 - 1
+
+    We can chunk it into 8 sub datasets::
+
+      >>> cd = ChunkedDataset(x, y, chunks=8)
+      >>> len(cd)
+      8
+
+    Each chunk carries approximatly 1/8th of the inpuy and output data::
+
+      >>> x0, y0 = cd[0]
+      >>> x0.shape
+      (1250, 12)
+
+      >>> y0.shape
+      (1250,)
+
+    """
+
+    def __new__(list, x, y, chunks=None):
+        chunks = chunks or mp.cpu_count() * 4
+        chunk_size, remaining = divmod(len(x), chunks)
+        if remaining != 0:
+            chunk_size += 1
+
+        return [(x[i:i+chunk_size], y[i:i+chunk_size])
+                for i in xrange(chunks)]
+
+
+class ParallelSvmModel(pysgd.SvmModel):
+    """SVM model that uses a pool of processes for inference and training"""
+
+    def __init__(self, pool_size=None):
+        self._pool = mp.Pool(pool_size)
+
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()
+r"""PySGD - A python implementation of Support Vector Machines with SGD
+
+This implementation is based on Stochastic Gradient Descent inspired by Leon
+Bottou svmsgd project.
+
+The goal is to minimize the following objective function over the w vector::
+
+    f(w) = lambda / 2 * (||w||^2 + b^2) + 1/N * sum (x, y in S) l(w, b, (x, y))
+
+with S of cardinality N, the set of training points x with labels y and l
+a loss function such as the l1 loss or any regularized variant::
+
+    l(w, (x, y)) = abs(dot(w, x) + b - y)
+
+We solve the minimization problem directly in the primal using standard
+stochastic gradient descent.
+
+TODO: what about non linear kernels?
+
+Example: choose a random w_opt and b_opt and random x and generate y
+then try to train a model and compare w and b to w_opt and b_opt::
+
+  >>> from numpy import random, array, dot
+  >>> random.seed(0)
+
+First set up the problem dimensions::
+
+  >>> nb_features = 500
+  >>> nb_training_points = 10000
+
+Define a random optimal model that will be used to build the training and test
+sets::
+
+  >>> w_opt = random.ranf(nb_features) * 2 - 1
+  >>> b_opt = random.ranf() * 2 - 1
+  >>> def f_opt(x_i): return dot(w_opt, x_i) + b_opt
+
+  >>> x_train = random.ranf((nb_training_points, nb_features)) * 2 - 1
+  >>> y_train = array([f_opt(x_i) for x_i in x_train])
+
+  >>> x_test = random.ranf((nb_training_points/5, nb_features)) * 2 - 1
+  >>> y_test = array([f_opt(x_i) for x_i in x_test])
+
+Let us add some white noise to the datasets::
+
+  >>> y_train = y_train + random.normal(scale=0.01, size=y_train.size)
+  >>> y_test = y_test + random.normal(scale=0.01, size=y_test.size)
+
+Let us build a linear Support Vector Machine model that uses a square loss
+function::
+
+  >>> m = SvmModel(loss_type='l1')
+
+The time step is initialized to a not too small value that depends on the problem
+dimensions to avoid too large values for the learning rate at the beginning. The
+skip parameter is also initialize from the data::
+
+  >>> m.callibrate(x_train, y_train)
+  >>> m.t0, m.t, m.skip
+  (1000000, 1000000, 16)
+
+Train the model against the training set while recording the loss evaluations to
+be able to plot the convergence. The train method returns the learned
+parameters ``w`` the hyperplan defining vector of size ``nb_features`` and ``b``
+the bias::
+
+  >>> epochs = 3
+  >>> w, b = m.train(x_train, y_train, epochs=epochs, compute_loss=True)
+
+The default stopping criterion makes the algorithm stop before it scan the
+complete training set the required number of epochs::
+
+  >>> m.t <= m.t0 + epochs * nb_training_points
+  True
+
+The optimized model is rather close to the optimal model in the euclidian
+parameter space::
+
+  >>> sqrt(sum((w_opt - w) ** 2) + (b_opt - b) ** 2) / (nb_features + 1) < 0.01
+  True
+
+Let us test the model accuracy by computing the average objective evaluation and
+mean loss on the sample test::
+
+  >>> objective, sample_loss = m.test(x_test, y_test)
+  >>> objective / nb_features < 0.01
+  True
+  >>> sample_loss / nb_features < 0.01
+  True
+
+As we recorded loss evaluations during training we can now plot the
+convergence::
+
+  >> m.plot()
+
+TODO: implement multi processor version to using either pyro or multiprocessing
+to easily leverage multicore machines and cloud computing clusters.
+
+TODO: implement live plot monitoring of the convergence during the training
+phase, possibly using asynchronous process
+
+TODO: plot 2D projections of the objective function evaluations against
+arbitrary features to qualitively monitor the convergence and check the
+convexity of the optimization problem.
+
+"""
+
+import logging
+import logging.config
+
+from itertools import izip
+from time import time
+
+from numpy import abs
+from numpy import arange
+from numpy import array
+from numpy import average
+from numpy import cumsum
+from numpy import dot
+from numpy import exp
+from numpy import log
+from numpy import logspace
+from numpy import nonzero
+from numpy import sign
+from numpy import sqrt
+from numpy import zeros
+
+def l1_loss(d):
+    return abs(d)
+
+def l1_dloss(d, x):
+    s = sign(d)
+    return s * x, s
+
+def l2_loss(d):
+    return d ** 2
+
+def l2_dloss(d, x):
+    return 2 * d * x, 2 * d
+
+def movavg(s, n):
+    s = array(s)
+    c = cumsum(s)
+    return (c[n-1:] - c[:-n+1]) / float(n-1)
+
+class SvmModel(object):
+    """Linear SVM model solved in the primal using Stochastic Gradient Descent
+    """
+
+    logger = logging.getLogger()
+
+    _kernels = {
+        "linear": dot,
+    }
+
+    _loss_functions = {
+        "l1": l1_loss,
+        "l2": l2_loss,
+    }
+
+    _dloss_functions = {
+        "l1": l1_dloss,
+        "l2": l2_dloss,
+    }
+
+    _callibrated_for = 0
+
+
+    def __init__(self,  lambda_=1e-4, kernel_type="linear", loss_type="l1"):
+        self.kernel_type = kernel_type
+        self._kernel = self._kernels.get(kernel_type)
+        if self._kernel is None:
+            raise ValueError("%s is not a supported kernel" % kernel_type)
+
+        self.lambda_ = lambda_
+
+        try:
+            self.loss = self._loss_functions[loss_type]
+            self.dloss = self._dloss_functions[loss_type]
+        except KeyError:
+            raise ValueError(
+                "invalid loss type '%s', should be one of %r" % (
+                    loss_type, list(self._loss_functions.keys())))
+
+        self.loss_history = list()
+        self.t0 = self.t = None
+
+    def callibrate(self, x, y, max_callibration=1000):
+        """Estimate the skip and best t0 parameters from the data
+
+        Only the first `max_callibration` items are used if `x` is larger.
+        """
+        start_time = time()
+        x = x[:max_callibration]
+        y = y[:max_callibration]
+        n_features = x[0].size
+
+        # the weight decay skip is based on the density
+        density = float(len(nonzero(x)[0])) / (len(x) * n_features)
+        if density == 0:
+            # this should never happen otherwise the optimization is
+            # pointless
+            self.skip = 0
+        else:
+            self.skip = int(16. / density)
+
+        # reset any old model
+        self.w = zeros(n_features)
+        self.bias = 0
+
+        # mark the model as callibrated for the dimension of the x
+        # feature space
+        self._callibrated_for = n_features
+
+        # compute the best t0 value by training and testing on the callibration
+        # set for various values of t0 in the log range [1.0 - 10^10]
+        best_t0, best_t0_loss = None, None
+        for t0 in logspace(0, 10, 11):
+            self.t0 = self.t = int(t0)
+
+            self.train(x, y)
+            _, current_loss = self.test(x, y)
+            if best_t0_loss is None or current_loss < best_t0_loss:
+                best_t0 = self.t0
+                best_t0_loss = current_loss
+
+            # reinit the model parameters
+            self.w = zeros(n_features)
+            self.bias = 0
+
+        # save the callibrated value for t0
+        self.t0 = self.t = best_t0
+
+        self.logger.info(
+            "callibrated model with %d vectors of %d features: "
+            "density=%0.3f, skip=%d, t0=%d in %0.3fs",
+            len(x), n_features, density, self.skip, self.t0, time()-start_time)
+
+    def train(self, x, y, epochs=1, compute_loss=False,
+              stopping_size=1000, stopping_tol=0.001):
+        """Train the model using data `x` and labels `y`
+
+        Return the vector of weights `w` and bias `b`.
+
+        - compute_loss: set to True records objective evaluations
+
+        - ploting_checkpoint: if not 0, update the plot every ploting_checkpoint
+          iterations
+
+        TODO: implement multiple ephocs
+
+        """
+        n_features = x[0].size
+        if self._callibrated_for != n_features:
+            self.callibrate(x, y)
+
+        start_time = time()
+        start_t = self.t
+
+        # optimize data lookups by using local variables
+        b = self.bias
+        lambda_ = self.lambda_
+        t = self.t
+        loss = self.loss
+        dloss = self.dloss
+        w = self.w
+        count = skip = self.skip
+        loss_history = self.loss_history
+        should_stop = False
+
+        for epoch in xrange(epochs):
+            for x_i, y_i in izip(x, y):
+                d = dot(x_i, w) + b - y_i
+                eta = 1.0 / (t * lambda_)
+                # TODO: investigate why eta without dividing by lambda works
+                # much better in most cases
+                #eta = 1.0 / t
+
+                # perform a single the stochastic gradient descent step on the
+                # data dependent term of the objective function, i.e. the
+                # gradient of the loss function
+                dloss_w, dloss_b = dloss(d, x_i)
+                w -= eta * dloss_w
+                b -= eta * dloss_b
+
+                count =- 1
+                if count <= 0:
+                    # rescale w every `skip` vectors: this is the regularization
+                    # term which data independant hence can be performed every
+                    # while or so
+                    w *= 1 - eta * lambda_ * skip
+                    b *= 1 - eta * lambda_ * skip
+                    count = skip
+
+                if compute_loss:
+                    loss_history.append(loss(d))
+                t += 1
+
+                if stopping_tol is not None and t % stopping_size:
+                    if len(loss_history) >= 2 * stopping_size:
+                        ss = stopping_size
+                        previous_loss = average(loss_history[-2*ss:-ss])
+                        current_loss = average(loss_history[-ss:])
+                        if abs(previous_loss - current_loss) < stopping_tol:
+                            should_stop = True
+                            break
+
+            # perform remaining regularization scaling if needed
+            if count > 0 and count != skip:
+                w *= 1 - eta * lambda_ * skip
+                b *= 1 - eta * lambda_ * skip
+
+            if should_stop:
+                break
+
+        # update the model parameters
+        self.w = w
+        self.bias = b
+
+        # save the current time advancement to be able to resume training
+        self.t = t
+
+        duration = time() - start_time
+        iterations = self.t - start_t
+        bandwidth = 4 * n_features * iterations / duration / 1000
+        self.logger.info(
+            "trained model with %d vectors of %d features and %d iterations "
+            "in %0.3fs at %dkbps", len(x), n_features, iterations, duration,
+            bandwidth)
+
+        return self.w, self.bias
+
+    def plot(self, logscale=False):
+        """Helper method to quickly plot loss descent history
+
+        Training should be performed with loss evaluation enabled::
+
+          model.train(x, y, compute_loss=True)
+
+        """
+        import matplotlib.pyplot as plt
+
+        t = arange(len(self.loss_history)) + self.t0
+        window = 1000
+        p = plt.semilogy if logscale else plt.plot
+        p(t, self.loss_history)
+        p(t[window/2:-window/2], movavg(self.loss_history, window + 1))
+
+        plt.grid(True)
+        plt.show()
+
+    def predict(self, x):
+        return dot(x, self.w) + self.bias
+
+    # make any trained SVM model a predicting callable
+    __call__ = predict
+
+    def test(self, x, y):
+        """Compute sample mean objective and sample mean loss"""
+        t0 = time()
+        mean_loss = average(self.loss(self.predict(x) - y))
+        objective = self.lambda_ / 2 * dot(self.w, self.w) + mean_loss
+
+        self.logger.info(
+            "tested model with %d vectors of %d features: "
+            "mean objective=%0.3f, mean loss=%0.3f in %0.3fs",
+            len(x), len(x[0]), objective, mean_loss, time()-t0)
+
+        return objective, mean_loss
+
+
+def test():
+    """Run the embedded doctests"""
+    import doctest
+    doctest.testmod()
+
+def main():
+    """Enable the logs and launch the doctest"""
+    logging_configuration = "logging.ini"
+    logging.config.fileConfig(logging_configuration)
+    test()
+
+if __name__ == "__main__":
+    main()
+
+"""Utilities to make it possible to call SIMD otpimized code on numpy arrays"""
+
+import numpy as N
+
+def align16(n, dtype=N.float64):
+    size = N.dtype(dtype).itemsize
+    over = 16 / size
+    data = N.empty(n + over, dtype=dtype)
+    skip = (- data.ctypes.data % 16) / size
+    return data[skip:skip + n]
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.