# HG changeset patch # User Ruben Martinez-Cantin # Date 1368548464 -7200 # Node ID bf97fb63a1f04e51e4dd4fed7bcc041df9723756 # Parent 98269d801d75e4e59f12e99d7861486f897c16d5 Missing files diff --git a/doxygen/introduction.dox b/doxygen/introduction.dox new file mode 100644 --- /dev/null +++ b/doxygen/introduction.dox @@ -0,0 +1,172 @@ +/*! \page bopttheory Bayesian optimization +\tableofcontents + +\section introbopt Introduction to Bayesian Optimization + +Many problems in engineering, computer science, economics, etc., +require to find the extremum of a real valued function. These +functions are typically continuous and sometimes smooth (e.g.: +Lipschitz continuous). However, those functions do not have a +closed-form expression or might be multimodal, where some of the local +extrema might have a bad outcome compared to the global extremum. The +evaluation of those functions might be costly. + +Global optimization is a special case of non-convex optimization where +we want to find the global extremum of a real valued function, that +is, the target function. The search is done by some pointwise +evaluation of the target function. + +The objective of a global optimization algorithm is to find the +sequence of points +\f[ +x_n \in \mathcal{A} \subset \mathbb{R}^m , \;\;\; n = 1,2,\ldots +\f] +which converges to the point \f$x^*\f$, that is, the extremum of the +target function, when \f$n\f$ is large. The algorithm should be able to +find that sequence at least for all functions from a given family. + +As explained in \cite Mockus94, this search procedure is a sequential +decision making problem where point at step \f$n+1\f$ is based on decision +\f$d_n\f$ which considers all previous data: +\f[ +x_{n+1} = d_n(x_{1:n},y_{1:n}) +\f] +where \f$y_i = f(x_i) + \epsilon_i\f$. For simplicity, many works assume +\f$\epsilon_i = 0\f$, that is, function evaluations are +deterministic. However, we can easily extend the description to +include stochastic functions (e.g.: homoscedastic noise \f$\epsilon_i +\sim \mathcal{N}(0,\sigma)\f$). + +The search method is the sequence of decisions \f$d = {d_0,\ldots, +d_{n-1}}\f$, which leads to the final decision \f$x_{n} = x_{n}(d)\f$. In +most applications, the objective is to optimize the response of the +final decisions. Then, the criteria relies on the \a optimality +\a error or \a optimality \a gap, which can be expressed as: +\f[ +\delta_n(f,d) = f\left(x_n\right) - f(x^*) +\f] +In other applications, the objective may require to converge to \f$x^*\f$ +in the input space. Then, we can use for example the Euclidean +distance error: +\f[ +\delta_n(f,d) = \|x_n - x^*\|_2 \label{eq:dist-error} +\f] +The previous equations can also be interpreted as variants of the +\a loss function for the decision at each step. Thus, the optimal +decision is defined as the function that minimizes the loss function: +\f[ +d_n = \arg \min_d \delta_n(f,d) +\f] +This requires full knowledge of function \f$f\f$, which is +unavailable. Instead, let assume that the target function \f$f = f(x)\f$ +belongs to a family of functions \f$f \in F\f$, e.g.: continuous functions +in \f$\mathbb{R}^m\f$. Let also assume that the function can be +represented as sample from a probability distribution over functions +\f$f \sim P(f)\f$. Then, the best response case analysis for the search +process is defined as the decision that optimizes the expectation of +the loss function: +\f[ +d^{BR}_n = \arg \min_d \mathbb{E}_{P(f)} \left[ +\delta_n(f,d)\right]= \arg \min_d \int_F \delta_n(f,d) \; dP(f) +\f] +where \f$P\f$ is a prior distribution over functions. + +However, we can improve the equation considering that, at decision +\f$d_n\f$ we have already \a observed the actual response of the +function at \f$n-1\f$ points, \f$\{x_{1:n-1},y_{1:n-1}\}\f$. Thus, the prior +information of the function can be updated with the observations and +the Bayes rule: +\f[ + P(f|x_{1:n-1},y_{1:n-1}) = \frac{P(x_{1:n-1},y_{1:n-1}|f) P(f)}{P(x_{1:n-1},y_{1:n-1})} +\f] +In fact, we can actually rewrite the equation to represent the updates +sequentially: +\f[ + P(f|x_{1:i},y_{1:i}) = \frac{P(x_{i},y_{i}|f) P(f|x_{1:i-1},y_{1:i-1})}{P(x_{i},y_{i})}, \qquad \forall \; i=1 \ldots n-1 +\f] +Thus, the previous equation can be rewritten as: +\f[ +d^{BO}_n = \arg \min_d \mathbb{E}_{P(f|x_{1:n-1},y_{1:n-1})} \left[ \delta_n(f,d)\right] = \arg \min_d \int_F \delta_n(f,d) \; dP(f|x_{1:n-1},y_{1:n-1}) +\f] +This equation is the root of Bayesian optimization, where the +Bayesian part comes from the fact that we are computing the +expectation with respect to the posterior distribution, also called +\a belief, over functions. Therefore, Bayesian optimization is a +memory-based optimization algorithm. + +As commented before, most of the theory of Bayesian optimization is +related to deterministic functions, we consider also stochastic +functions, that is, we assume there might be a random error in the +function output. In fact, evaluations can produce different outputs if +repeated. In that case, the target function is the expected +output. Furthermore, in a recent paper by \cite Gramacy2012 it has +been shown that, even for deterministic functions, it is better to +assume certain error in the observation. The main reason being that, +in practice, there might be some mismodelling errors which can lead to +instability of the recursion if neglected. + +\section models Bayesian optimization general model + +In order to simplify the description, we are going to use a special +case of Bayesian optimization model defined previously which +corresponds to the most common application. In subsequent Sections we +will introduce some generalizations for different applications. + +Without loss of generality, consider the problem of finding the +minimum of an unknown real valued function \f$f:\mathbb{X} \rightarrow +\mathbb{R}\f$, where \f$\mathbb{X}\f$ is a compact space, \f$\mathbb{X} +\subset \mathbb{R}^d, d \geq 1\f$. Let \f$P(f)\f$ be a prior distribution +over functions represented as a stochastic process, for example, a +Gaussian process \f$\mathbf{x}i(\cdot)\f$, with inputs \f$x \in \mathbb{X}\f$ and an +associate kernel or covariance function \f$k(\cdot,\cdot)\f$. Let also +assume that the target function is a sample of the stochastic process +\f$f \sim \mathbf{x}i(\cdot)\f$. + +In order to find the minimum, the algorithm has a maximum budget of +\f$N\f$ evaluations of the target function \f$f\f$. The purpose of the +algorithm is to find optimal decisions that provide a better +performance at the end. + +One advantage of using Gaussian processes as a prior distributions +over functions is that new observations of the target function +\f$(x_i,y_i)\f$ can be easily used to update the distribution over +functions. Furthermore, the posterior distribution is also a Gaussian +process \f$\mathbf{x}i_i = \left[ \mathbf{x}i(\cdot) | x_{1:i},y_{1:i} +\right]\f$. Therefore, the posterior can be used as an informative prior +for the next iteration in a recursive algorithm. + +In a more general setting, many authors have suggested to modify the +standard zero-mean Gaussian process for different variations that +include semi-parametric models \cite Huang06 \cite Handcock1993 \cite Jones:1998 \cite OHagan1992, use of hyperpriors on the parameters +\cite MartinezCantin09AR \cite Brochu:2010c \cite Hoffman2011, Student +t processes \cite Gramacy_Polson_2009 \cite Sacks89SS \cite Williams_Santner_Notz_2000, etc. + +We use a generalized linear model of the form: +\f[ + f(x) = \phi(\mathbf{x})^T \mathbf{w} + \epsilon(\mathbf{x}) +\f] +where +\f[ + \epsilon(\mathbf{x}) \sim \mathcal{NP} \left( 0, \sigma^2_s (\mathbf{K}(\theta) + \sigma^2_n \I) \right) +\f] +The term \f$\mathcal{NP}\f$ means a non-parametric process, which can +make reference to a Gaussian process \f$\mathcal{GP}\f$ or a Student's +t process \f$\mathcal{TP}\f$. In both cases, \f$\sigma^2_n\f$ is the +observation noise variance, sometimes called nugget, and it is problem +specific. Many authors decide to fix this value \f$\sigma^2_n = 0\f$ +when the function \f$f(x)\f$ is deterministic, for example, a computer +simulation. However, as cleverly pointed out in \cite Gramacy2012, +there might be more reasons to include this term appart from being the +observation noise, for example, to consider model inaccuracies. + +This model has been presented in different ways depending on the field +where it was used: +\li As a generalized linear model \f$\phi(\mathbf{x})^T\mathbf{w}\f$ with heteroscedastic +perturbation \f$\epsilon(\mathbf{x})\f$. +\li As a nonparametric process of the form \f$\mathcal{NP} \left(\phi(\mathbf{x})^T\mathbf{w}, +\sigma^2_s (\mathbf{K}(\theta) + \sigma^2_n \mathbf{I}) \right)\f$. +\li As a semiparametric model \f$f(\mathbf{x}) = f_{par}(\mathbf{x}) + f_{nonpar}(\mathbf{x}) = +\phi(\mathbf{x})^T\mathbf{w} + \mathcal{NP}(\cdot)\f$ + + +*/ \ No newline at end of file diff --git a/doxygen/models.dox b/doxygen/models.dox new file mode 100644 --- /dev/null +++ b/doxygen/models.dox @@ -0,0 +1,3 @@ +/*! + +*/ \ No newline at end of file diff --git a/src/gaussian_process_normal.cpp b/src/gaussian_process_normal.cpp new file mode 100644 --- /dev/null +++ b/src/gaussian_process_normal.cpp @@ -0,0 +1,144 @@ +/* +------------------------------------------------------------------------- + This file is part of BayesOpt, an efficient C++ library for + Bayesian optimization. + + Copyright (C) 2011-2013 Ruben Martinez-Cantin + + BayesOpt is free software: you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + BayesOpt is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with BayesOpt. If not, see . +------------------------------------------------------------------------ +*/ + +#include +#include +#include "log.hpp" +#include "cholesky.hpp" +#include "trace_ublas.hpp" +#include "elementwise_ublas.hpp" +#include "gauss_distribution.hpp" +#include "gaussian_process_normal.hpp" + +namespace bayesopt +{ + + namespace ublas = boost::numeric::ublas; + + GaussianProcessNormal::GaussianProcessNormal(size_t dim, + bopt_params params): + HierarchicalGaussianProcess(dim,params), + mSigma(params.sigma_s), + mW0(params.mean.n_coef), mInvVarW(params.mean.n_coef), + mD(params.mean.n_coef,params.mean.n_coef) + { + mW0 = utils::array2vector(params.mean.coef_mean,params.mean.n_coef); + for (size_t ii = 0; ii < params.mean.n_coef; ++ii) + { + double varii = params.mean.coef_std[ii] * params.mean.coef_std[ii]; + mInvVarW(ii) = 1/varii; + } + d_ = new GaussianDistribution(); + } // Constructor + + + + GaussianProcessNormal::~GaussianProcessNormal() + { + delete d_; + } // Default destructor + + + ProbabilityDistribution* + GaussianProcessNormal::prediction(const vectord &query) + { + double kq = (*mKernel)(query, query);; + vectord kn = computeCrossCorrelation(query); + vectord phi = mMean->getFeatures(query); + + vectord v(kn); + inplace_solve(mL,v,ublas::lower_tag()); + + vectord rq = phi - prod(v,mKF); + + vectord rho(rq); + inplace_solve(mD,rho,ublas::lower_tag()); + + double yPred = inner_prod(phi,mWMap) + inner_prod(v,mVf); + double sPred = sqrt( mSigma * (kq - inner_prod(v,v) + + inner_prod(rho,rho))); + + if ((boost::math::isnan(yPred)) || (boost::math::isnan(sPred))) + { + FILE_LOG(logERROR) << "Error in prediction. NaN found."; + throw 1; + } + + + d_->setMeanAndStd(yPred,sPred); + return d_; + } + + + double GaussianProcessNormal::negativeLogLikelihood() + { + matrixd KK = computeCorrMatrix(); + const size_t n = KK.size1(); + const size_t p = mMean->nFeatures(); + + vectord v0 = mGPY - prod(trans(mFeatM),mW0); + matrixd WW = zmatrixd(p,p); //TODO: diagonal matrix + utils::addToDiagonal(WW,mInvVarW); + matrixd FW = prod(trans(mFeatM),WW); + KK += prod(FW,mFeatM); + matrixd BB(n,n); + utils::cholesky_decompose(KK,BB); + inplace_solve(BB,v0,ublas::lower_tag()); + double zz = inner_prod(v0,v0); + + double lik = 1/(2*mSigma) * zz; + lik += utils::log_trace(BB); + return lik; + } + + + + int GaussianProcessNormal::precomputePrediction() + { + size_t n = mGPXX.size(); + size_t p = mMean->nFeatures(); + + mKF = trans(mFeatM); + inplace_solve(mL,mKF,ublas::lower_tag()); + //TODO: make one line + matrixd DD(p,p); + DD = prod(trans(mKF),mKF); + utils::addToDiagonal(DD,mInvVarW); + utils::cholesky_decompose(DD,mD); + + vectord vn = mGPY; + inplace_solve(mL,vn,ublas::lower_tag()); + mWMap = prod(mFeatM,vn) + utils::ublas_elementwise_prod(mInvVarW,mW0); + utils::cholesky_solve(mD,mWMap,ublas::lower()); + + mVf = mGPY - prod(trans(mFeatM),mWMap); + inplace_solve(mL,mVf,ublas::lower_tag()); + + if (boost::math::isnan(mWMap(0))) + { + FILE_LOG(logERROR) << "Error in precomputed prediction. NaN found."; + throw 1; + } + return 0; + } + +} //namespace bayesopt