# HG changeset patch
# User Ruben Martinez-Cantin
# Date 1368548464 -7200
# Node ID bf97fb63a1f04e51e4dd4fed7bcc041df9723756
# Parent 98269d801d75e4e59f12e99d7861486f897c16d5
Missing files
diff --git a/doxygen/introduction.dox b/doxygen/introduction.dox
new file mode 100644
--- /dev/null
+++ b/doxygen/introduction.dox
@@ -0,0 +1,172 @@
+/*! \page bopttheory Bayesian optimization
+\tableofcontents
+
+\section introbopt Introduction to Bayesian Optimization
+
+Many problems in engineering, computer science, economics, etc.,
+require to find the extremum of a real valued function. These
+functions are typically continuous and sometimes smooth (e.g.:
+Lipschitz continuous). However, those functions do not have a
+closed-form expression or might be multimodal, where some of the local
+extrema might have a bad outcome compared to the global extremum. The
+evaluation of those functions might be costly.
+
+Global optimization is a special case of non-convex optimization where
+we want to find the global extremum of a real valued function, that
+is, the target function. The search is done by some pointwise
+evaluation of the target function.
+
+The objective of a global optimization algorithm is to find the
+sequence of points
+\f[
+x_n \in \mathcal{A} \subset \mathbb{R}^m , \;\;\; n = 1,2,\ldots
+\f]
+which converges to the point \f$x^*\f$, that is, the extremum of the
+target function, when \f$n\f$ is large. The algorithm should be able to
+find that sequence at least for all functions from a given family.
+
+As explained in \cite Mockus94, this search procedure is a sequential
+decision making problem where point at step \f$n+1\f$ is based on decision
+\f$d_n\f$ which considers all previous data:
+\f[
+x_{n+1} = d_n(x_{1:n},y_{1:n})
+\f]
+where \f$y_i = f(x_i) + \epsilon_i\f$. For simplicity, many works assume
+\f$\epsilon_i = 0\f$, that is, function evaluations are
+deterministic. However, we can easily extend the description to
+include stochastic functions (e.g.: homoscedastic noise \f$\epsilon_i
+\sim \mathcal{N}(0,\sigma)\f$).
+
+The search method is the sequence of decisions \f$d = {d_0,\ldots,
+d_{n-1}}\f$, which leads to the final decision \f$x_{n} = x_{n}(d)\f$. In
+most applications, the objective is to optimize the response of the
+final decisions. Then, the criteria relies on the \a optimality
+\a error or \a optimality \a gap, which can be expressed as:
+\f[
+\delta_n(f,d) = f\left(x_n\right) - f(x^*)
+\f]
+In other applications, the objective may require to converge to \f$x^*\f$
+in the input space. Then, we can use for example the *Euclidean
+distance error*:
+\f[
+\delta_n(f,d) = \|x_n - x^*\|_2 \label{eq:dist-error}
+\f]
+The previous equations can also be interpreted as variants of the
+\a loss function for the decision at each step. Thus, the optimal
+decision is defined as the function that minimizes the loss function:
+\f[
+d_n = \arg \min_d \delta_n(f,d)
+\f]
+This requires full knowledge of function \f$f\f$, which is
+unavailable. Instead, let assume that the target function \f$f = f(x)\f$
+belongs to a family of functions \f$f \in F\f$, e.g.: continuous functions
+in \f$\mathbb{R}^m\f$. Let also assume that the function can be
+represented as sample from a probability distribution over functions
+\f$f \sim P(f)\f$. Then, the best response case analysis for the search
+process is defined as the decision that optimizes the expectation of
+the loss function:
+\f[
+d^{BR}_n = \arg \min_d \mathbb{E}_{P(f)} \left[
+\delta_n(f,d)\right]= \arg \min_d \int_F \delta_n(f,d) \; dP(f)
+\f]
+where \f$P\f$ is a prior distribution over functions.
+
+However, we can improve the equation considering that, at decision
+\f$d_n\f$ we have already \a observed the actual response of the
+function at \f$n-1\f$ points, \f$\{x_{1:n-1},y_{1:n-1}\}\f$. Thus, the prior
+information of the function can be updated with the observations and
+the Bayes rule:
+\f[
+ P(f|x_{1:n-1},y_{1:n-1}) = \frac{P(x_{1:n-1},y_{1:n-1}|f) P(f)}{P(x_{1:n-1},y_{1:n-1})}
+\f]
+In fact, we can actually rewrite the equation to represent the updates
+sequentially:
+\f[
+ P(f|x_{1:i},y_{1:i}) = \frac{P(x_{i},y_{i}|f) P(f|x_{1:i-1},y_{1:i-1})}{P(x_{i},y_{i})}, \qquad \forall \; i=1 \ldots n-1
+\f]
+Thus, the previous equation can be rewritten as:
+\f[
+d^{BO}_n = \arg \min_d \mathbb{E}_{P(f|x_{1:n-1},y_{1:n-1})} \left[ \delta_n(f,d)\right] = \arg \min_d \int_F \delta_n(f,d) \; dP(f|x_{1:n-1},y_{1:n-1})
+\f]
+This equation is the root of *Bayesian optimization*, where the
+Bayesian part comes from the fact that we are computing the
+expectation with respect to the posterior distribution, also called
+\a belief, over functions. Therefore, Bayesian optimization is a
+memory-based optimization algorithm.
+
+As commented before, most of the theory of Bayesian optimization is
+related to deterministic functions, we consider also stochastic
+functions, that is, we assume there might be a random error in the
+function output. In fact, evaluations can produce different outputs if
+repeated. In that case, the target function is the expected
+output. Furthermore, in a recent paper by \cite Gramacy2012 it has
+been shown that, even for deterministic functions, it is better to
+assume certain error in the observation. The main reason being that,
+in practice, there might be some mismodelling errors which can lead to
+instability of the recursion if neglected.
+
+\section models Bayesian optimization general model
+
+In order to simplify the description, we are going to use a special
+case of Bayesian optimization model defined previously which
+corresponds to the most common application. In subsequent Sections we
+will introduce some generalizations for different applications.
+
+Without loss of generality, consider the problem of finding the
+minimum of an unknown real valued function \f$f:\mathbb{X} \rightarrow
+\mathbb{R}\f$, where \f$\mathbb{X}\f$ is a compact space, \f$\mathbb{X}
+\subset \mathbb{R}^d, d \geq 1\f$. Let \f$P(f)\f$ be a prior distribution
+over functions represented as a stochastic process, for example, a
+Gaussian process \f$\mathbf{x}i(\cdot)\f$, with inputs \f$x \in \mathbb{X}\f$ and an
+associate kernel or covariance function \f$k(\cdot,\cdot)\f$. Let also
+assume that the target function is a sample of the stochastic process
+\f$f \sim \mathbf{x}i(\cdot)\f$.
+
+In order to find the minimum, the algorithm has a maximum budget of
+\f$N\f$ evaluations of the target function \f$f\f$. The purpose of the
+algorithm is to find optimal decisions that provide a better
+performance at the end.
+
+One advantage of using Gaussian processes as a prior distributions
+over functions is that new observations of the target function
+\f$(x_i,y_i)\f$ can be easily used to update the distribution over
+functions. Furthermore, the posterior distribution is also a Gaussian
+process \f$\mathbf{x}i_i = \left[ \mathbf{x}i(\cdot) | x_{1:i},y_{1:i}
+\right]\f$. Therefore, the posterior can be used as an informative prior
+for the next iteration in a recursive algorithm.
+
+In a more general setting, many authors have suggested to modify the
+standard zero-mean Gaussian process for different variations that
+include semi-parametric models \cite Huang06 \cite Handcock1993 \cite Jones:1998 \cite OHagan1992, use of hyperpriors on the parameters
+\cite MartinezCantin09AR \cite Brochu:2010c \cite Hoffman2011, Student
+t processes \cite Gramacy_Polson_2009 \cite Sacks89SS \cite Williams_Santner_Notz_2000, etc.
+
+We use a generalized linear model of the form:
+\f[
+ f(x) = \phi(\mathbf{x})^T \mathbf{w} + \epsilon(\mathbf{x})
+\f]
+where
+\f[
+ \epsilon(\mathbf{x}) \sim \mathcal{NP} \left( 0, \sigma^2_s (\mathbf{K}(\theta) + \sigma^2_n \I) \right)
+\f]
+The term \f$\mathcal{NP}\f$ means a non-parametric process, which can
+make reference to a Gaussian process \f$\mathcal{GP}\f$ or a Student's
+t process \f$\mathcal{TP}\f$. In both cases, \f$\sigma^2_n\f$ is the
+observation noise variance, sometimes called nugget, and it is problem
+specific. Many authors decide to fix this value \f$\sigma^2_n = 0\f$
+when the function \f$f(x)\f$ is deterministic, for example, a computer
+simulation. However, as cleverly pointed out in \cite Gramacy2012,
+there might be more reasons to include this term appart from being the
+observation noise, for example, to consider model inaccuracies.
+
+This model has been presented in different ways depending on the field
+where it was used:
+\li As a generalized linear model \f$\phi(\mathbf{x})^T\mathbf{w}\f$ with heteroscedastic
+perturbation \f$\epsilon(\mathbf{x})\f$.
+\li As a nonparametric process of the form \f$\mathcal{NP} \left(\phi(\mathbf{x})^T\mathbf{w},
+\sigma^2_s (\mathbf{K}(\theta) + \sigma^2_n \mathbf{I}) \right)\f$.
+\li As a semiparametric model \f$f(\mathbf{x}) = f_{par}(\mathbf{x}) + f_{nonpar}(\mathbf{x}) =
+\phi(\mathbf{x})^T\mathbf{w} + \mathcal{NP}(\cdot)\f$
+
+
+*/
\ No newline at end of file
diff --git a/doxygen/models.dox b/doxygen/models.dox
new file mode 100644
--- /dev/null
+++ b/doxygen/models.dox
@@ -0,0 +1,3 @@
+/*!
+
+*/
\ No newline at end of file
diff --git a/src/gaussian_process_normal.cpp b/src/gaussian_process_normal.cpp
new file mode 100644
--- /dev/null
+++ b/src/gaussian_process_normal.cpp
@@ -0,0 +1,144 @@
+/*
+-------------------------------------------------------------------------
+ This file is part of BayesOpt, an efficient C++ library for
+ Bayesian optimization.
+
+ Copyright (C) 2011-2013 Ruben Martinez-Cantin
+
+ BayesOpt is free software: you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ BayesOpt is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with BayesOpt. If not, see .
+------------------------------------------------------------------------
+*/
+
+#include
+#include
+#include "log.hpp"
+#include "cholesky.hpp"
+#include "trace_ublas.hpp"
+#include "elementwise_ublas.hpp"
+#include "gauss_distribution.hpp"
+#include "gaussian_process_normal.hpp"
+
+namespace bayesopt
+{
+
+ namespace ublas = boost::numeric::ublas;
+
+ GaussianProcessNormal::GaussianProcessNormal(size_t dim,
+ bopt_params params):
+ HierarchicalGaussianProcess(dim,params),
+ mSigma(params.sigma_s),
+ mW0(params.mean.n_coef), mInvVarW(params.mean.n_coef),
+ mD(params.mean.n_coef,params.mean.n_coef)
+ {
+ mW0 = utils::array2vector(params.mean.coef_mean,params.mean.n_coef);
+ for (size_t ii = 0; ii < params.mean.n_coef; ++ii)
+ {
+ double varii = params.mean.coef_std[ii] * params.mean.coef_std[ii];
+ mInvVarW(ii) = 1/varii;
+ }
+ d_ = new GaussianDistribution();
+ } // Constructor
+
+
+
+ GaussianProcessNormal::~GaussianProcessNormal()
+ {
+ delete d_;
+ } // Default destructor
+
+
+ ProbabilityDistribution*
+ GaussianProcessNormal::prediction(const vectord &query)
+ {
+ double kq = (*mKernel)(query, query);;
+ vectord kn = computeCrossCorrelation(query);
+ vectord phi = mMean->getFeatures(query);
+
+ vectord v(kn);
+ inplace_solve(mL,v,ublas::lower_tag());
+
+ vectord rq = phi - prod(v,mKF);
+
+ vectord rho(rq);
+ inplace_solve(mD,rho,ublas::lower_tag());
+
+ double yPred = inner_prod(phi,mWMap) + inner_prod(v,mVf);
+ double sPred = sqrt( mSigma * (kq - inner_prod(v,v)
+ + inner_prod(rho,rho)));
+
+ if ((boost::math::isnan(yPred)) || (boost::math::isnan(sPred)))
+ {
+ FILE_LOG(logERROR) << "Error in prediction. NaN found.";
+ throw 1;
+ }
+
+
+ d_->setMeanAndStd(yPred,sPred);
+ return d_;
+ }
+
+
+ double GaussianProcessNormal::negativeLogLikelihood()
+ {
+ matrixd KK = computeCorrMatrix();
+ const size_t n = KK.size1();
+ const size_t p = mMean->nFeatures();
+
+ vectord v0 = mGPY - prod(trans(mFeatM),mW0);
+ matrixd WW = zmatrixd(p,p); //TODO: diagonal matrix
+ utils::addToDiagonal(WW,mInvVarW);
+ matrixd FW = prod(trans(mFeatM),WW);
+ KK += prod(FW,mFeatM);
+ matrixd BB(n,n);
+ utils::cholesky_decompose(KK,BB);
+ inplace_solve(BB,v0,ublas::lower_tag());
+ double zz = inner_prod(v0,v0);
+
+ double lik = 1/(2*mSigma) * zz;
+ lik += utils::log_trace(BB);
+ return lik;
+ }
+
+
+
+ int GaussianProcessNormal::precomputePrediction()
+ {
+ size_t n = mGPXX.size();
+ size_t p = mMean->nFeatures();
+
+ mKF = trans(mFeatM);
+ inplace_solve(mL,mKF,ublas::lower_tag());
+ //TODO: make one line
+ matrixd DD(p,p);
+ DD = prod(trans(mKF),mKF);
+ utils::addToDiagonal(DD,mInvVarW);
+ utils::cholesky_decompose(DD,mD);
+
+ vectord vn = mGPY;
+ inplace_solve(mL,vn,ublas::lower_tag());
+ mWMap = prod(mFeatM,vn) + utils::ublas_elementwise_prod(mInvVarW,mW0);
+ utils::cholesky_solve(mD,mWMap,ublas::lower());
+
+ mVf = mGPY - prod(trans(mFeatM),mWMap);
+ inplace_solve(mL,mVf,ublas::lower_tag());
+
+ if (boost::math::isnan(mWMap(0)))
+ {
+ FILE_LOG(logERROR) << "Error in precomputed prediction. NaN found.";
+ throw 1;
+ }
+ return 0;
+ }
+
+} //namespace bayesopt