# vim: foldmethod=marker
import tensorflow as tf
from tensorflow_mcmc.sampling.mcmc_base_classes import BurnInMCMCSampler
from tensorflow_mcmc.tensor_utils import (vectorize, unvectorize,
safe_divide, safe_sqrt)
[docs]class SGLDSampler(BurnInMCMCSampler):
""" Stochastic Gradient Langevin Dynamics Sampler that uses a burn-in
procedure to adapt its own hyperparameters during the initial stages
of sampling.
See [1] for more details on this burn-in procedure.
See [2] for more details on Stochastic Gradient Langevin Dynamics.
[1] J. T. Springenberg, A. Klein, S. Falkner, F. Hutter
Bayesian Optimization with Robust Bayesian Neural Networks.
In Advances in Neural Information Processing Systems 29 (2016).
[2] M.Welling, Y. W. Teh
Bayesian Learning via Stochastic Gradient Langevin Dynamics
"""
[docs] def __init__(self, params, cost_fun, seed=None, batch_generator=None,
epsilon=0.01, session=tf.get_default_session(),
burn_in_steps=3000, scale_grad=1.0, dtype=tf.float64, A=1.0):
""" Initialize the sampler parameters and set up a tensorflow.Graph
for later queries.
Parameters
----------
params : list of tensorflow.Variable objects
Target parameters for which we want to sample new values.
cost_fun : callable
Function that takes `params` as input and returns a
1-d `tensorflow.Tensor` that contains the cost-value.
Frequently denoted with `U` in literature.
seed : int, optional
Random seed to use.
Defaults to `None`.
batch_generator : BatchGenerator, optional
Iterable which returns dictionaries to feed into
tensorflow.Session.run() calls to evaluate the cost function.
Defaults to `None` which indicates that no batches shall be fed.
epsilon : float, optional
Value that is used as learning rate parameter for the sampler,
also denoted as discretization parameter in literature.
Defaults to `0.01`.
session : tensorflow.Session, optional
Session object which knows about the external part of the graph
(which defines `Cost`, and possibly batches).
Used internally to evaluate (burn-in/sample) the sampler.
burn_in_steps: int, optional
Number of burn-in steps to perform. In each burn-in step, this
sampler will adapt its own internal parameters to decrease its error.
For reference see: TODO ADD PAPER REFERENCE HERE
scale_grad : float, optional
Value that is used to scale the magnitude of the noise used
during sampling. In a typical batches-of-data setting this usually
corresponds to the number of examples in the entire dataset.
A : float, optional
TODO XXX Doku
Defaults to `1.0`.
Examples
----------
Simple, plain example:
>>> import tensorflow as tf
>>> session = tf.Session()
>>> sampler = SGHMCSampler(params=[x], cost_fun=None, session=session)
>>> first_sample = next(sampler)
TODO: Add more samples
Simple example that uses batches:
TODO: Add simplified batch example here
See Also
----------
tensorflow_mcmc.sampling.mcmc_base_classes.BurnInMCMCSampler:
Base class for `SGLDSampler` that specifies how actual sampling
is performed (using iterator protocol, e.g. `next(sampler)`).
"""
super().__init__(params=params, batch_generator=batch_generator,
burn_in_steps=burn_in_steps,
session=session, dtype=dtype)
n_params = len(params)
# Initialize graph constants {{{ #
A = tf.constant(A, name="A", dtype=dtype)
Noise = tf.constant(0., name="noise", dtype=dtype)
Epsilon = tf.constant(epsilon, name="epsilon", dtype=dtype)
Scale_grad = tf.constant(scale_grad, name="scale_grad", dtype=dtype)
# }}} Initialize graph constants #
self.Cost = cost_fun(*params)
grads = [vectorize(gradient) for gradient in
tf.gradients(self.Cost, params)]
# Initialize internal sampler parameters {{{ #
Tau = [tf.Variable(tf.ones_like(Param, dtype=dtype),
dtype=dtype, name="Tau_{}".format(i),
trainable=False)
for i, Param in enumerate(self.vectorized_params)]
R = [tf.Variable(1. / (Tau[i].initialized_value() + 1),
name="R_{}".format(i), trainable=False)
for i, Param in enumerate(self.vectorized_params)]
G = [tf.Variable(tf.ones_like(Param, dtype=dtype),
dtype=dtype, name="G_{}".format(i),
trainable=False)
for i, Param in enumerate(self.vectorized_params)]
V_hat = [tf.Variable(tf.ones_like(Param, dtype=dtype),
dtype=dtype, name="V_hat_{}".format(i),
trainable=False)
for i, Param in enumerate(self.vectorized_params)]
# Initialize mass matrix inverse {{{ #
self.Minv = [tf.Variable(tf.divide(tf.constant(1., dtype=dtype),
tf.sqrt(V_hat[i].initialized_value())),
name="Minv_{}".format(i), trainable=False)
for i, Param in enumerate(self.vectorized_params)]
# }}} Initialize mass matrix inverse #
# }}} Initialize internal sampler parameters #
self.Minv_t = [None] * n_params # gets burned-in
for i, (Param, Grad) in enumerate(zip(params, grads)):
Vectorized_Param = self.vectorized_params[i]
# Burn-in logic {{{ #
R_t = tf.assign(R[i], 1. / (Tau[i] + 1.), name="R_t_{}".format(i))
# R_t should always use the old value of Tau
with tf.control_dependencies([R_t]):
Tau_t = tf.assign_add(
Tau[i],
safe_divide(-G[i] * G[i] * Tau[i], V_hat[i]) + 1,
name="Tau_t_{}".format(i)
)
self.Minv_t[i] = tf.assign(
self.Minv[i],
safe_divide(1., safe_sqrt(V_hat[i])),
name="Minv_t_{}".format(i)
)
# Tau_t, Minv_t should always use the old values of G, G2
with tf.control_dependencies([Tau_t, self.Minv_t[i]]):
G_t = tf.assign_add(
G[i],
-R_t * G[i] + R_t * Grad,
name="G_t_{}".format(i)
)
V_hat_t = tf.assign_add(
V_hat[i],
- R_t * V_hat[i] + R_t * Grad ** 2,
name="V_hat_t_{}".format(i)
)
# }}} Burn-in logic #
with tf.control_dependencies([G_t, V_hat_t]):
# Draw random sample {{{ #
Sigma = safe_sqrt(
2. * Epsilon *
safe_divide(
(self.Minv_t[i] * (A - Noise)), Scale_grad
)
)
Sample = self._draw_noise_sample(
Sigma=Sigma, Shape=Vectorized_Param.shape
)
# }}} Draw random sample #
# SGLD Update {{{ #
Vectorized_Theta_t = tf.assign_add(
Vectorized_Param,
- Epsilon * self.Minv_t[i] * A * Grad + Sample,
)
self.Theta_t[i] = tf.assign(
Param,
unvectorize(
Vectorized_Theta_t, original_shape=Param.shape
),
name="Theta_t_{}".format(i)
)
# }}} SGLD Update #