# vim:foldmethod=marker
# XXX: Polish reference to paper in docstring
import numpy as np
import tensorflow as tf
from tensorflow_mcmc.sampling.mcmc_base_classes import MCMCSampler
from tensorflow_mcmc.tensor_utils import (vectorize, unvectorize,
safe_divide, safe_sqrt)
[docs]class RelativisticSGHMCSampler(MCMCSampler):
[docs] def __init__(self, params, Cost, seed=None, epsilon=0.01, m=1.0, c=0.6,
D=1.0, scale_grad=1.0, batch_generator=None,
dtype=tf.float64, session=tf.get_default_session()):
""" Relativistic Stochastic Gradient Hamiltonian Monte-Carlo Sampler.
See [1] for more details on Relativistic SGHMC.
[1] X. Lu, V. Perrone, L. Hasenclever, Y. W. Teh, S. J. Vollmer
Relativistic Monte Carlo
Parameters
----------
params : list of tensorflow.Variable objects
Target parameters for which we want to sample new values.
Cost : tensorflow.Tensor
1-d Cost tensor that depends on `params`.
Frequently denoted as U(theta) in literature.
seed : int, optional
Random seed to use.
Defaults to `None`.
epsilon : float, optional
Value that is used as learning rate parameter for the sampler,
also denoted as discretization parameter in literature.
Defaults to `0.01`.
m : float, optional
mass constant.
Defaults to `1.0`.
c : float, optional
"Speed of light constant"
Defaults to `0.6`.
D : float, optional
Diffusion constant
Defaults to `1.0`.
scale_grad : float, optional
Value that is used to scale the magnitude of the noise used
during sampling. In a typical batches-of-data setting this usually
corresponds to the number of examples in the entire dataset.
Defaults to `1.0` which corresponds to no scaling.
batch_generator : BatchGenerator, optional
Iterable which returns dictionaries to feed into
tensorflow.Session.run() calls to evaluate the cost function.
Defaults to `None` which indicates that no batches shall be fed.
dtype : tensorflow.DType, optional
Type of elements of `tensorflow.Tensor` objects used in this sampler.
Defaults to `tensorflow.float64`.
session : tensorflow.Session, optional
Session object which knows about the external part of the graph
(which defines `Cost`, and possibly batches).
Used internally to evaluate (burn-in/sample) the sampler.
"""
super().__init__(params=params, batch_generator=batch_generator,
dtype=dtype, session=session, seed=seed)
self.c = tf.constant(c, dtype=dtype)
self.m = tf.constant(m, dtype=dtype)
self.Cost = Cost
# Epsilon = tf.constant(epsilon, dtype=dtype)
Epsilon = tf.constant(epsilon / np.sqrt(scale_grad), dtype=dtype)
grads = [vectorize(gradient) for gradient in
tf.gradients(self.Cost, params)]
# Sampler Variables {{{ #
# Noise estimate of stochastic gradient (B_hat) {{{ #
Tau = [tf.Variable(tf.ones_like(Param, dtype=dtype),
dtype=dtype, name="Tau_{}".format(i),
trainable=False)
for i, Param in enumerate(self.vectorized_params)]
R = [tf.Variable(1. / (Tau[i].initialized_value() + 1),
name="R_{}".format(i), trainable=False)
for i, Param in enumerate(self.vectorized_params)]
G = [tf.Variable(tf.ones_like(Param, dtype=dtype),
dtype=dtype, name="G_{}".format(i),
trainable=False)
for i, Param in enumerate(self.vectorized_params)]
V_hat = [tf.Variable(tf.ones_like(Param, dtype=dtype),
dtype=dtype, name="V_hat_{}".format(i),
trainable=False)
for i, Param in enumerate(self.vectorized_params)]
B_hat = [tf.Variable(0.5 * Epsilon * V_hat.initialized_value(),
name="B_hat_{}".format(i), dtype=dtype)
for i, V_hat in enumerate(V_hat)]
# }}} Noise estimate of stochastic gradient (B_hat) #
# Diffusion Terms {{{ #
D = [tf.Variable(tf.ones_like(Param, dtype=dtype),
name="D_{}".format(i),
dtype=dtype, trainable=False)
for i, Param in enumerate(self.vectorized_params)]
# }}} Diffusion Terms #
# Momentum
P = [tf.Variable(tf.ones_like(Param, dtype=dtype),
dtype=dtype, name="P_{}".format(i),
trainable=False)
for i, Param in enumerate(self.vectorized_params)]
# Mass term
M_update = [tf.Variable(self._mass_update(P_0.initialized_value()),
name="Mass_update_{}".format(i),
dtype=dtype)
for i, P_0 in enumerate(P)]
# }}} Sampler Parameters #
self.Theta_t = [None] * len(params)
for i, (Param, Grad) in enumerate(zip(params, grads)):
Vectorized_Param = self.vectorized_params[i]
R_t = tf.assign(R[i], 1. / (Tau[i] + 1), name="R_t_{}".format(i))
# R_t should always use the old value of Tau
with tf.control_dependencies([R_t]):
Tau_t = tf.assign_add(
Tau[i],
safe_divide(-G[i] * G[i] * Tau[i], V_hat[i]) + 1,
name="Tau_t_{}".format(i)
)
# Tau_t should always use the old values of G, V_hat
with tf.control_dependencies([Tau_t]):
G_t = tf.assign_add(
G[i],
-R_t * G[i] + R_t * Grad,
name="G_t_{}".format(i)
)
V_hat_t = tf.assign_add(
V_hat[i],
- R_t * V_hat[i] + R_t * Grad ** 2,
name="V_hat_t_{}".format(i)
)
with tf.control_dependencies([G_t, V_hat_t]):
B_hat_t = tf.assign(
B_hat[i],
0.5 * Epsilon * V_hat_t
)
# Draw random sample {{{ #
Noise_scale = Epsilon * (2 * D[i] - Epsilon * B_hat_t)
Sigma = safe_sqrt(Noise_scale)
Sample = self._draw_noise_sample(
Sigma=Sigma, Shape=Vectorized_Param.shape
)
# }}} Draw random sample #
# Equation (9) upper part
P_t = tf.assign_add(
P[i],
-Epsilon * Grad - Epsilon * D[i] * M_update[i] +
Sample,
name="P_t_{}".format(i)
)
# Update mass term for Equation (9) lower part
M_update_t = tf.assign(
M_update[i],
self._mass_update(P_t),
name="M_update_t_{}".format(i)
)
# Equation (9) lower part
Vectorized_Theta_t = tf.assign_add(
Vectorized_Param,
Epsilon * M_update_t
)
self.Theta_t[i] = tf.assign(
Param,
unvectorize(
Vectorized_Theta_t,
original_shape=Param.shape
),
name="Theta_t_{}".format(i)
)
def _mass_update(self, P):
# Equation (10) in the paper
return safe_divide(
P,
tf.sqrt(
tf.divide(
tf.matmul(P, P, transpose_a=True), tf.square(self.c)
) + tf.square(self.m)
)
)