Source code for relativistic_sghmc

# vim:foldmethod=marker
# XXX: Polish reference to paper in docstring
import numpy as np
import tensorflow as tf
from tensorflow_mcmc.sampling.mcmc_base_classes import MCMCSampler

from tensorflow_mcmc.tensor_utils import (vectorize, unvectorize,
                                          safe_divide, safe_sqrt)


[docs]class RelativisticSGHMCSampler(MCMCSampler):
[docs] def __init__(self, params, Cost, seed=None, epsilon=0.01, m=1.0, c=0.6, D=1.0, scale_grad=1.0, batch_generator=None, dtype=tf.float64, session=tf.get_default_session()): """ Relativistic Stochastic Gradient Hamiltonian Monte-Carlo Sampler. See [1] for more details on Relativistic SGHMC. [1] X. Lu, V. Perrone, L. Hasenclever, Y. W. Teh, S. J. Vollmer Relativistic Monte Carlo Parameters ---------- params : list of tensorflow.Variable objects Target parameters for which we want to sample new values. Cost : tensorflow.Tensor 1-d Cost tensor that depends on `params`. Frequently denoted as U(theta) in literature. seed : int, optional Random seed to use. Defaults to `None`. epsilon : float, optional Value that is used as learning rate parameter for the sampler, also denoted as discretization parameter in literature. Defaults to `0.01`. m : float, optional mass constant. Defaults to `1.0`. c : float, optional "Speed of light constant" Defaults to `0.6`. D : float, optional Diffusion constant Defaults to `1.0`. scale_grad : float, optional Value that is used to scale the magnitude of the noise used during sampling. In a typical batches-of-data setting this usually corresponds to the number of examples in the entire dataset. Defaults to `1.0` which corresponds to no scaling. batch_generator : BatchGenerator, optional Iterable which returns dictionaries to feed into tensorflow.Session.run() calls to evaluate the cost function. Defaults to `None` which indicates that no batches shall be fed. dtype : tensorflow.DType, optional Type of elements of `tensorflow.Tensor` objects used in this sampler. Defaults to `tensorflow.float64`. session : tensorflow.Session, optional Session object which knows about the external part of the graph (which defines `Cost`, and possibly batches). Used internally to evaluate (burn-in/sample) the sampler. """ super().__init__(params=params, batch_generator=batch_generator, dtype=dtype, session=session, seed=seed) self.c = tf.constant(c, dtype=dtype) self.m = tf.constant(m, dtype=dtype) self.Cost = Cost # Epsilon = tf.constant(epsilon, dtype=dtype) Epsilon = tf.constant(epsilon / np.sqrt(scale_grad), dtype=dtype) grads = [vectorize(gradient) for gradient in tf.gradients(self.Cost, params)] # Sampler Variables {{{ # # Noise estimate of stochastic gradient (B_hat) {{{ # Tau = [tf.Variable(tf.ones_like(Param, dtype=dtype), dtype=dtype, name="Tau_{}".format(i), trainable=False) for i, Param in enumerate(self.vectorized_params)] R = [tf.Variable(1. / (Tau[i].initialized_value() + 1), name="R_{}".format(i), trainable=False) for i, Param in enumerate(self.vectorized_params)] G = [tf.Variable(tf.ones_like(Param, dtype=dtype), dtype=dtype, name="G_{}".format(i), trainable=False) for i, Param in enumerate(self.vectorized_params)] V_hat = [tf.Variable(tf.ones_like(Param, dtype=dtype), dtype=dtype, name="V_hat_{}".format(i), trainable=False) for i, Param in enumerate(self.vectorized_params)] B_hat = [tf.Variable(0.5 * Epsilon * V_hat.initialized_value(), name="B_hat_{}".format(i), dtype=dtype) for i, V_hat in enumerate(V_hat)] # }}} Noise estimate of stochastic gradient (B_hat) # # Diffusion Terms {{{ # D = [tf.Variable(tf.ones_like(Param, dtype=dtype), name="D_{}".format(i), dtype=dtype, trainable=False) for i, Param in enumerate(self.vectorized_params)] # }}} Diffusion Terms # # Momentum P = [tf.Variable(tf.ones_like(Param, dtype=dtype), dtype=dtype, name="P_{}".format(i), trainable=False) for i, Param in enumerate(self.vectorized_params)] # Mass term M_update = [tf.Variable(self._mass_update(P_0.initialized_value()), name="Mass_update_{}".format(i), dtype=dtype) for i, P_0 in enumerate(P)] # }}} Sampler Parameters # self.Theta_t = [None] * len(params) for i, (Param, Grad) in enumerate(zip(params, grads)): Vectorized_Param = self.vectorized_params[i] R_t = tf.assign(R[i], 1. / (Tau[i] + 1), name="R_t_{}".format(i)) # R_t should always use the old value of Tau with tf.control_dependencies([R_t]): Tau_t = tf.assign_add( Tau[i], safe_divide(-G[i] * G[i] * Tau[i], V_hat[i]) + 1, name="Tau_t_{}".format(i) ) # Tau_t should always use the old values of G, V_hat with tf.control_dependencies([Tau_t]): G_t = tf.assign_add( G[i], -R_t * G[i] + R_t * Grad, name="G_t_{}".format(i) ) V_hat_t = tf.assign_add( V_hat[i], - R_t * V_hat[i] + R_t * Grad ** 2, name="V_hat_t_{}".format(i) ) with tf.control_dependencies([G_t, V_hat_t]): B_hat_t = tf.assign( B_hat[i], 0.5 * Epsilon * V_hat_t ) # Draw random sample {{{ # Noise_scale = Epsilon * (2 * D[i] - Epsilon * B_hat_t) Sigma = safe_sqrt(Noise_scale) Sample = self._draw_noise_sample( Sigma=Sigma, Shape=Vectorized_Param.shape ) # }}} Draw random sample # # Equation (9) upper part P_t = tf.assign_add( P[i], -Epsilon * Grad - Epsilon * D[i] * M_update[i] + Sample, name="P_t_{}".format(i) ) # Update mass term for Equation (9) lower part M_update_t = tf.assign( M_update[i], self._mass_update(P_t), name="M_update_t_{}".format(i) ) # Equation (9) lower part Vectorized_Theta_t = tf.assign_add( Vectorized_Param, Epsilon * M_update_t ) self.Theta_t[i] = tf.assign( Param, unvectorize( Vectorized_Theta_t, original_shape=Param.shape ), name="Theta_t_{}".format(i) )
def _mass_update(self, P): # Equation (10) in the paper return safe_divide( P, tf.sqrt( tf.divide( tf.matmul(P, P, transpose_a=True), tf.square(self.c) ) + tf.square(self.m) ) )