Source code for sgld

# vim: foldmethod=marker

import tensorflow as tf
from tensorflow_mcmc.sampling.mcmc_base_classes import BurnInMCMCSampler

from tensorflow_mcmc.tensor_utils import (vectorize, unvectorize,
                                          safe_divide, safe_sqrt)


[docs]class SGLDSampler(BurnInMCMCSampler):
    """ Stochastic Gradient Langevin Dynamics Sampler that uses a burn-in
        procedure to adapt its own hyperparameters during the initial stages
        of sampling.

        See [1] for more details on this burn-in procedure.
        See [2] for more details on Stochastic Gradient Langevin Dynamics.

        [1] J. T. Springenberg, A. Klein, S. Falkner, F. Hutter
            Bayesian Optimization with Robust Bayesian Neural Networks.
            In Advances in Neural Information Processing Systems 29 (2016).

        [2] M.Welling, Y. W. Teh
            Bayesian Learning via Stochastic Gradient Langevin Dynamics
    """

[docs]    def __init__(self, params, cost_fun, seed=None, batch_generator=None,
                 epsilon=0.01, session=tf.get_default_session(),
                 burn_in_steps=3000, scale_grad=1.0, dtype=tf.float64, A=1.0):
        """ Initialize the sampler parameters and set up a tensorflow.Graph
            for later queries.

        Parameters
        ----------
        params : list of tensorflow.Variable objects
            Target parameters for which we want to sample new values.

        cost_fun : callable
            Function that takes `params` as input and returns a
            1-d `tensorflow.Tensor` that contains the cost-value.
            Frequently denoted with `U` in literature.

        seed : int, optional
            Random seed to use.
            Defaults to `None`.

        batch_generator : BatchGenerator, optional
            Iterable which returns dictionaries to feed into
            tensorflow.Session.run() calls to evaluate the cost function.
            Defaults to `None` which indicates that no batches shall be fed.

        epsilon : float, optional
            Value that is used as learning rate parameter for the sampler,
            also denoted as discretization parameter in literature.
            Defaults to `0.01`.

        session : tensorflow.Session, optional
            Session object which knows about the external part of the graph
            (which defines `Cost`, and possibly batches).
            Used internally to evaluate (burn-in/sample) the sampler.

        burn_in_steps: int, optional
            Number of burn-in steps to perform. In each burn-in step, this
            sampler will adapt its own internal parameters to decrease its error.
            For reference see: TODO ADD PAPER REFERENCE HERE

        scale_grad : float, optional
            Value that is used to scale the magnitude of the noise used
            during sampling. In a typical batches-of-data setting this usually
            corresponds to the number of examples in the entire dataset.

        A : float, optional
            TODO XXX Doku
            Defaults to `1.0`.

        Examples
        ----------
        Simple, plain example:
        >>> import tensorflow as tf
        >>> session = tf.Session()
        >>> sampler = SGHMCSampler(params=[x], cost_fun=None, session=session)
        >>> first_sample = next(sampler)
        TODO: Add more samples

        Simple example that uses batches:
        TODO: Add simplified batch example here

        See Also
        ----------
        tensorflow_mcmc.sampling.mcmc_base_classes.BurnInMCMCSampler:
            Base class for `SGLDSampler` that specifies how actual sampling
            is performed (using iterator protocol, e.g. `next(sampler)`).

        """

        super().__init__(params=params, batch_generator=batch_generator,
                         burn_in_steps=burn_in_steps,
                         session=session, dtype=dtype)

        n_params = len(params)

        #  Initialize graph constants {{{ #

        A = tf.constant(A, name="A", dtype=dtype)
        Noise = tf.constant(0., name="noise", dtype=dtype)
        Epsilon = tf.constant(epsilon, name="epsilon", dtype=dtype)
        Scale_grad = tf.constant(scale_grad, name="scale_grad", dtype=dtype)

        #  }}} Initialize graph constants #

        self.Cost = cost_fun(*params)

        grads = [vectorize(gradient) for gradient in
                 tf.gradients(self.Cost, params)]

        #  Initialize internal sampler parameters {{{ #

        Tau = [tf.Variable(tf.ones_like(Param, dtype=dtype),
                           dtype=dtype, name="Tau_{}".format(i),
                           trainable=False)
               for i, Param in enumerate(self.vectorized_params)]

        R = [tf.Variable(1. / (Tau[i].initialized_value() + 1),
                         name="R_{}".format(i), trainable=False)
             for i, Param in enumerate(self.vectorized_params)]

        G = [tf.Variable(tf.ones_like(Param, dtype=dtype),
                         dtype=dtype, name="G_{}".format(i),
                         trainable=False)
             for i, Param in enumerate(self.vectorized_params)]

        V_hat = [tf.Variable(tf.ones_like(Param, dtype=dtype),
                             dtype=dtype, name="V_hat_{}".format(i),
                             trainable=False)
                 for i, Param in enumerate(self.vectorized_params)]

        #  Initialize mass matrix inverse {{{ #

        self.Minv = [tf.Variable(tf.divide(tf.constant(1., dtype=dtype),
                                 tf.sqrt(V_hat[i].initialized_value())),
                                 name="Minv_{}".format(i), trainable=False)
                     for i, Param in enumerate(self.vectorized_params)]

        #  }}} Initialize mass matrix inverse #

        #  }}} Initialize internal sampler parameters #

        self.Minv_t = [None] * n_params  # gets burned-in

        for i, (Param, Grad) in enumerate(zip(params, grads)):

            Vectorized_Param = self.vectorized_params[i]

            #  Burn-in logic {{{ #
            R_t = tf.assign(R[i], 1. / (Tau[i] + 1.), name="R_t_{}".format(i))
            # R_t should always use the old value of Tau
            with tf.control_dependencies([R_t]):
                Tau_t = tf.assign_add(
                    Tau[i],
                    safe_divide(-G[i] * G[i] * Tau[i], V_hat[i]) + 1,
                    name="Tau_t_{}".format(i)
                )

                self.Minv_t[i] = tf.assign(
                    self.Minv[i],
                    safe_divide(1., safe_sqrt(V_hat[i])),
                    name="Minv_t_{}".format(i)
                )
                # Tau_t, Minv_t should always use the old values of G, G2
                with tf.control_dependencies([Tau_t, self.Minv_t[i]]):
                    G_t = tf.assign_add(
                        G[i],
                        -R_t * G[i] + R_t * Grad,
                        name="G_t_{}".format(i)
                    )

                    V_hat_t = tf.assign_add(
                        V_hat[i],
                        - R_t * V_hat[i] + R_t * Grad ** 2,
                        name="V_hat_t_{}".format(i)
                    )

            #  }}} Burn-in logic #
                    with tf.control_dependencies([G_t, V_hat_t]):
                        #  Draw random sample {{{ #

                        Sigma = safe_sqrt(
                            2. * Epsilon *
                            safe_divide(
                                (self.Minv_t[i] * (A - Noise)), Scale_grad
                            )
                        )

                        Sample = self._draw_noise_sample(
                            Sigma=Sigma, Shape=Vectorized_Param.shape
                        )

                        #  }}} Draw random sample #

                        #  SGLD Update {{{ #

                        Vectorized_Theta_t = tf.assign_add(
                            Vectorized_Param,
                            - Epsilon * self.Minv_t[i] * A * Grad + Sample,
                        )
                        self.Theta_t[i] = tf.assign(
                            Param,
                            unvectorize(
                                Vectorized_Theta_t, original_shape=Param.shape
                            ),
                            name="Theta_t_{}".format(i)
                        )

                        #  }}} SGLD Update #