Source code for xnmt.optimizers

from typing import Optional
import numbers

import dynet as dy
import numpy as np

from xnmt import logger
from xnmt.param_collections import ParamManager
from xnmt.persistence import serializable_init, Serializable
from xnmt import utils

"""
The purpose of this module is mostly to expose the DyNet trainers to YAML serialization,
but may also be extended to customize optimizers / training schedules
"""

[docs]class XnmtOptimizer(object):
  """
  A base classe for trainers. Trainers are mostly simple wrappers of DyNet trainers but can add extra functionality.

  Args:
    optimizer: the underlying DyNet optimizer (trainer)
    skip_noisy: keep track of a moving average and a moving standard deviation of the log of the gradient norm
                          values, and abort a step if the norm of the gradient exceeds four standard deviations of the
                          moving average. Reference: https://arxiv.org/pdf/1804.09849.pdf
  """

  def __init__(self, optimizer: dy.Trainer, skip_noisy: bool = False) -> None:
    self.optimizer = optimizer
    self.skip_noisy = skip_noisy
    if skip_noisy:
      self.rolling_stats = utils.RollingStatistic()

[docs]  def update(self) -> None:
    """
    Update the parameters.
    """
    try:
      if not (self.skip_noisy and self._check_gradients_noisy()):
        self.optimizer.update()
      else:
        logger.info("skipping noisy update")
    except RuntimeError:
      logger.warning("Failed to perform update. Skipping example and clearing gradients.")
      for subcol in ParamManager.param_col.subcols.values():
        for param in subcol.parameters_list():
          param.scale_gradient(0)

[docs]  def status(self) -> None:
    """
    Outputs information about the trainer in the stderr.

    (number of updates since last call, number of clipped gradients, learning rate, etc…)
    """
    return self.optimizer.status()

[docs]  def set_clip_threshold(self, thr: numbers.Real) -> None:
    """
    Set clipping thershold

    To deactivate clipping, set the threshold to be <=0

    Args:
      thr: Clipping threshold
    """
    return self.optimizer.set_clip_threshold(thr)

[docs]  def get_clip_threshold(self) -> numbers.Real:
    """
    Get clipping threshold

    Returns:
      Gradient clipping threshold
    """
    return self.optimizer.get_clip_threshold()

[docs]  def restart(self) -> None:
    """
    Restarts the optimizer

    Clears all momentum values and assimilate (if applicable)
    """
    return self.optimizer.restart()

  @property
  def learning_rate(self):
    return self.optimizer.learning_rate
  @learning_rate.setter
  def learning_rate(self, value):
    self.optimizer.learning_rate = value

  def _check_gradients_noisy(self) -> bool:
    sq_norm = 0
    for subcol in ParamManager.param_col.subcols.values():
      for param in subcol.parameters_list():
        cur_grads = param.grad_as_array()
        sq_norm += np.sum(np.square(cur_grads))
    log_norm = np.log(np.sqrt(sq_norm))
    self.rolling_stats.update(log_norm)
    if self.rolling_stats.average is None: # too few statistics
      return False
    else:
      req_min = self.rolling_stats.average - 4*self.rolling_stats.stddev
      req_max = self.rolling_stats.average + 4*self.rolling_stats.stddev
      return not (req_min < log_norm < req_max)

[docs]class SimpleSGDTrainer(XnmtOptimizer, Serializable):
  """
  Stochastic gradient descent trainer

  This trainer performs stochastic gradient descent, the goto optimization procedure for neural networks.

  Args:
    e0: Initial learning rate
    skip_noisy: keep track of a moving average and a moving standard deviation of the log of the gradient norm
                          values, and abort a step if the norm of the gradient exceeds four standard deviations of the
                          moving average. Reference: https://arxiv.org/pdf/1804.09849.pdf
  """
  yaml_tag = '!SimpleSGDTrainer'

  @serializable_init
  def __init__(self, e0: numbers.Real = 0.1, skip_noisy: bool = False) -> None:
    super().__init__(optimizer=dy.SimpleSGDTrainer(ParamManager.global_collection(), e0),
                     skip_noisy=skip_noisy)

[docs]class MomentumSGDTrainer(XnmtOptimizer, Serializable):
  """
  Stochastic gradient descent with momentum

  This is a modified version of the SGD algorithm with momentum to stablize the gradient trajectory.

  Args:
    e0: Initial learning rate
    mom: Momentum
    skip_noisy: keep track of a moving average and a moving standard deviation of the log of the gradient norm
                          values, and abort a step if the norm of the gradient exceeds four standard deviations of the
                          moving average. Reference: https://arxiv.org/pdf/1804.09849.pdf
  """
  yaml_tag = '!MomentumSGDTrainer'

  @serializable_init
  def __init__(self, e0: numbers.Real = 0.01, mom: numbers.Real = 0.9, skip_noisy: bool = False) -> None:
    super().__init__(optimizer=dy.MomentumSGDTrainer(ParamManager.global_collection(), e0, mom),
                     skip_noisy=skip_noisy)

[docs]class AdagradTrainer(XnmtOptimizer, Serializable):
  """
  Adagrad optimizer

  The adagrad algorithm assigns a different learning rate to each parameter.

  Args:
    e0: Initial learning rate
    eps: Epsilon parameter to prevent numerical instability
    skip_noisy: keep track of a moving average and a moving standard deviation of the log of the gradient norm
                          values, and abort a step if the norm of the gradient exceeds four standard deviations of the
                          moving average. Reference: https://arxiv.org/pdf/1804.09849.pdf
  """
  yaml_tag = '!AdagradTrainer'

  @serializable_init
  def __init__(self, e0: numbers.Real = 0.1, eps: numbers.Real = 1e-20, skip_noisy: bool = False) -> None:
    super().__init__(optimizer=dy.AdagradTrainer(ParamManager.global_collection(), e0, eps=eps),
                     skip_noisy=skip_noisy)

[docs]class AdadeltaTrainer(XnmtOptimizer, Serializable):
  """
  AdaDelta optimizer

  The AdaDelta optimizer is a variant of Adagrad aiming to prevent vanishing learning rates.

  Args:
    eps: Epsilon parameter to prevent numerical instability
    rho: Update parameter for the moving average of updates in the numerator
    skip_noisy: keep track of a moving average and a moving standard deviation of the log of the gradient norm
                          values, and abort a step if the norm of the gradient exceeds four standard deviations of the
                          moving average. Reference: https://arxiv.org/pdf/1804.09849.pdf
  """
  yaml_tag = '!AdadeltaTrainer'

  @serializable_init
  def __init__(self, eps: numbers.Real = 1e-6, rho: numbers.Real = 0.95, skip_noisy: bool = False) -> None:
    super().__init__(optimizer=dy.AdadeltaTrainer(ParamManager.global_collection(), eps, rho),
                     skip_noisy=skip_noisy)

[docs]class AdamTrainer(XnmtOptimizer, Serializable):
  """
  Adam optimizer

  The Adam optimizer is similar to RMSProp but uses unbiased estimates of the first and second moments of the gradient

  Args:
    alpha: Initial learning rate
    beta_1: Moving average parameter for the mean
    beta_2: Moving average parameter for the variance
    eps: Epsilon parameter to prevent numerical instability
    skip_noisy: keep track of a moving average and a moving standard deviation of the log of the gradient norm
                          values, and abort a step if the norm of the gradient exceeds four standard deviations of the
                          moving average. Reference: https://arxiv.org/pdf/1804.09849.pdf
  """
  yaml_tag = '!AdamTrainer'

  @serializable_init
  def __init__(self,
               alpha: numbers.Real = 0.001,
               beta_1: numbers.Real = 0.9,
               beta_2: numbers.Real = 0.999,
               eps: numbers.Real = 1e-8,
               skip_noisy: bool = False) -> None:
    super().__init__(optimizer=dy.AdamTrainer(ParamManager.global_collection(), alpha, beta_1, beta_2, eps),
                     skip_noisy=skip_noisy)

[docs]class NoamTrainer(XnmtOptimizer, Serializable):
  """
  Proposed in the paper "Attention is all you need" (https://papers.nips.cc/paper/7181-attention-is-all-you-need.pdf) [Page 7, Eq. 3]
  In this the learning rate of Adam Optimizer is increased for the first warmup steps followed by a gradual decay

  Args:
    alpha:
    dim:
    warmup_steps:
    beta_1:
    beta_2:
    eps:
    skip_noisy: keep track of a moving average and a moving standard deviation of the log of the gradient norm
                          values, and abort a step if the norm of the gradient exceeds four standard deviations of the
                          moving average. Reference: https://arxiv.org/pdf/1804.09849.pdf
  """
  yaml_tag = '!NoamTrainer'

  @serializable_init
  def __init__(self,
               alpha: numbers.Real = 1.0,
               dim: numbers.Integral = 512,
               warmup_steps: Optional[numbers.Integral] = 4000,
               beta_1: numbers.Real = 0.9,
               beta_2: numbers.Real = 0.98,
               eps: numbers.Real = 1e-9,
               skip_noisy: bool = False) -> None:
    super().__init__(optimizer=dy.AdamTrainer(ParamManager.global_collection(),
                                    alpha=alpha,
                                    beta_1=beta_1,
                                    beta_2=beta_2,
                                    eps=eps),
                     skip_noisy=skip_noisy)
    self.dim = dim
    self.warmup_steps = warmup_steps
    self.steps = 0

[docs]  def update(self) -> None:
    self.steps += 1
    if self.warmup_steps:
      decay = (self.dim ** (-0.5)) * np.min([self.steps ** (-0.5), self.steps * (self.warmup_steps ** (-1.5))])
    else:
      decay = (self.dim ** (-0.5)) * self.steps ** (-0.5)
    self.optimizer.learning_rate = 1. * decay
    super().update()

    if self.steps % 200 == 0:
      logger.info('> Optimizer Logging')
      logger.info('  Steps=%d, learning_rate=%.2e' % (self.steps, self.optimizer.learning_rate))



[docs]class DummyTrainer(XnmtOptimizer, Serializable):
  """
  A dummy trainer that does not perform any parameter updates.
  """
  yaml_tag = "!DummyTrainer"

  @serializable_init
  def __init__(self) -> None:
    pass

[docs]  def update(self) -> None:
    pass

[docs]  def status(self) -> None:
    pass

[docs]  def set_clip_threshold(self, thr) -> None:
    pass

[docs]  def get_clip_threshold(self) -> None:
    pass

[docs]  def restart(self) -> None:
    pass

  @property
  def learning_rate(self):
    return 1.0
  @learning_rate.setter
  def learning_rate(self, value):
    pass