Commit f2120b07 authored by Olga Wichrowska's avatar Olga Wichrowska
Browse files

Added code for Learned Optimizers that Scale and Generalize

parent 6024579b
......@@ -10,6 +10,7 @@ differential_privacy/* @panyx0718
domain_adaptation/* @bousmalis @ddohan
im2txt/* @cshallue
inception/* @shlens @vincentvanhoucke
learned_optimizer/* @olganw @nirum
learning_to_remember_rare_events/* @lukaszkaiser @ofirnachum
lfads/* @jazcollins @susillo
lm_1b/* @oriolvinyals @panyx0718
......
# Learning to Optimize Learning (LOL)
package(default_visibility = ["//visibility:public"])
# Libraries
# =========
py_library(
name = "metaopt",
srcs = ["metaopt.py"],
deps = [
"//learned_optimizer/problems:datasets",
"//learned_optimizer/problems:problem_generator",
],
)
# Binaries
# ========
py_binary(
name = "metarun",
srcs = ["metarun.py"],
deps = [
":metaopt",
"//learned_optimizer/optimizer:coordinatewise_rnn",
"//learned_optimizer/optimizer:global_learning_rate",
"//learned_optimizer/optimizer:hierarchical_rnn",
"//learned_optimizer/optimizer:learning_rate_schedule",
"//learned_optimizer/optimizer:trainable_adam",
"//learned_optimizer/problems:problem_sets",
"//learned_optimizer/problems:problem_spec",
],
)
# Learned Optimizer
Code for [Learned Optimizers that Scale and Generalize](https://arxiv.org/abs/1703.04813).
## Requirements
* Bazel ([install](https://bazel.build/versions/master/docs/install.html))
* TensorFlow >= v1.3
## Training a Learned Optimizer
## Code Overview
In the top-level directory, ```metaopt.py``` contains the code to train and test a learned optimizer. ```metarun.py``` packages the actual training procedure into a
single file, defining and exposing many flags to tune the procedure, from selecting the optimizer type and problem set to more fine-grained hyperparameter settings.
There is no testing binary; testing can be done ad-hoc via ```metaopt.test_optimizer``` by passing an optimizer object and a directory with a checkpoint.
The ```optimizer``` directory contains a base ```trainable_optimizer.py``` class and a number of extensions, including the ```hierarchical_rnn``` optimizer used in
the paper, a ```coordinatewise_rnn``` optimizer that more closely matches previous work, and a number of simpler optimizers to demonstrate the basic mechanics of
a learnable optimizer.
The ```problems``` directory contains the code to build the problems that were used in the meta-training set.
### Binaries
```metarun.py```: meta-training of a learned optimizer
### Command-Line Flags
The flags most relevant to meta-training are defined in ```metarun.py```. The default values will meta-train a HierarchicalRNN optimizer with the hyperparameter
settings used in the paper.
### Using a Learned Optimizer as a Black Box
The ```trainable_optimizer``` inherits from ```tf.train.Optimizer```, so a properly instantiated version can be used to train any model in any APIs that accept
this class. There are just 2 caveats:
1. If using the Hierarchical RNN optimizer, the apply_gradients return type must be changed (see comments inline for what exactly must be removed)
2. Care must be taken to restore the variables from the optimizer without overriding them. Optimizer variables should be loaded manually using a pretrained checkpoint
and a ```tf.train.Saver``` with only the optimizer variables. Then, when constructing the session, ensure that any automatic variable initialization does not
re-initialize the loaded optimizer variables.
## Contact for Issues
* Olga Wichrowska (@olganw), Niru Maheswaranathan (@nirum)
# Copyright 2017 Google, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Helper utilities for training and testing optimizers."""
from collections import defaultdict
import random
import sys
import time
import numpy as np
import tensorflow as tf
from learned_optimizer.optimizer import trainable_optimizer
from learned_optimizer.optimizer import utils
from learned_optimizer.problems import datasets
from learned_optimizer.problems import problem_generator
tf.app.flags.DEFINE_integer("ps_tasks", 0,
"""Number of tasks in the ps job.
If 0 no ps job is used.""")
tf.app.flags.DEFINE_float("nan_l2_reg", 1e-2,
"""Strength of l2-reg when NaNs are encountered.""")
tf.app.flags.DEFINE_float("l2_reg", 0.,
"""Lambda value for parameter regularization.""")
# Default is 0.9
tf.app.flags.DEFINE_float("rms_decay", 0.9,
"""Decay value for the RMSProp metaoptimizer.""")
# Default is 1e-10
tf.app.flags.DEFINE_float("rms_epsilon", 1e-20,
"""Epsilon value for the RMSProp metaoptimizer.""")
tf.app.flags.DEFINE_boolean("set_profiling", False,
"""Enable memory usage and computation time """
"""tracing for tensorflow nodes (available in """
"""TensorBoard).""")
tf.app.flags.DEFINE_boolean("reset_rnn_params", True,
"""Reset the parameters of the optimizer
from one meta-iteration to the next.""")
FLAGS = tf.app.flags.FLAGS
OPTIMIZER_SCOPE = "LOL"
OPT_SUM_COLLECTION = "LOL_summaries"
def sigmoid_weights(n, slope=0.1, offset=5):
"""Generates a sigmoid, scaled to sum to 1.
This function is used to generate weights that serve to mask out
the early objective values of an optimization problem such that
initial variation in the objective is phased out (hence the sigmoid
starts at zero and ramps up to the maximum value, and the total
weight is normalized to sum to one)
Args:
n: the number of samples
slope: slope of the sigmoid (Default: 0.1)
offset: threshold of the sigmoid (Default: 5)
Returns:
No
"""
x = np.arange(n)
y = 1. / (1. + np.exp(-slope * (x-offset)))
y_normalized = y / np.sum(y)
return y_normalized
def sample_numiter(scale, min_steps=50):
"""Samples a number of iterations from an exponential distribution.
Args:
scale: parameter for the exponential distribution
min_steps: minimum number of steps to run (additive)
Returns:
num_steps: An integer equal to a rounded sample from the exponential
distribution + the value of min_steps.
"""
return int(np.round(np.random.exponential(scale=scale)) + min_steps)
def train_optimizer(logdir,
optimizer_spec,
problems_and_data,
num_problems,
num_meta_iterations,
num_unroll_func,
num_partial_unroll_itrs_func,
learning_rate=1e-4,
gradient_clip=5.,
is_chief=False,
select_random_problems=True,
callbacks=None,
obj_train_max_multiplier=-1,
out=sys.stdout):
"""Trains the meta-parameters of this optimizer.
Args:
logdir: a directory filepath for storing model checkpoints (must exist)
optimizer_spec: specification for an Optimizer (see utils.Spec)
problems_and_data: a list of tuples containing three elements: a problem
specification (see utils.Spec), a dataset (see datasets.Dataset), and
a batch_size (int) for generating a problem and corresponding dataset. If
the problem doesn't have data, set dataset to None.
num_problems: the number of problems to sample during meta-training
num_meta_iterations: the number of iterations (steps) to run the
meta-optimizer for on each subproblem.
num_unroll_func: called once per meta iteration and returns the number of
unrolls to do for that meta iteration.
num_partial_unroll_itrs_func: called once per unroll and returns the number
of iterations to do for that unroll.
learning_rate: learning rate of the RMSProp meta-optimizer (Default: 1e-4)
gradient_clip: value to clip gradients at (Default: 5.0)
is_chief: whether this is the chief task (Default: False)
select_random_problems: whether to select training problems randomly
(Default: True)
callbacks: a list of callback functions that is run after every random
problem draw
obj_train_max_multiplier: the maximum increase in the objective value over
a single training run. Ignored if < 0.
out: where to write output to, e.g. a file handle (Default: sys.stdout)
Raises:
ValueError: If one of the subproblems has a negative objective value.
"""
if select_random_problems:
# iterate over random draws of problem / dataset pairs
sampler = (random.choice(problems_and_data) for _ in range(num_problems))
else:
# iterate over a random shuffle of problems, looping if necessary
num_repeats = (num_problems / len(problems_and_data)) + 1
random.shuffle(problems_and_data)
sampler = (problems_and_data * num_repeats)[:num_problems]
for problem_itr, (problem_spec, dataset, batch_size) in enumerate(sampler):
# timer used to time how long it takes to initialize a problem
problem_start_time = time.time()
# if dataset is None, use the EMPTY_DATASET
if dataset is None:
dataset = datasets.EMPTY_DATASET
batch_size = dataset.size
# build a new graph for this problem
graph = tf.Graph()
real_device_setter = tf.train.replica_device_setter(FLAGS.ps_tasks)
def custom_device_setter(op):
# Places the local variables onto the workers.
if trainable_optimizer.is_local_state_variable(op):
return "/job:worker"
else:
return real_device_setter(op)
if real_device_setter:
device_setter = custom_device_setter
else:
device_setter = None
with graph.as_default(), graph.device(device_setter):
# initialize a problem
problem = problem_spec.build()
# build the optimizer
opt = optimizer_spec.build()
# get the meta-objective for training the optimizer
train_output = opt.train(problem, dataset)
state_keys = opt.state_keys
for key, val in zip(state_keys, train_output.output_state[0]):
finite_val = utils.make_finite(val, replacement=tf.zeros_like(val))
tf.summary.histogram("State/{}".format(key), finite_val,
collections=[OPT_SUM_COLLECTION])
tf.summary.scalar("MetaObjective", train_output.metaobj,
collections=[OPT_SUM_COLLECTION])
# Per-problem meta-objective
tf.summary.scalar(problem_spec.callable.__name__ + "_MetaObjective",
train_output.metaobj,
collections=[OPT_SUM_COLLECTION])
# create the meta-train_op
global_step = tf.Variable(0, name="global_step", trainable=False)
meta_parameters = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
scope=OPTIMIZER_SCOPE)
# parameter regularization
reg_l2 = FLAGS.l2_reg * sum([tf.reduce_sum(param ** 2)
for param in meta_parameters])
# compute the meta-gradients
meta_opt = tf.train.RMSPropOptimizer(learning_rate, decay=FLAGS.rms_decay,
use_locking=True,
epsilon=FLAGS.rms_epsilon)
grads_and_vars = meta_opt.compute_gradients(train_output.metaobj + reg_l2,
meta_parameters)
# clip the gradients
clipped_grads_and_vars = []
for grad, var in grads_and_vars:
clipped_grad = tf.clip_by_value(
utils.make_finite(grad, replacement=tf.zeros_like(var)),
-gradient_clip, gradient_clip)
clipped_grads_and_vars.append((clipped_grad, var))
# histogram summary of grads and vars
for grad, var in grads_and_vars:
tf.summary.histogram(
var.name + "_rawgrad",
utils.make_finite(
grad, replacement=tf.zeros_like(grad)),
collections=[OPT_SUM_COLLECTION])
for grad, var in clipped_grads_and_vars:
tf.summary.histogram(var.name + "_var", var,
collections=[OPT_SUM_COLLECTION])
tf.summary.histogram(var.name + "_grad", grad,
collections=[OPT_SUM_COLLECTION])
# builds the train and summary operations
train_op = meta_opt.apply_gradients(clipped_grads_and_vars,
global_step=global_step)
# only grab summaries defined for LOL, not inside the problem
summary_op = tf.summary.merge_all(key=OPT_SUM_COLLECTION)
# make sure the state gets propagated after the gradients and summaries
# were computed.
with tf.control_dependencies([train_op, summary_op]):
propagate_loop_state_ops = []
for dest, src in zip(
train_output.init_loop_vars, train_output.output_loop_vars):
propagate_loop_state_ops.append(dest.assign(src))
propagate_loop_state_op = tf.group(*propagate_loop_state_ops)
# create the supervisor
sv = tf.train.Supervisor(
graph=graph,
is_chief=is_chief,
logdir=logdir,
summary_op=None,
save_model_secs=0, # we save checkpoints manually
global_step=global_step,
)
with sv.managed_session() as sess:
init_time = time.time() - problem_start_time
out.write("--------- Problem #{} ---------\n".format(problem_itr))
out.write("{callable.__name__}{args}{kwargs}\n".format(
**problem_spec.__dict__))
out.write("Took {} seconds to initialize.\n".format(init_time))
out.flush()
# For profiling summaries
if FLAGS.set_profiling:
summary_writer = tf.summary.FileWriter(logdir, graph=sess.graph)
# used to store information during training
metadata = defaultdict(list)
for k in range(num_meta_iterations):
if sv.should_stop():
break
problem.init_fn(sess)
# set run options (for profiling)
full_trace_opt = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
run_options = full_trace_opt if FLAGS.set_profiling else None
run_metadata = tf.RunMetadata() if FLAGS.set_profiling else None
num_unrolls = num_unroll_func()
partial_unroll_iters = [
num_partial_unroll_itrs_func() for _ in xrange(num_unrolls)
]
total_num_iter = sum(partial_unroll_iters)
objective_weights = [np.ones(num) / float(num)
for num in partial_unroll_iters]
db = dataset.batch_indices(total_num_iter, batch_size)
dataset_batches = []
last_index = 0
for num in partial_unroll_iters:
dataset_batches.append(db[last_index:last_index + num])
last_index += num
train_start_time = time.time()
unroll_itr = 0
additional_log_info = ""
for unroll_itr in range(num_unrolls):
first_unroll = unroll_itr == 0
if FLAGS.reset_rnn_params:
reset_state = first_unroll and k == 0
else:
reset_state = first_unroll
feed = {
train_output.obj_weights: objective_weights[unroll_itr],
train_output.batches: dataset_batches[unroll_itr],
train_output.first_unroll: first_unroll,
train_output.reset_state: reset_state,
}
# run the train and summary ops
# when a "save_diagnostics" flag is turned on
fetches_list = [
train_output.metaobj,
train_output.problem_objectives,
train_output.initial_obj,
summary_op,
clipped_grads_and_vars,
train_op
]
if unroll_itr + 1 < num_unrolls:
fetches_list += [propagate_loop_state_op]
fetched = sess.run(fetches_list, feed_dict=feed,
options=run_options, run_metadata=run_metadata)
meta_obj = fetched[0]
sub_obj = fetched[1]
init_obj = fetched[2]
summ = fetched[3]
meta_grads_and_params = fetched[4]
# assert that the subproblem objectives are non-negative
# (this is so that we can rescale the objective by the initial value
# and not worry about rescaling by a negative value)
if np.any(sub_obj < 0):
raise ValueError(
"Training problem objectives must be nonnegative.")
# If the objective has increased more than we want, exit this
# training run and start over on another meta iteration.
if obj_train_max_multiplier > 0 and (
sub_obj[-1] > (init_obj +
abs(init_obj) * (obj_train_max_multiplier - 1))):
msg = " Broke early at {} out of {} unrolls. ".format(
unroll_itr + 1, num_unrolls)
additional_log_info += msg
break
# only the chief task is allowed to write the summary
if is_chief:
sv.summary_computed(sess, summ)
metadata["subproblem_objs"].append(sub_obj)
# store training metadata to pass to the callback
metadata["meta_objs"].append(meta_obj)
metadata["meta_grads_and_params"].append(meta_grads_and_params)
optimization_time = time.time() - train_start_time
if FLAGS.set_profiling:
summary_name = "%02d_iter%04d_%02d" % (FLAGS.task, problem_itr, k)
summary_writer.add_run_metadata(run_metadata, summary_name)
metadata["global_step"].append(sess.run(global_step))
metadata["runtimes"].append(optimization_time)
# write a diagnostic message to the output
args = (k, meta_obj, optimization_time,
sum(partial_unroll_iters[:unroll_itr+1]))
out.write(" [{:02}] {}, {} seconds, {} iters ".format(*args))
out.write("(unrolled {} steps)".format(
", ".join([str(s) for s in partial_unroll_iters[:unroll_itr+1]])))
out.write("{}\n".format(additional_log_info))
out.flush()
if FLAGS.set_profiling:
summary_writer.close()
# force a checkpoint save before we load a new problem
# only the chief task has the save_path and can write the checkpoint
if is_chief:
sv.saver.save(sess, sv.save_path, global_step=global_step)
# run the callbacks on the chief
if is_chief and callbacks is not None:
for callback in callbacks:
if hasattr(callback, "__call__"):
problem_name = problem_spec.callable.__name__
callback(problem_name, problem_itr, logdir, metadata)
def test_optimizer(optimizer,
problem,
num_iter,
dataset=datasets.EMPTY_DATASET,
batch_size=None,
seed=None,
graph=None,
logdir=None,
record_every=None):
"""Tests an optimization algorithm on a given problem.
Args:
optimizer: Either a tf.train.Optimizer instance, or an Optimizer instance
inheriting from trainable_optimizer.py
problem: A Problem instance that defines an optimization problem to solve
num_iter: The number of iterations of the optimizer to run
dataset: The dataset to train the problem against
batch_size: The number of samples per batch. If None (default), the
batch size is set to the full batch (dataset.size)
seed: A random seed used for drawing the initial parameters, or a list of
numpy arrays used to explicitly initialize the parameters.
graph: The tensorflow graph to execute (if None, uses the default graph)
logdir: A directory containing model checkpoints. If given, then the
parameters of the optimizer are loaded from the latest checkpoint
in this folder.
record_every: if an integer, stores the parameters, objective, and gradient
every recored_every iterations. If None, nothing is stored
Returns:
objective_values: A list of the objective values during optimization
parameters: The parameters obtained after training
records: A dictionary containing lists of the parameters and gradients
during optimization saved every record_every iterations (empty if
record_every is set to None)
"""
if dataset is None:
dataset = datasets.EMPTY_DATASET
batch_size = dataset.size
else:
# default batch size is the entire dataset
batch_size = dataset.size if batch_size is None else batch_size
graph = tf.get_default_graph() if graph is None else graph
with graph.as_default():
# define the parameters of the optimization problem
if isinstance(seed, (list, tuple)):
# seed is a list of arrays
params = problem_generator.init_fixed_variables(seed)
else:
# seed is an int or None
params = problem.init_variables(seed)
data_placeholder = tf.placeholder(tf.float32)
labels_placeholder = tf.placeholder(tf.int32)
# get the problem objective and gradient(s)
obj = problem.objective(params, data_placeholder, labels_placeholder)
gradients = problem.gradients(obj, params)
vars_to_preinitialize = params
with tf.Session(graph=graph) as sess:
# initialize the parameter scope variables; necessary for apply_gradients
sess.run(tf.variables_initializer(vars_to_preinitialize))
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
# create the train operation and training variables
try:
train_op, real_params = optimizer.apply_gradients(zip(gradients, params))
obj = problem.objective(real_params, data_placeholder, labels_placeholder)
except TypeError:
# If all goes well, this exception should only be thrown when we are using
# a non-hrnn optimizer.
train_op = optimizer.apply_gradients(zip(gradients, params))
vars_to_restore = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
scope=OPTIMIZER_SCOPE)
vars_to_initialize = list(
set(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)) -
set(vars_to_restore) - set(vars_to_preinitialize))
# load or initialize optimizer variables
if logdir is not None:
restorer = tf.Saver(var_list=vars_to_restore)
ckpt = tf.train.latest_checkpoint(logdir)
restorer.restore(sess, ckpt)
else:
sess.run(tf.variables_initializer(vars_to_restore))
# initialize all the other variables
sess.run(tf.variables_initializer(vars_to_initialize))
problem.init_fn(sess)
# generate the minibatch indices
batch_inds = dataset.batch_indices(num_iter, batch_size)
# run the train operation for n iterations and save the objectives
records = defaultdict(list)
objective_values = []
for itr, batch in enumerate(batch_inds):
# data to feed in
feed = {data_placeholder: dataset.data[batch],
labels_placeholder: dataset.labels[batch]}
full_feed = {data_placeholder: dataset.data,
labels_placeholder: dataset.labels}
# record stuff
if record_every is not None and (itr % record_every) == 0:
def grad_value(g):
if isinstance(g, tf.IndexedSlices):
return g.values
else:
return g
records_fetch = {}
for p in params:
for key in optimizer.get_slot_names():
v = optimizer.get_slot(p, key)
records_fetch[p.name + "_" + key] = v
gav_fetch = [(grad_value(g), v) for g, v in zip(gradients, params)]
_, gav_eval, records_eval = sess.run(
(obj, gav_fetch, records_fetch), feed_dict=feed)
full_obj_eval = sess.run([obj], feed_dict=full_feed)
records["objective"].append(full_obj_eval)
records["grad_norm"].append([np.linalg.norm(g.ravel())
for g, _ in gav_eval])
records["param_norm"].append([np.linalg.norm(v.ravel())
for _, v in gav_eval])
records["grad"].append([g for g, _ in gav_eval])
records["param"].append([v for _, v in gav_eval])
records["iter"].append(itr)
for k, v in records_eval.iteritems():
records[k].append(v)
# run the optimization train operation
objective_values.append(sess.run([train_op, obj], feed_dict=feed)[1])
# final parameters
parameters = [sess.run(p) for p in params]
coord.request_stop()
coord.join(threads)
return objective_values, parameters, records
def run_wall_clock_test(optimizer,
problem,
num_steps,
dataset=datasets.EMPTY_DATASET,
seed=None,
logdir=None,
batch_size=None):
"""Runs optimization with the given parameters and return average iter time.
Args:
optimizer: The tf.train.Optimizer instance
problem: The problem to optimize (a problem_generator.Problem)
num_steps: The number of steps to run optimization for
dataset: The dataset to train the problem against
seed: The seed used for drawing the initial parameters, or a list of
numpy arrays used to explicitly initialize the parameters
logdir: A directory containing model checkpoints. If given, then the
parameters of the optimizer are loaded from the latest checkpoint
in this folder.
batch_size: The number of samples per batch.
Returns:
The average time in seconds for a single optimization iteration.
"""
if dataset is None:
dataset = datasets.EMPTY_DATASET
batch_size = dataset.size
else:
# default batch size is the entire dataset
batch_size = dataset.size if batch_size is None else batch_size
# define the parameters of the optimization problem
if isinstance(seed, (list, tuple)):
# seed is a list of arrays
params = problem_generator.init_fixed_variables(seed)
else:
# seed is an int or None
params = problem.init_variables(seed)
data_placeholder = tf.placeholder(tf.float32)
labels_placeholder = tf.placeholder(tf.int32)
obj = problem.objective(params, data_placeholder, labels_placeholder)
gradients = problem.gradients(obj, params)
vars_to_preinitialize = params
with tf.Session(graph=tf.get_default_graph()) as sess:
# initialize the parameter scope variables; necessary for apply_gradients
sess.run(tf.variables_initializer(vars_to_preinitialize))
train_op = optimizer.apply_gradients(zip(gradients, params))
if isinstance(train_op, tuple) or isinstance(train_op, list):
# LOL apply_gradients returns a tuple. Regular optimizers do not.
train_op = train_op[0]
vars_to_restore = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
scope=OPTIMIZER_SCOPE)
vars_to_initialize = list(
set(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)) -
set(vars_to_restore) - set(vars_to_preinitialize))
# load or initialize optimizer variables
if logdir is not None:
restorer = tf.Saver(var_list=vars_to_restore)
ckpt = tf.train.latest_checkpoint(logdir)
restorer.restore(sess, ckpt)
else:
sess.run(tf.variables_initializer(vars_to_restore))
# initialize all the other variables
sess.run(tf.variables_initializer(vars_to_initialize))
problem.init_fn(sess)
# generate the minibatch indices
batch_inds = dataset.batch_indices(num_steps, batch_size)
avg_iter_time = []
for batch in batch_inds:
# data to feed in
feed = {data_placeholder: dataset.data[batch],
labels_placeholder: dataset.labels[batch]}
# run the optimization train operation
start = time.time()
sess.run([train_op], feed_dict=feed)
avg_iter_time.append(time.time() - start)
return np.median(np.array(avg_iter_time))
# Copyright 2017 Google, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Scripts for meta-optimization."""
from __future__ import print_function
import os
import tensorflow as tf
import metaopt
from learned_optimizer.optimizer import coordinatewise_rnn
from learned_optimizer.optimizer import global_learning_rate
from learned_optimizer.optimizer import hierarchical_rnn
from learned_optimizer.optimizer import learning_rate_schedule
from learned_optimizer.optimizer import trainable_adam
from learned_optimizer.problems import problem_sets as ps
from learned_optimizer.problems import problem_spec
tf.app.flags.DEFINE_string("train_dir", "/tmp/lol/",
"""Directory to store parameters and results.""")
tf.app.flags.DEFINE_integer("task", 0,
"""Task id of the replica running the training.""")
tf.app.flags.DEFINE_integer("worker_tasks", 1,
"""Number of tasks in the worker job.""")
tf.app.flags.DEFINE_integer("num_problems", 1000,
"""Number of sub-problems to run.""")
tf.app.flags.DEFINE_integer("num_meta_iterations", 5,
"""Number of meta-iterations to optimize.""")
tf.app.flags.DEFINE_integer("num_unroll_scale", 40,
"""The scale parameter of the exponential
distribution from which the number of partial
unrolls is drawn""")
tf.app.flags.DEFINE_integer("min_num_unrolls", 1,
"""The minimum number of unrolls per problem.""")
tf.app.flags.DEFINE_integer("num_partial_unroll_itr_scale", 200,
"""The scale parameter of the exponential
distribution from which the number of iterations
per unroll is drawn.""")
tf.app.flags.DEFINE_integer("min_num_itr_partial_unroll", 50,
"""The minimum number of iterations for one
unroll.""")
tf.app.flags.DEFINE_string("optimizer", "HierarchicalRNN",
"""Which meta-optimizer to train.""")
# CoordinatewiseRNN-specific flags
tf.app.flags.DEFINE_integer("cell_size", 20,
"""Size of the RNN hidden state in each layer.""")
tf.app.flags.DEFINE_integer("num_cells", 2,
"""Number of RNN layers.""")
tf.app.flags.DEFINE_string("cell_cls", "GRUCell",
"""Type of RNN cell to use.""")
# Metaoptimization parameters
tf.app.flags.DEFINE_float("meta_learning_rate", 1e-6,
"""The learning rate for the meta-optimizer.""")
tf.app.flags.DEFINE_float("gradient_clip_level", 1e4,
"""The level to clip gradients to.""")
# Training set selection
tf.app.flags.DEFINE_boolean("include_quadratic_problems", False,
"""Include non-noisy quadratic problems.""")
tf.app.flags.DEFINE_boolean("include_noisy_quadratic_problems", True,
"""Include noisy quadratic problems.""")
tf.app.flags.DEFINE_boolean("include_large_quadratic_problems", True,
"""Include very large quadratic problems.""")
tf.app.flags.DEFINE_boolean("include_bowl_problems", True,
"""Include 2D bowl problems.""")
tf.app.flags.DEFINE_boolean("include_softmax_2_class_problems", True,
"""Include 2-class logistic regression problems.""")
tf.app.flags.DEFINE_boolean("include_noisy_softmax_2_class_problems", True,
"""Include noisy 2-class logistic regression
problems.""")
tf.app.flags.DEFINE_boolean("include_optimization_test_problems", True,
"""Include non-noisy versions of classic
optimization test problems, e.g. Rosenbrock.""")
tf.app.flags.DEFINE_boolean("include_noisy_optimization_test_problems", True,
"""Include gradient-noise versions of classic
optimization test problems, e.g. Rosenbrock""")
tf.app.flags.DEFINE_boolean("include_fully_connected_random_2_class_problems",
True, """Include MLP problems for 2 classes.""")
tf.app.flags.DEFINE_boolean("include_matmul_problems", True,
"""Include matrix multiplication problems.""")
tf.app.flags.DEFINE_boolean("include_log_objective_problems", True,
"""Include problems where the objective is the log
objective of another problem, e.g. Bowl.""")
tf.app.flags.DEFINE_boolean("include_rescale_problems", True,
"""Include problems where the parameters are scaled
version of the original parameters.""")
tf.app.flags.DEFINE_boolean("include_norm_problems", True,
"""Include problems where the objective is the
N-norm of another problem, e.g. Quadratic.""")
tf.app.flags.DEFINE_boolean("include_sum_problems", True,
"""Include problems where the objective is the sum
of the objectives of the subproblems that make
up the problem parameters. Per-problem tensors
are still independent of each other.""")
tf.app.flags.DEFINE_boolean("include_sparse_gradient_problems", True,
"""Include problems where the gradient is set to 0
with some high probability.""")
tf.app.flags.DEFINE_boolean("include_sparse_softmax_problems", False,
"""Include sparse softmax problems.""")
tf.app.flags.DEFINE_boolean("include_one_hot_sparse_softmax_problems", False,
"""Include one-hot sparse softmax problems.""")
tf.app.flags.DEFINE_boolean("include_noisy_bowl_problems", True,
"""Include noisy bowl problems.""")
tf.app.flags.DEFINE_boolean("include_noisy_norm_problems", True,
"""Include noisy norm problems.""")
tf.app.flags.DEFINE_boolean("include_noisy_sum_problems", True,
"""Include noisy sum problems.""")
tf.app.flags.DEFINE_boolean("include_sum_of_quadratics_problems", False,
"""Include sum of quadratics problems.""")
tf.app.flags.DEFINE_boolean("include_projection_quadratic_problems", False,
"""Include projection quadratic problems.""")
tf.app.flags.DEFINE_boolean("include_outward_snake_problems", False,
"""Include outward snake problems.""")
tf.app.flags.DEFINE_boolean("include_dependency_chain_problems", False,
"""Include dependency chain problems.""")
tf.app.flags.DEFINE_boolean("include_min_max_well_problems", False,
"""Include min-max well problems.""")
# Optimizer parameters: initialization and scale values
tf.app.flags.DEFINE_float("min_lr", 1e-6,
"""The minimum initial learning rate.""")
tf.app.flags.DEFINE_float("max_lr", 1e-2,
"""The maximum initial learning rate.""")
# Optimizer parameters: small features.
tf.app.flags.DEFINE_boolean("zero_init_lr_weights", True,
"""Whether to initialize the learning rate weights
to 0 rather than the scaled random initialization
used for other RNN variables.""")
tf.app.flags.DEFINE_boolean("use_relative_lr", True,
"""Whether to use the relative learning rate as an
input during training. Can only be used if
learnable_decay is also True.""")
tf.app.flags.DEFINE_boolean("use_extreme_indicator", False,
"""Whether to use the extreme indicator for learning
rates as an input during training. Can only be
used if learnable_decay is also True.""")
tf.app.flags.DEFINE_boolean("use_log_means_squared", True,
"""Whether to track the log of the mean squared
grads instead of the means squared grads.""")
tf.app.flags.DEFINE_boolean("use_problem_lr_mean", True,
"""Whether to use the mean over all learning rates
in the problem when calculating the relative
learning rate.""")
# Optimizer parameters: major features
tf.app.flags.DEFINE_boolean("learnable_decay", True,
"""Whether to learn weights that dynamically
modulate the input scale via RMS decay.""")
tf.app.flags.DEFINE_boolean("dynamic_output_scale", True,
"""Whether to learn weights that dynamically
modulate the output scale.""")
tf.app.flags.DEFINE_boolean("use_log_objective", True,
"""Whether to use the log of the scaled objective
rather than just the scaled obj for training.""")
tf.app.flags.DEFINE_boolean("use_attention", False,
"""Whether to learn where to attend.""")
tf.app.flags.DEFINE_boolean("use_second_derivatives", True,
"""Whether to use second derivatives.""")
tf.app.flags.DEFINE_integer("num_gradient_scales", 4,
"""How many different timescales to keep for
gradient history. If > 1, also learns a scale
factor for gradient history.""")
tf.app.flags.DEFINE_float("max_log_lr", 33,
"""The maximum log learning rate allowed.""")
tf.app.flags.DEFINE_float("objective_training_max_multiplier", -1,
"""How much the objective can grow before training on
this problem / param pair is terminated. Sets a max
on the objective value when multiplied by the
initial objective. If <= 0, not used.""")
tf.app.flags.DEFINE_boolean("use_gradient_shortcut", True,
"""Whether to add a learned affine projection of the
gradient to the update delta in addition to the
gradient function computed by the RNN.""")
tf.app.flags.DEFINE_boolean("use_lr_shortcut", False,
"""Whether to add the difference between the current
learning rate and the desired learning rate to
the RNN input.""")
tf.app.flags.DEFINE_boolean("use_grad_products", True,
"""Whether to use gradient products in the input to
the RNN. Only applicable when num_gradient_scales
> 1.""")
tf.app.flags.DEFINE_boolean("use_multiple_scale_decays", False,
"""Whether to use many-timescale scale decays.""")
tf.app.flags.DEFINE_boolean("use_numerator_epsilon", False,
"""Whether to use epsilon in the numerator of the
log objective.""")
tf.app.flags.DEFINE_boolean("learnable_inp_decay", True,
"""Whether to learn input decay weight and bias.""")
tf.app.flags.DEFINE_boolean("learnable_rnn_init", True,
"""Whether to learn RNN state initialization.""")
FLAGS = tf.app.flags.FLAGS
# The Size of the RNN hidden state in each layer:
# [PerParam, PerTensor, Global]. The length of this list must be 1, 2, or 3.
# If less than 3, the Global and/or PerTensor RNNs will not be created.
HRNN_CELL_SIZES = [10, 20, 20]
def register_optimizers():
opts = {}
opts["CoordinatewiseRNN"] = coordinatewise_rnn.CoordinatewiseRNN
opts["GlobalLearningRate"] = global_learning_rate.GlobalLearningRate
opts["HierarchicalRNN"] = hierarchical_rnn.HierarchicalRNN
opts["LearningRateSchedule"] = learning_rate_schedule.LearningRateSchedule
opts["TrainableAdam"] = trainable_adam.TrainableAdam
return opts
def main(unused_argv):
"""Runs the main script."""
opts = register_optimizers()
# Choose a set of problems to optimize. By default this includes quadratics,
# 2-dimensional bowls, 2-class softmax problems, and non-noisy optimization
# test problems (e.g. Rosenbrock, Beale)
problems_and_data = []
if FLAGS.include_sparse_softmax_problems:
problems_and_data.extend(ps.sparse_softmax_2_class_sparse_problems())
if FLAGS.include_one_hot_sparse_softmax_problems:
problems_and_data.extend(
ps.one_hot_sparse_softmax_2_class_sparse_problems())
if FLAGS.include_quadratic_problems:
problems_and_data.extend(ps.quadratic_problems())
if FLAGS.include_noisy_quadratic_problems:
problems_and_data.extend(ps.quadratic_problems_noisy())
if FLAGS.include_large_quadratic_problems:
problems_and_data.extend(ps.quadratic_problems_large())
if FLAGS.include_bowl_problems:
problems_and_data.extend(ps.bowl_problems())
if FLAGS.include_noisy_bowl_problems:
problems_and_data.extend(ps.bowl_problems_noisy())
if FLAGS.include_softmax_2_class_problems:
problems_and_data.extend(ps.softmax_2_class_problems())
if FLAGS.include_noisy_softmax_2_class_problems:
problems_and_data.extend(ps.softmax_2_class_problems_noisy())
if FLAGS.include_optimization_test_problems:
problems_and_data.extend(ps.optimization_test_problems())
if FLAGS.include_noisy_optimization_test_problems:
problems_and_data.extend(ps.optimization_test_problems_noisy())
if FLAGS.include_fully_connected_random_2_class_problems:
problems_and_data.extend(ps.fully_connected_random_2_class_problems())
if FLAGS.include_matmul_problems:
problems_and_data.extend(ps.matmul_problems())
if FLAGS.include_log_objective_problems:
problems_and_data.extend(ps.log_objective_problems())
if FLAGS.include_rescale_problems:
problems_and_data.extend(ps.rescale_problems())
if FLAGS.include_norm_problems:
problems_and_data.extend(ps.norm_problems())
if FLAGS.include_noisy_norm_problems:
problems_and_data.extend(ps.norm_problems_noisy())
if FLAGS.include_sum_problems:
problems_and_data.extend(ps.sum_problems())
if FLAGS.include_noisy_sum_problems:
problems_and_data.extend(ps.sum_problems_noisy())
if FLAGS.include_sparse_gradient_problems:
problems_and_data.extend(ps.sparse_gradient_problems())
if FLAGS.include_fully_connected_random_2_class_problems:
problems_and_data.extend(ps.sparse_gradient_problems_mlp())
if FLAGS.include_min_max_well_problems:
problems_and_data.extend(ps.min_max_well_problems())
if FLAGS.include_sum_of_quadratics_problems:
problems_and_data.extend(ps.sum_of_quadratics_problems())
if FLAGS.include_projection_quadratic_problems:
problems_and_data.extend(ps.projection_quadratic_problems())
if FLAGS.include_outward_snake_problems:
problems_and_data.extend(ps.outward_snake_problems())
if FLAGS.include_dependency_chain_problems:
problems_and_data.extend(ps.dependency_chain_problems())
# log directory
logdir = os.path.join(FLAGS.train_dir,
"{}_{}_{}_{}".format(FLAGS.optimizer,
FLAGS.cell_cls,
FLAGS.cell_size,
FLAGS.num_cells))
# get the optimizer class and arguments
optimizer_cls = opts[FLAGS.optimizer]
assert len(HRNN_CELL_SIZES) in [1, 2, 3]
optimizer_args = (HRNN_CELL_SIZES,)
optimizer_kwargs = {
"init_lr_range": (FLAGS.min_lr, FLAGS.max_lr),
"learnable_decay": FLAGS.learnable_decay,
"dynamic_output_scale": FLAGS.dynamic_output_scale,
"cell_cls": getattr(tf.contrib.rnn, FLAGS.cell_cls),
"use_attention": FLAGS.use_attention,
"use_log_objective": FLAGS.use_log_objective,
"num_gradient_scales": FLAGS.num_gradient_scales,
"zero_init_lr_weights": FLAGS.zero_init_lr_weights,
"use_log_means_squared": FLAGS.use_log_means_squared,
"use_relative_lr": FLAGS.use_relative_lr,
"use_extreme_indicator": FLAGS.use_extreme_indicator,
"max_log_lr": FLAGS.max_log_lr,
"obj_train_max_multiplier": FLAGS.objective_training_max_multiplier,
"use_problem_lr_mean": FLAGS.use_problem_lr_mean,
"use_gradient_shortcut": FLAGS.use_gradient_shortcut,
"use_second_derivatives": FLAGS.use_second_derivatives,
"use_lr_shortcut": FLAGS.use_lr_shortcut,
"use_grad_products": FLAGS.use_grad_products,
"use_multiple_scale_decays": FLAGS.use_multiple_scale_decays,
"use_numerator_epsilon": FLAGS.use_numerator_epsilon,
"learnable_inp_decay": FLAGS.learnable_inp_decay,
"learnable_rnn_init": FLAGS.learnable_rnn_init,
}
optimizer_spec = problem_spec.Spec(
optimizer_cls, optimizer_args, optimizer_kwargs)
# make log directory
tf.gfile.MakeDirs(logdir)
is_chief = FLAGS.task == 0
# if this is a distributed run, make the chief run through problems in order
select_random_problems = FLAGS.worker_tasks == 1 or not is_chief
def num_unrolls():
return metaopt.sample_numiter(FLAGS.num_unroll_scale, FLAGS.min_num_unrolls)
def num_partial_unroll_itrs():
return metaopt.sample_numiter(FLAGS.num_partial_unroll_itr_scale,
FLAGS.min_num_itr_partial_unroll)
# run it
metaopt.train_optimizer(
logdir,
optimizer_spec,
problems_and_data,
FLAGS.num_problems,
FLAGS.num_meta_iterations,
num_unrolls,
num_partial_unroll_itrs,
learning_rate=FLAGS.meta_learning_rate,
gradient_clip=FLAGS.gradient_clip_level,
is_chief=is_chief,
select_random_problems=select_random_problems,
obj_train_max_multiplier=FLAGS.objective_training_max_multiplier,
callbacks=[])
return 0
if __name__ == "__main__":
tf.app.run()
package(default_visibility = ["//visibility:public"])
# Libraries
# =========
py_library(
name = "coordinatewise_rnn",
srcs = ["coordinatewise_rnn.py"],
deps = [
":trainable_optimizer",
":utils",
],
)
py_library(
name = "global_learning_rate",
srcs = ["global_learning_rate.py"],
deps = [
":trainable_optimizer",
],
)
py_library(
name = "hierarchical_rnn",
srcs = ["hierarchical_rnn.py"],
deps = [
":rnn_cells",
":trainable_optimizer",
":utils",
],
)
py_library(
name = "learning_rate_schedule",
srcs = ["learning_rate_schedule.py"],
deps = [
":trainable_optimizer",
],
)
py_library(
name = "rnn_cells",
srcs = ["rnn_cells.py"],
deps = [
":utils",
],
)
py_library(
name = "trainable_adam",
srcs = ["trainable_adam.py"],
deps = [
":trainable_optimizer",
":utils",
],
)
py_library(
name = "trainable_optimizer",
srcs = ["trainable_optimizer.py"],
deps = [
],
)
py_library(
name = "utils",
srcs = ["utils.py"],
deps = [
],
)
# Copyright 2017 Google, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Collection of trainable optimizers for meta-optimization."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import numpy as np
import tensorflow as tf
from learned_optimizer.optimizer import utils
from learned_optimizer.optimizer import trainable_optimizer as opt
# Default was 1e-3
tf.app.flags.DEFINE_float("crnn_rnn_readout_scale", 0.5,
"""The initialization scale for the RNN readouts.""")
tf.app.flags.DEFINE_float("crnn_default_decay_var_init", 2.2,
"""The default initializer value for any decay/
momentum style variables and constants.
sigmoid(2.2) ~ 0.9, sigmoid(-2.2) ~ 0.01.""")
FLAGS = tf.flags.FLAGS
class CoordinatewiseRNN(opt.TrainableOptimizer):
"""RNN that operates on each coordinate of the problem independently."""
def __init__(self,
cell_sizes,
cell_cls,
init_lr_range=(1., 1.),
dynamic_output_scale=True,
learnable_decay=True,
zero_init_lr_weights=False,
**kwargs):
"""Initializes the RNN per-parameter optimizer.
Args:
cell_sizes: List of hidden state sizes for each RNN cell in the network
cell_cls: tf.contrib.rnn class for specifying the RNN cell type
init_lr_range: the range in which to initialize the learning rates.
dynamic_output_scale: whether to learn weights that dynamically modulate
the output scale (default: True)
learnable_decay: whether to learn weights that dynamically modulate the
input scale via RMS style decay (default: True)
zero_init_lr_weights: whether to initialize the lr weights to zero
**kwargs: args passed to TrainableOptimizer's constructor
Raises:
ValueError: If the init lr range is not of length 2.
ValueError: If the init lr range is not a valid range (min > max).
"""
if len(init_lr_range) != 2:
raise ValueError(
"Initial LR range must be len 2, was {}".format(len(init_lr_range)))
if init_lr_range[0] > init_lr_range[1]:
raise ValueError("Initial LR range min is greater than max.")
self.init_lr_range = init_lr_range
self.zero_init_lr_weights = zero_init_lr_weights
self.reuse_vars = False
# create the RNN cell
with tf.variable_scope(opt.OPTIMIZER_SCOPE):
self.component_cells = [cell_cls(sz) for sz in cell_sizes]
self.cell = tf.contrib.rnn.MultiRNNCell(self.component_cells)
# random normal initialization scaled by the output size
scale_factor = FLAGS.crnn_rnn_readout_scale / math.sqrt(cell_sizes[-1])
scaled_init = tf.random_normal_initializer(0., scale_factor)
# weights for projecting the hidden state to a parameter update
self.update_weights = tf.get_variable("update_weights",
shape=(cell_sizes[-1], 1),
initializer=scaled_init)
self._initialize_decay(learnable_decay, (cell_sizes[-1], 1), scaled_init)
self._initialize_lr(dynamic_output_scale, (cell_sizes[-1], 1),
scaled_init)
state_size = sum([sum(state_size) for state_size in self.cell.state_size])
self._init_vector = tf.get_variable(
"init_vector", shape=[1, state_size],
initializer=tf.random_uniform_initializer(-1., 1.))
state_keys = ["rms", "rnn", "learning_rate", "decay"]
super(CoordinatewiseRNN, self).__init__("cRNN", state_keys, **kwargs)
def _initialize_decay(
self, learnable_decay, weights_tensor_shape, scaled_init):
"""Initializes the decay weights and bias variables or tensors.
Args:
learnable_decay: Whether to use learnable decay.
weights_tensor_shape: The shape the weight tensor should take.
scaled_init: The scaled initialization for the weights tensor.
"""
if learnable_decay:
# weights for projecting the hidden state to the RMS decay term
self.decay_weights = tf.get_variable("decay_weights",
shape=weights_tensor_shape,
initializer=scaled_init)
self.decay_bias = tf.get_variable(
"decay_bias", shape=(1,),
initializer=tf.constant_initializer(
FLAGS.crnn_default_decay_var_init))
else:
self.decay_weights = tf.zeros_like(self.update_weights)
self.decay_bias = tf.constant(FLAGS.crnn_default_decay_var_init)
def _initialize_lr(
self, dynamic_output_scale, weights_tensor_shape, scaled_init):
"""Initializes the learning rate weights and bias variables or tensors.
Args:
dynamic_output_scale: Whether to use a dynamic output scale.
weights_tensor_shape: The shape the weight tensor should take.
scaled_init: The scaled initialization for the weights tensor.
"""
if dynamic_output_scale:
zero_init = tf.constant_initializer(0.)
wt_init = zero_init if self.zero_init_lr_weights else scaled_init
self.lr_weights = tf.get_variable("learning_rate_weights",
shape=weights_tensor_shape,
initializer=wt_init)
self.lr_bias = tf.get_variable("learning_rate_bias", shape=(1,),
initializer=zero_init)
else:
self.lr_weights = tf.zeros_like(self.update_weights)
self.lr_bias = tf.zeros([1, 1])
def _initialize_state(self, var):
"""Return a dictionary mapping names of state variables to their values."""
vectorized_shape = [var.get_shape().num_elements(), 1]
min_lr = self.init_lr_range[0]
max_lr = self.init_lr_range[1]
if min_lr == max_lr:
init_lr = tf.constant(min_lr, shape=vectorized_shape)
else:
actual_vals = tf.random_uniform(vectorized_shape,
np.log(min_lr),
np.log(max_lr))
init_lr = tf.exp(actual_vals)
ones = tf.ones(vectorized_shape)
rnn_init = ones * self._init_vector
return {
"rms": tf.ones(vectorized_shape),
"learning_rate": init_lr,
"rnn": rnn_init,
"decay": tf.ones(vectorized_shape),
}
def _compute_update(self, param, grad, state):
"""Update parameters given the gradient and state.
Args:
param: tensor of parameters
grad: tensor of gradients with the same shape as param
state: a dictionary containing any state for the optimizer
Returns:
updated_param: updated parameters
updated_state: updated state variables in a dictionary
"""
with tf.variable_scope(opt.OPTIMIZER_SCOPE) as scope:
if self.reuse_vars:
scope.reuse_variables()
else:
self.reuse_vars = True
param_shape = tf.shape(param)
(grad_values, decay_state, rms_state, rnn_state, learning_rate_state,
grad_indices) = self._extract_gradients_and_internal_state(
grad, state, param_shape)
# Vectorize and scale the gradients.
grad_scaled, rms = utils.rms_scaling(grad_values, decay_state, rms_state)
# Apply the RNN update.
rnn_state_tuples = self._unpack_rnn_state_into_tuples(rnn_state)
rnn_output, rnn_state_tuples = self.cell(grad_scaled, rnn_state_tuples)
rnn_state = self._pack_tuples_into_rnn_state(rnn_state_tuples)
# Compute the update direction (a linear projection of the RNN output).
delta = utils.project(rnn_output, self.update_weights)
# The updated decay is an affine projection of the hidden state
decay = utils.project(rnn_output, self.decay_weights,
bias=self.decay_bias, activation=tf.nn.sigmoid)
# Compute the change in learning rate (an affine projection of the RNN
# state, passed through a 2x sigmoid, so the change is bounded).
learning_rate_change = 2. * utils.project(rnn_output, self.lr_weights,
bias=self.lr_bias,
activation=tf.nn.sigmoid)
# Update the learning rate.
new_learning_rate = learning_rate_change * learning_rate_state
# Apply the update to the parameters.
update = tf.reshape(new_learning_rate * delta, tf.shape(grad_values))
if isinstance(grad, tf.IndexedSlices):
update = utils.stack_tensor(update, grad_indices, param,
param_shape[:1])
rms = utils.update_slices(rms, grad_indices, state["rms"], param_shape)
new_learning_rate = utils.update_slices(new_learning_rate, grad_indices,
state["learning_rate"],
param_shape)
rnn_state = utils.update_slices(rnn_state, grad_indices, state["rnn"],
param_shape)
decay = utils.update_slices(decay, grad_indices, state["decay"],
param_shape)
new_param = param - update
# Collect the update and new state.
new_state = {
"rms": rms,
"learning_rate": new_learning_rate,
"rnn": rnn_state,
"decay": decay,
}
return new_param, new_state
def _extract_gradients_and_internal_state(self, grad, state, param_shape):
"""Extracts the gradients and relevant internal state.
If the gradient is sparse, extracts the appropriate slices from the state.
Args:
grad: The current gradient.
state: The current state.
param_shape: The shape of the parameter (used if gradient is sparse).
Returns:
grad_values: The gradient value tensor.
decay_state: The current decay state.
rms_state: The current rms state.
rnn_state: The current state of the internal rnns.
learning_rate_state: The current learning rate state.
grad_indices: The indices for the gradient tensor, if sparse.
None otherwise.
"""
if isinstance(grad, tf.IndexedSlices):
grad_indices, grad_values = utils.accumulate_sparse_gradients(grad)
decay_state = utils.slice_tensor(state["decay"], grad_indices,
param_shape)
rms_state = utils.slice_tensor(state["rms"], grad_indices, param_shape)
rnn_state = utils.slice_tensor(state["rnn"], grad_indices, param_shape)
learning_rate_state = utils.slice_tensor(state["learning_rate"],
grad_indices, param_shape)
decay_state.set_shape([None, 1])
rms_state.set_shape([None, 1])
else:
grad_values = grad
grad_indices = None
decay_state = state["decay"]
rms_state = state["rms"]
rnn_state = state["rnn"]
learning_rate_state = state["learning_rate"]
return (grad_values, decay_state, rms_state, rnn_state, learning_rate_state,
grad_indices)
def _unpack_rnn_state_into_tuples(self, rnn_state):
"""Creates state tuples from the rnn state vector."""
rnn_state_tuples = []
cur_state_pos = 0
for cell in self.component_cells:
total_state_size = sum(cell.state_size)
cur_state = tf.slice(rnn_state, [0, cur_state_pos],
[-1, total_state_size])
cur_state_tuple = tf.split(value=cur_state, num_or_size_splits=2,
axis=1)
rnn_state_tuples.append(cur_state_tuple)
cur_state_pos += total_state_size
return rnn_state_tuples
def _pack_tuples_into_rnn_state(self, rnn_state_tuples):
"""Creates a single state vector concatenated along column axis."""
rnn_state = None
for new_state_tuple in rnn_state_tuples:
new_c, new_h = new_state_tuple
if rnn_state is None:
rnn_state = tf.concat([new_c, new_h], axis=1)
else:
rnn_state = tf.concat([rnn_state, tf.concat([new_c, new_h], 1)], axis=1)
return rnn_state
# Copyright 2017 Google, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""A trainable optimizer that learns a single global learning rate."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from learned_optimizer.optimizer import trainable_optimizer
class GlobalLearningRate(trainable_optimizer.TrainableOptimizer):
"""Optimizes for a single global learning rate."""
def __init__(self, initial_rate=1e-3, **kwargs):
"""Initializes the global learning rate."""
with tf.variable_scope(trainable_optimizer.OPTIMIZER_SCOPE):
initializer = tf.constant_initializer(initial_rate)
self.learning_rate = tf.get_variable("global_learning_rate", shape=(),
initializer=initializer)
super(GlobalLearningRate, self).__init__("GLR", [], **kwargs)
def _compute_update(self, param, grad, state):
return param - tf.scalar_mul(self.learning_rate, grad), state
# Copyright 2017 Google, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Collection of trainable optimizers for meta-optimization."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import numpy as np
import tensorflow as tf
from tensorflow.python.ops import state_ops
from learned_optimizer.optimizer import rnn_cells
from learned_optimizer.optimizer import trainable_optimizer as opt
from learned_optimizer.optimizer import utils
# Default was 0.1
tf.app.flags.DEFINE_float("biasgrucell_scale", 0.5,
"""The scale for the internal BiasGRUCell vars.""")
# Default was 0
tf.app.flags.DEFINE_float("biasgrucell_gate_bias_init", 2.2,
"""The bias for the internal BiasGRUCell reset and
update gate variables.""")
# Default was 1e-3
tf.app.flags.DEFINE_float("hrnn_rnn_readout_scale", 0.5,
"""The initialization scale for the RNN readouts.""")
tf.app.flags.DEFINE_float("hrnn_default_decay_var_init", 2.2,
"""The default initializer value for any decay/
momentum style variables and constants.
sigmoid(2.2) ~ 0.9, sigmoid(-2.2) ~ 0.01.""")
# Default was 2.2
tf.app.flags.DEFINE_float("scale_decay_bias_init", 3.2,
"""The initialization for the scale decay bias. This
is the initial bias for the timescale for the
exponential avg of the mean square gradients.""")
tf.app.flags.DEFINE_float("learning_rate_momentum_logit_init", 3.2,
"""Initialization for the learning rate momentum.""")
# Default was 0.1
tf.app.flags.DEFINE_float("hrnn_affine_scale", 0.5,
"""The initialization scale for the weight matrix of
the bias variables in layer0 and 1 of the hrnn.""")
FLAGS = tf.flags.FLAGS
class HierarchicalRNN(opt.TrainableOptimizer):
"""3 level hierarchical RNN.
Optionally uses second order gradient information and has decoupled evaluation
and update locations.
"""
def __init__(self, level_sizes, init_lr_range=(1e-6, 1e-2),
learnable_decay=True, dynamic_output_scale=True,
use_attention=False, use_log_objective=True,
num_gradient_scales=4, zero_init_lr_weights=True,
use_log_means_squared=True, use_relative_lr=True,
use_extreme_indicator=False, max_log_lr=33,
obj_train_max_multiplier=-1, use_problem_lr_mean=False,
use_gradient_shortcut=False, use_lr_shortcut=False,
use_grad_products=False, use_multiple_scale_decays=False,
learnable_inp_decay=True, learnable_rnn_init=True,
random_seed=None, **kwargs):
"""Initializes the RNN per-parameter optimizer.
The hierarchy consists of up to three levels:
Level 0: per parameter RNN
Level 1: per tensor RNN
Level 2: global RNN
Args:
level_sizes: list or tuple with 1, 2, or 3 integers, the number of units
in each RNN in the hierarchy (level0, level1, level2).
length 1: only coordinatewise rnn's will be used
length 2: coordinatewise and tensor-level rnn's will be used
length 3: a single global-level rnn will be used in addition to
coordinatewise and tensor-level
init_lr_range: the range in which to initialize the learning rates
learnable_decay: whether to learn weights that dynamically modulate the
input scale via RMS style decay
dynamic_output_scale: whether to learn weights that dynamically modulate
the output scale
use_attention: whether to use attention to train the optimizer
use_log_objective: whether to train on the log of the objective
num_gradient_scales: the number of scales to use for gradient history
zero_init_lr_weights: whether to initialize the lr weights to zero
use_log_means_squared: whether to track the log of the means_squared,
used as a measure of signal vs. noise in gradient.
use_relative_lr: whether to use the relative learning rate as an
input during training (requires learnable_decay=True)
use_extreme_indicator: whether to use the extreme indicator for learning
rates as an input during training (requires learnable_decay=True)
max_log_lr: the maximum log learning rate allowed during train or test
obj_train_max_multiplier: max objective increase during a training run
use_problem_lr_mean: whether to use the mean over all learning rates in
the problem when calculating the relative learning rate as opposed to
the per-tensor mean
use_gradient_shortcut: Whether to add a learned affine projection of the
gradient to the update delta in addition to the gradient function
computed by the RNN
use_lr_shortcut: Whether to add as input the difference between the log lr
and the desired log lr (1e-3)
use_grad_products: Whether to use gradient products in the rnn input.
Only applicable if num_gradient_scales > 1
use_multiple_scale_decays: Whether to use multiple scales for the scale
decay, as with input decay
learnable_inp_decay: Whether to learn the input decay weights and bias.
learnable_rnn_init: Whether to learn the RNN state initialization.
random_seed: Random seed for random variable initializers. (Default: None)
**kwargs: args passed to TrainableOptimizer's constructor
Raises:
ValueError: If level_sizes is not a length 1, 2, or 3 list.
ValueError: If there are any non-integer sizes in level_sizes.
ValueError: If the init lr range is not of length 2.
ValueError: If the init lr range is not a valid range (min > max).
"""
if len(level_sizes) not in [1, 2, 3]:
raise ValueError("HierarchicalRNN only supports 1, 2, or 3 levels in the "
"hierarchy, but {} were requested.".format(
len(level_sizes)))
if any(not isinstance(level, int) for level in level_sizes):
raise ValueError("Level sizes must be integer values, were {}".format(
level_sizes))
if len(init_lr_range) != 2:
raise ValueError(
"Initial LR range must be len 2, was {}".format(len(init_lr_range)))
if init_lr_range[0] > init_lr_range[1]:
raise ValueError("Initial LR range min is greater than max.")
self.learnable_decay = learnable_decay
self.dynamic_output_scale = dynamic_output_scale
self.use_attention = use_attention
self.use_log_objective = use_log_objective
self.num_gradient_scales = num_gradient_scales
self.zero_init_lr_weights = zero_init_lr_weights
self.use_log_means_squared = use_log_means_squared
self.use_relative_lr = use_relative_lr
self.use_extreme_indicator = use_extreme_indicator
self.max_log_lr = max_log_lr
self.use_problem_lr_mean = use_problem_lr_mean
self.use_gradient_shortcut = use_gradient_shortcut
self.use_lr_shortcut = use_lr_shortcut
self.use_grad_products = use_grad_products
self.use_multiple_scale_decays = use_multiple_scale_decays
self.learnable_inp_decay = learnable_inp_decay
self.learnable_rnn_init = learnable_rnn_init
self.random_seed = random_seed
self.num_layers = len(level_sizes)
self.init_lr_range = init_lr_range
self.reuse_vars = None
self.reuse_global_state = None
self.cells = []
self.init_vectors = []
with tf.variable_scope(opt.OPTIMIZER_SCOPE):
self._initialize_rnn_cells(level_sizes)
# get the cell size for the per-parameter RNN (Level 0)
cell_size = level_sizes[0]
# Random normal initialization scaled by the output size. This is the
# scale for the RNN *readouts*. RNN internal weight scale is set in the
# BiasGRUCell call.
scale_factor = FLAGS.hrnn_rnn_readout_scale / math.sqrt(cell_size)
scaled_init = tf.random_normal_initializer(0., scale_factor,
seed=self.random_seed)
# weights for projecting the hidden state to a parameter update
self.update_weights = tf.get_variable("update_weights",
shape=(cell_size, 1),
initializer=scaled_init)
if self.use_attention:
# weights for projecting the hidden state to the location at which the
# gradient is attended
self.attention_weights = tf.get_variable(
"attention_weights",
initializer=self.update_weights.initialized_value())
# weights for projecting the hidden state to the RMS decay term
self._initialize_scale_decay((cell_size, 1), scaled_init)
self._initialize_input_decay((cell_size, 1), scaled_init)
self._initialize_lr((cell_size, 1), scaled_init)
state_keys = ["parameter", "layer", "scl_decay", "inp_decay", "true_param"]
if self.dynamic_output_scale:
state_keys.append("log_learning_rate")
for i in range(self.num_gradient_scales):
state_keys.append("grad_accum{}".format(i + 1))
state_keys.append("ms{}".format(i + 1))
super(HierarchicalRNN, self).__init__(
"hRNN", state_keys, use_attention=use_attention,
use_log_objective=use_log_objective,
obj_train_max_multiplier=obj_train_max_multiplier, **kwargs)
def _initialize_rnn_cells(self, level_sizes):
"""Initializes the RNN cells to use in the hierarchical RNN."""
# RNN Cell layers (0 -> lowest, 1 -> middle, 2 -> global)
for level in range(self.num_layers):
scope = "Level{}_RNN".format(level)
with tf.variable_scope(scope):
hcell = rnn_cells.BiasGRUCell(
level_sizes[level],
scale=FLAGS.biasgrucell_scale,
gate_bias_init=FLAGS.biasgrucell_gate_bias_init,
random_seed=self.random_seed)
self.cells.append(hcell)
if self.learnable_rnn_init:
self.init_vectors.append(tf.Variable(
tf.random_uniform([1, hcell.state_size], -1., 1.,
seed=self.random_seed),
name="init_vector"))
else:
self.init_vectors.append(
tf.random_uniform([1, hcell.state_size], -1., 1.,
seed=self.random_seed))
def _initialize_scale_decay(self, weights_tensor_shape, scaled_init):
"""Initializes the scale decay weights and bias variables or tensors.
Args:
weights_tensor_shape: The shape the weight tensor should take.
scaled_init: The scaled initialization for the weights tensor.
"""
if self.learnable_decay:
self.scl_decay_weights = tf.get_variable("scl_decay_weights",
shape=weights_tensor_shape,
initializer=scaled_init)
scl_decay_bias_init = tf.constant_initializer(
FLAGS.scale_decay_bias_init)
self.scl_decay_bias = tf.get_variable("scl_decay_bias",
shape=(1,),
initializer=scl_decay_bias_init)
else:
self.scl_decay_weights = tf.zeros_like(self.update_weights)
self.scl_decay_bias = tf.log(0.93 / (1. - 0.93))
def _initialize_input_decay(self, weights_tensor_shape, scaled_init):
"""Initializes the input scale decay weights and bias variables or tensors.
Args:
weights_tensor_shape: The shape the weight tensor should take.
scaled_init: The scaled initialization for the weights tensor.
"""
if (self.learnable_decay and self.num_gradient_scales > 1 and
self.learnable_inp_decay):
self.inp_decay_weights = tf.get_variable("inp_decay_weights",
shape=weights_tensor_shape,
initializer=scaled_init)
inp_decay_bias_init = tf.constant_initializer(
FLAGS.hrnn_default_decay_var_init)
self.inp_decay_bias = tf.get_variable("inp_decay_bias",
shape=(1,),
initializer=inp_decay_bias_init)
else:
self.inp_decay_weights = tf.zeros_like(self.update_weights)
self.inp_decay_bias = tf.log(0.89 / (1. - 0.89))
def _initialize_lr(self, weights_tensor_shape, scaled_init):
"""Initializes the learning rate weights and bias variables or tensors.
Args:
weights_tensor_shape: The shape the weight tensor should take.
scaled_init: The scaled initialization for the weights tensor.
"""
if self.dynamic_output_scale:
zero_init = tf.constant_initializer(0.)
wt_init = zero_init if self.zero_init_lr_weights else scaled_init
self.lr_weights = tf.get_variable("learning_rate_weights",
shape=weights_tensor_shape,
initializer=wt_init)
self.lr_bias = tf.get_variable("learning_rate_bias", shape=(1,),
initializer=zero_init)
else:
self.lr_weights = tf.zeros_like(self.update_weights)
self.lr_bias = tf.zeros([1, 1])
def _initialize_state(self, var):
"""Return a dictionary mapping names of state variables to their values."""
var_vectorized = tf.reshape(var, [-1, 1])
ndim = var_vectorized.get_shape().as_list()[0]
state = {
# parameter init tensor is [var_ndim x layer0_cell_size]
"parameter": tf.ones([ndim, 1]) * self.init_vectors[0],
"scl_decay": tf.zeros_like(var_vectorized),
"inp_decay": tf.zeros_like(var_vectorized),
"true_param": var,
}
if self.num_layers > 1:
# layer init tensor is [1 x layer1_cell_size]
state["layer"] = tf.ones([1, 1]) * self.init_vectors[1]
if self.dynamic_output_scale:
min_lr = self.init_lr_range[0]
max_lr = self.init_lr_range[1]
if min_lr == max_lr:
log_init_lr = tf.log(min_lr * tf.ones_like(var_vectorized))
else:
# Use a random offset to increase the likelihood that the average of the
# LRs for this variable is different from the LRs for other variables.
actual_vals = tf.random_uniform(var_vectorized.get_shape().as_list(),
np.log(min_lr) / 2.,
np.log(max_lr) / 2.,
seed=self.random_seed)
offset = tf.random_uniform((), np.log(min_lr) / 2., np.log(max_lr) / 2.,
seed=self.random_seed)
log_init_lr = actual_vals + offset
# Clip the log learning rate to the flag at the top end, and to
# (log(min int32) - 1) at the bottom
clipped = tf.clip_by_value(log_init_lr, -33, self.max_log_lr)
state["log_learning_rate"] = clipped
for i in range(self.num_gradient_scales):
state["grad_accum{}".format(i + 1)] = tf.zeros_like(var_vectorized)
state["ms{}".format(i + 1)] = tf.zeros_like(var_vectorized)
return state
def _initialize_global_state(self):
if self.num_layers < 3:
return []
rnn_global_init = tf.ones([1, 1]) * self.init_vectors[2]
return [rnn_global_init]
def _compute_updates(self, params, grads, states, global_state):
# Store the updated parameters and states.
updated_params = []
updated_attention = []
updated_states = []
with tf.variable_scope(opt.OPTIMIZER_SCOPE):
mean_log_lr = self._compute_mean_log_lr(states)
# Iterate over the layers.
for param, grad_unflat, state in zip(params, grads, states):
with tf.variable_scope("PerTensor", reuse=self.reuse_vars):
self.reuse_vars = True
grad = tf.reshape(grad_unflat, [-1, 1])
# Create the RNN input. We will optionally extend it with additional
# features such as curvature and gradient signal vs. noise.
(grads_scaled, mean_squared_gradients,
grads_accum) = self._compute_scaled_and_ms_grads(grad, state)
rnn_input = [g for g in grads_scaled]
self._extend_rnn_input(rnn_input, state, grads_scaled,
mean_squared_gradients, mean_log_lr)
# Concatenate any features we've collected.
rnn_input_tensor = tf.concat(rnn_input, 1)
layer_state, new_param_state = self._update_rnn_cells(
state, global_state, rnn_input_tensor,
len(rnn_input) != len(grads_scaled))
(scl_decay, inp_decay, new_log_lr, update_step, lr_attend,
attention_delta) = self._compute_rnn_state_projections(
state, new_param_state, grads_scaled)
# Apply updates and store state variables.
if self.use_attention:
truth = state["true_param"]
updated_param = truth - update_step
attention_step = tf.reshape(lr_attend * attention_delta,
truth.get_shape())
updated_attention.append(truth - attention_step)
else:
updated_param = param - update_step
updated_attention.append(updated_param)
updated_params.append(updated_param)
# Collect the new state.
new_state = {
"parameter": new_param_state,
"scl_decay": scl_decay,
"inp_decay": inp_decay,
"true_param": updated_param,
}
if layer_state is not None:
new_state["layer"] = layer_state
if self.dynamic_output_scale:
new_state["log_learning_rate"] = new_log_lr
for i in range(self.num_gradient_scales):
new_state["grad_accum{}".format(i + 1)] = grads_accum[i]
new_state["ms{}".format(i + 1)] = mean_squared_gradients[i]
updated_states.append(new_state)
updated_global_state = self._compute_updated_global_state([layer_state],
global_state)
return (updated_params, updated_states, [updated_global_state],
updated_attention)
def _compute_mean_log_lr(self, states):
"""Computes the mean log learning rate across all variables."""
if self.use_problem_lr_mean and self.use_relative_lr:
sum_log_lr = 0.
count_log_lr = 0.
for state in states:
sum_log_lr += tf.reduce_sum(state["log_learning_rate"])
# Note: get_shape().num_elements()=num elements in the original tensor.
count_log_lr += state["log_learning_rate"].get_shape().num_elements()
return sum_log_lr / count_log_lr
def _compute_scaled_and_ms_grads(self, grad, state):
"""Computes the scaled gradient and the mean squared gradients.
Gradients are also accumulated across different timescales if appropriate.
Args:
grad: The gradient tensor for this layer.
state: The optimizer state for this layer.
Returns:
The scaled gradients, mean squared gradients, and accumulated gradients.
"""
input_decays = [state["inp_decay"]]
scale_decays = [state["scl_decay"]]
if self.use_multiple_scale_decays and self.num_gradient_scales > 1:
for i in range(self.num_gradient_scales - 1):
scale_decays.append(tf.sqrt(scale_decays[i]))
for i in range(self.num_gradient_scales - 1):
# Each accumulator on twice the timescale of the one before.
input_decays.append(tf.sqrt(input_decays[i]))
grads_accum = []
grads_scaled = []
mean_squared_gradients = []
# populate the scaled gradients and associated mean_squared values
if self.num_gradient_scales > 0:
for i, decay in enumerate(input_decays):
if self.num_gradient_scales == 1:
# We don't accumulate if no scales, just take the current gradient.
grad_accum = grad
else:
# The state vars are 1-indexed.
old_accum = state["grad_accum{}".format(i + 1)]
grad_accum = grad * (1. - decay) + old_accum * decay
grads_accum.append(grad_accum)
sd = scale_decays[i if self.use_multiple_scale_decays else 0]
grad_scaled, ms = utils.rms_scaling(grad_accum, sd,
state["ms{}".format(i + 1)],
update_ms=True)
grads_scaled.append(grad_scaled)
mean_squared_gradients.append(ms)
return grads_scaled, mean_squared_gradients, grads_accum
def _extend_rnn_input(self, rnn_input, state, grads_scaled,
mean_squared_gradients, mean_log_lr):
"""Computes additional rnn inputs and adds them to the rnn_input list."""
if self.num_gradient_scales > 1 and self.use_grad_products:
# This gives a measure of curvature relative to input averaging
# lengthscale and to the learning rate
grad_products = [a * b for a, b in
zip(grads_scaled[:-1], grads_scaled[1:])]
rnn_input.extend([g for g in grad_products])
if self.use_log_means_squared:
log_means_squared = [tf.log(ms + 1e-16)
for ms in mean_squared_gradients]
avg = tf.reduce_mean(log_means_squared, axis=0)
# This gives a measure of the signal vs. noise contribution to the
# gradient, at the current averaging lengthscale. If all the noise
# is averaged out, and if updates are small, these will be 0.
mean_log_means_squared = [m - avg for m in log_means_squared]
rnn_input.extend([m for m in mean_log_means_squared])
if self.use_relative_lr or self.use_extreme_indicator:
if not self.dynamic_output_scale:
raise Exception("Relative LR and Extreme Indicator features "
"require dynamic_output_scale to be set to True.")
log_lr_vec = tf.reshape(state["log_learning_rate"], [-1, 1])
if self.use_relative_lr:
if self.use_problem_lr_mean:
# Learning rate of this dimension vs. rest of target problem.
relative_lr = log_lr_vec - mean_log_lr
else:
# Learning rate of this dimension vs. rest of tensor.
relative_lr = log_lr_vec - tf.reduce_mean(log_lr_vec)
rnn_input.append(relative_lr)
if self.use_extreme_indicator:
# Indicator of extremely large or extremely small learning rate.
extreme_indicator = (tf.nn.relu(log_lr_vec - tf.log(1.)) -
tf.nn.relu(tf.log(1e-6) - log_lr_vec))
rnn_input.append(extreme_indicator)
if self.use_lr_shortcut:
log_lr_vec = tf.reshape(state["log_learning_rate"], [-1, 1])
rnn_input.append(log_lr_vec - tf.log(1e-3))
def _update_rnn_cells(self, state, global_state, rnn_input_tensor,
use_additional_features):
"""Updates the component RNN cells with the given state and tensor.
Args:
state: The current state of the optimizer.
global_state: The current global RNN state.
rnn_input_tensor: The input tensor to the RNN.
use_additional_features: Whether the rnn input tensor contains additional
features beyond the scaled gradients (affects whether the rnn input
tensor is used as input to the RNN.)
Returns:
layer_state: The new state of the per-tensor RNN.
new_param_state: The new state of the per-parameter RNN.
"""
# lowest level (per parameter)
# input -> gradient for this parameter
# bias -> output from the layer RNN
with tf.variable_scope("Layer0_RNN"):
total_bias = None
if self.num_layers > 1:
sz = 3 * self.cells[0].state_size # size of the concatenated bias
param_bias = utils.affine([state["layer"]], sz,
scope="Param/Affine",
scale=FLAGS.hrnn_affine_scale,
random_seed=self.random_seed)
total_bias = param_bias
if self.num_layers == 3:
global_bias = utils.affine(global_state, sz,
scope="Global/Affine",
scale=FLAGS.hrnn_affine_scale,
random_seed=self.random_seed)
total_bias += global_bias
new_param_state, _ = self.cells[0](
rnn_input_tensor, state["parameter"], bias=total_bias)
if self.num_layers > 1:
# middle level (per layer)
# input -> average hidden state from each parameter in this layer
# bias -> output from the RNN at the global level
with tf.variable_scope("Layer1_RNN"):
if not use_additional_features:
# Restore old behavior and only add the mean of the new params.
layer_input = tf.reduce_mean(new_param_state, 0, keep_dims=True)
else:
layer_input = tf.reduce_mean(
tf.concat((new_param_state, rnn_input_tensor), 1), 0,
keep_dims=True)
if self.num_layers == 3:
sz = 3 * self.cells[1].state_size
layer_bias = utils.affine(global_state, sz,
scale=FLAGS.hrnn_affine_scale,
random_seed=self.random_seed)
layer_state, _ = self.cells[1](
layer_input, state["layer"], bias=layer_bias)
else:
layer_state, _ = self.cells[1](layer_input, state["layer"])
else:
layer_state = None
return layer_state, new_param_state
def _compute_rnn_state_projections(self, state, new_param_state,
grads_scaled):
"""Computes the RNN state-based updates to parameters and update steps."""
# Compute the update direction (a linear projection of the RNN output).
update_weights = self.update_weights
update_delta = utils.project(new_param_state, update_weights)
if self.use_gradient_shortcut:
# Include an affine projection of just the direction of the gradient
# so that RNN hidden states are freed up to store more complex
# functions of the gradient and other parameters.
grads_scaled_tensor = tf.concat([g for g in grads_scaled], 1)
update_delta += utils.affine(grads_scaled_tensor, 1,
scope="GradsToDelta",
include_bias=False,
vec_mean=1. / len(grads_scaled),
random_seed=self.random_seed)
if self.dynamic_output_scale:
denom = tf.sqrt(tf.reduce_mean(update_delta ** 2) + 1e-16)
update_delta /= denom
if self.use_attention:
attention_weights = self.attention_weights
attention_delta = utils.project(new_param_state,
attention_weights)
if self.use_gradient_shortcut:
attention_delta += utils.affine(grads_scaled_tensor, 1,
scope="GradsToAttnDelta",
include_bias=False,
vec_mean=1. / len(grads_scaled),
random_seed=self.random_seed)
if self.dynamic_output_scale:
attention_delta /= tf.sqrt(
tf.reduce_mean(attention_delta ** 2) + 1e-16)
else:
attention_delta = None
# The updated decay is an affine projection of the hidden state.
scl_decay = utils.project(new_param_state, self.scl_decay_weights,
bias=self.scl_decay_bias,
activation=tf.nn.sigmoid)
# This is only used if learnable_decay and num_gradient_scales > 1
inp_decay = utils.project(new_param_state, self.inp_decay_weights,
bias=self.inp_decay_bias,
activation=tf.nn.sigmoid)
# Also update the learning rate.
lr_param, lr_attend, new_log_lr = self._compute_new_learning_rate(
state, new_param_state)
update_step = tf.reshape(lr_param * update_delta,
state["true_param"].get_shape())
return (scl_decay, inp_decay, new_log_lr, update_step, lr_attend,
attention_delta)
def _compute_new_learning_rate(self, state, new_param_state):
if self.dynamic_output_scale:
# Compute the change in learning rate (an affine projection of the
# RNN state, passed through a sigmoid or log depending on flags).
# Update the learning rate, w/ momentum.
lr_change = utils.project(new_param_state, self.lr_weights,
bias=self.lr_bias)
step_log_lr = state["log_learning_rate"] + lr_change
# Clip the log learning rate to the flag at the top end, and to
# (log(min int32) - 1) at the bottom
# Check out this hack: we want to be able to compute the gradient
# of the downstream result w.r.t lr weights and bias, even if the
# value of step_log_lr is outside the clip range. So we clip,
# subtract off step_log_lr, and wrap all that in a stop_gradient so
# TF never tries to take the gradient of the clip... or the
# subtraction. Then we add BACK step_log_lr so that downstream still
# receives the clipped value. But the GRADIENT of step_log_lr will
# be the gradient of the unclipped value, which we added back in
# after stop_gradients.
step_log_lr += tf.stop_gradient(
tf.clip_by_value(step_log_lr, -33, self.max_log_lr)
- step_log_lr)
lr_momentum_logit = tf.get_variable(
"learning_rate_momentum_logit",
initializer=FLAGS.learning_rate_momentum_logit_init)
lrm = tf.nn.sigmoid(lr_momentum_logit)
new_log_lr = (lrm * state["log_learning_rate"] +
(1. - lrm) * step_log_lr)
param_stepsize_offset = tf.get_variable("param_stepsize_offset",
initializer=-1.)
lr_param = tf.exp(step_log_lr + param_stepsize_offset)
lr_attend = tf.exp(step_log_lr) if self.use_attention else lr_param
else:
# Dynamic output scale is off, LR param is always 1.
lr_param = 2. * utils.project(new_param_state, self.lr_weights,
bias=self.lr_bias,
activation=tf.nn.sigmoid)
new_log_lr = None
lr_attend = lr_param
return lr_param, lr_attend, new_log_lr
def _compute_updated_global_state(self, layer_states, global_state):
"""Computes the new global state gives the layers states and old state.
Args:
layer_states: The current layer states.
global_state: The old global state.
Returns:
The updated global state.
"""
updated_global_state = []
if self.num_layers == 3:
# highest (global) layer
# input -> average hidden state from each layer-specific RNN
# bias -> None
with tf.variable_scope("Layer2_RNN", reuse=self.reuse_global_state):
self.reuse_global_state = True
global_input = tf.reduce_mean(tf.concat(layer_states, 0), 0,
keep_dims=True)
updated_global_state, _ = self.cells[2](global_input, global_state[0])
return updated_global_state
def apply_gradients(self, grads_and_vars, global_step=None, name=None):
"""Overwrites the tf.train.Optimizer interface for applying gradients."""
# Pull out the variables.
grads_and_vars = tuple(grads_and_vars) # Make sure repeat iteration works.
for g, v in grads_and_vars:
if not isinstance(g, (tf.Tensor, tf.IndexedSlices, type(None))):
raise TypeError(
"Gradient must be a Tensor, IndexedSlices, or None: %s" % g)
if not isinstance(v, tf.Variable):
raise TypeError(
"Variable must be a tf.Variable: %s" % v)
if g is not None:
self._assert_valid_dtypes([g, v])
var_list = [v for g, v in grads_and_vars if g is not None]
if not var_list:
raise ValueError("No gradients provided for any variable: %s" %
(grads_and_vars,))
# Create slots for the variables.
with tf.control_dependencies(None):
self._create_slots(var_list)
# Store update ops in this list.
with tf.op_scope([], name, self._name) as name:
# Prepare the global state.
with tf.variable_scope(self._name, reuse=self.reuse_global_state):
gs = self._initialize_global_state()
if gs:
global_state = [tf.get_variable("global_state", initializer=gs[0])]
else:
global_state = []
# Get the states for each variable in the list.
states = [{key: self.get_slot(var, key) for key in self.get_slot_names()}
for var in var_list]
# Compute updated values.
grads, params = zip(*grads_and_vars)
args = (params, grads, states, global_state)
updates = self._compute_updates(*args)
new_params, new_states, new_global_state, new_attention = updates
# Assign op for new global state.
update_ops = [tf.assign(gs, ngs)
for gs, ngs in zip(global_state, new_global_state)]
# Create the assign ops for the params and state variables.
args = (params, states, new_params, new_attention, new_states)
for var, state, new_var, new_var_attend, new_state in zip(*args):
# Assign updates to the state variables.
state_assign_ops = [tf.assign(state_var, new_state[key])
for key, state_var in state.items()]
# Update the parameter.
with tf.control_dependencies(state_assign_ops):
if self.use_attention:
# Assign to the attended location, rather than the actual location
# so that the gradients are computed where attention is.
param_update_op = var.assign(new_var_attend)
else:
param_update_op = var.assign(new_var)
with tf.name_scope("update_" + var.op.name): #, tf.colocate_with(var):
update_ops.append(param_update_op)
real_params = [self.get_slot(var, "true_param") for var in var_list]
if global_step is None:
# NOTE: if using the optimizer in a non-test-optimizer setting (e.g.
# on Inception), remove the real_params return value. Otherwise
# the code will throw an error.
return self._finish(update_ops, name), real_params
else:
with tf.control_dependencies([self._finish(update_ops, "update")]):
return state_ops.assign_add(global_step, 1, name=name).op, real_params
# Copyright 2017 Google, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""A trainable optimizer that learns a learning rate schedule."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from learned_optimizer.optimizer import trainable_optimizer
class LearningRateSchedule(trainable_optimizer.TrainableOptimizer):
"""Learns a learning rate schedule over a fixed number of iterations."""
def __init__(self, initial_rate=0.0, n_steps=1000, **kwargs):
"""Initializes the learning rates."""
self.max_index = tf.constant(n_steps-1, dtype=tf.int32)
with tf.variable_scope(trainable_optimizer.OPTIMIZER_SCOPE):
initializer = tf.constant_initializer(initial_rate)
self.learning_rates = tf.get_variable("learning_rates",
shape=([n_steps,]),
initializer=initializer)
super(LearningRateSchedule, self).__init__("LRS", ["itr"], **kwargs)
def _initialize_state(self, var):
"""Return a dictionary mapping names of state variables to their values."""
return {
"itr": tf.constant(0, dtype=tf.int32),
}
def _compute_update(self, param, grad, state):
"""Compute updates of parameters."""
# get the learning rate at the current index, if the index
# is greater than the number of available learning rates,
# use the last one
index = tf.minimum(state["itr"], self.max_index)
learning_rate = tf.gather(self.learning_rates, index)
# update the parameters: parameter - learning_rate * gradient
updated_param = param - tf.scalar_mul(learning_rate, grad)
return updated_param, {"itr": state["itr"] + 1}
# Copyright 2017 Google, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Custom RNN cells for hierarchical RNNs."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from learned_optimizer.optimizer import utils
class BiasGRUCell(tf.contrib.rnn.RNNCell):
"""GRU cell (cf. http://arxiv.org/abs/1406.1078) with an additional bias."""
def __init__(self, num_units, activation=tf.tanh, scale=0.1,
gate_bias_init=0., random_seed=None):
self._num_units = num_units
self._activation = activation
self._scale = scale
self._gate_bias_init = gate_bias_init
self._random_seed = random_seed
@property
def state_size(self):
return self._num_units
@property
def output_size(self):
return self._num_units
def __call__(self, inputs, state, bias=None):
# Split the injected bias vector into a bias for the r, u, and c updates.
if bias is None:
bias = tf.zeros((1, 3))
r_bias, u_bias, c_bias = tf.split(bias, 3, 1)
with tf.variable_scope(type(self).__name__): # "BiasGRUCell"
with tf.variable_scope("gates"): # Reset gate and update gate.
proj = utils.affine([inputs, state], 2 * self._num_units,
scale=self._scale, bias_init=self._gate_bias_init,
random_seed=self._random_seed)
r_lin, u_lin = tf.split(proj, 2, 1)
r, u = tf.nn.sigmoid(r_lin + r_bias), tf.nn.sigmoid(u_lin + u_bias)
with tf.variable_scope("candidate"):
proj = utils.affine([inputs, r * state], self._num_units,
scale=self._scale, random_seed=self._random_seed)
c = self._activation(proj + c_bias)
new_h = u * state + (1 - u) * c
return new_h, new_h
# Copyright 2017 Google, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""A trainable ADAM optimizer that learns its internal variables."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import tensorflow as tf
from learned_optimizer.optimizer import trainable_optimizer as opt
from learned_optimizer.optimizer import utils
class TrainableAdam(opt.TrainableOptimizer):
"""Adam optimizer with learnable scalar parameters.
See Kingma et. al., 2014 for algorithm (http://arxiv.org/abs/1412.6980).
"""
def __init__(self,
learning_rate=1e-3,
beta1=0.9,
beta2=0.999,
epsilon=1e-8,
**kwargs):
"""Initializes the TrainableAdam optimizer with the given initial values.
Args:
learning_rate: The learning rate (default: 1e-3).
beta1: The exponential decay rate for the 1st moment estimates.
beta2: The exponential decay rate for the 2nd moment estimates.
epsilon: A small constant for numerical stability.
**kwargs: Any additional keyword arguments for TrainableOptimizer.
Raises:
ValueError: if the learning rate or epsilon is not positive
ValueError: if beta1 or beta2 is not in (0, 1).
"""
if learning_rate <= 0:
raise ValueError("Learning rate must be positive.")
if epsilon <= 0:
raise ValueError("Epsilon must be positive.")
if not 0 < beta1 < 1 or not 0 < beta2 < 1:
raise ValueError("Beta values must be between 0 and 1, exclusive.")
self._reuse_vars = False
with tf.variable_scope(opt.OPTIMIZER_SCOPE):
def inv_sigmoid(x):
return np.log(x / (1.0 - x))
self.log_learning_rate = tf.get_variable(
"log_learning_rate",
shape=[],
initializer=tf.constant_initializer(np.log(learning_rate)))
self.beta1_logit = tf.get_variable(
"beta1_logit",
shape=[],
initializer=tf.constant_initializer(inv_sigmoid(beta1)))
self.beta2_logit = tf.get_variable(
"beta2_logit",
shape=[],
initializer=tf.constant_initializer(inv_sigmoid(beta2)))
self.log_epsilon = tf.get_variable(
"log_epsilon",
shape=[],
initializer=tf.constant_initializer(np.log(epsilon)))
# Key names are derived from Algorithm 1 described in
# https://arxiv.org/pdf/1412.6980.pdf
state_keys = ["m", "v", "t"]
super(TrainableAdam, self).__init__("Adam", state_keys, **kwargs)
def _initialize_state(self, var):
"""Returns a dictionary mapping names of state variables to their values."""
vectorized_shape = var.get_shape().num_elements(), 1
return {key: tf.zeros(vectorized_shape) for key in self.state_keys}
def _compute_update(self, param, grad, state):
"""Calculates the new internal state and parameters.
If the gradient is sparse, updates the appropriate slices in the internal
state and stacks the update tensor.
Args:
param: A tensor of parameters.
grad: A tensor of gradients with the same shape as param.
state: A dictionary containing any state for the optimizer.
Returns:
updated_param: The updated parameters.
updated_state: The updated state variables in a dictionary.
"""
with tf.variable_scope(opt.OPTIMIZER_SCOPE) as scope:
if self._reuse_vars:
scope.reuse_variables()
else:
self._reuse_vars = True
(grad_values, first_moment, second_moment, timestep, grad_indices
) = self._extract_gradients_and_internal_state(
grad, state, tf.shape(param))
beta1 = tf.nn.sigmoid(self.beta1_logit)
beta2 = tf.nn.sigmoid(self.beta2_logit)
epsilon = tf.exp(self.log_epsilon) + 1e-10
learning_rate = tf.exp(self.log_learning_rate)
old_grad_shape = tf.shape(grad_values)
grad_values = tf.reshape(grad_values, [-1, 1])
new_timestep = timestep + 1
new_first_moment = self._update_adam_estimate(
first_moment, grad_values, beta1)
new_second_moment = self._debias_adam_estimate(
second_moment, tf.square(grad_values), beta2)
debiased_first_moment = self._debias_adam_estimate(
new_first_moment, beta1, new_timestep)
debiased_second_moment = self._debias_adam_estimate(
new_second_moment, beta2, new_timestep)
# Propagating through the square root of 0 is very bad for stability.
update = (learning_rate * debiased_first_moment /
(tf.sqrt(debiased_second_moment + 1e-10) + epsilon))
update = tf.reshape(update, old_grad_shape)
if grad_indices is not None:
param_shape = tf.shape(param)
update = utils.stack_tensor(
update, grad_indices, param, param_shape[:1])
new_first_moment = utils.update_slices(
new_first_moment, grad_indices, state["m"], param_shape)
new_second_moment = utils.update_slices(
new_second_moment, grad_indices, state["v"], param_shape)
new_timestep = utils.update_slices(
new_timestep, grad_indices, state["t"], param_shape)
new_param = param - update
# collect the update and new state
new_state = {
"m": new_first_moment,
"v": new_second_moment,
"t": new_timestep
}
return new_param, new_state
def _update_adam_estimate(self, estimate, value, beta):
"""Returns a beta-weighted average of estimate and value."""
return (beta * estimate) + ((1 - beta) * value)
def _debias_adam_estimate(self, estimate, beta, t_step):
"""Returns a debiased estimate based on beta and the timestep."""
return estimate / (1 - tf.pow(beta, t_step))
def _extract_gradients_and_internal_state(self, grad, state, param_shape):
"""Extracts the gradients and relevant internal state.
If the gradient is sparse, extracts the appropriate slices from the state.
Args:
grad: The current gradient.
state: The current state.
param_shape: The shape of the parameter (used if gradient is sparse).
Returns:
grad_values: The gradient value tensor.
first_moment: The first moment tensor (internal state).
second_moment: The second moment tensor (internal state).
timestep: The current timestep (internal state).
grad_indices: The indices for the gradient tensor, if sparse.
None otherwise.
"""
grad_values = grad
grad_indices = None
first_moment = state["m"]
second_moment = state["v"]
timestep = state["t"]
if isinstance(grad, tf.IndexedSlices):
grad_indices, grad_values = utils.accumulate_sparse_gradients(grad)
first_moment = utils.slice_tensor(
first_moment, grad_indices, param_shape)
second_moment = utils.slice_tensor(
second_moment, grad_indices, param_shape)
timestep = utils.slice_tensor(timestep, grad_indices, param_shape)
return grad_values, first_moment, second_moment, timestep, grad_indices
# Copyright 2017 Google, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""A base class definition for trainable optimizers."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import itertools
import tensorflow as tf
from tensorflow.python.framework import tensor_shape
OPTIMIZER_SCOPE = "LOL"
_LOCAL_VARIABLE_PREFIX = "local_state_"
_LOCAL_STATE_VARIABLE_COLLECTION = "local_state_collection"
EPSILON = 1e-6
class TrainableOptimizer(tf.train.Optimizer):
"""Base class for trainable optimizers.
A trainable optimizer is an optimizer that has parameters that can themselves
be learned (meta-optimized).
Subclasses must implement:
_compute_update(self, param, grad, state)
"""
def __init__(self, name, state_keys, use_attention=False,
use_log_objective=False, obj_train_max_multiplier=-1,
use_second_derivatives=True, use_numerator_epsilon=False,
**kwargs):
"""Initializes the optimizer with the given name and settings.
Args:
name: The name string for this optimizer.
state_keys: The names of any required state variables (list)
use_attention: Whether this optimizer uses attention (Default: True)
use_log_objective: Whether this optimizer uses the logarithm of the
objective when computing the loss (Default: False)
obj_train_max_multiplier: The maximum multiplier for the increase in the
objective before meta-training is stopped. If <= 0, meta-training is
not stopped early. (Default: -1)
use_second_derivatives: Whether this optimizer uses second derivatives in
meta-training. This should be set to False if some second derivatives
in the meta-training problem set are not defined in Tensorflow.
(Default: True)
use_numerator_epsilon: Whether to use epsilon in the numerator when
scaling the problem objective during meta-training. (Default: False)
**kwargs: Any additional keyword arguments.
"""
self.use_second_derivatives = use_second_derivatives
self.state_keys = sorted(state_keys)
self.use_attention = use_attention
self.use_log_objective = use_log_objective
self.obj_train_max_multiplier = obj_train_max_multiplier
self.use_numerator_epsilon = use_numerator_epsilon
use_locking = False
super(TrainableOptimizer, self).__init__(use_locking, name)
def _create_slots(self, var_list):
"""Creates all slots needed by the variables.
Args:
var_list: A list of `Variable` objects.
"""
for var in var_list:
init_states = self._initialize_state(var)
for slot_name in sorted(init_states):
slot_var_name = "{}_{}".format(self.get_name(), slot_name)
value = init_states[slot_name]
self._get_or_make_slot(var, value, slot_name, slot_var_name)
def _initialize_state(self, var):
"""Initializes any state required for this variable.
Args:
var: a tensor containing parameters to be optimized
Returns:
state: a dictionary mapping state keys to initial state values (tensors)
"""
return {}
def _initialize_global_state(self):
"""Initializes any global state values."""
return []
def _apply_common(self, grad, var):
"""Applies the optimizer updates to the variables.
Note: this should only get called via _apply_dense or _apply_sparse when
using the optimizer via optimizer.minimize or optimizer.apply_gradients.
During meta-training, the optimizer.train function should be used to
construct an optimization path that is differentiable.
Args:
grad: A tensor representing the gradient.
var: A tf.Variable with the same shape as grad.
Returns:
update_op: A tensorflow op that assigns new values to the variable, and
also defines dependencies that update the state variables for the
optimizer.
"""
state = {key: self.get_slot(var, key) for key in self.get_slot_names()}
new_var, new_state = self._compute_update(var, grad, state)
state_assign_ops = [tf.assign(state_var, new_state[key])
for key, state_var in state.items()]
with tf.control_dependencies(state_assign_ops):
update_op = var.assign(new_var)
return update_op
def _apply_dense(self, grad, var):
"""Adds ops to apply dense gradients to 'var'."""
return self._apply_common(grad, var)
def _apply_sparse(self, grad, var):
"""Adds ops to apply sparse gradients to 'var'."""
return self._apply_common(grad, var)
def _compute_update(self, param, grad, state):
"""Computes the update step for optimization.
Args:
param: A tensor of parameters to optimize.
grad: The gradient tensor of the objective with respect to the parameters.
(It has the same shape as param.)
state: A dictionary containing any extra state required by the optimizer.
Returns:
updated_params: The updated parameters.
updated_state: The dictionary of updated state variable(s).
"""
raise NotImplementedError
def _compute_updates(self, params, grads, states, global_state):
"""Maps the compute update functions for each parameter.
This function can be overriden by a subclass if the subclass wants to
combine information across the different parameters in the list.
Args:
params: A list of parameter tensors.
grads: A list of gradients corresponding to each parameter.
states: A list of state variables corresponding to each parameter.
global_state: A list of global state variables for the problem.
Returns:
new_params: The updated parameters.
new_states: The updated states.
new_global_state: The updated global state.
attention_params: A list of attention parameters. This is the same as
new_params if the optimizer does not use attention.
"""
# Zip up the arguments to _compute_update.
args = zip(params, grads, states)
# Call compute_update on each set of parameter/gradient/state args.
new_params, new_states = zip(*list(
itertools.starmap(self._compute_update, args)))
# Global state is unused in the basic case, just pass it through.
return list(new_params), list(new_states), global_state, list(new_params)
def train(self, problem, dataset):
"""Creates graph operations to train the optimizer.
Args:
problem: A problem_generator.Problem instance to train on.
dataset: A datasets.Dataset tuple to use when training.
Returns:
meta_objective: A tensorflow operation for computing the meta-objective
obj_weights: A tensor placeholder for feeding in the objective weights
obj_values: The subproblem objective values during optimization
batches: The batch indexes tensor for overriding with feed_dict
first_unroll: A placeholder signifying if this is a first unroll
(this will propagate the gradients slightly differently).
reset_state: A placeholder signifying that the rnn state should be reset.
output_state: The final state of the optimizer
init_loop_vars_to_override: Local variables that can be assigned to
propagate the optimizer and problem state for unrolling
final_loop_vals: Final values of the loop variables that can be
assigned to init_loop_vars_to_override.
"""
# Placeholder for the objective weights
obj_weights = tf.placeholder(tf.float32)
num_iter = tf.shape(obj_weights)[0]
# Unpack the dataset and generate the minibatches for training
data, labels = dataset
# Convert the ndarrays to tensors so we can pass them back in via feed_dict
data = tf.constant(data)
labels = tf.constant(labels)
batches = tf.placeholder(tf.int32)
first_unroll = tf.placeholder_with_default(False, [])
reset_state = tf.placeholder_with_default(False, [])
training_output = collections.namedtuple("TrainingOutput",
["metaobj",
"obj_weights",
"problem_objectives",
"initial_obj",
"batches",
"first_unroll",
"reset_state",
"output_state",
"init_loop_vars",
"output_loop_vars"])
def loop_body(itr, obj_accum, params, attend_params, flattened_states,
global_state, all_obj, unused_init_obj, data,
labels, batches):
"""Body of the meta-training while loop for optimizing a sub-problem.
Args:
itr: The current meta-training iteration.
obj_accum: The accumulated objective over all training steps so far.
params: The parameters of the sub-problem.
attend_params: The parameters of the sub-problems at the attended
location.
flattened_states: The states of the trainable optimizer, sorted and
flattened into a list (since a while loop can't handle nested lists
or dictionaries).
global_state: The global state of the optimizer.
all_obj: The list of all objective values in the training process.
unused_init_obj: The initial objective (unused here, but needed in the
variable list because it's used in a stopping condition in the
loop_cond.)
data: The data for this problem.
labels: The labels corresponding to the data.
batches: The batch indexes needed for shuffled minibatch creation.
Returns:
itr: The updated meta-training iteration.
obj_accum: The updated accumulated objective.
params: The new parameters of the sub-problem.
attend_params: The new parameters of the sub-problems at the attended
location.
flattened_states: The new states of the trainable optimizer.
global_state: The updated global state.
all_obj: The updates list of all objective values.
unused_init_obj: The initial objective.
data: The data for this problem.
labels: The labels corresponding to the data.
batches: The batch indexes needed for shuffled minibatch creation.
"""
batch_indices = tf.gather(batches, itr)
batch_data = tf.gather(data, batch_indices)
batch_labels = tf.gather(labels, batch_indices)
# Compute the objective over the entire dataset (full batch).
obj = problem.objective(params, data, labels)
# Compute the gradients on just the current batch
if self.use_attention:
current_obj = problem.objective(attend_params, batch_data, batch_labels)
grads = problem.gradients(current_obj, attend_params)
else:
current_obj = problem.objective(params, batch_data, batch_labels)
grads = problem.gradients(current_obj, params)
if not self.use_second_derivatives:
new_grads = []
for grad in grads:
if isinstance(grad, tf.IndexedSlices):
new_grads.append(
tf.IndexedSlices(tf.stop_gradient(grad.values), grad.indices))
else:
new_grads.append(tf.stop_gradient(grad))
grads = new_grads
# store the objective value for the entire problem at each iteration
all_obj = tf.concat([all_obj, tf.reshape(obj, (1,))], 0)
# accumulate the weighted objective for the entire dataset
acc = tf.gather(obj_weights, itr) * obj
obj_accum = tf.add(obj_accum, acc)
# Set the shape to keep the shape invariant for obj_accum. Without this,
# the graph builder thinks the tensor shape is unknown on the 2nd iter.
obj_accum.set_shape([])
# convert flattened_states to dictionaries
dict_states = [dict(zip(self.state_keys, flat_state))
for flat_state in flattened_states]
# compute the new parameters and states
args = (params, grads, dict_states, global_state)
updates = self._compute_updates(*args)
new_params, new_states, new_global_state, new_attend_params = updates
# flatten the states
new_flattened_states = map(flatten_and_sort, new_states)
return [itr + 1, obj_accum, new_params, new_attend_params,
new_flattened_states, new_global_state, all_obj, unused_init_obj,
data, labels, batches]
def loop_cond(itr, obj_accum, unused_params, unused_attend_params,
unused_flattened_states, unused_global_state, all_obj,
init_obj, *args):
"""Termination conditions of the sub-problem optimization loop."""
del args # unused
cond1 = tf.less(itr, num_iter) # We've run < num_iter times
cond2 = tf.is_finite(obj_accum) # The objective is still finite
if self.obj_train_max_multiplier > 0:
current_obj = tf.gather(all_obj, itr)
# Account for negative init_obj too
max_diff = (self.obj_train_max_multiplier - 1) * tf.abs(init_obj)
max_obj = init_obj + max_diff
# The objective is a reasonable multiplier of the original objective
cond3 = tf.less(current_obj, max_obj)
return tf.logical_and(tf.logical_and(cond1, cond2), cond3,
name="training_loop_cond")
else:
return tf.logical_and(cond1, cond2, name="training_loop_cond")
init = self._initialize_training_loop_parameters(
problem, data, labels, batches, first_unroll, reset_state)
loop_vars, invariants, initial_obj, init_loop_vars_to_override = init
loop_output = tf.while_loop(loop_cond, loop_body, loop_vars,
swap_memory=True, shape_invariants=invariants)
meta_obj, problem_objectives = loop_output[1], loop_output[6]
# The meta objective is normalized by the initial objective at the start of
# the series of partial unrolls.
scaled_meta_objective = self.scale_objective(
meta_obj, problem_objectives, initial_obj)
final_loop_vals = (
[initial_obj] + loop_output[2] + loop_output[3] + loop_output[5])
final_loop_vals.extend(itertools.chain(*loop_output[4]))
return training_output(scaled_meta_objective,
obj_weights,
problem_objectives,
initial_obj,
batches,
first_unroll,
reset_state,
loop_output[4],
init_loop_vars_to_override,
final_loop_vals)
def _initialize_training_loop_parameters(
self, problem, data, labels, batches, first_unroll, reset_state):
"""Initializes the vars and params needed for the training process.
Args:
problem: The problem being optimized.
data: The data for the problem.
labels: The corresponding labels for the data.
batches: The indexes needed to create shuffled batches of the data.
first_unroll: Whether this is the first unroll in a partial unrolling.
reset_state: Whether RNN state variables should be reset.
Returns:
loop_vars: The while loop variables for training.
invariants: The corresponding variable shapes (required by while loop).
initial_obj: The initial objective (used later for scaling).
init_loop_vars_to_override: The loop vars that can be overridden when
performing training via partial unrolls.
"""
# Extract these separately so we don't have to make inter-variable
# dependencies.
initial_tensors = problem.init_tensors()
return_initial_tensor_values = first_unroll
initial_params_vars, initial_params = local_state_variables(
initial_tensors, return_initial_tensor_values)
initial_attend_params_vars, initial_attend_params = local_state_variables(
initial_tensors, return_initial_tensor_values)
# Recalculate the initial objective for the list on each partial unroll with
# the new initial_params. initial_obj holds the value from the very first
# unroll.
initial_obj_init = problem.objective(initial_params, data, labels)
return_initial_obj_init = first_unroll
[initial_obj_var], [initial_obj] = local_state_variables(
[initial_obj_init], return_initial_obj_init)
# Initialize the loop variables.
initial_itr = tf.constant(0, dtype=tf.int32)
initial_meta_obj = tf.constant(0, dtype=tf.float32)
# N.B. the use of initial_obj_init here rather than initial_obj
initial_problem_objectives = tf.reshape(initial_obj_init, (1,))
# Initialize the extra state.
initial_state_vars = []
initial_state = []
state_shapes = []
return_initial_state_values = reset_state
for param in initial_tensors:
param_state_vars, param_state = local_state_variables(
flatten_and_sort(self._initialize_state(param)),
return_initial_state_values)
initial_state_vars.append(param_state_vars)
initial_state.append(param_state)
state_shapes.append([f.get_shape() for f in param_state])
# Initialize any global (problem-level) state.
initial_global_state_vars, initial_global_state = local_state_variables(
self._initialize_global_state(), return_initial_state_values)
global_shapes = []
for item in initial_global_state:
global_shapes.append(item.get_shape())
# build the list of loop variables:
loop_vars = [
initial_itr,
initial_meta_obj,
initial_params, # Local variables.
initial_attend_params, # Local variables.
initial_state, # Local variables.
initial_global_state, # Local variables.
initial_problem_objectives,
initial_obj, # Local variable.
data,
labels,
batches,
]
invariants = [
initial_itr.get_shape(),
initial_meta_obj.get_shape(),
[t.get_shape() for t in initial_params],
[t.get_shape() for t in initial_attend_params],
state_shapes,
global_shapes,
tensor_shape.TensorShape([None]), # The problem objectives list grows
initial_obj.get_shape(),
tensor_shape.unknown_shape(), # Placeholder shapes are unknown
tensor_shape.unknown_shape(),
tensor_shape.unknown_shape(),
]
# Initialize local variables that we will override with final tensors at the
# next iter.
init_loop_vars_to_override = (
[initial_obj_var] + initial_params_vars + initial_attend_params_vars +
initial_global_state_vars)
init_loop_vars_to_override.extend(itertools.chain(*initial_state_vars))
return loop_vars, invariants, initial_obj, init_loop_vars_to_override
def scale_objective(self, total_obj, all_objs, initial_obj,
obj_scale_eps=1e-6):
"""Normalizes the objective based on the initial objective value.
Args:
total_obj: The total accumulated objective over the training run.
all_objs: A list of all the individual objectives over the training run.
initial_obj: The initial objective value.
obj_scale_eps: The epsilon value to use in computations for stability.
Returns:
The scaled objective as a single value.
"""
if self.use_log_objective:
if self.use_numerator_epsilon:
scaled_problem_obj = ((all_objs + obj_scale_eps) /
(initial_obj + obj_scale_eps))
log_scaled_problem_obj = tf.log(scaled_problem_obj)
else:
scaled_problem_obj = all_objs / (initial_obj + obj_scale_eps)
log_scaled_problem_obj = tf.log(scaled_problem_obj + obj_scale_eps)
return tf.reduce_mean(log_scaled_problem_obj)
else:
return total_obj / (initial_obj + obj_scale_eps)
def local_state_variables(init_values, return_init_values):
"""Create local variables initialized from init_values.
This will create local variables from a list of init_values. Each variable
will be named based on the value's shape and dtype.
As a convenience, a boolean tensor allows you to return value from
the created local variable or from the original init value.
Args:
init_values: iterable of tensors
return_init_values: boolean tensor
Returns:
local_vars: list of the created local variables.
vals: if return_init_values is true, then this returns the values of
init_values. Otherwise it returns the values of the local_vars.
"""
if not init_values:
return [], []
# This generates a harmless warning when saving the metagraph.
variable_use_count = tf.get_collection_ref(_LOCAL_STATE_VARIABLE_COLLECTION)
if not variable_use_count:
variable_use_count.append(collections.defaultdict(int))
variable_use_count = variable_use_count[0]
local_vars = []
with tf.variable_scope(OPTIMIZER_SCOPE):
# We can't use the init_value as an initializer as init_value may
# itself depend on some problem variables. This would produce
# inter-variable initialization order dependence which TensorFlow
# sucks at making easy.
for init_value in init_values:
name = create_local_state_variable_name(init_value)
unique_name = name + "_" + str(variable_use_count[name])
variable_use_count[name] += 1
# The overarching idea here is to be able to reuse variables between
# different sessions on the same TensorFlow master without errors. By
# uniquifying based on the type and name we mirror the checks made inside
# TensorFlow, while still allowing some memory reuse. Ultimately this is a
# hack due to the broken Session.reset().
local_vars.append(
tf.get_local_variable(
unique_name,
initializer=tf.zeros(
init_value.get_shape(), dtype=init_value.dtype)))
# It makes things a lot simpler if we use the init_value the first
# iteration, instead of the variable itself. It allows us to propagate
# gradients through it as well as simplifying initialization. The variable
# ends up assigned to after the first iteration.
vals = tf.cond(return_init_values, lambda: init_values, lambda: local_vars)
if len(init_values) == 1:
# tf.cond extracts elements from singleton lists.
vals = [vals]
return local_vars, vals
def create_local_state_variable_name(tensor):
"""Create a name of the variable based on its type and shape."""
if not tensor.get_shape().is_fully_defined():
raise ValueError("Need a fully specified shape to create a local variable.")
return (_LOCAL_VARIABLE_PREFIX + "_".join(
map(str, tensor.get_shape().as_list())) + "_" + tensor.dtype.name)
def is_local_state_variable(op):
"""Returns if this op is a local state variable created for training."""
return op.node_def.op in ["Variable", "VariableV2"] and op.name.startswith(
OPTIMIZER_SCOPE + "/" + _LOCAL_VARIABLE_PREFIX)
def flatten_and_sort(dictionary):
"""Flattens a dictionary into a list of values sorted by the keys."""
return [dictionary[k] for k in sorted(dictionary.keys())]
# Copyright 2017 Google, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utilities and helper functions."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import tensorflow as tf
def make_finite(t, replacement):
"""Replaces non-finite tensor values with the replacement value."""
return tf.where(tf.is_finite(t), t, replacement)
def asinh(x):
"""Computes the inverse hyperbolic sine function (in tensorflow)."""
return tf.log(x + tf.sqrt(1. + x ** 2))
def affine(inputs, output_size, scope="Affine", scale=0.1, vec_mean=0.,
include_bias=True, bias_init=0., random_seed=None):
"""Computes an affine function of the inputs.
Creates or recalls tensorflow variables "Matrix" and "Bias"
to generate an affine operation on the input.
If the inputs are a list of tensors, they are concatenated together.
Initial weights for the matrix are drawn from a Gaussian with zero
mean and standard deviation that is the given scale divided by the
square root of the input dimension. Initial weights for the bias are
set to zero.
Args:
inputs: List of tensors with shape (batch_size, input_size)
output_size: Size (dimension) of the output
scope: Variable scope for these parameters (default: "Affine")
scale: Initial weight scale for the matrix parameters (default: 0.1),
this constant is divided by the sqrt of the input size to get the
std. deviation of the initial weights
vec_mean: The mean for the random initializer
include_bias: Whether to include the bias term
bias_init: The initializer bias (default 0.)
random_seed: Random seed for random initializers. (Default: None)
Returns:
output: Tensor with shape (batch_size, output_size)
"""
# Concatenate the input arguments.
x = tf.concat(inputs, 1)
with tf.variable_scope(scope):
input_size = x.get_shape().as_list()[1]
sigma = scale / np.sqrt(input_size)
rand_init = tf.random_normal_initializer(mean=vec_mean, stddev=sigma,
seed=random_seed)
matrix = tf.get_variable("Matrix", [input_size, output_size],
dtype=tf.float32, initializer=rand_init)
if include_bias:
bias = tf.get_variable("Bias", [output_size], dtype=tf.float32,
initializer=tf.constant_initializer(bias_init,
tf.float32))
else:
bias = 0.
output = tf.matmul(x, matrix) + bias
return output
def project(inputs, weights, bias=0., activation=tf.identity):
"""Computes an affine or linear projection of the inputs.
Projects the inputs onto the given weight vector and (optionally)
adds a bias and passes the result through an activation function.
Args:
inputs: matrix of inputs with shape [batch_size, dim]
weights: weight matrix with shape [dim, output_dim]
bias: bias vector with shape [output_dim] (default: 0)
activation: nonlinear activation function (default: tf.identity)
Returns:
outputs: an op which computes activation(inputs @ weights + bias)
"""
return activation(tf.matmul(inputs, weights) + bias)
def new_mean_squared(grad_vec, decay, ms):
"""Calculates the new accumulated mean squared of the gradient.
Args:
grad_vec: the vector for the current gradient
decay: the decay term
ms: the previous mean_squared value
Returns:
the new mean_squared value
"""
decay_size = decay.get_shape().num_elements()
decay_check_ops = [
tf.assert_less_equal(decay, 1., summarize=decay_size),
tf.assert_greater_equal(decay, 0., summarize=decay_size)]
with tf.control_dependencies(decay_check_ops):
grad_squared = tf.square(grad_vec)
# If the previous mean_squared is the 0 vector, don't use the decay and just
# return the full grad_squared. This should only happen on the first timestep.
decay = tf.cond(tf.reduce_all(tf.equal(ms, 0.)),
lambda: tf.zeros_like(decay, dtype=tf.float32), lambda: decay)
# Update the running average of squared gradients.
epsilon = 1e-12
return (1. - decay) * (grad_squared + epsilon) + decay * ms
def rms_scaling(gradient, decay, ms, update_ms=True):
"""Vectorizes and scales a tensor of gradients.
Args:
gradient: the current gradient
decay: the current decay value.
ms: the previous mean squared value
update_ms: Whether to update the mean squared value (default: True)
Returns:
The scaled gradient and the new ms value if update_ms is True,
the old ms value otherwise.
"""
# Vectorize the gradients and compute the squared gradients.
grad_vec = tf.reshape(gradient, [-1, 1])
if update_ms:
ms = new_mean_squared(grad_vec, decay, ms)
# Scale the current gradients by the RMS, squashed by the asinh function.
scaled_gradient = asinh(grad_vec / tf.sqrt(ms + 1e-16))
return scaled_gradient, ms
def accumulate_sparse_gradients(grad):
"""Accumulates repeated indices of a sparse gradient update.
Args:
grad: a tf.IndexedSlices gradient
Returns:
grad_indices: unique indices
grad_values: gradient values corresponding to the indices
"""
grad_indices, grad_segments = tf.unique(grad.indices)
grad_values = tf.unsorted_segment_sum(grad.values, grad_segments,
tf.shape(grad_indices)[0])
return grad_indices, grad_values
def slice_tensor(dense_tensor, indices, head_dims):
"""Extracts slices from a partially flattened dense tensor.
indices is assumed to index into the first dimension of head_dims.
dense_tensor is assumed to have a shape [D_0, D_1, ...] such that
prod(head_dims) == D_0. This function will extract slices along the
first_dimension of head_dims.
Example:
Consider a tensor with shape head_dims = [100, 2] and a dense_tensor with
shape [200, 3]. Note that the first dimension of dense_tensor equals the
product of head_dims. This function will reshape dense_tensor such that
its shape is now [100, 2, 3] (i.e. the first dimension became head-dims)
and then slice it along the first dimension. After slicing, the slices will
have their initial dimensions flattened just as they were in dense_tensor
(e.g. if there are 4 indices, the return value will have a shape of [4, 3]).
Args:
dense_tensor: a N-D dense tensor. Shape: [D_0, D_1, ...]
indices: a 1-D integer tensor. Shape: [K]
head_dims: True dimensions of the dense_tensor's first dimension.
Returns:
Extracted slices. Shape [K, D_1, ...]
"""
tail_dims = tf.shape(dense_tensor)[1:]
dense_tensor = tf.reshape(dense_tensor,
tf.concat([head_dims, tail_dims], 0))
slices = tf.gather(dense_tensor, indices)
# NOTE(siege): This kills the shape annotation.
return tf.reshape(slices, tf.concat([[-1], tail_dims], 0))
def stack_tensor(slices, indices, dense_tensor, head_dims):
"""Reconsititutes a tensor from slices and corresponding indices.
This is an inverse operation to slice_tensor. Missing slices are set to 0.
Args:
slices: a tensor. Shape [K, D_1, ...]
indices: a 1-D integer tensor. Shape: [K]
dense_tensor: the original tensor the slices were taken
from. Shape: [D_0, D_1, ...]
head_dims: True dimensions of the dense_tensor's first dimension.
Returns:
Reconsituted tensor. Shape: [D_0, D_1, ...]
"""
# NOTE(siege): This cast shouldn't be necessary.
indices = tf.cast(indices, tf.int32)
tail_dims = tf.shape(dense_tensor)[1:]
dense_shape = tf.concat([head_dims, tail_dims], 0)
slices = tf.reshape(slices, tf.concat([[-1], dense_shape[1:]], 0))
indices = tf.expand_dims(indices, -1)
return tf.reshape(tf.scatter_nd(indices, slices, dense_shape),
tf.shape(dense_tensor))
def update_slices(slices, indices, dense_tensor, head_dims):
"""Reconstitutes a tensor from slices and corresponding indices.
Like _stack_tensor, but instead of setting missing slices to 0, sets them to
what they were in the original tensor. The return value is reshaped to be
the same as dense_tensor.
Args:
slices: a tensor. Shape [K, D_1, ...]
indices: a 1-D integer tensor. Shape: [K]
dense_tensor: the original tensor the slices were taken
from. Shape: [D_0, D_1, ...]
head_dims: True dimensions of the dense_tensor's first dimension.
Returns:
Reconsituted tensor. Shape: [D_0, D_1, ...]
"""
# NOTE(siege): This cast shouldn't be necessary.
indices = tf.cast(indices, tf.int32)
tail_dims = tf.shape(dense_tensor)[1:]
dense_shape = tf.concat([head_dims, tail_dims], 0)
update_mask_vals = tf.fill(tf.shape(indices), 1)
reshaped_indices = tf.expand_dims(indices, -1)
update_mask = tf.equal(
tf.scatter_nd(reshaped_indices, update_mask_vals, head_dims[:1]), 1)
reshaped_dense_slices = tf.reshape(
stack_tensor(slices, indices, dense_tensor, head_dims), dense_shape)
reshaped_dense_tensor = tf.reshape(dense_tensor, dense_shape)
return tf.reshape(
tf.where(update_mask, reshaped_dense_slices, reshaped_dense_tensor),
tf.shape(dense_tensor))
package(default_visibility = ["//visibility:public"])
# Libraries
# =====
py_library(
name = "datasets",
srcs = ["datasets.py"],
deps = [
],
)
py_library(
name = "model_adapter",
srcs = ["model_adapter.py"],
deps = [
":problem_generator",
],
)
py_library(
name = "problem_generator",
srcs = ["problem_generator.py"],
deps = [
":problem_spec",
],
)
py_library(
name = "problem_sets",
srcs = ["problem_sets.py"],
deps = [
":datasets",
":model_adapter",
":problem_generator",
],
)
py_library(
name = "problem_spec",
srcs = ["problem_spec.py"],
deps = [],
)
# Copyright 2017 Google, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Functions to generate or load datasets for supervised learning."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from collections import namedtuple
import numpy as np
from sklearn.datasets import make_classification
MAX_SEED = 4294967295
class Dataset(namedtuple("Dataset", "data labels")):
"""Helper class for managing a supervised learning dataset.
Args:
data: an array of type float32 with N samples, each of which is the set
of features for that sample. (Shape (N, D_i), where N is the number of
samples and D_i is the number of features for that sample.)
labels: an array of type int32 or int64 with N elements, indicating the
class label for the corresponding set of features in data.
"""
# Since this is an immutable object, we don't need to reserve slots.
__slots__ = ()
@property
def size(self):
"""Dataset size (number of samples)."""
return len(self.data)
def batch_indices(self, num_batches, batch_size):
"""Creates indices of shuffled minibatches.
Args:
num_batches: the number of batches to generate
batch_size: the size of each batch
Returns:
batch_indices: a list of minibatch indices, arranged so that the dataset
is randomly shuffled.
Raises:
ValueError: if the data and labels have different lengths
"""
if len(self.data) != len(self.labels):
raise ValueError("Labels and data must have the same number of samples.")
batch_indices = []
# Follows logic in mnist.py to ensure we cover the entire dataset.
index_in_epoch = 0
dataset_size = len(self.data)
dataset_indices = np.arange(dataset_size)
np.random.shuffle(dataset_indices)
for _ in range(num_batches):
start = index_in_epoch
index_in_epoch += batch_size
if index_in_epoch > dataset_size:
# Finished epoch, reshuffle.
np.random.shuffle(dataset_indices)
# Start next epoch.
start = 0
index_in_epoch = batch_size
end = index_in_epoch
batch_indices.append(dataset_indices[start:end].tolist())
return batch_indices
def noisy_parity_class(n_samples,
n_classes=2,
n_context_ids=5,
noise_prob=0.25,
random_seed=None):
"""Returns a randomly generated sparse-to-sparse dataset.
The label is a parity class of a set of context classes.
Args:
n_samples: number of samples (data points)
n_classes: number of class labels (default: 2)
n_context_ids: how many classes to take the parity of (default: 5).
noise_prob: how often to corrupt the label (default: 0.25)
random_seed: seed used for drawing the random data (default: None)
Returns:
dataset: A Dataset namedtuple containing the generated data and labels
"""
np.random.seed(random_seed)
x = np.random.randint(0, n_classes, [n_samples, n_context_ids])
noise = np.random.binomial(1, noise_prob, [n_samples])
y = (np.sum(x, 1) + noise) % n_classes
return Dataset(x.astype("float32"), y.astype("int32"))
def random(n_features, n_samples, n_classes=2, sep=1.0, random_seed=None):
"""Returns a randomly generated classification dataset.
Args:
n_features: number of features (dependent variables)
n_samples: number of samples (data points)
n_classes: number of class labels (default: 2)
sep: separation of the two classes, a higher value corresponds to
an easier classification problem (default: 1.0)
random_seed: seed used for drawing the random data (default: None)
Returns:
dataset: A Dataset namedtuple containing the generated data and labels
"""
# Generate the problem data.
x, y = make_classification(n_samples=n_samples,
n_features=n_features,
n_informative=n_features,
n_redundant=0,
n_classes=n_classes,
class_sep=sep,
random_state=random_seed)
return Dataset(x.astype("float32"), y.astype("int32"))
def random_binary(n_features, n_samples, random_seed=None):
"""Returns a randomly generated dataset of binary values.
Args:
n_features: number of features (dependent variables)
n_samples: number of samples (data points)
random_seed: seed used for drawing the random data (default: None)
Returns:
dataset: A Dataset namedtuple containing the generated data and labels
"""
random_seed = (np.random.randint(MAX_SEED) if random_seed is None
else random_seed)
np.random.seed(random_seed)
x = np.random.randint(2, size=(n_samples, n_features))
y = np.zeros((n_samples, 1))
return Dataset(x.astype("float32"), y.astype("int32"))
def random_symmetric(n_features, n_samples, random_seed=None):
"""Returns a randomly generated dataset of values and their negatives.
Args:
n_features: number of features (dependent variables)
n_samples: number of samples (data points)
random_seed: seed used for drawing the random data (default: None)
Returns:
dataset: A Dataset namedtuple containing the generated data and labels
"""
random_seed = (np.random.randint(MAX_SEED) if random_seed is None
else random_seed)
np.random.seed(random_seed)
x1 = np.random.normal(size=(int(n_samples/2), n_features))
x = np.concatenate((x1, -x1), axis=0)
y = np.zeros((n_samples, 1))
return Dataset(x.astype("float32"), y.astype("int32"))
def random_mlp(n_features, n_samples, random_seed=None, n_layers=6, width=20):
"""Returns a generated output of an MLP with random weights.
Args:
n_features: number of features (dependent variables)
n_samples: number of samples (data points)
random_seed: seed used for drawing the random data (default: None)
n_layers: number of layers in random MLP
width: width of the layers in random MLP
Returns:
dataset: A Dataset namedtuple containing the generated data and labels
"""
random_seed = (np.random.randint(MAX_SEED) if random_seed is None
else random_seed)
np.random.seed(random_seed)
x = np.random.normal(size=(n_samples, n_features))
y = x
n_in = n_features
scale_factor = np.sqrt(2.) / np.sqrt(n_features)
for _ in range(n_layers):
weights = np.random.normal(size=(n_in, width)) * scale_factor
y = np.dot(y, weights).clip(min=0)
n_in = width
y = y[:, 0]
y[y > 0] = 1
return Dataset(x.astype("float32"), y.astype("int32"))
EMPTY_DATASET = Dataset(np.array([], dtype="float32"),
np.array([], dtype="int32"))
# Copyright 2017 Google, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Implementation of the ModelAdapter class."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import mock
import tensorflow as tf
from learned_optimizer.problems import problem_generator as pg
class ModelAdapter(pg.Problem):
"""Adapts Tensorflow models/graphs into a form suitable for meta-training.
This class adapts an existing TensorFlow graph into a form suitable for
meta-training a learned optimizer.
"""
def __init__(self, make_loss_and_init_fn):
"""Wraps a model in the Problem interface.
make_loss_and_init argument is a callable that returns a tuple of
two other callables as follows.
The first will construct most of the graph and return the problem loss. It
is essential that this graph contains the totality of the model's variables,
but none of its queues.
The second will return construct the model initialization graph given a list
of parameters and return a callable that is passed an instance of
tf.Session, and should initialize the models' parameters.
An argument value function would look like this:
```python
def make_loss_and_init_fn():
inputs = queued_reader()
def make_loss():
return create_model_with_variables(inputs)
def make_init_fn(parameters):
saver = tf.Saver(parameters)
def init_fn(sess):
sess.restore(sess, ...)
return init_fn
return make_loss, make_init_fn
```
Args:
make_loss_and_init_fn: a callable, as described aboce
"""
make_loss_fn, make_init_fn = make_loss_and_init_fn()
self.make_loss_fn = make_loss_fn
self.parameters, self.constants = _get_variables(make_loss_fn)
if make_init_fn is not None:
init_fn = make_init_fn(self.parameters + self.constants)
else:
init_op = tf.initialize_variables(self.parameters + self.constants)
init_fn = lambda sess: sess.run(init_op)
tf.logging.info("ModelAdapter parameters: %s",
[op.name for op in self.parameters])
tf.logging.info("ModelAdapter constants: %s",
[op.name for op in self.constants])
super(ModelAdapter, self).__init__(
[], random_seed=None, noise_stdev=0.0, init_fn=init_fn)
def init_tensors(self, seed=None):
"""Returns a list of tensors with the given shape."""
return self.parameters
def init_variables(self, seed=None):
"""Returns a list of variables with the given shape."""
# NOTE(siege): This is awkward, as these are not set as trainable.
return self.parameters
def objective(self, parameters, data=None, labels=None):
"""Computes the objective given a list of parameters.
Args:
parameters: The parameters to optimize (as a list of tensors)
data: An optional batch of data for calculating objectives
labels: An optional batch of corresponding labels
Returns:
A scalar tensor representing the objective value
"""
# We need to set up a mapping based on the original parameter names, because
# the parameters passed can be arbitrary tensors.
parameter_mapping = {
old_p.name: p
for old_p, p in zip(self.parameters, parameters)
}
with tf.variable_scope(tf.get_variable_scope(), reuse=True):
return _make_with_custom_variables(self.make_loss_fn, parameter_mapping)
def _get_variables(func):
"""Calls func, returning any variables created.
The created variables are modified to not be trainable, and are placed into
the LOCAL_VARIABLES collection.
Args:
func: Function to be called.
Returns:
A tuple (variables, constants) where the first element is a list of
trainable variables and the second is the non-trainable variables.
"""
variables = []
constants = []
# We need to create these variables like normal, so grab the original
# constructor before we mock it.
original_init = tf.Variable.__init__
def custom_init(self, *args, **kwargs):
trainable = kwargs["trainable"]
kwargs["trainable"] = False
# Making these variables local keeps them out of the optimizer's checkpoints
# somehow.
kwargs["collections"] = [tf.GraphKeys.LOCAL_VARIABLES]
original_init(self, *args, **kwargs)
if trainable:
variables.append(self)
else:
constants.append(self)
# This name-scope is just a nicety for TensorBoard.
with tf.name_scope("unused_graph"):
with mock.patch.object(tf.Variable, "__init__", custom_init):
func()
return variables, constants
def _make_with_custom_variables(func, variable_mapping):
"""Calls func and replaces the value of some variables created in it.
Args:
func: Function to be called.
variable_mapping: A mapping of variable name to the replacement tensor or
tf.Variable.
Returns:
The return value of func is returned.
"""
original_value = tf.Variable.value
def custom_value(self):
if self.name in variable_mapping:
replacement = variable_mapping[self.name]
tf.logging.info("Replaced %s with %s" % (self.name, replacement))
# value() method needs to return a tensor, we need to call value on it.
# This has to be done manually like this otherwise we'll get an infinite
# loop.
if isinstance(replacement, tf.Variable):
replacement = original_value(replacement)
return replacement
else:
return original_value(self)
with mock.patch.object(tf.Variable, "value", custom_value):
with mock.patch.object(tf.Variable, "_AsTensor", custom_value):
return func()
# Copyright 2017 Google, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Generates toy optimization problems.
This module contains a base class, Problem, that defines a minimal interface
for optimization problems, and a few specific problem types that subclass it.
Test functions for optimization: http://www.sfu.ca/~ssurjano/optimization.html
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import tensorflow as tf
from learned_optimizer.problems import problem_spec as prob_spec
tf.app.flags.DEFINE_float("l2_reg_scale", 1e-3,
"""Scaling factor for parameter value regularization
in softmax classifier problems.""")
FLAGS = tf.app.flags.FLAGS
EPSILON = 1e-6
MAX_SEED = 4294967295
PARAMETER_SCOPE = "parameters"
_Spec = prob_spec.Spec
class Problem(object):
"""Base class for optimization problems.
This defines an interface for optimization problems, including objective and
gradients functions and a feed_generator function that yields data to pass to
feed_dict in tensorflow.
Subclasses of Problem must (at the minimum) override the objective method,
which computes the objective/loss/cost to minimize, and specify the desired
shape of the parameters in a list in the param_shapes attribute.
"""
def __init__(self, param_shapes, random_seed, noise_stdev, init_fn=None):
"""Initializes a global random seed for the problem.
Args:
param_shapes: A list of tuples defining the expected shapes of the
parameters for this problem
random_seed: Either an integer (or None, in which case the seed is
randomly drawn)
noise_stdev: Strength (standard deviation) of added gradient noise
init_fn: A function taking a tf.Session object that is used to
initialize the problem's variables.
Raises:
ValueError: If the random_seed is not an integer and not None
"""
if random_seed is not None and not isinstance(random_seed, int):
raise ValueError("random_seed must be an integer or None")
# Pick a random seed.
self.random_seed = (np.random.randint(MAX_SEED) if random_seed is None
else random_seed)
# Store the noise level.
self.noise_stdev = noise_stdev
# Set the random seed to ensure any random data in the problem is the same.
np.random.seed(self.random_seed)
# Store the parameter shapes.
self.param_shapes = param_shapes
if init_fn is not None:
self.init_fn = init_fn
else:
self.init_fn = lambda _: None
def init_tensors(self, seed=None):
"""Returns a list of tensors with the given shape."""
return [tf.random_normal(shape, seed=seed) for shape in self.param_shapes]
def init_variables(self, seed=None):
"""Returns a list of variables with the given shape."""
with tf.variable_scope(PARAMETER_SCOPE):
params = [tf.Variable(param) for param in self.init_tensors(seed)]
return params
def objective(self, parameters, data=None, labels=None):
"""Computes the objective given a list of parameters.
Args:
parameters: The parameters to optimize (as a list of tensors)
data: An optional batch of data for calculating objectives
labels: An optional batch of corresponding labels
Returns:
A scalar tensor representing the objective value
"""
raise NotImplementedError
def gradients(self, objective, parameters):
"""Compute gradients of the objective with respect to the parameters.
Args:
objective: The objective op (e.g. output of self.objective())
parameters: A list of tensors (the parameters to optimize)
Returns:
A list of tensors representing the gradient for each parameter,
returned in the same order as the given list
"""
grads = tf.gradients(objective, list(parameters))
noisy_grads = []
for grad in grads:
if isinstance(grad, tf.IndexedSlices):
noise = self.noise_stdev * tf.random_normal(tf.shape(grad.values))
new_grad = tf.IndexedSlices(grad.values + noise, grad.indices)
else:
new_grad = grad + self.noise_stdev * tf.random_normal(grad.get_shape())
noisy_grads.append(new_grad)
return noisy_grads
class Quadratic(Problem):
"""Optimizes a random quadratic function.
The objective is: f(x) = (1/2) ||Wx - y||_2^2
where W is a random Gaussian matrix and y is a random Gaussian vector.
"""
def __init__(self, ndim, random_seed=None, noise_stdev=0.0):
"""Initializes a random quadratic problem."""
param_shapes = [(ndim, 1)]
super(Quadratic, self).__init__(param_shapes, random_seed, noise_stdev)
# Generate a random problem instance.
self.w = np.random.randn(ndim, ndim).astype("float32")
self.y = np.random.randn(ndim, 1).astype("float32")
def objective(self, params, data=None, labels=None):
"""Quadratic objective (see base class for details)."""
return tf.nn.l2_loss(tf.matmul(self.w, params[0]) - self.y)
class SoftmaxClassifier(Problem):
"""Helper functions for supervised softmax classification problems."""
def init_tensors(self, seed=None):
"""Returns a list of tensors with the given shape."""
return [tf.random_normal(shape, seed=seed) * 1.2 / np.sqrt(shape[0])
for shape in self.param_shapes]
def inference(self, params, data):
"""Computes logits given parameters and data.
Args:
params: List of parameter tensors or variables
data: Batch of features with samples along the first dimension
Returns:
logits: Un-normalized logits with shape (num_samples, num_classes)
"""
raise NotImplementedError
def objective(self, params, data, labels):
"""Computes the softmax cross entropy.
Args:
params: List of parameter tensors or variables
data: Batch of features with samples along the first dimension
labels: Vector of labels with the same number of samples as the data
Returns:
loss: Softmax cross entropy loss averaged over the samples in the batch
Raises:
ValueError: If the objective is to be computed over >2 classes, because
this operation is broken in tensorflow at the moment.
"""
# Forward pass.
logits = self.inference(params, data)
# Compute the loss.
l2reg = [tf.reduce_sum(param ** 2) for param in params]
if int(logits.get_shape()[1]) == 2:
labels = tf.cast(labels, tf.float32)
losses = tf.nn.sigmoid_cross_entropy_with_logits(
labels=labels, logits=logits[:, 0])
else:
raise ValueError("Unable to compute softmax cross entropy for more than"
" 2 classes.")
return tf.reduce_mean(losses) + tf.reduce_mean(l2reg) * FLAGS.l2_reg_scale
def argmax(self, logits):
"""Samples the most likely class label given the logits.
Args:
logits: Un-normalized logits with shape (num_samples, num_classes)
Returns:
predictions: Predicted class labels, has shape (num_samples,)
"""
return tf.cast(tf.argmax(tf.nn.softmax(logits), 1), tf.int32)
def accuracy(self, params, data, labels):
"""Computes the accuracy (fraction of correct classifications).
Args:
params: List of parameter tensors or variables
data: Batch of features with samples along the first dimension
labels: Vector of labels with the same number of samples as the data
Returns:
accuracy: Fraction of correct classifications across the batch
"""
predictions = self.argmax(self.inference(params, data))
return tf.contrib.metrics.accuracy(predictions, tf.cast(labels, tf.int32))
class SoftmaxRegression(SoftmaxClassifier):
"""Builds a softmax regression problem."""
def __init__(self, n_features, n_classes, activation=tf.identity,
random_seed=None, noise_stdev=0.0):
self.activation = activation
self.n_features = n_features
param_shapes = [(n_features, n_classes), (n_classes,)]
super(SoftmaxRegression, self).__init__(param_shapes,
random_seed,
noise_stdev)
def inference(self, params, data):
features = tf.reshape(data, (-1, self.n_features))
return tf.matmul(features, params[0]) + params[1]
class SparseSoftmaxRegression(SoftmaxClassifier):
"""Builds a sparse input softmax regression problem."""
def __init__(self,
n_features,
n_classes,
activation=tf.identity,
random_seed=None,
noise_stdev=0.0):
self.activation = activation
self.n_features = n_features
param_shapes = [(n_classes, n_features), (n_features, n_classes), (
n_classes,)]
super(SparseSoftmaxRegression, self).__init__(param_shapes, random_seed,
noise_stdev)
def inference(self, params, data):
all_embeddings, softmax_weights, softmax_bias = params
embeddings = tf.nn.embedding_lookup(all_embeddings, tf.cast(data, tf.int32))
embeddings = tf.reduce_sum(embeddings, 1)
return tf.matmul(embeddings, softmax_weights) + softmax_bias
class OneHotSparseSoftmaxRegression(SoftmaxClassifier):
"""Builds a sparse input softmax regression problem.
This is identical to SparseSoftmaxRegression, but without using embedding
ops.
"""
def __init__(self,
n_features,
n_classes,
activation=tf.identity,
random_seed=None,
noise_stdev=0.0):
self.activation = activation
self.n_features = n_features
self.n_classes = n_classes
param_shapes = [(n_classes, n_features), (n_features, n_classes), (
n_classes,)]
super(OneHotSparseSoftmaxRegression, self).__init__(param_shapes,
random_seed,
noise_stdev)
def inference(self, params, data):
all_embeddings, softmax_weights, softmax_bias = params
num_ids = tf.shape(data)[1]
one_hot_embeddings = tf.one_hot(tf.cast(data, tf.int32), self.n_classes)
one_hot_embeddings = tf.reshape(one_hot_embeddings, [-1, self.n_classes])
embeddings = tf.matmul(one_hot_embeddings, all_embeddings)
embeddings = tf.reshape(embeddings, [-1, num_ids, self.n_features])
embeddings = tf.reduce_sum(embeddings, 1)
return tf.matmul(embeddings, softmax_weights) + softmax_bias
class FullyConnected(SoftmaxClassifier):
"""Builds a multi-layer perceptron classifier."""
def __init__(self, n_features, n_classes, hidden_sizes=(32, 64),
activation=tf.nn.sigmoid, random_seed=None, noise_stdev=0.0):
"""Initializes an multi-layer perceptron classification problem."""
# Store the number of features and activation function.
self.n_features = n_features
self.activation = activation
# Define the network as a list of weight + bias shapes for each layer.
param_shapes = []
for ix, sz in enumerate(hidden_sizes + (n_classes,)):
# The previous layer"s size (n_features if input).
prev_size = n_features if ix == 0 else hidden_sizes[ix - 1]
# Weight shape for this layer.
param_shapes.append((prev_size, sz))
# Bias shape for this layer.
param_shapes.append((sz,))
super(FullyConnected, self).__init__(param_shapes, random_seed, noise_stdev)
def inference(self, params, data):
# Flatten the features into a vector.
features = tf.reshape(data, (-1, self.n_features))
# Pass the data through the network.
preactivations = tf.matmul(features, params[0]) + params[1]
for layer in range(2, len(self.param_shapes), 2):
net = self.activation(preactivations)
preactivations = tf.matmul(net, params[layer]) + params[layer + 1]
return preactivations
def accuracy(self, params, data, labels):
"""Computes the accuracy (fraction of correct classifications).
Args:
params: List of parameter tensors or variables
data: Batch of features with samples along the first dimension
labels: Vector of labels with the same number of samples as the data
Returns:
accuracy: Fraction of correct classifications across the batch
"""
predictions = self.argmax(self.activation(self.inference(params, data)))
return tf.contrib.metrics.accuracy(predictions, tf.cast(labels, tf.int32))
class ConvNet(SoftmaxClassifier):
"""Builds an N-layer convnet for image classification."""
def __init__(self,
image_shape,
n_classes,
filter_list,
activation=tf.nn.relu,
random_seed=None,
noise_stdev=0.0):
# Number of channels, number of pixels in x- and y- dimensions.
n_channels, px, py = image_shape
# Store the activation.
self.activation = activation
param_shapes = []
input_size = n_channels
for fltr in filter_list:
# Add conv2d filters.
param_shapes.append((fltr[0], fltr[1], input_size, fltr[2]))
input_size = fltr[2]
# Number of units in the final (dense) layer.
self.affine_size = input_size * px * py
param_shapes.append((self.affine_size, n_classes)) # affine weights
param_shapes.append((n_classes,)) # affine bias
super(ConvNet, self).__init__(param_shapes, random_seed, noise_stdev)
def init_tensors(self, seed=None):
"""Returns a list of tensors with the given shape."""
return [tf.random_normal(shape, mean=0., stddev=0.01, seed=seed)
for shape in self.param_shapes]
def inference(self, params, data):
# Unpack.
w_conv_list = params[:-2]
output_w, output_b = params[-2:]
conv_input = data
for w_conv in w_conv_list:
layer = tf.nn.conv2d(conv_input, w_conv, strides=[1] * 4, padding="SAME")
output = self.activation(layer)
conv_input = output
# Flatten.
flattened = tf.reshape(conv_input, (-1, self.affine_size))
# Fully connected layer.
return tf.matmul(flattened, output_w) + output_b
class Bowl(Problem):
"""A 2D quadratic bowl."""
def __init__(self, condition_number, angle=0.0,
random_seed=None, noise_stdev=0.0):
assert condition_number > 0, "Condition number must be positive."
# Define parameter shapes.
param_shapes = [(2, 1)]
super(Bowl, self).__init__(param_shapes, random_seed, noise_stdev)
self.condition_number = condition_number
self.angle = angle
self._build_matrix(condition_number, angle)
def _build_matrix(self, condition_number, angle):
"""Builds the Hessian matrix."""
hessian = np.array([[condition_number, 0.], [0., 1.]], dtype="float32")
# Build the rotation matrix.
rotation_matrix = np.array([
[np.cos(angle), -np.sin(angle)],
[np.sin(angle), np.cos(angle)]
])
# The objective is 0.5 * || Ax ||_2^2
# where the data matrix (A) is: sqrt(Hessian).dot(rotation_matrix).
self.matrix = np.sqrt(hessian).dot(rotation_matrix)
def objective(self, params, data=None, labels=None):
mtx = tf.constant(self.matrix, dtype=tf.float32)
return tf.nn.l2_loss(tf.matmul(mtx, params[0]))
def surface(self, xlim=5, ylim=5, n=50):
xm, ym = _mesh(xlim, ylim, n)
pts = np.vstack([xm.ravel(), ym.ravel()])
zm = 0.5 * np.linalg.norm(self.matrix.dot(pts), axis=0) ** 2
return xm, ym, zm.reshape(n, n)
class Problem2D(Problem):
def __init__(self, random_seed=None, noise_stdev=0.0):
param_shapes = [(2,)]
super(Problem2D, self).__init__(param_shapes, random_seed, noise_stdev)
def surface(self, n=50, xlim=5, ylim=5):
"""Computes the objective surface over a 2d mesh."""
# Create a mesh over the given coordinate ranges.
xm, ym = _mesh(xlim, ylim, n)
with tf.Graph().as_default(), tf.Session() as sess:
# Ops to compute the objective at every (x, y) point.
x = tf.placeholder(tf.float32, shape=xm.shape)
y = tf.placeholder(tf.float32, shape=ym.shape)
obj = self.objective([[x, y]])
# Run the computation.
zm = sess.run(obj, feed_dict={x: xm, y: ym})
return xm, ym, zm
class Rosenbrock(Problem2D):
"""See https://en.wikipedia.org/wiki/Rosenbrock_function.
This function has a single global minima at [1, 1]
The objective value at this point is zero.
"""
def init_tensors(self, seed=None):
"""Returns a list of tensors with the given shape."""
return [tf.random_uniform(shape, minval=-5., maxval=10., seed=seed)
for shape in self.param_shapes]
def objective(self, params, data=None, labels=None):
x, y = tf.split(params[0], 2, axis=0)
obj = (1 - x)**2 + 100 * (y - x**2)**2
return tf.squeeze(obj)
def make_rosenbrock_loss_and_init(device=None):
"""A variable-backed version of Rosenbrock problem.
See the Rosenbrock class for details.
Args:
device: Where to place the ops of this problem.
Returns:
A tuple of two callables, first of which creates the loss and the second
creates the parameter initializer function.
"""
def make_rosenbrock_loss():
with tf.name_scope("optimizee"):
with tf.device(device):
x = tf.get_variable("x", [1])
y = tf.get_variable("y", [1])
c = tf.get_variable(
"c", [1],
initializer=tf.constant_initializer(100.0),
trainable=False)
obj = (1 - x)**2 + c * (y - x**2)**2
return tf.squeeze(obj)
def make_init_fn(parameters):
with tf.device(device):
init_op = tf.variables_initializer(parameters)
def init_fn(sess):
tf.logging.info("Initializing model parameters.")
sess.run(init_op)
return init_fn
return make_rosenbrock_loss, make_init_fn
class Saddle(Problem2D):
"""Loss surface around a saddle point."""
def objective(self, params, data=None, labels=None):
x, y = tf.split(params[0], 2, axis=0)
obj = x ** 2 - y ** 2
return tf.squeeze(obj)
class LogSumExp(Problem2D):
"""2D function defined by the log of the sum of exponentials."""
def objective(self, params, data=None, labels=None):
x, y = tf.split(params[0], 2, axis=0)
obj = tf.log(tf.exp(x + 3. * y - 0.1) +
tf.exp(x - 3. * y - 0.1) +
tf.exp(-x - 0.1) + 1.0)
return tf.squeeze(obj)
class Ackley(Problem2D):
"""Ackley's function (contains many local minima)."""
def init_tensors(self, seed=None):
"""Returns a list of tensors with the given shape."""
return [tf.random_uniform(shape, minval=-32.768, maxval=32.768, seed=seed)
for shape in self.param_shapes]
def objective(self, params, data=None, labels=None):
x, y = tf.split(params[0], 2, axis=0)
obj = (-20 * tf.exp(-0.2 * tf.sqrt(0.5 * (x ** 2 + y ** 2))) -
tf.exp(0.5 * (tf.cos(2 * np.pi * x) + tf.cos(2 * np.pi * y))) +
tf.exp(1.0) + 20.)
return tf.squeeze(obj)
class Beale(Problem2D):
"""Beale function (a multimodal function with sharp peaks)."""
def init_tensors(self, seed=None):
"""Returns a list of tensors with the given shape."""
return [tf.random_uniform(shape, minval=-4.5, maxval=4.5, seed=seed)
for shape in self.param_shapes]
def objective(self, params, data=None, labels=None):
x, y = tf.split(params[0], 2, axis=0)
obj = ((1.5 - x + x * y) ** 2 +
(2.25 - x + x * y ** 2) ** 2 +
(2.625 - x + x * y ** 3) ** 2)
return tf.squeeze(obj)
class Booth(Problem2D):
"""Booth's function (has a long valley along one dimension)."""
def init_tensors(self, seed=None):
"""Returns a list of tensors with the given shape."""
return [tf.random_uniform(shape, minval=-10., maxval=10., seed=seed)
for shape in self.param_shapes]
def objective(self, params, data=None, labels=None):
x, y = tf.split(params[0], 2, axis=0)
obj = (x + 2 * y - 7) ** 2 + (2 * x + y - 5) ** 2
return tf.squeeze(obj)
class StyblinskiTang(Problem2D):
"""Styblinski-Tang function (a bumpy function in two dimensions)."""
def init_tensors(self, seed=None):
"""Returns a list of tensors with the given shape."""
return [tf.random_uniform(shape, minval=-5., maxval=5., seed=seed)
for shape in self.param_shapes]
def objective(self, params, data=None, labels=None):
params = tf.split(params[0], 2, axis=0)
obj = 0.5 * tf.reduce_sum([x ** 4 - 16 * x ** 2 + 5 * x
for x in params], 0) + 80.
return tf.squeeze(obj)
class Matyas(Problem2D):
"""Matyas function (a function with a single global minimum in a valley)."""
def init_tensors(self, seed=None):
"""Returns a list of tensors with the given shape."""
return [tf.random_uniform(shape, minval=-10, maxval=10, seed=seed)
for shape in self.param_shapes]
def objective(self, params, data=None, labels=None):
x, y = tf.split(params[0], 2, axis=0)
obj = 0.26 * (x ** 2 + y ** 2) - 0.48 * x * y
return tf.squeeze(obj)
class Branin(Problem2D):
"""Branin function (a function with three global minima)."""
def init_tensors(self, seed=None):
"""Returns a list of tensors with the given shape."""
x1 = tf.random_uniform((1,), minval=-5., maxval=10.,
seed=seed)
x2 = tf.random_uniform((1,), minval=0., maxval=15.,
seed=seed)
return [tf.concat([x1, x2], 0)]
def objective(self, params, data=None, labels=None):
x, y = tf.split(params[0], 2, axis=0)
# Define some constants.
a = 1.
b = 5.1 / (4. * np.pi ** 2)
c = 5 / np.pi
r = 6.
s = 10.
t = 1 / (8. * np.pi)
# Evaluate the function.
obj = a * (y - b * x ** 2 + c * x - r) ** 2 + s * (1 - t) * tf.cos(x) + s
return tf.squeeze(obj)
class Michalewicz(Problem2D):
"""Michalewicz function (has steep ridges and valleys)."""
def init_tensors(self, seed=None):
"""Returns a list of tensors with the given shape."""
return [tf.random_uniform(shape, minval=0., maxval=np.pi, seed=seed)
for shape in self.param_shapes]
def objective(self, params, data=None, labels=None):
x, y = tf.split(params[0], 2, axis=0)
m = 5 # Defines how steep the ridges are (larger m => steeper ridges).
obj = 2. - (tf.sin(x) * tf.sin(x ** 2 / np.pi) ** (2 * m) +
tf.sin(y) * tf.sin(2 * y ** 2 / np.pi) ** (2 * m))
return tf.squeeze(obj)
class Rescale(Problem):
"""Takes an existing problem, and rescales all the parameters."""
def __init__(self, problem_spec, scale=10., noise_stdev=0.0):
self.problem = problem_spec.build()
self.param_shapes = self.problem.param_shapes
self.scale = scale
super(Rescale, self).__init__(self.param_shapes, random_seed=None,
noise_stdev=noise_stdev)
def init_tensors(self, seed=None):
params_raw = self.problem.init_tensors(seed=seed)
params = [t * self.scale for t in params_raw]
return params
def objective(self, params, data=None, labels=None):
params_raw = [t/self.scale for t in params]
problem_obj = self.problem.objective(params_raw, data, labels)
return problem_obj
class SumTask(Problem):
"""Takes a list of problems and modifies the objective to be their sum."""
def __init__(self, problem_specs, noise_stdev=0.0):
self.problems = [ps.build() for ps in problem_specs]
self.param_shapes = []
for prob in self.problems:
self.param_shapes += prob.param_shapes
super(SumTask, self).__init__(self.param_shapes, random_seed=None,
noise_stdev=noise_stdev)
def init_tensors(self, seed=None):
tensors = []
for prob in self.problems:
tensors += prob.init_tensors(seed=seed)
return tensors
def objective(self, params, data=None, labels=None):
obj = 0.
index = 0
for prob in self.problems:
num_params = len(prob.param_shapes)
obj += prob.objective(params[index:index + num_params])
index += num_params
return obj
class IsotropicQuadratic(Problem):
"""An isotropic quadratic problem."""
def objective(self, params, data=None, labels=None):
return sum([tf.reduce_sum(param ** 2) for param in params])
class Norm(Problem):
"""Takes an existing problem and modifies the objective to be its N-norm."""
def __init__(self, ndim, random_seed=None, noise_stdev=0.0, norm_power=2.):
param_shapes = [(ndim, 1)]
super(Norm, self).__init__(param_shapes, random_seed, noise_stdev)
# Generate a random problem instance.
self.w = np.random.randn(ndim, ndim).astype("float32")
self.y = np.random.randn(ndim, 1).astype("float32")
self.norm_power = norm_power
def objective(self, params, data=None, labels=None):
diff = tf.matmul(self.w, params[0]) - self.y
exp = 1. / self.norm_power
loss = tf.reduce_sum((tf.abs(diff) + EPSILON) ** self.norm_power) ** exp
return loss
class LogObjective(Problem):
"""Takes an existing problem and modifies the objective to be its log."""
def __init__(self, problem_spec):
self.problem = problem_spec.build()
self.param_shapes = self.problem.param_shapes
super(LogObjective, self).__init__(self.param_shapes,
random_seed=None,
noise_stdev=0.0)
def objective(self, params, data=None, labels=None):
problem_obj = self.problem.objective(params, data, labels)
return tf.log(problem_obj + EPSILON) - tf.log(EPSILON)
class SparseProblem(Problem):
"""Takes a problem and sets gradients to 0 with the given probability."""
def __init__(self,
problem_spec,
zero_probability=0.99,
random_seed=None,
noise_stdev=0.0):
self.problem = problem_spec.build()
self.param_shapes = self.problem.param_shapes
self.zero_prob = zero_probability
super(SparseProblem, self).__init__(self.param_shapes,
random_seed=random_seed,
noise_stdev=noise_stdev)
def objective(self, parameters, data=None, labels=None):
return self.problem.objective(parameters, data, labels)
def gradients(self, objective, parameters):
grads = tf.gradients(objective, list(parameters))
new_grads = []
for grad in grads:
mask = tf.greater(self.zero_prob, tf.random_uniform(grad.get_shape()))
zero_grad = tf.zeros_like(grad, dtype=tf.float32)
noisy_grad = grad + self.noise_stdev * tf.random_normal(grad.get_shape())
new_grads.append(tf.where(mask, zero_grad, noisy_grad))
return new_grads
class DependencyChain(Problem):
"""A problem in which parameters must be optimized in order.
A sequence of parameters which all need to be brought to 0, but where each
parameter in the sequence can't be brought to 0 until the preceding one
has been. This should take a long time to optimize, with steady
(or accelerating) progress throughout the entire process.
"""
def __init__(self, ndim, random_seed=None, noise_stdev=0.):
param_shapes = [(ndim + 1,)]
self.ndim = ndim
super(DependencyChain, self).__init__(
param_shapes, random_seed, noise_stdev)
def objective(self, params, data=None, labels=None):
terms = params[0][0]**2 + params[0][1:]**2 / (params[0][:-1]**2 + EPSILON)
return tf.reduce_sum(terms)
class MinMaxWell(Problem):
"""Problem with global min when both the min and max (absolute) params are 1.
The gradient for all but two parameters (the min and max) is zero. This
should therefore encourage the optimizer to behave sensible even when
parameters have zero gradients, as is common eg for some deep neural nets.
"""
def __init__(self, ndim, random_seed=None, noise_stdev=0.):
param_shapes = [(ndim,)]
self.ndim = ndim
super(MinMaxWell, self).__init__(param_shapes, random_seed, noise_stdev)
def objective(self, params, data=None, labels=None):
params_sqr = params[0]**2
min_sqr = tf.reduce_min(params_sqr)
max_sqr = tf.reduce_max(params_sqr)
epsilon = 1e-12
return max_sqr + 1./min_sqr - 2. + epsilon
class OutwardSnake(Problem):
"""A winding path out to infinity.
Ideal step length stays constant along the entire path.
"""
def __init__(self, ndim, random_seed=None, noise_stdev=0.):
param_shapes = [(ndim,)]
self.ndim = ndim
super(OutwardSnake, self).__init__(param_shapes, random_seed, noise_stdev)
def objective(self, params, data, labels=None):
radius = tf.sqrt(tf.reduce_sum(params[0]**2))
rad_loss = tf.reduce_sum(1. / (radius + 1e-6) * data[:, 0])
sin_dist = params[0][1:] - tf.cos(params[0][:-1]) * np.pi
sin_loss = tf.reduce_sum((sin_dist * data[:, 1:])**2)
return rad_loss + sin_loss
class ProjectionQuadratic(Problem):
"""Dataset consists of different directions to probe. Global min is at 0."""
def __init__(self, ndim, random_seed=None, noise_stdev=0.):
param_shapes = [(1, ndim)]
super(ProjectionQuadratic, self).__init__(
param_shapes, random_seed, noise_stdev)
def objective(self, params, data, labels=None):
return tf.reduce_sum((params[0] * data)**2)
class SumOfQuadratics(Problem):
def __init__(self, ndim, random_seed=None, noise_stdev=0.):
param_shapes = [(1, ndim)]
super(SumOfQuadratics, self).__init__(
param_shapes, random_seed, noise_stdev)
def objective(self, params, data, labels=None):
epsilon = 1e-12
# Assume dataset is designed so that the global minimum is at params=0.
# Subtract loss at params=0, so that global minimum has objective value
# epsilon (added to avoid floating point issues).
return (tf.reduce_sum((params[0] - data)**2) - tf.reduce_sum(data**2) +
epsilon)
class MatMulAlgorithm(Problem):
"""A 6-th order polynomial optimization problem.
This problem is parametrized by n and k. A solution to this problem with
objective value exactly zero defines a matrix multiplication algorithm of
n x n matrices using k multiplications between matrices. When applied
recursively, such an algorithm has complexity O(n^(log_n(k))).
Given n, it is not known in general which values of k in [n^2, n^3] have a
solution. There is always a solution with k = n^3 (this is the naive
algorithm).
In the special case n = 2, it is known that there are solutions for k = {7, 8}
but not for k <= 6. For n = 3, it is known that there are exact solutions for
23 <= k <= 27, and there are asymptotic solutions for k = {21, 22}, but the
other cases are unknown.
For a given n and k, if one solution exists then infinitely many solutions
exist due to permutation and scaling symmetries in the parameters.
This is a very hard problem for some values of n and k (e.g. n = 3, k = 21),
but very easy for other values (e.g. n = 2, k = 7).
For a given n and k, the specific formulation of this problem is as follows.
Let theta_a, theta_b, theta_c be parameter matrices with respective dimensions
[n**2, k], [n**2, k], [k, n**2]. Then for any matrices a, b with shape [n, n],
we can form the matrix c with shape [n, n] via the operation:
((vec(a) * theta_a) .* (vec(b) * theta_b)) * theta_c = vec(c), (#)
where vec(x) is the operator that flattens a matrix with shape [n, n] into a
row vector with shape [1, n**2], * denotes matrix multiplication and .*
denotes elementwise multiplication.
This operation, parameterized by theta_a, theta_b, theta_c, is a matrix
multiplication algorithm iff c = a*b for all [n, n] matrices a and b. But
actually it suffices to verify all combinations of one-hot matrices a and b,
of which there are n**4 such combinations. This gives a batch of n**4 matrix
triplets (a, b, c) such that equation (#) must hold for each triplet. We solve
for theta_a, theta_b, theta_c by minimizing the sum of squares of errors
across this batch.
Finally, theta_c can be computed from theta_a and theta_b. Therefore it
suffices to learn theta_a and theta_b, from which theta_c and therefore the
objective value can be computed.
"""
def __init__(self, n, k):
assert isinstance(n, int), "n must be an integer"
assert isinstance(k, int), "k must be an integer"
assert n >= 2, "Must have n >= 2"
assert k >= n**2 and k <= n**3, "Must have n**2 <= k <= n**3"
param_shapes = [(n**2, k), (n**2, k)] # theta_a, theta_b
super(MatMulAlgorithm, self).__init__(
param_shapes, random_seed=None, noise_stdev=0.0)
self.n = n
self.k = k
# Build a batch of all combinations of one-hot matrices a, b, and their
# respective products c. Correctness on this batch is a necessary and
# sufficient condition for the algorithm to be valid. The number of matrices
# in {a, b, c}_3d is n**4 and each matrix is n x n.
onehots = np.identity(n**2).reshape(n**2, n, n)
a_3d = np.repeat(onehots, n**2, axis=0)
b_3d = np.tile(onehots, [n**2, 1, 1])
c_3d = np.matmul(a_3d, b_3d)
# Convert the batch to 2D Tensors.
self.a = tf.constant(a_3d.reshape(n**4, n**2), tf.float32, name="a")
self.b = tf.constant(b_3d.reshape(n**4, n**2), tf.float32, name="b")
self.c = tf.constant(c_3d.reshape(n**4, n**2), tf.float32, name="c")
def init_tensors(self, seed=None):
# Initialize params such that the columns of theta_a and theta_b have L2
# norm 1.
def _param_initializer(shape, seed=None):
x = tf.random_normal(shape, dtype=tf.float32, seed=seed)
return tf.transpose(tf.nn.l2_normalize(tf.transpose(x), 1))
return [_param_initializer(shape, seed) for shape in self.param_shapes]
def objective(self, parameters, data=None, labels=None):
theta_a = parameters[0]
theta_b = parameters[1]
# Compute theta_c from theta_a and theta_b.
p = tf.matmul(self.a, theta_a) * tf.matmul(self.b, theta_b)
p_trans = tf.transpose(p, name="p_trans")
p_inv = tf.matmul(
tf.matrix_inverse(tf.matmul(p_trans, p)), p_trans, name="p_inv")
theta_c = tf.matmul(p_inv, self.c, name="theta_c")
# Compute the "predicted" value of c.
c_hat = tf.matmul(p, theta_c, name="c_hat")
# Compute the loss (sum of squared errors).
loss = tf.reduce_sum((c_hat - self.c)**2, name="loss")
return loss
def matmul_problem_sequence(n, k_min, k_max):
"""Helper to generate a sequence of matrix multiplication problems."""
return [(_Spec(MatMulAlgorithm, (n, k), {}), None, None)
for k in range(k_min, k_max + 1)]
def init_fixed_variables(arrays):
with tf.variable_scope(PARAMETER_SCOPE):
params = [tf.Variable(arr.astype("float32")) for arr in arrays]
return params
def _mesh(xlim, ylim, n):
"""Creates a 2D meshgrid covering the given ranges.
Args:
xlim: int that defines the desired x-range (-xlim, xlim)
ylim: int that defines the desired y-range (-ylim, ylim)
n: number of points in each dimension of the mesh
Returns:
xm: 2D array of x-values in the mesh
ym: 2D array of y-values in the mesh
"""
return np.meshgrid(np.linspace(-xlim, xlim, n),
np.linspace(-ylim, ylim, n))
# Copyright 2017 Google, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Groups of problems of different types for optimizer training."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import tensorflow as tf
from learned_optimizer.problems import datasets
from learned_optimizer.problems import model_adapter
from learned_optimizer.problems import problem_generator as pg
from learned_optimizer.problems import problem_spec
_Spec = problem_spec.Spec
def quadratic_problems():
return [
(_Spec(pg.Quadratic, (20,), {}), None, None),
(_Spec(pg.Quadratic, (25,), {}), None, None),
(_Spec(pg.Quadratic, (50,), {}), None, None),
(_Spec(pg.Quadratic, (100,), {}), None, None),
]
# Note: this group contains one non-noisy problem for historical reasons. The
# original training set before the refactor included this set of quadratics.
def quadratic_problems_noisy():
return [
(_Spec(pg.Quadratic, (20,), {"noise_stdev": 0.5}), None, None),
(_Spec(pg.Quadratic, (25,), {"noise_stdev": 0.0}), None, None),
(_Spec(pg.Quadratic, (50,), {"noise_stdev": 1.0}), None, None),
(_Spec(pg.Quadratic, (100,), {"noise_stdev": 2.0}), None, None),
]
def quadratic_problems_large():
return [
(_Spec(pg.Quadratic, (784,), {}), None, None),
(_Spec(pg.Quadratic, (1024,), {}), None, None),
(_Spec(pg.Quadratic, (2048,), {}), None, None),
]
def bowl_problems():
return [
(_Spec(pg.Bowl, (0.1,), {"noise_stdev": 0.0}), None, None),
(_Spec(pg.Bowl, (1.0,), {"noise_stdev": 0.0}), None, None),
(_Spec(pg.Bowl, (5.0,), {"noise_stdev": 0.0}), None, None),
(_Spec(pg.Bowl, (5.0,), {"noise_stdev": 0.0, "angle": np.pi / 4.}),
None, None),
]
def bowl_problems_noisy():
return [
(_Spec(pg.Bowl, (0.1,), {"noise_stdev": 0.1}), None, None),
(_Spec(pg.Bowl, (1.0,), {"noise_stdev": 0.1}), None, None),
(_Spec(pg.Bowl, (5.0,), {"noise_stdev": 0.1}), None, None),
(_Spec(pg.Bowl, (5.0,), {"noise_stdev": 0.1, "angle": np.pi / 4.}),
None, None),
]
def sparse_softmax_2_class_sparse_problems():
return [(_Spec(pg.SparseSoftmaxRegression, (5, 2), {"noise_stdev": 0.0}),
datasets.noisy_parity_class(5, random_seed=123), 23),]
def one_hot_sparse_softmax_2_class_sparse_problems():
return [
(_Spec(pg.OneHotSparseSoftmaxRegression, (5, 2), {"noise_stdev": 0.0}),
datasets.noisy_parity_class(5, random_seed=123), 23),
]
def softmax_2_class_problems():
return [
(_Spec(pg.SoftmaxRegression, (10, 2), {}), datasets.random(
10, 1000, random_seed=123, sep=2.0), 100),
(_Spec(pg.SoftmaxRegression, (100, 2), {}), datasets.random(
100, 1000, random_seed=123), 50),
(_Spec(pg.SoftmaxRegression, (200, 2), {}), datasets.random(
200, 1000, random_seed=123, sep=1.5), 20),
(_Spec(pg.SoftmaxRegression, (256, 2), {}), datasets.random(
256, 1000, random_seed=123, sep=1.5), 100),
]
def softmax_2_class_problems_noisy():
return [
(_Spec(pg.SoftmaxRegression, (10, 2), {"noise_stdev": 0.5}),
datasets.random(10, 1000, random_seed=123, sep=2.0), 100),
(_Spec(pg.SoftmaxRegression, (100, 2), {"noise_stdev": 0.1}),
datasets.random(100, 1000, random_seed=123), 50),
(_Spec(pg.SoftmaxRegression, (200, 2), {"noise_stdev": 0.1}),
datasets.random(200, 1000, random_seed=123, sep=1.5), 20),
(_Spec(pg.SoftmaxRegression, (256, 2), {"noise_stdev": 0.5}),
datasets.random(256, 1000, random_seed=123, sep=1.5), 100),
]
def optimization_test_problems():
return [
(_Spec(pg.Ackley, (), {}), None, None),
(_Spec(pg.Beale, (), {}), None, None),
(_Spec(pg.Booth, (), {}), None, None),
(_Spec(pg.Branin, (), {}), None, None),
(_Spec(pg.LogSumExp, (), {}), None, None),
(_Spec(pg.Matyas, (), {}), None, None),
(_Spec(pg.Michalewicz, (), {}), None, None),
(_Spec(pg.Rosenbrock, (), {}), None, None),
(_Spec(pg.StyblinskiTang, (), {}), None, None),
]
def optimization_test_problems_noisy():
return [
(_Spec(pg.Ackley, (), {"noise_stdev": 1.}), None, None),
(_Spec(pg.Beale, (), {"noise_stdev": 1.}), None, None),
(_Spec(pg.Booth, (), {"noise_stdev": 1.}), None, None),
(_Spec(pg.Branin, (), {"noise_stdev": 1.}), None, None),
(_Spec(pg.LogSumExp, (), {"noise_stdev": 1.}), None, None),
(_Spec(pg.Matyas, (), {"noise_stdev": 1.}), None, None),
(_Spec(pg.Michalewicz, (), {"noise_stdev": 1.}), None, None),
(_Spec(pg.Rosenbrock, (), {"noise_stdev": 1.}), None, None),
(_Spec(pg.StyblinskiTang, (), {"noise_stdev": 1.}), None, None),
]
def fully_connected_random_2_class_problems():
return [
(_Spec(pg.FullyConnected, (8, 2),
{"hidden_sizes": (8, 5,), "activation": tf.nn.sigmoid}),
datasets.random_mlp(8, 1000), 10),
(_Spec(pg.FullyConnected, (12, 2),
{"hidden_sizes": (8, 5, 3), "activation": tf.nn.sigmoid}),
datasets.random_mlp(12, 1000), 200),
(_Spec(pg.FullyConnected, (5, 2),
{"hidden_sizes": (4, 4, 4, 4,), "activation": tf.nn.sigmoid}),
datasets.random_mlp(5, 1000), 100),
(_Spec(pg.FullyConnected, (11, 2),
{"hidden_sizes": (4, 5, 6,), "activation": tf.nn.sigmoid}),
datasets.random_mlp(11, 1000), 64),
(_Spec(pg.FullyConnected, (9, 2),
{"hidden_sizes": (8,), "activation": tf.nn.sigmoid}),
datasets.random_mlp(9, 1000), 128),
(_Spec(pg.FullyConnected, (7, 2),
{"hidden_sizes": (8, 5,), "activation": tf.nn.sigmoid}),
datasets.random_mlp(7, 1000), 16),
(_Spec(pg.FullyConnected, (8, 2),
{"hidden_sizes": (32, 64,), "activation": tf.nn.sigmoid}),
datasets.random_mlp(8, 1000), 10),
(_Spec(pg.FullyConnected, (12, 2),
{"hidden_sizes": (16, 8, 3), "activation": tf.nn.sigmoid}),
datasets.random_mlp(12, 1000), 200),
(_Spec(pg.FullyConnected, (5, 2),
{"hidden_sizes": (8, 8, 8, 8,), "activation": tf.nn.sigmoid}),
datasets.random_mlp(5, 1000), 100),
(_Spec(pg.FullyConnected, (11, 2),
{"hidden_sizes": (10, 12, 12,), "activation": tf.nn.sigmoid}),
datasets.random_mlp(11, 1000), 64),
(_Spec(pg.FullyConnected, (9, 2),
{"hidden_sizes": (32,), "activation": tf.nn.sigmoid}),
datasets.random_mlp(9, 1000), 128),
(_Spec(pg.FullyConnected, (7, 2),
{"hidden_sizes": (32, 64,), "activation": tf.nn.sigmoid}),
datasets.random_mlp(7, 1000), 16),
]
def matmul_problems():
return sum([
pg.matmul_problem_sequence(2, 5, 8),
pg.matmul_problem_sequence(3, 19, 24)], [])
def log_objective_problems():
return [
(_Spec(pg.LogObjective, [_Spec(pg.Quadratic, (20,), {})], {}),
None, None),
(_Spec(pg.LogObjective, [_Spec(pg.Quadratic, (50,), {})], {}),
None, None),
(_Spec(pg.LogObjective, [_Spec(pg.Quadratic, (100,), {})], {}),
None, None),
(_Spec(pg.LogObjective, [_Spec(pg.Bowl, (0.1,), {})], {}), None, None),
(_Spec(pg.LogObjective, [_Spec(pg.Bowl, (1.0,), {})], {}), None, None),
(_Spec(pg.LogObjective, [_Spec(pg.Bowl, (5.0,), {})], {}), None, None),
]
def sparse_gradient_problems():
return [
(_Spec(pg.SparseProblem, [_Spec(pg.Quadratic, (20,), {})], {}),
None, None),
(_Spec(pg.SparseProblem, [_Spec(pg.Quadratic, (50,), {})], {}),
None, None),
(_Spec(pg.SparseProblem, [_Spec(pg.Quadratic, (100,), {})], {}),
None, None),
(_Spec(pg.SparseProblem, [_Spec(pg.Bowl, (0.1,), {})], {}), None, None),
(_Spec(pg.SparseProblem, [_Spec(pg.Bowl, (1.0,), {})], {}), None, None),
(_Spec(pg.SparseProblem, [_Spec(pg.Bowl, (5.0,), {})], {}), None, None),
]
def sparse_gradient_problems_mlp():
return [
(_Spec(pg.SparseProblem, [
_Spec(pg.FullyConnected, (8, 2), {
"hidden_sizes": (8, 5,),
"activation": tf.nn.sigmoid
})
], {}), datasets.random_mlp(8, 1000), 10),
(_Spec(pg.SparseProblem, [
_Spec(pg.FullyConnected, (12, 2), {
"hidden_sizes": (8, 5, 3),
"activation": tf.nn.sigmoid
})
], {}), datasets.random_mlp(12, 1000), 200),
(_Spec(pg.SparseProblem, [
_Spec(pg.FullyConnected, (5, 2), {
"hidden_sizes": (4, 4, 4, 4,),
"activation": tf.nn.sigmoid
})
], {}), datasets.random_mlp(5, 1000), 100),
]
def rescale_problems():
return [
(_Spec(pg.Rescale, [_Spec(pg.Norm, (18,), {"norm_power": 2.5})],
{"scale": 0.123}), None, None),
(_Spec(pg.Rescale, [_Spec(pg.Norm, (18,), {"norm_power": 1.5})],
{"scale": 8}), None, None),
(_Spec(pg.Rescale, [_Spec(pg.Norm, (18,), {"norm_power": 2.})],
{"scale": 50}), None, None),
(_Spec(pg.Rescale, [_Spec(pg.Norm, (18,), {"norm_power": 3.})],
{"scale": 200}), None, None),
(_Spec(pg.Rescale, [_Spec(pg.Norm, (18,), {"norm_power": 1.})],
{"scale": 1000}), None, None),
(_Spec(pg.Rescale, [_Spec(pg.Quadratic, (20,), {})], {"scale": 0.1}),
None, None),
(_Spec(pg.Rescale, [_Spec(pg.Quadratic, (25,), {})], {"scale": 10.}),
None, None),
(_Spec(pg.Rescale, [_Spec(pg.Quadratic, (50,), {})], {"scale": 350.}),
None, None),
(_Spec(pg.Rescale, [_Spec(pg.Quadratic, (100,), {})], {"scale": 132}),
None, None),
]
def norm_problems():
return [
# < 1 Norm causes NaN gradients early in training.
(_Spec(pg.Norm, (27,), {"norm_power": 1.}), None, None),
(_Spec(pg.Norm, (25,), {"norm_power": 2.}), None, None),
(_Spec(pg.Norm, (22,), {"norm_power": 3.}), None, None),
]
def norm_problems_noisy():
return [
# < 1 Norm causes NaN gradients early in training.
(_Spec(pg.Norm, (19,), {"noise_stdev": .1, "norm_power": 1.}),
None, None),
(_Spec(pg.Norm, (26,), {"noise_stdev": .1, "norm_power": 2.}),
None, None),
(_Spec(pg.Norm, (23,), {"noise_stdev": .1, "norm_power": 3.}),
None, None),
]
def sum_problems():
return [
(_Spec(pg.SumTask, [[
_Spec(pg.Quadratic, (11,), {}),
_Spec(pg.Quadratic, (3,), {}),
_Spec(pg.Quadratic, (9,), {}),
_Spec(pg.Quadratic, (7,), {}),
_Spec(pg.Quadratic, (5,), {}),
_Spec(pg.Quadratic, (13,), {}),
_Spec(pg.Quadratic, (12,), {})
]], {}), None, None),
(_Spec(pg.SumTask, [[
_Spec(pg.Norm, (18,), {"norm_power": 3}),
_Spec(pg.Quadratic, (25,), {}),
_Spec(pg.Rosenbrock, (), {})
]], {}), None, None),
(_Spec(pg.SumTask, [[
_Spec(pg.Rosenbrock, (), {}),
_Spec(pg.LogSumExp, (), {}),
_Spec(pg.Ackley, (), {}),
_Spec(pg.Beale, (), {}),
_Spec(pg.Booth, (), {}),
_Spec(pg.StyblinskiTang, (), {}),
_Spec(pg.Matyas, (), {}),
_Spec(pg.Branin, (), {}),
_Spec(pg.Michalewicz, (), {})
]], {}), None, None),
(_Spec(pg.SumTask, [[
_Spec(pg.Rosenbrock, (), {}),
_Spec(pg.LogSumExp, (), {}),
_Spec(pg.Ackley, (), {}),
_Spec(pg.Beale, (), {}),
_Spec(pg.Booth, (), {}),
_Spec(pg.StyblinskiTang, (), {}),
_Spec(pg.Matyas, (), {}),
_Spec(pg.Branin, (), {}),
_Spec(pg.Michalewicz, (), {}),
_Spec(pg.Quadratic, (5,), {}),
_Spec(pg.Quadratic, (13,), {})
]], {}), None, None),
(_Spec(pg.SumTask, [[
_Spec(pg.Quadratic, (11,), {}),
_Spec(pg.Quadratic, (3,), {})
]], {}), None, None),
(_Spec(pg.SumTask, [[
_Spec(pg.Rosenbrock, (), {}),
_Spec(pg.LogSumExp, (), {}),
_Spec(pg.Ackley, (), {})
]], {}), None, None),
]
def sum_problems_noisy():
return [
(_Spec(pg.SumTask, [[
_Spec(pg.Quadratic, (11,), {"noise_stdev": 0.1}),
_Spec(pg.Quadratic, (3,), {"noise_stdev": 0.1}),
_Spec(pg.Quadratic, (9,), {"noise_stdev": 0.1}),
_Spec(pg.Quadratic, (7,), {"noise_stdev": 0.1}),
_Spec(pg.Quadratic, (5,), {"noise_stdev": 0.1}),
_Spec(pg.Quadratic, (13,), {"noise_stdev": 0.1}),
_Spec(pg.Quadratic, (12,), {"noise_stdev": 0.1})
]], {}), None, None),
(_Spec(pg.SumTask, [[
_Spec(pg.Rosenbrock, (), {}),
_Spec(pg.LogSumExp, (), {}),
_Spec(pg.Ackley, (), {}),
_Spec(pg.Beale, (), {}),
_Spec(pg.Booth, (), {}),
_Spec(pg.StyblinskiTang, (), {}),
_Spec(pg.Matyas, (), {}),
_Spec(pg.Branin, (), {}),
_Spec(pg.Michalewicz, (), {}),
_Spec(pg.Quadratic, (5,), {}),
_Spec(pg.Quadratic, (13,), {"noise_stdev": 0.5})
]], {}), None, None),
]
def dependency_chain_problems():
return [
(_Spec(pg.DependencyChain, (20,), {}), datasets.random_binary(
20, 1000), 100),
(_Spec(pg.DependencyChain, (12,), {}), datasets.random_binary(
12, 200), 10),
(_Spec(pg.DependencyChain, (56,), {}), datasets.random_binary(
56, 5000), 100),
(_Spec(pg.DependencyChain, (64,), {}), datasets.random_binary(
64, 1000), 50),
(_Spec(pg.DependencyChain, (13,), {}), datasets.random_binary(
13, 10000), 50),
(_Spec(pg.DependencyChain, (20,), {}), datasets.random_binary(
20, 1000), 128),
(_Spec(pg.DependencyChain, (12,), {}), datasets.random_binary(
12, 300), 16),
(_Spec(pg.DependencyChain, (56,), {}), datasets.random_binary(
56, 5000), 128),
(_Spec(pg.DependencyChain, (64,), {}), datasets.random_binary(
64, 1000), 64),
(_Spec(pg.DependencyChain, (13,), {}), datasets.random_binary(
13, 10000), 32),
]
def outward_snake_problems():
return [
(_Spec(pg.OutwardSnake, (20,), {}), datasets.random_binary(
20, 1000), 100),
(_Spec(pg.OutwardSnake, (12,), {}), datasets.random_binary(
12, 200), 10),
(_Spec(pg.OutwardSnake, (56,), {}), datasets.random_binary(
56, 5000), 100),
(_Spec(pg.OutwardSnake, (64,), {}), datasets.random_binary(
64, 1000), 50),
(_Spec(pg.OutwardSnake, (13,), {}), datasets.random_binary(
13, 10000), 50),
(_Spec(pg.OutwardSnake, (20,), {}), datasets.random_binary(
20, 1000), 128),
(_Spec(pg.OutwardSnake, (12,), {}), datasets.random_binary(
12, 300), 16),
(_Spec(pg.OutwardSnake, (56,), {}), datasets.random_binary(
56, 5000), 128),
(_Spec(pg.OutwardSnake, (64,), {}), datasets.random_binary(
64, 1000), 64),
(_Spec(pg.OutwardSnake, (13,), {}), datasets.random_binary(
13, 10000), 32),
]
def min_max_well_problems():
return [
(_Spec(pg.MinMaxWell, (20,), {}), None, None),
(_Spec(pg.MinMaxWell, (12,), {}), None, None),
(_Spec(pg.MinMaxWell, (56,), {}), None, None),
(_Spec(pg.MinMaxWell, (64,), {}), None, None),
(_Spec(pg.MinMaxWell, (13,), {}), None, None),
]
def sum_of_quadratics_problems():
return [
(_Spec(pg.SumOfQuadratics, (20,), {}),
datasets.random_symmetric(20, 1000), 100),
(_Spec(pg.SumOfQuadratics, (12,), {}),
datasets.random_symmetric(12, 100), 10),
(_Spec(pg.SumOfQuadratics, (56,), {}),
datasets.random_symmetric(56, 5000), 100),
(_Spec(pg.SumOfQuadratics, (64,), {}),
datasets.random_symmetric(64, 1000), 50),
(_Spec(pg.SumOfQuadratics, (13,), {}),
datasets.random_symmetric(13, 10000), 50),
(_Spec(pg.SumOfQuadratics, (20,), {}),
datasets.random_symmetric(20, 1000), 128),
(_Spec(pg.SumOfQuadratics, (12,), {}),
datasets.random_symmetric(12, 100), 16),
(_Spec(pg.SumOfQuadratics, (56,), {}),
datasets.random_symmetric(56, 5000), 128),
(_Spec(pg.SumOfQuadratics, (64,), {}),
datasets.random_symmetric(64, 1000), 64),
(_Spec(pg.SumOfQuadratics, (13,), {}),
datasets.random_symmetric(13, 10000), 32),
]
def projection_quadratic_problems():
return [
(_Spec(pg.ProjectionQuadratic, (20,), {}),
datasets.random_symmetric(20, 1000), 100),
(_Spec(pg.ProjectionQuadratic, (12,), {}),
datasets.random_symmetric(12, 100), 10),
(_Spec(pg.ProjectionQuadratic, (56,), {}),
datasets.random_symmetric(56, 5000), 100),
(_Spec(pg.ProjectionQuadratic, (64,), {}),
datasets.random_symmetric(64, 1000), 50),
(_Spec(pg.ProjectionQuadratic, (13,), {}),
datasets.random_symmetric(13, 10000), 50),
(_Spec(pg.ProjectionQuadratic, (20,), {}),
datasets.random_symmetric(20, 1000), 128),
(_Spec(pg.ProjectionQuadratic, (12,), {}),
datasets.random_symmetric(12, 100), 16),
(_Spec(pg.ProjectionQuadratic, (56,), {}),
datasets.random_symmetric(56, 5000), 128),
(_Spec(pg.ProjectionQuadratic, (64,), {}),
datasets.random_symmetric(64, 1000), 64),
(_Spec(pg.ProjectionQuadratic, (13,), {}),
datasets.random_symmetric(13, 10000), 32),
]
def adapter_rosenbrock_local():
return [(_Spec(model_adapter.ModelAdapter,
(pg.make_rosenbrock_loss_and_init,), {}), None, None),]
def adapter_rosenbrock_worker():
return [(_Spec(model_adapter.ModelAdapter,
(pg.make_rosenbrock_loss_and_init,),
{"device": "/job:worker"}), None, None),]
def _test_problem_mlp_scaled_init_small():
return [
np.random.randn(10, 32) * np.sqrt(2./10),
np.random.randn(32,) * 0.1,
np.random.randn(32, 64) * np.sqrt(2./32.),
np.random.randn(64,) * 0.1,
np.random.randn(64, 2) * np.sqrt(2./64.),
np.random.randn(2,) * 0.1
]
def _test_problem_mlp_scaled_init_large():
return [
np.random.randn(20, 32) * np.sqrt(2./20),
np.random.randn(32,) * 0.1,
np.random.randn(32, 64) * np.sqrt(2./32.),
np.random.randn(64,) * 0.1,
np.random.randn(64, 10) * np.sqrt(2./64.),
np.random.randn(10,) * 0.1
]
def _test_problem_mlp_scaled_init_mnist():
return [
np.random.randn(784, 64) * np.sqrt(2./784.),
np.random.randn(64,) * 0.1,
np.random.randn(64, 10) * np.sqrt(2./ 64.),
np.random.randn(10,) * 0.1,
]
# Wrap this construction in a function to avoid UnparsedFlagAccessError
def test_problems():
"""Test problems for visualizations."""
# Unlike the training problem sets, these test problems are made up of
# length-5 tuples. The final items in the tuple are the name of the problem
# and the initialization random_seed for testing consistency.
tp = [
(_Spec(pg.Quadratic, (20,), {"random_seed": 1234}), None, None,
"quad_problem", 5678),
(_Spec(pg.Quadratic, (20,), {"noise_stdev": 1.0, "random_seed": 1234}),
None, None, "quad_problem_noise", 5678),
(_Spec(pg.Rosenbrock, (), {"random_seed": 1234}), None, None,
"rosenbrock", 5678),
(_Spec(pg.Rosenbrock, (), {"random_seed": 1234, "noise_stdev": 1.0}),
None, None, "rosenbrock_noise", 5678),
(_Spec(pg.SoftmaxRegression, (10, 2), {}), datasets.random(
10, 10000, random_seed=1234), 100, "softmax", 5678),
(_Spec(pg.SoftmaxRegression, (10, 2), {"noise_stdev": 1.0}),
datasets.random(10, 10000, random_seed=1234), 100, "softmax_noise",
5678),
(_Spec(pg.FullyConnected, (10, 2), {}), datasets.random(
10, 10000, random_seed=1234), 100, "mlp_small",
_test_problem_mlp_scaled_init_small()),
(_Spec(pg.FullyConnected, (20, 10), {}), datasets.random(
20, 10000, n_classes=10, random_seed=1234), 100, "mlp_large",
_test_problem_mlp_scaled_init_large()),
(_Spec(pg.FullyConnected, (784, 10),
{"hidden_sizes": (64,), "activation": tf.nn.sigmoid}),
datasets.mnist(), 64, "mlp_mnist_sigmoid",
_test_problem_mlp_scaled_init_mnist()),
(_Spec(pg.FullyConnected, (784, 10),
{"hidden_sizes": (64,), "activation": tf.nn.relu}),
datasets.mnist(), 64, "mlp_mnist_relu",
_test_problem_mlp_scaled_init_mnist()),
(_Spec(pg.ConvNet, ((1, 28, 28), 10, [(3, 3, 8), (5, 5, 8)]),
{"activation": tf.nn.sigmoid}), datasets.mnist(), 64,
"convnet_mnist_sigmoid", None),
(_Spec(pg.ConvNet, ((1, 28, 28), 10, [(3, 3, 8), (5, 5, 8)]),
{"activation": tf.nn.relu}), datasets.mnist(), 64,
"convnet_mnist_relu", None),
]
return tp
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment