Unverified Commit f16a7b5b authored by vedanshu's avatar vedanshu Committed by GitHub
Browse files

Merge pull request #1 from tensorflow/master

new pull
parents 8e9296ff 8f58f396
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Executes Transformer w/Keras benchmark and accuracy tests."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import time
from absl import flags
import tensorflow as tf
from official.benchmark import benchmark_wrappers
from official.benchmark import owner_utils
from official.benchmark.perfzero_benchmark import PerfZeroBenchmark
from official.nlp.transformer import misc
from official.nlp.transformer import transformer_main as transformer_main
from official.utils.flags import core as flags_core
TRANSFORMER_EN2DE_DATA_DIR_NAME = 'wmt32k-en2de-official'
EN2DE_2014_BLEU_DATA_DIR_NAME = 'newstest2014'
FLAGS = flags.FLAGS
TMP_DIR = os.getenv('TMPDIR')
class TransformerBenchmark(PerfZeroBenchmark):
"""Methods common to executing transformer w/keras tests.
Code under test for the Transformer Keras models report the same data and
require the same FLAG setup.
"""
def __init__(self, output_dir=None, default_flags=None, root_data_dir=None,
flag_methods=None, tpu=None):
root_data_dir = root_data_dir if root_data_dir else ''
self.train_data_dir = os.path.join(root_data_dir,
TRANSFORMER_EN2DE_DATA_DIR_NAME)
self.vocab_file = os.path.join(root_data_dir,
TRANSFORMER_EN2DE_DATA_DIR_NAME,
'vocab.ende.32768')
self.bleu_source = os.path.join(root_data_dir,
EN2DE_2014_BLEU_DATA_DIR_NAME,
'newstest2014.en')
self.bleu_ref = os.path.join(root_data_dir,
EN2DE_2014_BLEU_DATA_DIR_NAME,
'newstest2014.de')
if default_flags is None:
default_flags = {}
default_flags['data_dir'] = self.train_data_dir
default_flags['vocab_file'] = self.vocab_file
super(TransformerBenchmark, self).__init__(
output_dir=output_dir,
default_flags=default_flags,
flag_methods=flag_methods,
tpu=tpu)
@benchmark_wrappers.enable_runtime_flags
def _run_and_report_benchmark(self,
bleu_max=None,
bleu_min=None,
log_steps=None,
total_batch_size=None,
warmup=1):
"""Report benchmark results by writing to local protobuf file.
Args:
bleu_max: highest passing level for bleu score.
bleu_min: lowest passing level for bleu score.
log_steps: How often the log was created for stats['step_timestamp_log'].
total_batch_size: Global batch-size.
warmup: number of entries in stats['step_timestamp_log'] to ignore.
"""
start_time_sec = time.time()
task = transformer_main.TransformerTask(FLAGS)
stats = task.train()
wall_time_sec = time.time() - start_time_sec
metrics = []
if 'bleu_uncased' in stats:
if 'bleu_uncased_history' in stats:
bleu_uncased_best = max(stats['bleu_uncased_history'],
key=lambda x: x[1])
metrics.append({'name': 'bleu_uncased',
'value': bleu_uncased_best[1],
'min_value': bleu_min,
'max_value': bleu_max})
metrics.append({'name': 'bleu_best_score_iteration',
'value': bleu_uncased_best[0]})
metrics.append({'name': 'bleu_uncased_last',
'value': stats['bleu_uncased']})
else:
metrics.append({'name': 'bleu_uncased',
'value': stats['bleu_uncased'],
'min_value': bleu_min,
'max_value': bleu_max})
if (warmup and 'step_timestamp_log' in stats and
len(stats['step_timestamp_log']) > warmup + 1):
# first entry in the time_log is start of step 1. The rest of the
# entries are the end of each step recorded
time_log = stats['step_timestamp_log']
elapsed = time_log[-1].timestamp - time_log[warmup].timestamp
num_examples = (
total_batch_size * log_steps * (len(time_log) - warmup - 1))
examples_per_sec = num_examples / elapsed
metrics.append({'name': 'exp_per_second',
'value': examples_per_sec})
if 'avg_exp_per_second' in stats:
metrics.append({'name': 'avg_exp_per_second',
'value': stats['avg_exp_per_second']})
if 'step_timestamp_log' in stats:
time_log = stats['step_timestamp_log']
metrics.append({'name': 'startup_time',
'value': time_log[0].timestamp - start_time_sec})
flags_str = flags_core.get_nondefault_flags_as_str()
self.report_benchmark(iters=-1, wall_time=wall_time_sec, metrics=metrics,
extras={'flags': flags_str})
class TransformerBaseKerasAccuracy(TransformerBenchmark):
"""Benchmark accuracy tests for Transformer Base model w/ Keras."""
def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
"""Benchmark accuracy tests for Transformer Base model w/ Keras.
Args:
output_dir: directory where to output e.g. log files
root_data_dir: directory under which to look for dataset
**kwargs: arbitrary named arguments. This is needed to make the
constructor forward compatible in case PerfZero provides more
named arguments before updating the constructor.
"""
flag_methods = [misc.define_transformer_flags]
super(TransformerBaseKerasAccuracy, self).__init__(
output_dir=output_dir, root_data_dir=root_data_dir,
flag_methods=flag_methods)
def benchmark_1_gpu(self):
"""Benchmark 1 gpu.
The paper uses 8 GPUs and a much larger effective batch size, this is will
not converge to the 27.3 BLEU (uncased) SOTA.
"""
self._setup()
FLAGS.num_gpus = 1
FLAGS.data_dir = self.train_data_dir
FLAGS.vocab_file = self.vocab_file
# Sets values directly to avoid validation check.
FLAGS['bleu_source'].value = self.bleu_source
FLAGS['bleu_ref'].value = self.bleu_ref
FLAGS.param_set = 'base'
FLAGS.batch_size = 2048
FLAGS.train_steps = 1000
FLAGS.steps_between_evals = 500
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu')
# These bleu scores are based on test runs after at this limited
# number of steps and batch size after verifying SOTA at 8xV100s.
self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps,
bleu_min=25.3,
bleu_max=26)
def benchmark_1_gpu_static_batch(self):
"""Benchmark 1 gpu with static_batch.
The paper uses 8 GPUs and a much larger effective batch size, this is will
not converge to the 27.3 BLEU (uncased) SOTA.
"""
self._setup()
FLAGS.num_gpus = 1
FLAGS.data_dir = self.train_data_dir
FLAGS.vocab_file = self.vocab_file
# Sets values directly to avoid validation check.
FLAGS['bleu_source'].value = self.bleu_source
FLAGS['bleu_ref'].value = self.bleu_ref
FLAGS.param_set = 'base'
FLAGS.batch_size = 4096
FLAGS.train_steps = 100000
FLAGS.steps_between_evals = 5000
FLAGS.static_batch = True
FLAGS.max_length = 64
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_static_batch')
# These bleu scores are based on test runs after at this limited
# number of steps and batch size after verifying SOTA at 8xV100s.
self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps,
bleu_min=25.3,
bleu_max=26)
def benchmark_8_gpu(self):
"""Benchmark 8 gpu.
Should converge to 27.3 BLEU (uncased). This has not been confirmed yet.
"""
self._setup()
FLAGS.num_gpus = 8
FLAGS.data_dir = self.train_data_dir
FLAGS.vocab_file = self.vocab_file
# Sets values directly to avoid validation check.
FLAGS['bleu_source'].value = self.bleu_source
FLAGS['bleu_ref'].value = self.bleu_ref
FLAGS.param_set = 'base'
FLAGS.batch_size = 4096*8
FLAGS.train_steps = 100000
FLAGS.steps_between_evals = 20000
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu')
self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps,
bleu_min=27,
bleu_max=28)
def benchmark_8_gpu_static_batch(self):
"""Benchmark 8 gpu.
Should converge to 27.3 BLEU (uncased). This has not been confirmed yet.
"""
self._setup()
FLAGS.num_gpus = 8
FLAGS.data_dir = self.train_data_dir
FLAGS.vocab_file = self.vocab_file
# Sets values directly to avoid validation check.
FLAGS['bleu_source'].value = self.bleu_source
FLAGS['bleu_ref'].value = self.bleu_ref
FLAGS.param_set = 'base'
FLAGS.batch_size = 4096*8
FLAGS.train_steps = 100000
FLAGS.static_batch = True
FLAGS.max_length = 64
FLAGS.steps_between_evals = 5000
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_static_batch')
self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps,
bleu_min=27,
bleu_max=28)
class TransformerBigKerasAccuracy(TransformerBenchmark):
"""Benchmark accuracy tests for Transformer Big model w/ Keras."""
def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
"""Benchmark accuracy tests for Transformer Big model w/ Keras.
Args:
output_dir: directory where to output e.g. log files
root_data_dir: directory under which to look for dataset
**kwargs: arbitrary named arguments. This is needed to make the
constructor forward compatible in case PerfZero provides more
named arguments before updating the constructor.
"""
flag_methods = [misc.define_transformer_flags]
super(TransformerBigKerasAccuracy, self).__init__(
output_dir=output_dir, root_data_dir=root_data_dir,
flag_methods=flag_methods)
def benchmark_8_gpu(self):
"""Benchmark 8 gpu.
Over 6 runs with eval every 20K steps the average highest value was 28.195
(bleu uncased). 28.424 was the highest and 27.96 the lowest. The values are
the highest value seen during a run and occurred at a median of iteration 9.
Iterations are not epochs, an iteration is a number of steps between evals.
"""
self._setup()
FLAGS.num_gpus = 8
FLAGS.data_dir = self.train_data_dir
FLAGS.vocab_file = self.vocab_file
# Sets values directly to avoid validation check.
FLAGS['bleu_source'].value = self.bleu_source
FLAGS['bleu_ref'].value = self.bleu_ref
FLAGS.param_set = 'big'
FLAGS.batch_size = 3072*8
FLAGS.train_steps = 20000 * 12
FLAGS.steps_between_evals = 20000
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu')
self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps,
bleu_min=27.9,
bleu_max=29.2)
def benchmark_8_gpu_static_batch(self):
"""Benchmark 8 gpu.
Should converge to 28.4 BLEU (uncased). This has not be verified yet."
"""
self._setup()
FLAGS.num_gpus = 8
FLAGS.data_dir = self.train_data_dir
FLAGS.vocab_file = self.vocab_file
# Sets values directly to avoid validation check.
FLAGS['bleu_source'].value = self.bleu_source
FLAGS['bleu_ref'].value = self.bleu_ref
FLAGS.param_set = 'big'
FLAGS.batch_size = 3072*8
FLAGS.static_batch = True
FLAGS.max_length = 64
FLAGS.train_steps = 20000 * 12
FLAGS.steps_between_evals = 20000
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_static_batch')
self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps,
bleu_min=28,
bleu_max=29.2)
def benchmark_8_gpu_fp16(self):
"""Benchmark 8 gpu with dynamic batch and fp16.
Over 6 runs with eval every 20K steps the average highest value was 28.247
(bleu uncased). 28.424 was the highest and 28.09 the lowest. The values are
the highest value seen during a run and occurred at a median of iteration
11. While this could be interpreted as worse than FP32, if looking at the
first iteration at which 28 is passed FP16 performs equal and possibly
better. Although not part of the initial test runs, the highest value
recorded with the arguments below was 28.9 at iteration 12. Iterations are
not epochs, an iteration is a number of steps between evals.
"""
self._setup()
FLAGS.num_gpus = 8
FLAGS.dtype = 'fp16'
FLAGS.data_dir = self.train_data_dir
FLAGS.vocab_file = self.vocab_file
# Sets values directly to avoid validation check.
FLAGS['bleu_source'].value = self.bleu_source
FLAGS['bleu_ref'].value = self.bleu_ref
FLAGS.param_set = 'big'
FLAGS.batch_size = 3072*8
FLAGS.train_steps = 20000 * 12
FLAGS.steps_between_evals = 20000
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_fp16')
self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps,
bleu_min=28,
bleu_max=29.2)
def benchmark_8_gpu_fp16_amp(self):
"""Benchmark 8 gpu with dynamic batch and fp16 with automatic mixed precision.
Should converge to 28.4 BLEU (uncased). This has not be verified yet."
"""
self._setup()
FLAGS.num_gpus = 8
FLAGS.dtype = 'fp16'
FLAGS.fp16_implementation = 'graph_rewrite'
FLAGS.data_dir = self.train_data_dir
FLAGS.vocab_file = self.vocab_file
# Sets values directly to avoid validation check.
FLAGS['bleu_source'].value = self.bleu_source
FLAGS['bleu_ref'].value = self.bleu_ref
FLAGS.param_set = 'big'
FLAGS.batch_size = 3072*8
FLAGS.train_steps = 20000 * 12
FLAGS.steps_between_evals = 20000
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_fp16_amp')
self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps,
bleu_min=28,
bleu_max=29)
def benchmark_8_gpu_static_batch_fp16(self):
"""Benchmark 8 gpu with static batch and fp16.
Should converge to 28.4 BLEU (uncased). This has not be verified yet."
"""
self._setup()
FLAGS.num_gpus = 8
FLAGS.dtype = 'fp16'
FLAGS.data_dir = self.train_data_dir
FLAGS.vocab_file = self.vocab_file
# Sets values directly to avoid validation check.
FLAGS['bleu_source'].value = self.bleu_source
FLAGS['bleu_ref'].value = self.bleu_ref
FLAGS.param_set = 'big'
FLAGS.batch_size = 3072*8
FLAGS.static_batch = True
FLAGS.max_length = 64
FLAGS.train_steps = 400000
FLAGS.steps_between_evals = 20000
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_static_batch_fp16')
self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps,
bleu_min=28,
bleu_max=29.2)
def benchmark_xla_8_gpu_static_batch_fp16(self):
"""Benchmark 8 gpu with static batch, XLA, and FP16.
Should converge to 28.4 BLEU (uncased). This has not be verified yet."
"""
self._setup()
FLAGS.num_gpus = 8
FLAGS.dtype = 'fp16'
FLAGS.enable_xla = True
FLAGS.data_dir = self.train_data_dir
FLAGS.vocab_file = self.vocab_file
# Sets values directly to avoid validation check.
FLAGS['bleu_source'].value = self.bleu_source
FLAGS['bleu_ref'].value = self.bleu_ref
FLAGS.param_set = 'big'
FLAGS.batch_size = 3072*8
FLAGS.static_batch = True
FLAGS.max_length = 64
FLAGS.train_steps = 400000
FLAGS.steps_between_evals = 20000
FLAGS.model_dir = self._get_model_dir(
'benchmark_xla_8_gpu_static_batch_fp16')
self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps,
bleu_min=28,
bleu_max=29.2)
class TransformerKerasBenchmark(TransformerBenchmark):
"""Benchmarks for Transformer (Base and Big) using Keras."""
def __init__(self, output_dir=None, default_flags=None,
root_data_dir=None, batch_per_gpu=4096, tpu=None):
"""Initialize.
Args:
output_dir: Based directory for saving artifacts, e.g. checkpoints.
default_flags: default flags to use for all tests.
root_data_dir: root directory for data, e.g. training.
batch_per_gpu: batch size to use per gpu.
tpu: Target TPU to use.
"""
flag_methods = [misc.define_transformer_flags]
self.batch_per_gpu = batch_per_gpu
super(TransformerKerasBenchmark, self).__init__(
output_dir=output_dir,
default_flags=default_flags,
root_data_dir=root_data_dir,
flag_methods=flag_methods,
tpu=tpu)
def benchmark_1_gpu_no_dist_strat(self):
"""Benchmark 1 gpu without distribution strategy."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.distribution_strategy = 'off'
FLAGS.batch_size = self.batch_per_gpu
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_no_dist_strat')
self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps)
def benchmark_1_gpu_no_dist_strat_static_batch(self):
"""Benchmark 1 gpu without distribution strategy with static batch."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.distribution_strategy = 'off'
FLAGS.batch_size = self.batch_per_gpu
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_no_ds_sb')
FLAGS.static_batch = True
FLAGS.max_length = 64
self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps)
def benchmark_1_gpu(self):
"""Benchmark 1 gpu."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.batch_size = self.batch_per_gpu
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu')
self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps)
def benchmark_1_gpu_fp16(self):
"""Benchmark 1 gpu FP16."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.batch_size = self.batch_per_gpu
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_fp16')
FLAGS.dtype = 'fp16'
self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps)
def benchmark_xla_1_gpu(self):
"""Benchmark 1 gpu w/xla."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.batch_size = self.batch_per_gpu
FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu')
FLAGS.enable_xla = True
self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps)
def benchmark_xla_1_gpu_fp16(self):
"""Benchmark 1 gpu w/xla and FP16."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.batch_size = self.batch_per_gpu
FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_fp16')
FLAGS.enable_xla = True
FLAGS.dtype = 'fp16'
self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps)
def benchmark_1_gpu_static_batch(self):
"""Benchmark 1 gpu with static batch."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.batch_size = self.batch_per_gpu
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_static_batch')
FLAGS.static_batch = True
FLAGS.max_length = 64
self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps)
def benchmark_xla_1_gpu_static_batch(self):
"""Benchmark 1 gpu with static batch w/xla."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.batch_size = self.batch_per_gpu
FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_static_batch')
FLAGS.static_batch = True
FLAGS.max_length = 64
FLAGS.enable_xla = True
self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps)
def benchmark_1_gpu_static_batch_fp16(self):
"""Benchmark 1 gpu with static batch FP16."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.batch_size = self.batch_per_gpu
FLAGS.model_dir = self._get_model_dir(
'benchmark_1_gpu_static_batch_fp16')
FLAGS.static_batch = True
FLAGS.max_length = 64
FLAGS.dtype = 'fp16'
self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps)
def benchmark_xla_1_gpu_static_batch_fp16(self):
"""Benchmark 1 gpu with static batch w/xla and FP16."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.batch_size = self.batch_per_gpu
FLAGS.model_dir = self._get_model_dir(
'benchmark_xla_1_gpu_static_batch_fp16')
FLAGS.static_batch = True
FLAGS.max_length = 64
FLAGS.enable_xla = True
FLAGS.dtype = 'fp16'
self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps)
def benchmark_8_gpu(self):
"""Benchmark 8 gpu."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.batch_size = self.batch_per_gpu * 8
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu')
self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps)
def benchmark_8_gpu_fp16(self):
"""Benchmark 8 gpu FP16."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.dtype = 'fp16'
FLAGS.batch_size = self.batch_per_gpu * 8
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_fp16')
self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps)
def benchmark_xla_8_gpu(self):
"""Benchmark 8 gpu w/xla."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.enable_xla = True
FLAGS.batch_size = self.batch_per_gpu * 8
FLAGS.model_dir = self._get_model_dir('benchmark_xla_8_gpu')
self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps)
def benchmark_xla_8_gpu_fp16(self):
"""Benchmark 8 gpu w/xla and FP16."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.enable_xla = True
FLAGS.dtype = 'fp16'
FLAGS.batch_size = self.batch_per_gpu * 8
FLAGS.model_dir = self._get_model_dir('benchmark_xla_8_gpu_fp16')
self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps)
def benchmark_8_gpu_static_batch(self):
"""Benchmark 8 gpu with static batch."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.batch_size = self.batch_per_gpu * 8
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_static_batch')
FLAGS.static_batch = True
FLAGS.max_length = 64
self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps)
def benchmark_8_gpu_static_batch_fp16(self):
"""Benchmark 8 gpu with static batch FP16."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.dtype = 'fp16'
FLAGS.batch_size = self.batch_per_gpu * 8
FLAGS.model_dir = self._get_model_dir(
'benchmark_8_gpu_static_batch_fp16')
FLAGS.static_batch = True
FLAGS.max_length = 64
self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps)
def benchmark_xla_8_gpu_static_batch(self):
"""Benchmark 8 gpu with static batch w/xla."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.enable_xla = True
FLAGS.batch_size = self.batch_per_gpu * 8
FLAGS.model_dir = self._get_model_dir('benchmark_xla_8_gpu_static_batch')
FLAGS.static_batch = True
FLAGS.max_length = 64
self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps)
def benchmark_xla_8_gpu_static_batch_fp16(self):
"""Benchmark 8 gpu with static batch w/xla and FP16."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.enable_xla = True
FLAGS.dtype = 'fp16'
FLAGS.batch_size = self.batch_per_gpu * 8
FLAGS.model_dir = self._get_model_dir(
'benchmark_xla_8_gpu_static_batch_fp16')
FLAGS.static_batch = True
FLAGS.max_length = 64
self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps)
class TransformerBaseKerasBenchmarkReal(TransformerKerasBenchmark):
"""Transformer based version real data benchmark tests."""
def __init__(self, output_dir=TMP_DIR, root_data_dir=TMP_DIR, **kwargs):
def_flags = {}
def_flags['param_set'] = 'base'
def_flags['train_steps'] = 50
def_flags['log_steps'] = 10
super(TransformerBaseKerasBenchmarkReal, self).__init__(
output_dir=output_dir, default_flags=def_flags,
root_data_dir=root_data_dir, batch_per_gpu=4096)
class TransformerBigKerasBenchmarkReal(TransformerKerasBenchmark):
"""Transformer based version real data benchmark tests."""
def __init__(self, output_dir=TMP_DIR, root_data_dir=TMP_DIR,
tpu=None, **kwargs):
def_flags = {}
def_flags['param_set'] = 'big'
def_flags['train_steps'] = 50
def_flags['log_steps'] = 10
super(TransformerBigKerasBenchmarkReal, self).__init__(
output_dir=output_dir, default_flags=def_flags,
root_data_dir=root_data_dir, batch_per_gpu=3072,
tpu=tpu)
def benchmark_2x2_tpu(self):
"""Port of former snaggletooth transformer_big model on 2x2."""
self._setup()
FLAGS.model_dir = self._get_model_dir('benchmark_2x2_tpu')
FLAGS.train_steps = 300
FLAGS.log_steps = 150
FLAGS.steps_between_evals = 150
FLAGS.distribution_strategy = 'tpu'
FLAGS.static_batch = True
FLAGS.use_ctl = True
FLAGS.batch_size = 6144
FLAGS.max_length = 64
FLAGS.decode_batch_size = 32
FLAGS.decode_max_length = 97
FLAGS.padded_decode = True
FLAGS.enable_checkpointing = False
self._run_and_report_benchmark(
total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps)
def benchmark_4x4_tpu(self):
"""Port of former GCP transformer_big model on 4x4."""
self._setup()
FLAGS.model_dir = self._get_model_dir('benchmark_4x4_tpu')
FLAGS.train_steps = 300
FLAGS.log_steps = 150
FLAGS.steps_between_evals = 150
FLAGS.distribution_strategy = 'tpu'
FLAGS.static_batch = True
FLAGS.use_ctl = True
FLAGS.batch_size = 24576
FLAGS.max_length = 64
FLAGS.decode_batch_size = 32
FLAGS.decode_max_length = 97
FLAGS.padded_decode = True
FLAGS.enable_checkpointing = False
self._run_and_report_benchmark(
total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps)
@owner_utils.Owner('tf-graph-compiler')
def benchmark_4x4_tpu_mlir(self):
"""Run transformer_big model on 4x4 with the MLIR Bridge enabled."""
self._setup()
FLAGS.model_dir = self._get_model_dir('benchmark_4x4_tpu')
FLAGS.train_steps = 300
FLAGS.log_steps = 150
FLAGS.steps_between_evals = 150
FLAGS.distribution_strategy = 'tpu'
FLAGS.static_batch = True
FLAGS.use_ctl = True
FLAGS.batch_size = 24576
FLAGS.max_length = 64
FLAGS.decode_batch_size = 32
FLAGS.decode_max_length = 97
FLAGS.padded_decode = True
FLAGS.enable_checkpointing = False
tf.config.experimental.enable_mlir_bridge()
self._run_and_report_benchmark(
total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps)
if __name__ == '__main__':
tf.test.main()
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Executes benchmark testing for 3D Unet model."""
# pylint: disable=line-too-long
from __future__ import print_function
import functools
import os
import time
from typing import Optional
from absl import flags
import tensorflow as tf # pylint: disable=g-bad-import-order
from official.benchmark import benchmark_wrappers
from official.benchmark import keras_benchmark
from official.benchmark import owner_utils
from official.vision.segmentation import unet_main as unet_training_lib
from official.vision.segmentation import unet_model as unet_model_lib
UNET3D_MIN_ACCURACY = 0.90
UNET3D_MAX_ACCURACY = 0.98
UNET_TRAINING_FILES = 'gs://mlcompass-data/unet3d/train_data/*'
UNET_EVAL_FILES = 'gs://mlcompass-data/unet3d/eval_data/*'
UNET_MODEL_CONFIG_FILE = 'gs://mlcompass-data/unet3d/config/unet_config.yaml'
FLAGS = flags.FLAGS
class Unet3DAccuracyBenchmark(keras_benchmark.KerasBenchmark):
"""Benchmark accuracy tests for UNet3D model in Keras."""
def __init__(self,
output_dir: Optional[str] = None,
root_data_dir: Optional[str] = None,
**kwargs):
"""A benchmark class.
Args:
output_dir: directory where to output e.g. log files
root_data_dir: directory under which to look for dataset
**kwargs: arbitrary named arguments. This is needed to make the
constructor forward compatible in case PerfZero provides more named
arguments before updating the constructor.
"""
flag_methods = [unet_training_lib.define_unet3d_flags]
# UNet3D model in Keras."""
self.training_file_pattern = UNET_TRAINING_FILES
self.eval_file_pattern = UNET_EVAL_FILES
# TODO(hongjunchoi): Create and use shared config file instead.
self.config_file = UNET_MODEL_CONFIG_FILE
super(Unet3DAccuracyBenchmark, self).__init__(
output_dir=output_dir, flag_methods=flag_methods)
def _set_benchmark_parameters(self, experiment_name):
"""Overrides training parameters for benchmark tests."""
FLAGS.model_dir = self._get_model_dir(experiment_name)
FLAGS.mode = 'train'
FLAGS.training_file_pattern = self.training_file_pattern
FLAGS.eval_file_pattern = self.eval_file_pattern
FLAGS.config_file = self.config_file
FLAGS.lr_init_value = 0.00005
FLAGS.lr_decay_rate = 0.5
FLAGS.epochs = 3
@benchmark_wrappers.enable_runtime_flags
def _run_and_report_benchmark(self,
experiment_name: str,
min_accuracy: float = UNET3D_MIN_ACCURACY,
max_accuracy: float = UNET3D_MAX_ACCURACY,
distribution_strategy: str = 'tpu',
epochs: int = 10,
steps: int = 0,
epochs_between_evals: int = 1,
dtype: str = 'float32',
enable_xla: bool = False,
run_eagerly: bool = False):
"""Runs and reports the benchmark given the provided configuration."""
params = unet_training_lib.extract_params(FLAGS)
strategy = unet_training_lib.create_distribution_strategy(params)
input_dtype = params.dtype
if input_dtype == 'float16' or input_dtype == 'bfloat16':
policy = tf.keras.mixed_precision.experimental.Policy(
'mixed_bfloat16' if input_dtype == 'bfloat16' else 'mixed_float16')
tf.keras.mixed_precision.experimental.set_policy(policy)
stats = {}
start_time_sec = time.time()
with strategy.scope():
unet_model = unet_model_lib.build_unet_model(params)
history = unet_training_lib.train(
params, strategy, unet_model,
functools.partial(unet_training_lib.get_train_dataset, params),
functools.partial(unet_training_lib.get_eval_dataset, params))
stats['accuracy_top_1'] = history.history['val_metric_accuracy'][-1]
stats['training_accuracy_top_1'] = history.history['metric_accuracy'][-1]
wall_time_sec = time.time() - start_time_sec
super(Unet3DAccuracyBenchmark, self)._report_benchmark(
stats,
wall_time_sec,
top_1_min=min_accuracy,
top_1_max=max_accuracy,
total_batch_size=params.train_batch_size)
def _get_model_dir(self, folder_name):
return os.path.join(self.output_dir, folder_name)
@owner_utils.Owner('tf-model-garden')
def benchmark_4x4_tpu_bf16(self):
"""Test Keras model with 4x4 TPU, fp16."""
experiment_name = 'benchmark_4x4_tpu_fp16'
self._setup()
self._set_benchmark_parameters(experiment_name)
self._run_and_report_benchmark(
experiment_name=experiment_name,
dtype='bfloat16',
distribution_strategy='tpu')
@owner_utils.Owner('tf-graph-compiler')
def benchmark_4x4_tpu_bf16_mlir(self):
"""Test Keras model with 4x4 TPU, fp16 and MLIR enabled."""
experiment_name = 'benchmark_4x4_tpu_fp16_mlir'
tf.config.experimental.enable_mlir_bridge()
self._setup()
self._set_benchmark_parameters(experiment_name)
self._run_and_report_benchmark(
experiment_name=experiment_name,
dtype='bfloat16',
distribution_strategy='tpu')
if __name__ == '__main__':
tf.test.main()
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Executes XLNet benchmarks and accuracy tests."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import json
import os
import time
# pylint: disable=g-bad-import-order
from absl import flags
from absl.testing import flagsaver
import tensorflow as tf
# pylint: enable=g-bad-import-order
from official.benchmark import bert_benchmark_utils as benchmark_utils
from official.benchmark import owner_utils
from official.nlp.xlnet import run_classifier
from official.nlp.xlnet import run_squad
from official.benchmark import benchmark_wrappers
# pylint: disable=line-too-long
PRETRAINED_CHECKPOINT_PATH = 'gs://cloud-tpu-checkpoints/xlnet/large/xlnet_model-1'
CLASSIFIER_TRAIN_DATA_PATH = 'gs://tf-perfzero-data/xlnet/imdb/spiece.model.len-512.train.tf_record'
CLASSIFIER_EVAL_DATA_PATH = 'gs://tf-perfzero-data/xlnet/imdb/spiece.model.len-512.dev.eval.tf_record'
SQUAD_DATA_PATH = 'gs://tf-perfzero-data/xlnet/squadv2_cased/'
# pylint: enable=line-too-long
FLAGS = flags.FLAGS
class XLNetBenchmarkBase(benchmark_utils.BertBenchmarkBase):
"""Base class to hold methods common to test classes in the module."""
def __init__(self, output_dir=None, tpu=None):
super(XLNetBenchmarkBase, self).__init__(output_dir=output_dir, tpu=tpu)
self.num_epochs = None
self.num_steps_per_epoch = None
@flagsaver.flagsaver
def _run_xlnet_classifier(self):
"""Starts XLNet classification task."""
run_classifier.main(unused_argv=None)
@flagsaver.flagsaver
def _run_xlnet_squad(self):
"""Starts XLNet classification task."""
run_squad.main(unused_argv=None)
class XLNetClassifyAccuracy(XLNetBenchmarkBase):
"""Short accuracy test for XLNet classifier model.
Tests XLNet classification task model accuracy. The naming
convention of below test cases follow
`benchmark_(number of gpus)_gpu_(dataset type)` format.
"""
def __init__(self, output_dir=None, tpu=None, **kwargs):
self.train_data_path = CLASSIFIER_TRAIN_DATA_PATH
self.eval_data_path = CLASSIFIER_EVAL_DATA_PATH
self.pretrained_checkpoint_path = PRETRAINED_CHECKPOINT_PATH
super(XLNetClassifyAccuracy, self).__init__(output_dir=output_dir, tpu=tpu)
@benchmark_wrappers.enable_runtime_flags
def _run_and_report_benchmark(self,
training_summary_path,
min_accuracy=0.95,
max_accuracy=0.97):
"""Starts XLNet accuracy benchmark test."""
start_time_sec = time.time()
self._run_xlnet_classifier()
wall_time_sec = time.time() - start_time_sec
with tf.io.gfile.GFile(training_summary_path, 'rb') as reader:
summary = json.loads(reader.read().decode('utf-8'))
super(XLNetClassifyAccuracy, self)._report_benchmark(
stats=summary,
wall_time_sec=wall_time_sec,
min_accuracy=min_accuracy,
max_accuracy=max_accuracy)
def _setup(self):
super(XLNetClassifyAccuracy, self)._setup()
FLAGS.test_data_size = 25024
FLAGS.train_batch_size = 16
FLAGS.seq_len = 512
FLAGS.mem_len = 0
FLAGS.n_layer = 24
FLAGS.d_model = 1024
FLAGS.d_embed = 1024
FLAGS.n_head = 16
FLAGS.d_head = 64
FLAGS.d_inner = 4096
FLAGS.untie_r = True
FLAGS.n_class = 2
FLAGS.ff_activation = 'gelu'
FLAGS.strategy_type = 'mirror'
FLAGS.learning_rate = 2e-5
FLAGS.train_steps = 4000
FLAGS.warmup_steps = 500
FLAGS.iterations = 200
FLAGS.bi_data = False
FLAGS.init_checkpoint = self.pretrained_checkpoint_path
FLAGS.train_tfrecord_path = self.train_data_path
FLAGS.test_tfrecord_path = self.eval_data_path
@owner_utils.Owner('tf-model-garden')
def benchmark_8_gpu_imdb(self):
"""Run XLNet model accuracy test with 8 GPUs."""
self._setup()
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_imdb')
# Sets timer_callback to None as we do not use it now.
self.timer_callback = None
summary_path = os.path.join(FLAGS.model_dir,
'summaries/training_summary.txt')
self._run_and_report_benchmark(summary_path)
@owner_utils.Owner('tf-model-garden')
def benchmark_2x2_tpu_imdb(self):
"""Run XLNet model accuracy test on 2x2 tpu."""
self._setup()
FLAGS.strategy_type = 'tpu'
FLAGS.model_dir = self._get_model_dir('benchmark_2x2_tpu_imdb')
# Sets timer_callback to None as we do not use it now.
self.timer_callback = None
summary_path = os.path.join(FLAGS.model_dir,
'summaries/training_summary.txt')
self._run_and_report_benchmark(summary_path)
class XLNetSquadAccuracy(XLNetBenchmarkBase):
"""Short accuracy test for XLNet squad model.
Tests XLNet squad task model accuracy. The naming
convention of below test cases follow
`benchmark_(number of gpus)_gpu_(dataset type)` format.
"""
def __init__(self, output_dir=None, tpu=None, **kwargs):
self.train_data_path = SQUAD_DATA_PATH
self.predict_file = os.path.join(SQUAD_DATA_PATH, "dev-v2.0.json")
self.test_data_path = os.path.join(SQUAD_DATA_PATH, "12048.eval.tf_record")
self.spiece_model_file = os.path.join(SQUAD_DATA_PATH, "spiece.cased.model")
self.pretrained_checkpoint_path = PRETRAINED_CHECKPOINT_PATH
super(XLNetSquadAccuracy, self).__init__(output_dir=output_dir, tpu=tpu)
@benchmark_wrappers.enable_runtime_flags
def _run_and_report_benchmark(self,
training_summary_path,
min_accuracy=87.0,
max_accuracy=89.0):
"""Starts XLNet accuracy benchmark test."""
start_time_sec = time.time()
self._run_xlnet_squad()
wall_time_sec = time.time() - start_time_sec
with tf.io.gfile.GFile(training_summary_path, 'rb') as reader:
summary = json.loads(reader.read().decode('utf-8'))
super(XLNetSquadAccuracy, self)._report_benchmark(
stats=summary,
wall_time_sec=wall_time_sec,
min_accuracy=min_accuracy,
max_accuracy=max_accuracy)
def _setup(self):
super(XLNetSquadAccuracy, self)._setup()
FLAGS.train_batch_size = 16
FLAGS.seq_len = 512
FLAGS.mem_len = 0
FLAGS.n_layer = 24
FLAGS.d_model = 1024
FLAGS.d_embed = 1024
FLAGS.n_head = 16
FLAGS.d_head = 64
FLAGS.d_inner = 4096
FLAGS.untie_r = True
FLAGS.ff_activation = 'gelu'
FLAGS.strategy_type = 'mirror'
FLAGS.learning_rate = 3e-5
FLAGS.train_steps = 8000
FLAGS.warmup_steps = 1000
FLAGS.iterations = 1000
FLAGS.bi_data = False
FLAGS.init_checkpoint = self.pretrained_checkpoint_path
FLAGS.train_tfrecord_path = self.train_data_path
FLAGS.test_tfrecord_path = self.test_data_path
FLAGS.spiece_model_file = self.spiece_model_file
FLAGS.predict_file = self.predict_file
FLAGS.adam_epsilon = 1e-6
FLAGS.lr_layer_decay_rate = 0.75
@owner_utils.Owner('tf-model-garden')
def benchmark_8_gpu_squadv2(self):
"""Run XLNet model squad v2 accuracy test with 8 GPUs."""
self._setup()
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_squadv2')
FLAGS.predict_dir = FLAGS.model_dir
# Sets timer_callback to None as we do not use it now.
self.timer_callback = None
summary_path = os.path.join(FLAGS.model_dir,
'summaries/training_summary.txt')
self._run_and_report_benchmark(summary_path)
@owner_utils.Owner('tf-model-garden')
def benchmark_2x2_tpu_squadv2(self):
"""Run XLNet model squad v2 accuracy test on 2x2 tpu."""
self._setup()
FLAGS.strategy_type = 'tpu'
FLAGS.model_dir = self._get_model_dir('benchmark_2x2_tpu_squadv2')
FLAGS.predict_dir = FLAGS.model_dir
# Sets timer_callback to None as we do not use it now.
self.timer_callback = None
summary_path = os.path.join(FLAGS.model_dir,
'summaries/training_summary.txt')
self._run_and_report_benchmark(summary_path)
if __name__ == '__main__':
tf.test.main()
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "vXLA5InzXydn"
},
"source": [
"##### Copyright 2021 The TensorFlow Authors."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "RuRlpLL-X0R_"
},
"outputs": [],
"source": [
"#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
"# you may not use this file except in compliance with the License.\n",
"# You may obtain a copy of the License at\n",
"#\n",
"# https://www.apache.org/licenses/LICENSE-2.0\n",
"#\n",
"# Unless required by applicable law or agreed to in writing, software\n",
"# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
"# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
"# See the License for the specific language governing permissions and\n",
"# limitations under the License."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "fsACVQpVSifi"
},
"source": [
"### Install the TensorFlow Model Garden pip package\n",
"\n",
"* `tf-models-official` is the stable Model Garden package. Note that it may not include the latest changes in the `tensorflow_models` github repo. To include latest changes, you may install `tf-models-nightly`,\n",
"which is the nightly Model Garden package created daily automatically.\n",
"* pip will install all models and dependencies automatically."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "hYEwGTeCXnnX"
},
"source": [
"\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
" \u003ctd\u003e\n",
" \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/official_models/tutorials/decoding_api_in_tf_nlp.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n",
" \u003c/td\u003e\n",
" \u003ctd\u003e\n",
" \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/models/blob/master/official/colab/decoding_api_in_tf_nlp.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
" \u003c/td\u003e\n",
" \u003ctd\u003e\n",
" \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/models/blob/master/official/colab/decoding_api_in_tf_nlp.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
" \u003c/td\u003e\n",
" \u003ctd\u003e\n",
" \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/models/official/colab/decoding_api_in_tf_nlp.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
" \u003c/td\u003e\n",
"\u003c/table\u003e"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "2j-xhrsVQOQT"
},
"outputs": [],
"source": [
"pip install tf-models-nightly"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "BjP7zwxmskpY"
},
"outputs": [],
"source": [
"import os\n",
"\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"\n",
"import tensorflow as tf\n",
"\n",
"from official import nlp\n",
"from official.nlp.modeling.ops import sampling_module\n",
"from official.nlp.modeling.ops import beam_search"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "0AWgyo-IQ5sP"
},
"source": [
"# Decoding API\n",
"This API provides an interface to experiment with different decoding strategies used for auto-regressive models.\n",
"\n",
"1. The following sampling strategies are provided in sampling_module.py, which inherits from the base Decoding class:\n",
" * [top_p](https://arxiv.org/abs/1904.09751) : [github](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/ops/sampling_module.py#L65) \n",
"\n",
" This implementation chooses most probable logits with cumulative probabilities upto top_p.\n",
"\n",
" * [top_k](https://arxiv.org/pdf/1805.04833.pdf) : [github](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/ops/sampling_module.py#L48)\n",
"\n",
" At each timestep, this implementation samples from top-k logits based on their probability distribution\n",
"\n",
" * Greedy : [github](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/ops/sampling_module.py#L26)\n",
"\n",
" This implementation returns the top logits based on probabilities.\n",
"\n",
"2. Beam search is provided in beam_search.py. [github](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/ops/beam_search.py)\n",
"\n",
" This implementation reduces the risk of missing hidden high probability logits by keeping the most likely num_beams of logits at each time step and eventually choosing the logits that has the overall highest probability."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "MfOj7oaBRQnS"
},
"source": [
"## Initialize Sampling Module in TF-NLP.\n",
"\n",
"\n",
"\u003e **symbols_to_logits_fn** : This is a closure implemented by the users of the API. The input to this closure will be \n",
"```\n",
"Args:\n",
" 1] ids [batch_size, .. (index + 1 or 1 if padded_decode is True)],\n",
" 2] index [scalar] : current decoded step,\n",
" 3] cache [nested dictionary of tensors].\n",
"Returns:\n",
" 1] tensor for next-step logits [batch_size, vocab]\n",
" 2] the updated_cache [nested dictionary of tensors].\n",
"```\n",
"This closure calls the model to predict the logits for the 'index+1' step. The cache is used for faster decoding.\n",
"Here is a [reference](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/ops/beam_search_test.py#L88) implementation for the above closure.\n",
"\n",
"\n",
"\u003e **length_normalization_fn** : Closure for returning length normalization parameter.\n",
"```\n",
"Args: \n",
" 1] length : scalar for decoded step index.\n",
" 2] dtype : data-type of output tensor\n",
"Returns:\n",
" 1] value of length normalization factor.\n",
"Example :\n",
" def _length_norm(length, dtype):\n",
" return tf.pow(((5. + tf.cast(length, dtype)) / 6.), 0.0)\n",
"```\n",
"\n",
"\u003e **vocab_size** : Output vocabulary size.\n",
"\n",
"\u003e **max_decode_length** : Scalar for total number of decoding steps.\n",
"\n",
"\u003e **eos_id** : Decoding will stop if all output decoded ids in the batch have this ID.\n",
"\n",
"\u003e **padded_decode** : Set this to True if running on TPU. Tensors are padded to max_decoding_length if this is True.\n",
"\n",
"\u003e **top_k** : top_k is enabled if this value is \u003e 1.\n",
"\n",
"\u003e **top_p** : top_p is enabled if this value is \u003e 0 and \u003c 1.0\n",
"\n",
"\u003e **sampling_temperature** : This is used to re-estimate the softmax output. Temperature skews the distribution towards high probability tokens and lowers the mass in tail distribution. Value has to be positive. Low temperature is equivalent to greedy and makes the distribution sharper, while high temperature makes it more flat.\n",
"\n",
"\u003e **enable_greedy** : By default, this is true and greedy decoding is enabled.\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "lV1RRp6ihnGX"
},
"source": [
"# Initialize the Model Hyper-parameters"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "eTsGp2gaKLdE"
},
"outputs": [],
"source": [
"params = {}\n",
"params['num_heads'] = 2\n",
"params['num_layers'] = 2\n",
"params['batch_size'] = 2\n",
"params['n_dims'] = 256\n",
"params['max_decode_length'] = 4"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "UGvmd0_dRFYI"
},
"source": [
"## What is a Cache?\n",
"In auto-regressive architectures like Transformer based [Encoder-Decoder](https://arxiv.org/abs/1706.03762) models, \n",
"Cache is used for fast sequential decoding.\n",
"It is a nested dictionary storing pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention blocks) for every layer.\n",
"\n",
"```\n",
"{\n",
" 'layer_%d' % layer: {\n",
" 'k': tf.zeros([params['batch_size'], params['max_decode_length'], params['num_heads'], params['n_dims']/params['num_heads']], dtype=tf.float32),\n",
" 'v': tf.zeros([params['batch_size'], params['max_decode_length'], params['num_heads'], params['n_dims']/params['num_heads']], dtype=tf.float32)\n",
" } for layer in range(params['num_layers']),\n",
" 'model_specific_item' : Model specific tensor shape,\n",
"}\n",
"\n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "CYXkoplAij01"
},
"source": [
"# Initialize cache. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "D6kfZOOKgkm1"
},
"outputs": [],
"source": [
"cache = {\n",
" 'layer_%d' % layer: {\n",
" 'k': tf.zeros([params['batch_size'], params['max_decode_length'], params['num_heads'], params['n_dims']/params['num_heads']], dtype=tf.float32),\n",
" 'v': tf.zeros([params['batch_size'], params['max_decode_length'], params['num_heads'], params['n_dims']/params['num_heads']], dtype=tf.float32)\n",
" } for layer in range(params['num_layers'])\n",
" }\n",
"print(\"cache key shape for layer 1 :\", cache['layer_1']['k'].shape)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "nNY3Xn8SiblP"
},
"source": [
"# Define closure for length normalization. **optional.**\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "T92ccAzlnGqh"
},
"outputs": [],
"source": [
"def length_norm(length, dtype):\n",
" \"\"\"Return length normalization factor.\"\"\"\n",
" return tf.pow(((5. + tf.cast(length, dtype)) / 6.), 0.0)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "syl7I5nURPgW"
},
"source": [
"# Create model_fn\n",
" In practice, this will be replaced by an actual model implementation such as [here](https://github.com/tensorflow/models/blob/master/official/nlp/transformer/transformer.py#L236)\n",
"```\n",
"Args:\n",
"i : Step that is being decoded.\n",
"Returns:\n",
" logit probabilities of size [batch_size, 1, vocab_size]\n",
"```\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "AhzSkRisRdB6"
},
"outputs": [],
"source": [
"probabilities = tf.constant([[[0.3, 0.4, 0.3], [0.3, 0.3, 0.4],\n",
" [0.1, 0.1, 0.8], [0.1, 0.1, 0.8]],\n",
" [[0.2, 0.5, 0.3], [0.2, 0.7, 0.1],\n",
" [0.1, 0.1, 0.8], [0.1, 0.1, 0.8]]])\n",
"def model_fn(i):\n",
" return probabilities[:, i, :]"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "DBMUkaVmVZBg"
},
"source": [
"# Initialize symbols_to_logits_fn\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "FAJ4CpbfVdjr"
},
"outputs": [],
"source": [
"def _symbols_to_logits_fn():\n",
" \"\"\"Calculates logits of the next tokens.\"\"\"\n",
" def symbols_to_logits_fn(ids, i, temp_cache):\n",
" del ids\n",
" logits = tf.cast(tf.math.log(model_fn(i)), tf.float32)\n",
" return logits, temp_cache\n",
" return symbols_to_logits_fn"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "R_tV3jyWVL47"
},
"source": [
"# Greedy \n",
"Greedy decoding selects the token id with the highest probability as its next id: $id_t = argmax_{w}P(id | id_{1:t-1})$ at each timestep $t$. The following sketch shows greedy decoding. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "aGt9idSkVQEJ"
},
"outputs": [],
"source": [
"greedy_obj = sampling_module.SamplingModule(\n",
" length_normalization_fn=None,\n",
" dtype=tf.float32,\n",
" symbols_to_logits_fn=_symbols_to_logits_fn(),\n",
" vocab_size=3,\n",
" max_decode_length=params['max_decode_length'],\n",
" eos_id=10,\n",
" padded_decode=False)\n",
"ids, _ = greedy_obj.generate(\n",
" initial_ids=tf.constant([9, 1]), initial_cache=cache)\n",
"print(\"Greedy Decoded Ids:\", ids)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "s4pTTsQXVz5O"
},
"source": [
"# top_k sampling\n",
"In *Top-K* sampling, the *K* most likely next token ids are filtered and the probability mass is redistributed among only those *K* ids. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "pCLWIn6GV5_G"
},
"outputs": [],
"source": [
"top_k_obj = sampling_module.SamplingModule(\n",
" length_normalization_fn=length_norm,\n",
" dtype=tf.float32,\n",
" symbols_to_logits_fn=_symbols_to_logits_fn(),\n",
" vocab_size=3,\n",
" max_decode_length=params['max_decode_length'],\n",
" eos_id=10,\n",
" sample_temperature=tf.constant(1.0),\n",
" top_k=tf.constant(3),\n",
" padded_decode=False,\n",
" enable_greedy=False)\n",
"ids, _ = top_k_obj.generate(\n",
" initial_ids=tf.constant([9, 1]), initial_cache=cache)\n",
"print(\"top-k sampled Ids:\", ids)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Jp3G-eE_WI4Y"
},
"source": [
"# top_p sampling\n",
"Instead of sampling only from the most likely *K* token ids, in *Top-p* sampling chooses from the smallest possible set of ids whose cumulative probability exceeds the probability *p*."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "rEGdIWcuWILO"
},
"outputs": [],
"source": [
"top_p_obj = sampling_module.SamplingModule(\n",
" length_normalization_fn=length_norm,\n",
" dtype=tf.float32,\n",
" symbols_to_logits_fn=_symbols_to_logits_fn(),\n",
" vocab_size=3,\n",
" max_decode_length=params['max_decode_length'],\n",
" eos_id=10,\n",
" sample_temperature=tf.constant(1.0),\n",
" top_p=tf.constant(0.9),\n",
" padded_decode=False,\n",
" enable_greedy=False)\n",
"ids, _ = top_p_obj.generate(\n",
" initial_ids=tf.constant([9, 1]), initial_cache=cache)\n",
"print(\"top-p sampled Ids:\", ids)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "2hcuyJ2VWjDz"
},
"source": [
"# Beam search decoding\n",
"Beam search reduces the risk of missing hidden high probability token ids by keeping the most likely num_beams of hypotheses at each time step and eventually choosing the hypothesis that has the overall highest probability. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "cJ3WzvSrWmSA"
},
"outputs": [],
"source": [
"beam_size = 2\n",
"params['batch_size'] = 1\n",
"beam_cache = {\n",
" 'layer_%d' % layer: {\n",
" 'k': tf.zeros([params['batch_size'], params['max_decode_length'], params['num_heads'], params['n_dims']], dtype=tf.float32),\n",
" 'v': tf.zeros([params['batch_size'], params['max_decode_length'], params['num_heads'], params['n_dims']], dtype=tf.float32)\n",
" } for layer in range(params['num_layers'])\n",
" }\n",
"print(\"cache key shape for layer 1 :\", beam_cache['layer_1']['k'].shape)\n",
"ids, _ = beam_search.sequence_beam_search(\n",
" symbols_to_logits_fn=_symbols_to_logits_fn(),\n",
" initial_ids=tf.constant([9], tf.int32),\n",
" initial_cache=beam_cache,\n",
" vocab_size=3,\n",
" beam_size=beam_size,\n",
" alpha=0.6,\n",
" max_decode_length=params['max_decode_length'],\n",
" eos_id=10,\n",
" padded_decode=False,\n",
" dtype=tf.float32)\n",
"print(\"Beam search ids:\", ids)"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"collapsed_sections": [],
"name": "decoding_api_in_tf_nlp.ipynb",
"provenance": [],
"toc_visible": true
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
......@@ -3,7 +3,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "vXLA5InzXydn"
},
"source": [
......@@ -12,11 +11,9 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"cellView": "form",
"colab": {},
"colab_type": "code",
"id": "RuRlpLL-X0R_"
},
"outputs": [],
......@@ -37,7 +34,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "1mLJmVotXs64"
},
"source": [
......@@ -47,7 +43,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "hYEwGTeCXnnX"
},
"source": [
......@@ -64,13 +59,15 @@
" \u003ctd\u003e\n",
" \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/models/official/colab/fine_tuning_bert.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
" \u003c/td\u003e\n",
" \u003ctd\u003e\n",
" \u003ca href=\"https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/hub_logo_32px.png\" /\u003eSee TF Hub model\u003c/a\u003e\n",
" \u003c/td\u003e\n",
"\u003c/table\u003e"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "YN2ACivEPxgD"
},
"source": [
......@@ -82,7 +79,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "s2d9S2CSSO1z"
},
"source": [
......@@ -92,34 +88,30 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "fsACVQpVSifi"
},
"source": [
"### Install the TensorFlow Model Garden pip package\n",
"\n",
"* `tf-models-nightly` is the nightly Model Garden package created daily automatically.\n",
"* `tf-models-official` is the stable Model Garden package. Note that it may not include the latest changes in the `tensorflow_models` github repo. To include latest changes, you may install `tf-models-nightly`,\n",
"which is the nightly Model Garden package created daily automatically.\n",
"* pip will install all models and dependencies automatically."
]
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "NvNr2svBM-p3"
},
"outputs": [],
"source": [
"!pip install -q tf-nightly\n",
"!pip install -q tf-models-nightly"
"!pip install -q tf-models-official==2.4.0"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "U-7qPCjWUAyy"
},
"source": [
......@@ -128,10 +120,8 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "lXsXev5MNr20"
},
"outputs": [],
......@@ -160,13 +150,12 @@
"import official.nlp.data.classifier_data_lib\n",
"import official.nlp.modeling.losses\n",
"import official.nlp.modeling.models\n",
"import official.nlp.modeling.networks"
"import official.nlp.modeling.networks\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "mbanlzTvJBsz"
},
"source": [
......@@ -176,7 +165,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "PpW0x8TpR8DT"
},
"source": [
......@@ -185,45 +173,39 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "vzRHOLciR8eq"
},
"outputs": [],
"source": [
"gs_folder_bert = \"gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-12_H-768_A-12\"\n",
"gs_folder_bert = \"gs://cloud-tpu-checkpoints/bert/v3/uncased_L-12_H-768_A-12\"\n",
"tf.io.gfile.listdir(gs_folder_bert)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "9uFskufsR2LT"
},
"source": [
"You can get a pre-trained BERT encoder from TensorFlow Hub here:"
"You can get a pre-trained BERT encoder from [TensorFlow Hub](https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2):"
]
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "e0dAkUttJAzj"
},
"outputs": [],
"source": [
"hub_url_bert = \"https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2\""
"hub_url_bert = \"https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3\""
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "Qv6abtRvH4xO"
},
"source": [
......@@ -236,7 +218,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "28DvUhC1YUiB"
},
"source": [
......@@ -252,10 +233,8 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "Ijikx5OsH9AT"
},
"outputs": [],
......@@ -267,10 +246,8 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "xf9zz4vLYXjr"
},
"outputs": [],
......@@ -281,7 +258,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "ZgBg2r2nYT-K"
},
"source": [
......@@ -290,10 +266,8 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "IQrHxv7W7jH5"
},
"outputs": [],
......@@ -304,7 +278,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "vhsVWYNxazz5"
},
"source": [
......@@ -313,10 +286,8 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "n0gfc_VTayfQ"
},
"outputs": [],
......@@ -327,7 +298,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "38zJcap6xkbC"
},
"source": [
......@@ -336,10 +306,8 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "xON_i6SkwApW"
},
"outputs": [],
......@@ -353,7 +321,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "9fbTyfJpNr7x"
},
"source": [
......@@ -363,7 +330,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "wqeN54S61ZKQ"
},
"source": [
......@@ -376,10 +342,8 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "idxyhmrCQcw5"
},
"outputs": [],
......@@ -395,7 +359,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "zYHDSquU2lDU"
},
"source": [
......@@ -404,10 +367,8 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "L_OfOYPg853R"
},
"outputs": [],
......@@ -421,7 +382,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "kkAXLtuyWWDI"
},
"source": [
......@@ -435,7 +395,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "62UTWLQd9-LB"
},
"source": [
......@@ -446,10 +405,8 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "bdL-dRNRBRJT"
},
"outputs": [],
......@@ -460,7 +417,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "UrPktnqpwqie"
},
"source": [
......@@ -469,10 +425,8 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "BR7BmtU498Bh"
},
"outputs": [],
......@@ -490,10 +444,8 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "has42aUdfky-"
},
"outputs": [],
......@@ -505,7 +457,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "MU9lTWy_xXbb"
},
"source": [
......@@ -514,10 +465,8 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "USD8uihw-g4J"
},
"outputs": [],
......@@ -530,7 +479,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "xmNv4l4k-dBZ"
},
"source": [
......@@ -540,7 +488,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "DIWjNIKq-ldh"
},
"source": [
......@@ -553,7 +500,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "ulNZ4U96-8JZ"
},
"source": [
......@@ -562,10 +508,8 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "EezOO9qj91kP"
},
"outputs": [],
......@@ -578,7 +522,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "rxLenwAvCkBf"
},
"source": [
......@@ -587,10 +530,8 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "2CetH_5C9P2m"
},
"outputs": [],
......@@ -606,7 +547,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "P5UBnCn8Ii6s"
},
"source": [
......@@ -617,10 +557,8 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "sDGiWYPLEd5a"
},
"outputs": [],
......@@ -661,10 +599,8 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "yuLKxf6zHxw-"
},
"outputs": [],
......@@ -682,7 +618,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "7FC5aLVxKVKK"
},
"source": [
......@@ -691,10 +626,8 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "jyjTdGpFhO_1"
},
"outputs": [],
......@@ -708,7 +641,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "FSwymsbkbLDA"
},
"source": [
......@@ -718,7 +650,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "Efrj3Cn1kLAp"
},
"source": [
......@@ -728,7 +659,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "xxpOY5r2Ayq6"
},
"source": [
......@@ -737,10 +667,8 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "ujapVfZ_AKW7"
},
"outputs": [],
......@@ -758,7 +686,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "96ldxDSwkVkj"
},
"source": [
......@@ -769,10 +696,8 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "cH682__U0FBv"
},
"outputs": [],
......@@ -784,7 +709,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "XqKp3-5GIZlw"
},
"source": [
......@@ -793,10 +717,8 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "bAQblMIjwkvx"
},
"outputs": [],
......@@ -807,7 +729,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "sFmVG4SKZAw8"
},
"source": [
......@@ -816,10 +737,8 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "VTjgPbp4ZDKo"
},
"outputs": [],
......@@ -834,7 +753,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "Q0NTdwZsQK8n"
},
"source": [
......@@ -845,10 +763,8 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "8L__-erBwLIQ"
},
"outputs": [],
......@@ -859,7 +775,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "mKAvkQc3heSy"
},
"source": [
......@@ -870,23 +785,20 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "97Ll2Gichd_Y"
},
"outputs": [],
"source": [
"checkpoint = tf.train.Checkpoint(model=bert_encoder)\n",
"checkpoint.restore(\n",
"checkpoint = tf.train.Checkpoint(encoder=bert_encoder)\n",
"checkpoint.read(\n",
" os.path.join(gs_folder_bert, 'bert_model.ckpt')).assert_consumed()"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "2oHOql35k3Dd"
},
"source": [
......@@ -896,7 +808,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "115caFLMk-_l"
},
"source": [
......@@ -908,10 +819,8 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "w8qXKRZuCwW4"
},
"outputs": [],
......@@ -934,7 +843,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "pXRGxiRNEHS2"
},
"source": [
......@@ -943,10 +851,8 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "eQNA16bhDpky"
},
"outputs": [],
......@@ -957,7 +863,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "xqu_K71fJQB8"
},
"source": [
......@@ -967,7 +872,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "78FEUOOEkoP0"
},
"source": [
......@@ -977,7 +881,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "OTNcA0O0nSq9"
},
"source": [
......@@ -986,10 +889,8 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "nzi8hjeTQTRs"
},
"outputs": [],
......@@ -1012,7 +913,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "IFtKFWbNKb0u"
},
"source": [
......@@ -1023,10 +923,8 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "9ZoUgDUNJPz3"
},
"outputs": [],
......@@ -1046,7 +944,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "7ynJibkBRTJF"
},
"source": [
......@@ -1055,26 +952,22 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "umo0ttrgRYIM"
},
"outputs": [],
"source": [
"result = bert_classifier(my_examples, training=False)\n",
"\n",
"result = tf.argmax(result).numpy()\n",
"result = tf.argmax(result, axis=-1).numpy()\n",
"result"
]
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "utGl0M3aZCE4"
},
"outputs": [],
......@@ -1085,7 +978,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "fVo_AnT0l26j"
},
"source": [
......@@ -1096,10 +988,8 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "Nl5x6nElZqkP"
},
"outputs": [],
......@@ -1110,10 +1000,9 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"collapsed": true,
"id": "y_ACvKPsVUXC"
},
"outputs": [],
......@@ -1134,7 +1023,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "eQceYqRFT_Eg"
},
"source": [
......@@ -1144,7 +1032,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "SaC1RlFawUpc"
},
"source": [
......@@ -1155,7 +1042,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "CwUdjFBkzUgh"
},
"source": [
......@@ -1167,7 +1053,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "2UTQrkyOT5wD"
},
"source": [
......@@ -1176,10 +1061,8 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "XQeDFOzYR9Z9"
},
"outputs": [],
......@@ -1192,7 +1075,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "XrFQbfErUWxa"
},
"source": [
......@@ -1201,10 +1083,8 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "ymw7GOHpSHKU"
},
"outputs": [],
......@@ -1231,7 +1111,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "uX_Sp-wTUoRm"
},
"source": [
......@@ -1240,10 +1119,8 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "rkHxIK57SQ_r"
},
"outputs": [],
......@@ -1264,7 +1141,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "stbaVouogvzS"
},
"source": [
......@@ -1273,10 +1149,8 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "gwhrlQl4gxVF"
},
"outputs": [],
......@@ -1287,7 +1161,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "dbJ76vSJj77j"
},
"source": [
......@@ -1297,7 +1170,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "9J95LFRohiYw"
},
"source": [
......@@ -1306,10 +1178,8 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "gCvaLLAxPuMc"
},
"outputs": [],
......@@ -1351,10 +1221,8 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "rutkBadrhzdR"
},
"outputs": [],
......@@ -1379,10 +1247,8 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "59TVgt4Z7fuU"
},
"outputs": [],
......@@ -1393,7 +1259,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "QbklKt-w_CiI"
},
"source": [
......@@ -1406,17 +1271,38 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "lo6479At4sP1"
"id": "GDWrHm0BGpbX"
},
"outputs": [],
"source": [
"# Note: 350MB download.\n",
"import tensorflow_hub as hub\n",
"hub_encoder = hub.KerasLayer(hub_url_bert, trainable=True)\n",
"import tensorflow_hub as hub"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "Y29meH0qGq_5"
},
"outputs": [],
"source": [
"hub_model_name = \"bert_en_uncased_L-12_H-768_A-12\" #@param [\"bert_en_uncased_L-24_H-1024_A-16\", \"bert_en_wwm_cased_L-24_H-1024_A-16\", \"bert_en_uncased_L-12_H-768_A-12\", \"bert_en_wwm_uncased_L-24_H-1024_A-16\", \"bert_en_cased_L-24_H-1024_A-16\", \"bert_en_cased_L-12_H-768_A-12\", \"bert_zh_L-12_H-768_A-12\", \"bert_multi_cased_L-12_H-768_A-12\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "lo6479At4sP1"
},
"outputs": [],
"source": [
"hub_encoder = hub.KerasLayer(f\"https://tfhub.dev/tensorflow/{hub_model_name}/3\",\n",
" trainable=True)\n",
"\n",
"print(f\"The Hub encoder has {len(hub_encoder.trainable_variables)} trainable variables\")"
]
......@@ -1424,7 +1310,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "iTzF574wivQv"
},
"source": [
......@@ -1433,29 +1318,27 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "XEcYrCR45Uwo"
},
"outputs": [],
"source": [
"result = hub_encoder(\n",
" inputs=[glue_train['input_word_ids'][:10],\n",
" glue_train['input_mask'][:10],\n",
" glue_train['input_type_ids'][:10],],\n",
" inputs=dict(\n",
" input_word_ids=glue_train['input_word_ids'][:10],\n",
" input_mask=glue_train['input_mask'][:10],\n",
" input_type_ids=glue_train['input_type_ids'][:10],),\n",
" training=False,\n",
")\n",
"\n",
"print(\"Pooled output shape:\", result[0].shape)\n",
"print(\"Sequence output shape:\", result[1].shape)"
"print(\"Pooled output shape:\", result['pooled_output'].shape)\n",
"print(\"Sequence output shape:\", result['sequence_output'].shape)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "cjojn8SmLSRI"
},
"source": [
......@@ -1466,35 +1349,33 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "9nTDaApyLR70"
},
"outputs": [],
"source": [
"hub_classifier, hub_encoder = bert.bert_models.classifier_model(\n",
" # Caution: Most of `bert_config` is ignored if you pass a hub url.\n",
" bert_config=bert_config, hub_module_url=hub_url_bert, num_labels=2)"
"hub_classifier = nlp.modeling.models.BertClassifier(\n",
" bert_encoder,\n",
" num_classes=2,\n",
" dropout_rate=0.1,\n",
" initializer=tf.keras.initializers.TruncatedNormal(\n",
" stddev=0.02))"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "xMJX3wV0_v7I"
},
"source": [
"The one downside to loading this model from TFHub is that the structure of internal keras layers is not restored. So it's more difficult to inspect or modify the model. The `TransformerEncoder` model is now a single layer:"
"The one downside to loading this model from TFHub is that the structure of internal keras layers is not restored. So it's more difficult to inspect or modify the model. The `BertEncoder` model is now a single layer:"
]
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "pD71dnvhM2QS"
},
"outputs": [],
......@@ -1504,10 +1385,8 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "nLZD-isBzNKi"
},
"outputs": [],
......@@ -1522,7 +1401,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "ZxSqH0dNAgXV"
},
"source": [
......@@ -1530,13 +1408,12 @@
"\n",
"### Low level model building\n",
"\n",
"If you need a more control over the construction of the model it's worth noting that the `classifier_model` function used earlier is really just a thin wrapper over the `nlp.modeling.networks.TransformerEncoder` and `nlp.modeling.models.BertClassifier` classes. Just remember that if you start modifying the architecture it may not be correct or possible to reload the pre-trained checkpoint so you'll need to retrain from scratch."
"If you need a more control over the construction of the model it's worth noting that the `classifier_model` function used earlier is really just a thin wrapper over the `nlp.modeling.networks.BertEncoder` and `nlp.modeling.models.BertClassifier` classes. Just remember that if you start modifying the architecture it may not be correct or possible to reload the pre-trained checkpoint so you'll need to retrain from scratch."
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "0cgABEwDj06P"
},
"source": [
......@@ -1545,45 +1422,40 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "5r_yqhBFSVEM"
},
"outputs": [],
"source": [
"transformer_config = config_dict.copy()\n",
"bert_encoder_config = config_dict.copy()\n",
"\n",
"# You need to rename a few fields to make this work:\n",
"transformer_config['attention_dropout_rate'] = transformer_config.pop('attention_probs_dropout_prob')\n",
"transformer_config['activation'] = tf_utils.get_activation(transformer_config.pop('hidden_act'))\n",
"transformer_config['dropout_rate'] = transformer_config.pop('hidden_dropout_prob')\n",
"transformer_config['initializer'] = tf.keras.initializers.TruncatedNormal(\n",
" stddev=transformer_config.pop('initializer_range'))\n",
"transformer_config['max_sequence_length'] = transformer_config.pop('max_position_embeddings')\n",
"transformer_config['num_layers'] = transformer_config.pop('num_hidden_layers')\n",
"bert_encoder_config['attention_dropout_rate'] = bert_encoder_config.pop('attention_probs_dropout_prob')\n",
"bert_encoder_config['activation'] = tf_utils.get_activation(bert_encoder_config.pop('hidden_act'))\n",
"bert_encoder_config['dropout_rate'] = bert_encoder_config.pop('hidden_dropout_prob')\n",
"bert_encoder_config['initializer'] = tf.keras.initializers.TruncatedNormal(\n",
" stddev=bert_encoder_config.pop('initializer_range'))\n",
"bert_encoder_config['max_sequence_length'] = bert_encoder_config.pop('max_position_embeddings')\n",
"bert_encoder_config['num_layers'] = bert_encoder_config.pop('num_hidden_layers')\n",
"\n",
"transformer_config"
"bert_encoder_config"
]
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "rIO8MI7LLijh"
},
"outputs": [],
"source": [
"manual_encoder = nlp.modeling.networks.TransformerEncoder(**transformer_config)"
"manual_encoder = nlp.modeling.networks.BertEncoder(**bert_encoder_config)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "4a4tFSg9krRi"
},
"source": [
......@@ -1592,23 +1464,20 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "X6N9NEqfXJCx"
},
"outputs": [],
"source": [
"checkpoint = tf.train.Checkpoint(model=manual_encoder)\n",
"checkpoint.restore(\n",
"checkpoint = tf.train.Checkpoint(encoder=manual_encoder)\n",
"checkpoint.read(\n",
" os.path.join(gs_folder_bert, 'bert_model.ckpt')).assert_consumed()"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "1BPiPO4ykuwM"
},
"source": [
......@@ -1617,10 +1486,8 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "hlVdgJKmj389"
},
"outputs": [],
......@@ -1634,7 +1501,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "nJMXvVgJkyBv"
},
"source": [
......@@ -1643,10 +1509,8 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "tQX57GJ6wkAb"
},
"outputs": [],
......@@ -1654,17 +1518,14 @@
"manual_classifier = nlp.modeling.models.BertClassifier(\n",
" bert_encoder,\n",
" num_classes=2,\n",
" dropout_rate=transformer_config['dropout_rate'],\n",
" initializer=tf.keras.initializers.TruncatedNormal(\n",
" stddev=bert_config.initializer_range))"
" dropout_rate=bert_encoder_config['dropout_rate'],\n",
" initializer=bert_encoder_config['initializer'])"
]
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "kB-nBWhQk0dS"
},
"outputs": [],
......@@ -1675,7 +1536,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "E6AJlOSyIO1L"
},
"source": [
......@@ -1688,10 +1548,8 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "28Dv3BPRlFTD"
},
"outputs": [],
......@@ -1703,7 +1561,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "LRjcHr0UlT8c"
},
"source": [
......@@ -1714,10 +1571,8 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "MHY8K6kDngQn"
},
"outputs": [],
......@@ -1733,10 +1588,9 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"collapsed": true,
"id": "wKIcSprulu3P"
},
"outputs": [],
......@@ -1752,7 +1606,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "IMTC_gfAl_PZ"
},
"source": [
......@@ -1761,10 +1614,8 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "YRt3VTmBmCBY"
},
"outputs": [],
......@@ -1786,7 +1637,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "l8D9Lv3Bn740"
},
"source": [
......@@ -1795,10 +1645,8 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "2Hf2rpRXk89N"
},
"outputs": [],
......
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "Customizing a Transformer Encoder",
"private_outputs": true,
"provenance": [],
"collapsed_sections": [],
"toc_visible": true
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "Bp8t2AI8i7uP"
},
"source": [
......@@ -12,14 +26,10 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"colab": {},
"colab_type": "code",
"id": "rxPj2Lsni9O4"
},
"outputs": [],
"source": [
"#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
"# you may not use this file except in compliance with the License.\n",
......@@ -32,12 +42,13 @@
"# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
"# See the License for the specific language governing permissions and\n",
"# limitations under the License."
]
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "6xS-9i5DrRvO"
},
"source": [
......@@ -47,30 +58,28 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "Mwb9uw1cDXsa"
},
"source": [
"\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
" \u003ctd\u003e\n",
" \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/official_models/nlp/customize_encoder\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n",
" \u003c/td\u003e\n",
" \u003ctd\u003e\n",
" \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/models/blob/master/official/colab/nlp/customize_encoder.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
" \u003c/td\u003e\n",
" \u003ctd\u003e\n",
" \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/models/blob/master/official/colab/nlp/customize_encoder.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
" \u003c/td\u003e\n",
" \u003ctd\u003e\n",
" \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/models/official/colab/nlp/customize_encoder.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
" \u003c/td\u003e\n",
"\u003c/table\u003e"
"<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
" <td>\n",
" <a target=\"_blank\" href=\"https://www.tensorflow.org/official_models/nlp/customize_encoder\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
" </td>\n",
" <td>\n",
" <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/models/blob/master/official/colab/nlp/customize_encoder.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
" </td>\n",
" <td>\n",
" <a target=\"_blank\" href=\"https://github.com/tensorflow/models/blob/master/official/colab/nlp/customize_encoder.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
" </td>\n",
" <td>\n",
" <a href=\"https://storage.googleapis.com/tensorflow_docs/models/official/colab/nlp/customize_encoder.ipynb\"><img src=\"https://www.tensorflow.org/images/download_logo_32px.png\" />Download notebook</a>\n",
" </td>\n",
"</table>"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "iLrcV4IyrcGX"
},
"source": [
......@@ -84,7 +93,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "YYxdyoWgsl8t"
},
"source": [
......@@ -94,34 +102,30 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "fEJSFutUsn_h"
},
"source": [
"### Install the TensorFlow Model Garden pip package\n",
"\n",
"* `tf-models-nightly` is the nightly Model Garden package created daily automatically.\n",
"* `tf-models-official` is the stable Model Garden package. Note that it may not include the latest changes in the `tensorflow_models` github repo. To include latest changes, you may install `tf-models-nightly`,\n",
"which is the nightly Model Garden package created daily automatically.\n",
"* `pip` will install all models and dependencies automatically."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "thsKZDjhswhR"
},
"outputs": [],
"source": [
"!pip install -q tf-nightly\n",
"!pip install -q tf-models-nightly"
]
"!pip install -q tf-models-official==2.4.0"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "hpf7JPCVsqtv"
},
"source": [
......@@ -130,13 +134,9 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "my4dp-RMssQe"
},
"outputs": [],
"source": [
"import numpy as np\n",
"import tensorflow as tf\n",
......@@ -144,12 +144,13 @@
"from official.modeling import activations\n",
"from official.nlp import modeling\n",
"from official.nlp.modeling import layers, losses, models, networks"
]
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "vjDmVsFfs85n"
},
"source": [
......@@ -160,13 +161,9 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "Oav8sbgstWc-"
},
"outputs": [],
"source": [
"cfg = {\n",
" \"vocab_size\": 100,\n",
......@@ -177,22 +174,23 @@
" \"activation\": activations.gelu,\n",
" \"dropout_rate\": 0.1,\n",
" \"attention_dropout_rate\": 0.1,\n",
" \"sequence_length\": 16,\n",
" \"max_sequence_length\": 16,\n",
" \"type_vocab_size\": 2,\n",
" \"initializer\": tf.keras.initializers.TruncatedNormal(stddev=0.02),\n",
"}\n",
"bert_encoder = modeling.networks.TransformerEncoder(**cfg)\n",
"bert_encoder = modeling.networks.BertEncoder(**cfg)\n",
"\n",
"def build_classifier(bert_encoder):\n",
" return modeling.models.BertClassifier(bert_encoder, num_classes=2)\n",
"\n",
"canonical_classifier_model = build_classifier(bert_encoder)"
]
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "Qe2UWI6_tsHo"
},
"source": [
......@@ -203,31 +201,28 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "csED2d-Yt5h6"
},
"outputs": [],
"source": [
"def predict(model):\n",
" batch_size = 3\n",
" np.random.seed(0)\n",
" word_ids = np.random.randint(\n",
" cfg[\"vocab_size\"], size=(batch_size, cfg[\"sequence_length\"]))\n",
" mask = np.random.randint(2, size=(batch_size, cfg[\"sequence_length\"]))\n",
" cfg[\"vocab_size\"], size=(batch_size, cfg[\"max_sequence_length\"]))\n",
" mask = np.random.randint(2, size=(batch_size, cfg[\"max_sequence_length\"]))\n",
" type_ids = np.random.randint(\n",
" cfg[\"type_vocab_size\"], size=(batch_size, cfg[\"sequence_length\"]))\n",
" cfg[\"type_vocab_size\"], size=(batch_size, cfg[\"max_sequence_length\"]))\n",
" print(model([word_ids, mask, type_ids], training=False))\n",
"\n",
"predict(canonical_classifier_model)"
]
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "PzKStEK9t_Pb"
},
"source": [
......@@ -239,7 +234,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "rmwQfhj6fmKz"
},
"source": [
......@@ -250,7 +244,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "xsMgEVHAui11"
},
"source": [
......@@ -263,26 +256,21 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "-JBabpa2AOz8"
},
"source": [
"#### Without Customization\n",
"\n",
"Without any customization, `EncoderScaffold` behaves the same the canonical `TransformerEncoder`.\n",
"Without any customization, `EncoderScaffold` behaves the same the canonical `BertEncoder`.\n",
"\n",
"As shown in the following example, `EncoderScaffold` can load `TransformerEncoder`'s weights and output the same values:"
"As shown in the following example, `EncoderScaffold` can load `BertEncoder`'s weights and output the same values:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "ktNzKuVByZQf"
},
"outputs": [],
"source": [
"default_hidden_cfg = dict(\n",
" num_attention_heads=cfg[\"num_attention_heads\"],\n",
......@@ -296,10 +284,9 @@
" vocab_size=cfg[\"vocab_size\"],\n",
" type_vocab_size=cfg[\"type_vocab_size\"],\n",
" hidden_size=cfg[\"hidden_size\"],\n",
" seq_length=cfg[\"sequence_length\"],\n",
" initializer=tf.keras.initializers.TruncatedNormal(0.02),\n",
" dropout_rate=cfg[\"dropout_rate\"],\n",
" max_seq_length=cfg[\"sequence_length\"],\n",
" max_seq_length=cfg[\"max_sequence_length\"]\n",
")\n",
"default_kwargs = dict(\n",
" hidden_cfg=default_hidden_cfg,\n",
......@@ -309,17 +296,19 @@
" return_all_layer_outputs=True,\n",
" pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(0.02),\n",
")\n",
"\n",
"encoder_scaffold = modeling.networks.EncoderScaffold(**default_kwargs)\n",
"classifier_model_from_encoder_scaffold = build_classifier(encoder_scaffold)\n",
"classifier_model_from_encoder_scaffold.set_weights(\n",
" canonical_classifier_model.get_weights())\n",
"predict(classifier_model_from_encoder_scaffold)"
]
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "sMaUmLyIuwcs"
},
"source": [
......@@ -332,18 +321,14 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "LTinnaG6vcsw"
},
"outputs": [],
"source": [
"word_ids = tf.keras.layers.Input(\n",
" shape=(cfg['sequence_length'],), dtype=tf.int32, name=\"input_word_ids\")\n",
" shape=(cfg['max_sequence_length'],), dtype=tf.int32, name=\"input_word_ids\")\n",
"mask = tf.keras.layers.Input(\n",
" shape=(cfg['sequence_length'],), dtype=tf.int32, name=\"input_mask\")\n",
" shape=(cfg['max_sequence_length'],), dtype=tf.int32, name=\"input_mask\")\n",
"embedding_layer = modeling.layers.OnDeviceEmbedding(\n",
" vocab_size=cfg['vocab_size'],\n",
" embedding_width=cfg['hidden_size'],\n",
......@@ -353,12 +338,13 @@
"attention_mask = layers.SelfAttentionMask()([word_embeddings, mask])\n",
"new_embedding_network = tf.keras.Model([word_ids, mask],\n",
" [word_embeddings, attention_mask])"
]
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "HN7_yu-6O3qI"
},
"source": [
......@@ -368,21 +354,18 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "fO9zKFE4OpHp"
},
"outputs": [],
"source": [
"tf.keras.utils.plot_model(new_embedding_network, show_shapes=True, dpi=48)"
]
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "9cOaGQHLv12W"
},
"source": [
......@@ -391,13 +374,9 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "mtFDMNf2vIl9"
},
"outputs": [],
"source": [
"kwargs = dict(default_kwargs)\n",
"\n",
......@@ -412,12 +391,13 @@
"\n",
"# Assert that there are only two inputs.\n",
"assert len(classifier_model.inputs) == 2"
]
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "Z73ZQDtmwg9K"
},
"source": [
......@@ -432,13 +412,9 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "uAIarLZgw6pA"
},
"outputs": [],
"source": [
"kwargs = dict(default_kwargs)\n",
"\n",
......@@ -452,12 +428,13 @@
"\n",
"# Assert that the variable `rezero_alpha` from ReZeroTransformer exists.\n",
"assert 'rezero_alpha' in ''.join([x.name for x in classifier_model.trainable_weights])"
]
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "6PMHFdvnxvR0"
},
"source": [
......@@ -470,7 +447,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "D6FejlgwyAy_"
},
"source": [
......@@ -485,13 +461,9 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "nFrSMrZuyNeQ"
},
"outputs": [],
"source": [
"# Use TalkingHeadsAttention\n",
"hidden_cfg = dict(default_hidden_cfg)\n",
......@@ -508,12 +480,13 @@
"\n",
"# Assert that the variable `pre_softmax_weight` from TalkingHeadsAttention exists.\n",
"assert 'pre_softmax_weight' in ''.join([x.name for x in classifier_model.trainable_weights])"
]
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "kuEJcTyByVvI"
},
"source": [
......@@ -528,13 +501,9 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "XAbKy_l4y_-i"
},
"outputs": [],
"source": [
"# Use TalkingHeadsAttention\n",
"hidden_cfg = dict(default_hidden_cfg)\n",
......@@ -551,12 +520,13 @@
"\n",
"# Assert that the variable `gate` from GatedFeedforward exists.\n",
"assert 'gate' in ''.join([x.name for x in classifier_model.trainable_weights])"
]
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "a_8NWUhkzeAq"
},
"source": [
......@@ -564,29 +534,26 @@
"\n",
"Finally, you could also build a new encoder using building blocks in the modeling library.\n",
"\n",
"See [AlbertTransformerEncoder](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/networks/albert_transformer_encoder.py) as an example:\n"
"See [AlbertEncoder](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/networks/albert_encoder.py) as an example:\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "xsiA3RzUzmUM"
},
"outputs": [],
"source": [
"albert_encoder = modeling.networks.AlbertTransformerEncoder(**cfg)\n",
"albert_encoder = modeling.networks.AlbertEncoder(**cfg)\n",
"classifier_model = build_classifier(albert_encoder)\n",
"# ... Train the model ...\n",
"predict(classifier_model)"
]
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "MeidDfhlHKSO"
},
"source": [
......@@ -595,31 +562,14 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "Uv_juT22HERW"
},
"outputs": [],
"source": [
"tf.keras.utils.plot_model(albert_encoder, show_shapes=True, dpi=48)"
]
}
],
"metadata": {
"colab": {
"collapsed_sections": [],
"name": "Customizing a Transformer Encoder",
"private_outputs": true,
"provenance": [],
"toc_visible": true
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
],
"execution_count": null,
"outputs": []
}
},
"nbformat": 4,
"nbformat_minor": 0
}
]
}
\ No newline at end of file
......@@ -3,7 +3,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "80xnUmoI7fBX"
},
"source": [
......@@ -15,8 +14,6 @@
"execution_count": null,
"metadata": {
"cellView": "form",
"colab": {},
"colab_type": "code",
"id": "8nvTnfs6Q692"
},
"outputs": [],
......@@ -37,7 +34,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "WmfcMK5P5C1G"
},
"source": [
......@@ -47,7 +43,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "cH-oJ8R6AHMK"
},
"source": [
......@@ -70,7 +65,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "0H_EFIhq4-MJ"
},
"source": [
......@@ -82,7 +76,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "2N97-dps_nUk"
},
"source": [
......@@ -92,13 +85,13 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "459ygAVl_rg0"
},
"source": [
"### Install the TensorFlow Model Garden pip package\n",
"\n",
"* `tf-models-nightly` is the nightly Model Garden package created daily automatically.\n",
"* `tf-models-official` is the stable Model Garden package. Note that it may not include the latest changes in the `tensorflow_models` github repo. To include latest changes, you may install `tf-models-nightly`,\n",
"which is the nightly Model Garden package created daily automatically.\n",
"* `pip` will install all models and dependencies automatically."
]
},
......@@ -106,20 +99,16 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "Y-qGkdh6_sZc"
},
"outputs": [],
"source": [
"!pip install -q tf-nightly\n",
"!pip install -q tf-models-nightly"
"!pip install -q tf-models-official==2.4.0"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "e4huSSwyAG_5"
},
"source": [
......@@ -130,8 +119,6 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "jqYXqtjBAJd9"
},
"outputs": [],
......@@ -146,7 +133,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "djBQWjvy-60Y"
},
"source": [
......@@ -160,13 +146,12 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "MKuHVlsCHmiq"
},
"source": [
"### Build a `BertPretrainer` model wrapping `TransformerEncoder`\n",
"### Build a `BertPretrainer` model wrapping `BertEncoder`\n",
"\n",
"The [TransformerEncoder](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/networks/transformer_encoder.py) implements the Transformer-based encoder as described in [BERT paper](https://arxiv.org/abs/1810.04805). It includes the embedding lookups and transformer layers, but not the masked language model or classification task networks.\n",
"The [BertEncoder](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/networks/bert_encoder.py) implements the Transformer-based encoder as described in [BERT paper](https://arxiv.org/abs/1810.04805). It includes the embedding lookups and transformer layers, but not the masked language model or classification task networks.\n",
"\n",
"The [BertPretrainer](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/models/bert_pretrainer.py) allows a user to pass in a transformer stack, and instantiates the masked language model and classification networks that are used to create the training objectives."
]
......@@ -175,8 +160,6 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "EXkcXz-9BwB3"
},
"outputs": [],
......@@ -184,14 +167,13 @@
"# Build a small transformer network.\n",
"vocab_size = 100\n",
"sequence_length = 16\n",
"network = modeling.networks.TransformerEncoder(\n",
"network = modeling.networks.BertEncoder(\n",
" vocab_size=vocab_size, num_layers=2, sequence_length=16)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "0NH5irV5KTMS"
},
"source": [
......@@ -204,8 +186,6 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "lZNoZkBrIoff"
},
"outputs": [],
......@@ -217,8 +197,6 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "o7eFOZXiIl-b"
},
"outputs": [],
......@@ -232,7 +210,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "d5h5HT7gNHx_"
},
"source": [
......@@ -243,8 +220,6 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "2tcNfm03IBF7"
},
"outputs": [],
......@@ -256,8 +231,6 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "F2oHrXGUIS0M"
},
"outputs": [],
......@@ -280,7 +253,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "bnx3UCHniCS5"
},
"source": [
......@@ -292,8 +264,6 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "k30H4Q86f52x"
},
"outputs": [],
......@@ -316,7 +286,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "wrmSs8GjHxVw"
},
"source": [
......@@ -328,7 +297,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "k8cQVFvBCV4s"
},
"source": [
......@@ -342,28 +310,25 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "xrLLEWpfknUW"
},
"source": [
"### Build a BertSpanLabeler wrapping TransformerEncoder\n",
"### Build a BertSpanLabeler wrapping BertEncoder\n",
"\n",
"[BertSpanLabeler](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/models/bert_span_labeler.py) implements a simple single-span start-end predictor (that is, a model that predicts two values: a start token index and an end token index), suitable for SQuAD-style tasks.\n",
"\n",
"Note that `BertSpanLabeler` wraps a `TransformerEncoder`, the weights of which can be restored from the above pretraining model.\n"
"Note that `BertSpanLabeler` wraps a `BertEncoder`, the weights of which can be restored from the above pretraining model.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "B941M4iUCejO"
},
"outputs": [],
"source": [
"network = modeling.networks.TransformerEncoder(\n",
"network = modeling.networks.BertEncoder(\n",
" vocab_size=vocab_size, num_layers=2, sequence_length=sequence_length)\n",
"\n",
"# Create a BERT trainer with the created network.\n",
......@@ -373,7 +338,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "QpB9pgj4PpMg"
},
"source": [
......@@ -384,8 +348,6 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "RbqRNJCLJu4H"
},
"outputs": [],
......@@ -397,8 +359,6 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "fUf1vRxZJwio"
},
"outputs": [],
......@@ -417,7 +377,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "WqhgQaN1lt-G"
},
"source": [
......@@ -429,8 +388,6 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "waqs6azNl3Nn"
},
"outputs": [],
......@@ -450,7 +407,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "Zdf03YtZmd_d"
},
"source": [
......@@ -460,7 +416,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "0A1XnGSTChg9"
},
"source": [
......@@ -472,26 +427,23 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "MSK8OpZgnQa9"
},
"source": [
"### Build a BertClassifier model wrapping TransformerEncoder\n",
"### Build a BertClassifier model wrapping BertEncoder\n",
"\n",
"[BertClassifier](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/models/bert_classifier.py) implements a simple token classification model containing a single classification head using the `TokenClassification` network."
"[BertClassifier](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/models/bert_classifier.py) implements a [CLS] token classification model containing a single classification head."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "cXXCsffkCphk"
},
"outputs": [],
"source": [
"network = modeling.networks.TransformerEncoder(\n",
"network = modeling.networks.BertEncoder(\n",
" vocab_size=vocab_size, num_layers=2, sequence_length=sequence_length)\n",
"\n",
"# Create a BERT trainer with the created network.\n",
......@@ -503,7 +455,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "8tZKueKYP4bB"
},
"source": [
......@@ -514,8 +465,6 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "snlutm9ZJgEZ"
},
"outputs": [],
......@@ -527,8 +476,6 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "yyHPHsqBJkCz"
},
"outputs": [],
......@@ -546,7 +493,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "w--a2mg4nzKm"
},
"source": [
......@@ -559,23 +505,20 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "9X0S1DoFn_5Q"
},
"outputs": [],
"source": [
"labels = np.random.randint(num_classes, size=(batch_size))\n",
"\n",
"loss = modeling.losses.weighted_sparse_categorical_crossentropy_loss(\n",
" labels=labels, predictions=tf.nn.log_softmax(logits, axis=-1))\n",
"loss = tf.keras.losses.sparse_categorical_crossentropy(\n",
" labels, logits, from_logits=True)\n",
"print(loss)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "mzBqOylZo3og"
},
"source": [
......
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "vs3a5tGVAWGI"
},
"source": [
"##### Copyright 2021 The TensorFlow Authors."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "HYfsarcYBJQp"
},
"outputs": [],
"source": [
"#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
"# you may not use this file except in compliance with the License.\n",
"# You may obtain a copy of the License at\n",
"#\n",
"# https://www.apache.org/licenses/LICENSE-2.0\n",
"#\n",
"# Unless required by applicable law or agreed to in writing, software\n",
"# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
"# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
"# See the License for the specific language governing permissions and\n",
"# limitations under the License."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "aOpqCFEyBQDd"
},
"source": [
"# Uncertainty-aware Deep Language Learning with BERT-SNGP"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "6MlSYP6cBT61"
},
"source": [
"\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
" \u003ctd\u003e\n",
" \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/official_models/tutorials/uncertainty_quantification_with_sngp_bert.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n",
" \u003c/td\u003e\n",
" \u003ctd\u003e\n",
" \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/models/blob/master/official/colab/uncertainty_quantification_with_sngp_bert.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
" \u003c/td\u003e\n",
" \u003ctd\u003e\n",
" \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/models/blob/master/official/colab/uncertainty_quantification_with_sngp_bert.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView on GitHub\u003c/a\u003e\n",
" \u003c/td\u003e\n",
" \u003ctd\u003e\n",
" \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/docs/models/official/colab/fine_tuning_bert.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
" \u003c/td\u003e\n",
" \u003ctd\u003e\n",
" \u003ca href=\"https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/hub_logo_32px.png\" /\u003eSee TF Hub model\u003c/a\u003e\n",
" \u003c/td\u003e\n",
"\u003c/table\u003e"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "-IM5IzM26GBh"
},
"source": [
"In the [SNGP tutorial](https://www.tensorflow.org/tutorials/uncertainty/sngp), you learned how to build SNGP model on top of a deep residual network to improve its ability to quantify its uncertainty. In this tutorial, you will apply SNGP to a natural language understanding (NLU) task by building it on top of a deep BERT encoder to improve deep NLU model's ability in detecting out-of-scope queries. \n",
"\n",
"Specifically, you will:\n",
"* Build BERT-SNGP, a SNGP-augmented [BERT](https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2) model.\n",
"* Load the [CLINC Out-of-scope (OOS)](https://www.tensorflow.org/datasets/catalog/clinc_oos) intent detection dataset.\n",
"* Train the BERT-SNGP model.\n",
"* Evaluate the BERT-SNGP model's performance in uncertainty calibration and out-of-domain detection.\n",
"\n",
"Beyond CLINC OOS, the SNGP model has been applied to large-scale datasets such as [Jigsaw toxicity detection](https://www.tensorflow.org/datasets/catalog/wikipedia_toxicity_subtypes), and to the image datasets such as [CIFAR-100](https://www.tensorflow.org/datasets/catalog/cifar100) and [ImageNet](https://www.tensorflow.org/datasets/catalog/imagenet2012). \n",
"For benchmark results of SNGP and other uncertainty methods, as well as high-quality implementation with end-to-end training / evaluation scripts, you can check out the [Uncertainty Baselines](https://github.com/google/uncertainty-baselines) benchmark."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "-bsids4eAYYI"
},
"source": [
"## Setup"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "3sgnLBKk7iuR"
},
"outputs": [],
"source": [
"!pip install tf-models-nightly"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "M42dnVSk7dVy"
},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"\n",
"import sklearn.metrics\n",
"import sklearn.calibration\n",
"\n",
"import tensorflow_hub as hub\n",
"import tensorflow_datasets as tfds\n",
"\n",
"import numpy as np\n",
"import tensorflow as tf\n",
"\n",
"import official.nlp.modeling.layers as layers\n",
"import official.nlp.optimization as optimization"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "cnRQfguq6GZj"
},
"source": [
"First implement a standard BERT classifier following the [classify text with BERT](https://www.tensorflow.org/tutorials/text/classify_text_with_bert) tutorial. We will use the [BERT-base](https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3) encoder, and the built-in [`ClassificationHead`](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/layers/cls_head.py) as the classifier."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "bNBEGs7s6NHB"
},
"outputs": [],
"source": [
"#@title Standard BERT model\n",
"\n",
"PREPROCESS_HANDLE = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'\n",
"MODEL_HANDLE = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3'\n",
"\n",
"class BertClassifier(tf.keras.Model):\n",
" def __init__(self, \n",
" num_classes=150, inner_dim=768, dropout_rate=0.1,\n",
" **classifier_kwargs):\n",
" \n",
" super().__init__()\n",
" self.classifier_kwargs = classifier_kwargs\n",
"\n",
" # Initiate the BERT encoder components.\n",
" self.bert_preprocessor = hub.KerasLayer(PREPROCESS_HANDLE, name='preprocessing')\n",
" self.bert_hidden_layer = hub.KerasLayer(MODEL_HANDLE, trainable=True, name='bert_encoder')\n",
"\n",
" # Defines the encoder and classification layers.\n",
" self.bert_encoder = self.make_bert_encoder()\n",
" self.classifier = self.make_classification_head(num_classes, inner_dim, dropout_rate)\n",
"\n",
" def make_bert_encoder(self):\n",
" text_inputs = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')\n",
" encoder_inputs = self.bert_preprocessor(text_inputs)\n",
" encoder_outputs = self.bert_hidden_layer(encoder_inputs)\n",
" return tf.keras.Model(text_inputs, encoder_outputs)\n",
"\n",
" def make_classification_head(self, num_classes, inner_dim, dropout_rate):\n",
" return layers.ClassificationHead(\n",
" num_classes=num_classes, \n",
" inner_dim=inner_dim,\n",
" dropout_rate=dropout_rate,\n",
" **self.classifier_kwargs)\n",
"\n",
" def call(self, inputs, **kwargs):\n",
" encoder_outputs = self.bert_encoder(inputs)\n",
" classifier_inputs = encoder_outputs['sequence_output']\n",
" return self.classifier(classifier_inputs, **kwargs)\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "SbhbNbKk6WNR"
},
"source": [
"### Build SNGP model"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "p7YakN0V6Oif"
},
"source": [
"To implement a BERT-SNGP model, you only need to replace the `ClassificationHead` with the built-in [`GaussianProcessClassificationHead`](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/layers/cls_head.py). Spectral normalization is already pre-packaged into this classification head. Like in the [SNGP tutorial](https://www.tensorflow.org/tutorials/uncertainty/sngp), add a covariance reset callback to the model, so the model automatically reset the covariance estimator at the begining of a new epoch to avoid counting the same data twice."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "QCaJy85y8WeE"
},
"outputs": [],
"source": [
"class ResetCovarianceCallback(tf.keras.callbacks.Callback):\n",
"\n",
" def on_epoch_begin(self, epoch, logs=None):\n",
" \"\"\"Resets covariance matrix at the begining of the epoch.\"\"\"\n",
" if epoch \u003e 0:\n",
" self.model.classifier.reset_covariance_matrix()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "YoHgOuiZ6Q4y"
},
"outputs": [],
"source": [
"class SNGPBertClassifier(BertClassifier):\n",
"\n",
" def make_classification_head(self, num_classes, inner_dim, dropout_rate):\n",
" return layers.GaussianProcessClassificationHead(\n",
" num_classes=num_classes, \n",
" inner_dim=inner_dim,\n",
" dropout_rate=dropout_rate,\n",
" gp_cov_momentum=-1,\n",
" temperature=30.,\n",
" **self.classifier_kwargs)\n",
"\n",
" def fit(self, *args, **kwargs):\n",
" \"\"\"Adds ResetCovarianceCallback to model callbacks.\"\"\"\n",
" kwargs['callbacks'] = list(kwargs.get('callbacks', []))\n",
" kwargs['callbacks'].append(ResetCovarianceCallback())\n",
"\n",
" return super().fit(*args, **kwargs)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "UOj5YWTt6dCe"
},
"source": [
"Note: The `GaussianProcessClassificationHead` takes a new argument `temperature`. It corresponds to the $\\lambda$ parameter in the __mean-field approximation__ introduced in the [SNGP tutorial](https://www.tensorflow.org/tutorials/uncertainty/sngp). In practice, this value is usually treated as a hyperparamter, and is finetuned to optimize the model's calibration performance."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "qdU90uDT6hFq"
},
"source": [
"### Load CLINC OOS dataset"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "AnuNeyHw6kH7"
},
"source": [
"Now load the [CLINC OOS](https://www.tensorflow.org/datasets/catalog/clinc_oos) intent detection dataset. This dataset contains 15000 user's spoken queries collected over 150 intent classes, it also contains 1000 out-of-domain (OOD) sentences that are not covered by any of the known classes."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "mkMZN2iA6hhg"
},
"outputs": [],
"source": [
"(clinc_train, clinc_test, clinc_test_oos), ds_info = tfds.load(\n",
" 'clinc_oos', split=['train', 'test', 'test_oos'], with_info=True, batch_size=-1)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "UJSL2nm8Bo02"
},
"source": [
"Make the train and test data."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "cgkOOZOq6fQL"
},
"outputs": [],
"source": [
"train_examples = clinc_train['text']\n",
"train_labels = clinc_train['intent']\n",
"\n",
"# Makes the in-domain (IND) evaluation data.\n",
"ind_eval_data = (clinc_test['text'], clinc_test['intent'])"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Kw76f6caBq_E"
},
"source": [
"Create a OOD evaluation dataset. For this, combine the in-domain test data `clinc_test` and the out-of-domain data `clinc_test_oos`. We will also assign label 0 to the in-domain examples, and label 1 to the out-of-domain examples. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "uVFuzecR64FJ"
},
"outputs": [],
"source": [
"test_data_size = ds_info.splits['test'].num_examples\n",
"oos_data_size = ds_info.splits['test_oos'].num_examples\n",
"\n",
"# Combines the in-domain and out-of-domain test examples.\n",
"oos_texts = tf.concat([clinc_test['text'], clinc_test_oos['text']], axis=0)\n",
"oos_labels = tf.constant([0] * test_data_size + [1] * oos_data_size)\n",
"\n",
"# Converts into a TF dataset.\n",
"ood_eval_dataset = tf.data.Dataset.from_tensor_slices(\n",
" {\"text\": oos_texts, \"label\": oos_labels})"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "ZcHwfwfU6qCE"
},
"source": [
"### Train and evaluate"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "_VTY6KYc6sBB"
},
"source": [
"First set up the basic training configurations."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "_-uUkUtk6qWC"
},
"outputs": [],
"source": [
"TRAIN_EPOCHS = 3\n",
"TRAIN_BATCH_SIZE = 32\n",
"EVAL_BATCH_SIZE = 256"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "tiEjMdFV6wXQ"
},
"outputs": [],
"source": [
"#@title\n",
"\n",
"def bert_optimizer(learning_rate, \n",
" batch_size=TRAIN_BATCH_SIZE, epochs=TRAIN_EPOCHS, \n",
" warmup_rate=0.1):\n",
" \"\"\"Creates an AdamWeightDecay optimizer with learning rate schedule.\"\"\"\n",
" train_data_size = ds_info.splits['train'].num_examples\n",
" \n",
" steps_per_epoch = int(train_data_size / batch_size)\n",
" num_train_steps = steps_per_epoch * epochs\n",
" num_warmup_steps = int(warmup_rate * num_train_steps) \n",
"\n",
" # Creates learning schedule.\n",
" lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(\n",
" initial_learning_rate=learning_rate,\n",
" decay_steps=num_train_steps,\n",
" end_learning_rate=0.0) \n",
" \n",
" return optimization.AdamWeightDecay(\n",
" learning_rate=lr_schedule,\n",
" weight_decay_rate=0.01,\n",
" epsilon=1e-6,\n",
" exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "KX_Hzl3l6w-H"
},
"outputs": [],
"source": [
"optimizer = bert_optimizer(learning_rate=1e-4)\n",
"loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)\n",
"metrics = tf.metrics.SparseCategoricalAccuracy()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "ptn9Cupe6z7o"
},
"outputs": [],
"source": [
"fit_configs = dict(batch_size=TRAIN_BATCH_SIZE,\n",
" epochs=TRAIN_EPOCHS,\n",
" validation_batch_size=EVAL_BATCH_SIZE, \n",
" validation_data=ind_eval_data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "0ZK5PBwW61jd"
},
"outputs": [],
"source": [
"sngp_model = SNGPBertClassifier()\n",
"sngp_model.compile(optimizer=optimizer, loss=loss, metrics=metrics)\n",
"sngp_model.fit(train_examples, train_labels, **fit_configs)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "cpDsgTYx63tO"
},
"source": [
"### Evaluate OOD performance"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "d5NGVe7L67bB"
},
"source": [
"Evaluate how well the model can detect the unfamiliar out-of-domain queries. For rigorous evaluation, use the OOD evaluation dataset `ood_eval_dataset` built earlier."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "yyLgt_lL7APo"
},
"outputs": [],
"source": [
"#@title\n",
"\n",
"def oos_predict(model, ood_eval_dataset, **model_kwargs):\n",
" oos_labels = []\n",
" oos_probs = []\n",
"\n",
" ood_eval_dataset = ood_eval_dataset.batch(EVAL_BATCH_SIZE)\n",
" for oos_batch in ood_eval_dataset:\n",
" oos_text_batch = oos_batch[\"text\"]\n",
" oos_label_batch = oos_batch[\"label\"] \n",
"\n",
" pred_logits = model(oos_text_batch, **model_kwargs)\n",
" pred_probs_all = tf.nn.softmax(pred_logits, axis=-1)\n",
" pred_probs = tf.reduce_max(pred_probs_all, axis=-1)\n",
"\n",
" oos_labels.append(oos_label_batch)\n",
" oos_probs.append(pred_probs)\n",
"\n",
" oos_probs = tf.concat(oos_probs, axis=0)\n",
" oos_labels = tf.concat(oos_labels, axis=0) \n",
"\n",
" return oos_probs, oos_labels"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Dmc2tVXs6_uo"
},
"source": [
"Computes the OOD probabilities as $1 - p(x)$, where $p(x)=softmax(logit(x))$ is the predictive probability."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "_9aFVVDO7C7o"
},
"outputs": [],
"source": [
"sngp_probs, ood_labels = oos_predict(sngp_model, ood_eval_dataset)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "_PC0wwZp7GJD"
},
"outputs": [],
"source": [
"ood_probs = 1 - sngp_probs"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "AsandMTX7HjX"
},
"source": [
"Now evaluate how well the model's uncertainty score `ood_probs` predicts the out-of-domain label. First compute the Area under precision-recall curve (AUPRC) for OOD probability v.s. OOD detection accuracy."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "0u5Wx8AP7Mdx"
},
"outputs": [],
"source": [
"precision, recall, _ = sklearn.metrics.precision_recall_curve(ood_labels, ood_probs)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "axcctOsh7N5A"
},
"outputs": [],
"source": [
"auprc = sklearn.metrics.auc(recall, precision)\n",
"print(f'SNGP AUPRC: {auprc:.4f}')"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "U_GEqxq-7Q1Y"
},
"source": [
"This matches the SNGP performance reported at the CLINC OOS benchmark under the [Uncertainty Baselines](https://github.com/google/uncertainty-baselines)."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "8H4vYcyd7Ux2"
},
"source": [
"Next, examine the model's quality in [uncertainty calibration](https://scikit-learn.org/stable/modules/calibration.html), i.e., whether the model's predictive probability corresponds to its predictive accuracy. A well-calibrated model is considered trust-worthy, since, for example, its predictive probability $p(x)=0.8$ means that the model is correct 80% of the time."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "x5GxrSWJ7SYn"
},
"outputs": [],
"source": [
"prob_true, prob_pred = sklearn.calibration.calibration_curve(\n",
" ood_labels, ood_probs, n_bins=10, strategy='quantile')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "ozzJM-D-7XVq"
},
"outputs": [],
"source": [
"plt.plot(prob_pred, prob_true)\n",
"\n",
"plt.plot([0., 1.], [0., 1.], c='k', linestyle=\"--\")\n",
"plt.xlabel('Predictive Probability')\n",
"plt.ylabel('Predictive Accuracy')\n",
"plt.title('Calibration Plots, SNGP')\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "36M6HeHx7ZI4"
},
"source": [
"## Resources and further reading"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "xdFTpyaP0A-N"
},
"source": [
"* See the [SNGP tutorial](https://www.tensorflow.org/tutorials/uncertainty/sngp) for an detailed walkthrough of implementing SNGP from scratch. \n",
"* See [Uncertainty Baselines](https://github.com/google/uncertainty-baselines) for the implementation of SNGP model (and many other uncertainty methods) on a wide variety of benchmark datasets (e.g., [CIFAR](https://www.tensorflow.org/datasets/catalog/cifar100), [ImageNet](https://www.tensorflow.org/datasets/catalog/imagenet2012), [Jigsaw toxicity detection](https://www.tensorflow.org/datasets/catalog/wikipedia_toxicity_subtypes), etc).\n",
"* For a deeper understanding of the SNGP method, check out the paper [Simple and Principled Uncertainty Estimation with Deterministic Deep Learning via Distance Awareness](https://arxiv.org/abs/2006.10108).\n"
]
}
],
"metadata": {
"colab": {
"collapsed_sections": [],
"name": "uncertainty_quantification_with_sngp_bert.ipynb",
"toc_visible": true
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utility library for picking an appropriate dataset function."""
from typing import Any, Callable, Union, Type
import tensorflow as tf
PossibleDatasetType = Union[Type[tf.data.Dataset], Callable[[tf.Tensor], Any]]
def pick_dataset_fn(file_type: str) -> PossibleDatasetType:
if file_type == 'tfrecord':
return tf.data.TFRecordDataset
raise ValueError('Unrecognized file_type: {}'.format(file_type))
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Helper functions for running models in a distributed setting."""
import json
import os
import tensorflow as tf
def _collective_communication(all_reduce_alg):
"""Return a CollectiveCommunication based on all_reduce_alg.
Args:
all_reduce_alg: a string specifying which collective communication to pick,
or None.
Returns:
tf.distribute.experimental.CollectiveCommunication object
Raises:
ValueError: if `all_reduce_alg` not in [None, "ring", "nccl"]
"""
collective_communication_options = {
None: tf.distribute.experimental.CollectiveCommunication.AUTO,
"ring": tf.distribute.experimental.CollectiveCommunication.RING,
"nccl": tf.distribute.experimental.CollectiveCommunication.NCCL
}
if all_reduce_alg not in collective_communication_options:
raise ValueError(
"When used with `multi_worker_mirrored`, valid values for "
"all_reduce_alg are [`ring`, `nccl`]. Supplied value: {}".format(
all_reduce_alg))
return collective_communication_options[all_reduce_alg]
def _mirrored_cross_device_ops(all_reduce_alg, num_packs):
"""Return a CrossDeviceOps based on all_reduce_alg and num_packs.
Args:
all_reduce_alg: a string specifying which cross device op to pick, or None.
num_packs: an integer specifying number of packs for the cross device op.
Returns:
tf.distribute.CrossDeviceOps object or None.
Raises:
ValueError: if `all_reduce_alg` not in [None, "nccl", "hierarchical_copy"].
"""
if all_reduce_alg is None:
return None
mirrored_all_reduce_options = {
"nccl": tf.distribute.NcclAllReduce,
"hierarchical_copy": tf.distribute.HierarchicalCopyAllReduce
}
if all_reduce_alg not in mirrored_all_reduce_options:
raise ValueError(
"When used with `mirrored`, valid values for all_reduce_alg are "
"[`nccl`, `hierarchical_copy`]. Supplied value: {}".format(
all_reduce_alg))
cross_device_ops_class = mirrored_all_reduce_options[all_reduce_alg]
return cross_device_ops_class(num_packs=num_packs)
def tpu_initialize(tpu_address):
"""Initializes TPU for TF 2.x training.
Args:
tpu_address: string, bns address of master TPU worker.
Returns:
A TPUClusterResolver.
"""
cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
tpu=tpu_address)
if tpu_address not in ("", "local"):
tf.config.experimental_connect_to_cluster(cluster_resolver)
tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
return cluster_resolver
def get_distribution_strategy(distribution_strategy="mirrored",
num_gpus=0,
all_reduce_alg=None,
num_packs=1,
tpu_address=None,
**kwargs):
"""Return a DistributionStrategy for running the model.
Args:
distribution_strategy: a string specifying which distribution strategy to
use. Accepted values are "off", "one_device", "mirrored",
"parameter_server", "multi_worker_mirrored", and "tpu" -- case
insensitive. "off" means not to use Distribution Strategy; "tpu" means to
use TPUStrategy using `tpu_address`.
num_gpus: Number of GPUs to run this model.
all_reduce_alg: Optional. Specifies which algorithm to use when performing
all-reduce. For `MirroredStrategy`, valid values are "nccl" and
"hierarchical_copy". For `MultiWorkerMirroredStrategy`, valid values are
"ring" and "nccl". If None, DistributionStrategy will choose based on
device topology.
num_packs: Optional. Sets the `num_packs` in `tf.distribute.NcclAllReduce`
or `tf.distribute.HierarchicalCopyAllReduce` for `MirroredStrategy`.
tpu_address: Optional. String that represents TPU to connect to. Must not be
None if `distribution_strategy` is set to `tpu`.
**kwargs: Additional kwargs for internal usages.
Returns:
tf.distribute.DistibutionStrategy object.
Raises:
ValueError: if `distribution_strategy` is "off" or "one_device" and
`num_gpus` is larger than 1; or `num_gpus` is negative or if
`distribution_strategy` is `tpu` but `tpu_address` is not specified.
"""
del kwargs
if num_gpus < 0:
raise ValueError("`num_gpus` can not be negative.")
if not isinstance(distribution_strategy, str):
msg = ("distribution_strategy must be a string but got: %s." %
(distribution_strategy,))
if distribution_strategy == False: # pylint: disable=singleton-comparison,g-explicit-bool-comparison
msg += (" If you meant to pass the string 'off', make sure you add "
"quotes around 'off' so that yaml interprets it as a string "
"instead of a bool.")
raise ValueError(msg)
distribution_strategy = distribution_strategy.lower()
if distribution_strategy == "off":
if num_gpus > 1:
raise ValueError("When {} GPUs are specified, distribution_strategy "
"flag cannot be set to `off`.".format(num_gpus))
return None
if distribution_strategy == "tpu":
# When tpu_address is an empty string, we communicate with local TPUs.
cluster_resolver = tpu_initialize(tpu_address)
return tf.distribute.TPUStrategy(cluster_resolver)
if distribution_strategy == "multi_worker_mirrored":
return tf.distribute.experimental.MultiWorkerMirroredStrategy(
communication=_collective_communication(all_reduce_alg))
if distribution_strategy == "one_device":
if num_gpus == 0:
return tf.distribute.OneDeviceStrategy("device:CPU:0")
if num_gpus > 1:
raise ValueError("`OneDeviceStrategy` can not be used for more than "
"one device.")
return tf.distribute.OneDeviceStrategy("device:GPU:0")
if distribution_strategy == "mirrored":
if num_gpus == 0:
devices = ["device:CPU:0"]
else:
devices = ["device:GPU:%d" % i for i in range(num_gpus)]
return tf.distribute.MirroredStrategy(
devices=devices,
cross_device_ops=_mirrored_cross_device_ops(all_reduce_alg, num_packs))
if distribution_strategy == "parameter_server":
cluster_resolver = tf.distribute.cluster_resolver.TFConfigClusterResolver()
return tf.distribute.experimental.ParameterServerStrategy(cluster_resolver)
raise ValueError("Unrecognized Distribution Strategy: %r" %
distribution_strategy)
def configure_cluster(worker_hosts=None, task_index=-1):
"""Set multi-worker cluster spec in TF_CONFIG environment variable.
Args:
worker_hosts: comma-separated list of worker ip:port pairs.
task_index: index of the worker.
Returns:
Number of workers in the cluster.
"""
tf_config = json.loads(os.environ.get("TF_CONFIG", "{}"))
if tf_config:
num_workers = (
len(tf_config["cluster"].get("chief", [])) +
len(tf_config["cluster"].get("worker", [])))
elif worker_hosts:
workers = worker_hosts.split(",")
num_workers = len(workers)
if num_workers > 1 and task_index < 0:
raise ValueError("Must specify task_index when number of workers > 1")
task_index = 0 if num_workers == 1 else task_index
os.environ["TF_CONFIG"] = json.dumps({
"cluster": {
"worker": workers
},
"task": {
"type": "worker",
"index": task_index
}
})
else:
num_workers = 1
return num_workers
def get_strategy_scope(strategy):
if strategy:
strategy_scope = strategy.scope()
else:
strategy_scope = DummyContextManager()
return strategy_scope
class DummyContextManager(object):
def __enter__(self):
pass
def __exit__(self, *args):
pass
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Tests for distribution util functions."""
import tensorflow as tf
from official.common import distribute_utils
class GetDistributionStrategyTest(tf.test.TestCase):
"""Tests for get_distribution_strategy."""
def test_one_device_strategy_cpu(self):
ds = distribute_utils.get_distribution_strategy(num_gpus=0)
self.assertEquals(ds.num_replicas_in_sync, 1)
self.assertEquals(len(ds.extended.worker_devices), 1)
self.assertIn('CPU', ds.extended.worker_devices[0])
def test_one_device_strategy_gpu(self):
ds = distribute_utils.get_distribution_strategy(num_gpus=1)
self.assertEquals(ds.num_replicas_in_sync, 1)
self.assertEquals(len(ds.extended.worker_devices), 1)
self.assertIn('GPU', ds.extended.worker_devices[0])
def test_mirrored_strategy(self):
ds = distribute_utils.get_distribution_strategy(num_gpus=5)
self.assertEquals(ds.num_replicas_in_sync, 5)
self.assertEquals(len(ds.extended.worker_devices), 5)
for device in ds.extended.worker_devices:
self.assertIn('GPU', device)
def test_no_strategy(self):
ds = distribute_utils.get_distribution_strategy('off')
self.assertIsNone(ds)
def test_invalid_strategy(self):
with self.assertRaisesRegexp(
ValueError,
'distribution_strategy must be a string but got: False. If'):
distribute_utils.get_distribution_strategy(False)
with self.assertRaisesRegexp(
ValueError, 'distribution_strategy must be a string but got: 1'):
distribute_utils.get_distribution_strategy(1)
if __name__ == '__main__':
tf.test.main()
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The central place to define flags."""
from absl import flags
def define_flags():
"""Defines flags."""
flags.DEFINE_string(
'experiment', default=None, help='The experiment type registered.')
flags.DEFINE_enum(
'mode',
default=None,
enum_values=[
'train', 'eval', 'train_and_eval', 'continuous_eval',
'continuous_train_and_eval', 'train_and_validate'
],
help='Mode to run: `train`, `eval`, `train_and_eval`, '
'`continuous_eval`, `continuous_train_and_eval` and '
'`train_and_validate` (which is not implemented in '
'the open source version).')
flags.DEFINE_string(
'model_dir',
default=None,
help='The directory where the model and training/evaluation summaries'
'are stored.')
flags.DEFINE_multi_string(
'config_file',
default=None,
help='YAML/JSON files which specifies overrides. The override order '
'follows the order of args. Note that each file '
'can be used as an override template to override the default parameters '
'specified in Python. If the same parameter is specified in both '
'`--config_file` and `--params_override`, `config_file` will be used '
'first, followed by params_override.')
flags.DEFINE_string(
'params_override',
default=None,
help='a YAML/JSON string or a YAML file which specifies additional '
'overrides over the default parameters and those specified in '
'`--config_file`. Note that this is supposed to be used only to override '
'the model parameters, but not the parameters like TPU specific flags. '
'One canonical use case of `--config_file` and `--params_override` is '
'users first define a template config file using `--config_file`, then '
'use `--params_override` to adjust the minimal set of tuning parameters, '
'for example setting up different `train_batch_size`. The final override '
'order of parameters: default_model_params --> params from config_file '
'--> params in params_override. See also the help message of '
'`--config_file`.')
# The libraries rely on gin often make mistakes that include flags inside
# the library files which causes conflicts.
try:
flags.DEFINE_multi_string(
'gin_file', default=None, help='List of paths to the config files.')
except flags.DuplicateFlagError:
pass
try:
flags.DEFINE_multi_string(
'gin_params',
default=None,
help='Newline separated list of Gin parameter bindings.')
except flags.DuplicateFlagError:
pass
flags.DEFINE_string(
'tpu',
default=None,
help='The Cloud TPU to use for training. This should be either the name '
'used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 '
'url.')
flags.DEFINE_string(
'tf_data_service', default=None, help='The tf.data service address')
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""All necessary imports for registration."""
# pylint: disable=unused-import
from official.nlp import tasks
from official.nlp.configs import experiment_configs
from official.utils.testing import mock_task
from official.vision import beta
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -12,52 +11,81 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Defines the base task abstraction."""
import abc
import functools
from typing import Any, Callable, Optional
from typing import Optional
import six
from absl import logging
import tensorflow as tf
from official.modeling.hyperparams import config_definitions as cfg
from official.utils import registry
from official.core import config_definitions
from official.modeling import optimization
from official.modeling import performance
OptimizationConfig = optimization.OptimizationConfig
RuntimeConfig = config_definitions.RuntimeConfig
@six.add_metaclass(abc.ABCMeta)
class Task(tf.Module):
class Task(tf.Module, metaclass=abc.ABCMeta):
"""A single-replica view of training procedure.
Tasks provide artifacts for training/evalution procedures, including
loading/iterating over Datasets, initializing the model, calculating the loss
and customized metrics with reduction.
Tasks provide artifacts for training/validation procedures, including
loading/iterating over Datasets, training/validation steps, calculating the
loss and customized metrics with reduction.
"""
# Special keys in train/validate step returned logs.
loss = "loss"
def __init__(self, params: cfg.TaskConfig, logging_dir: str = None):
def __init__(self, params, logging_dir: str = None, name: str = None):
"""Task initialization.
Args:
params: cfg.TaskConfig instance.
params: the task configuration instance, which can be any of dataclass,
ConfigDict, namedtuple, etc.
logging_dir: a string pointing to where the model, summaries etc. will be
saved. You can also write additional stuff in this directory.
name: the task name.
"""
super().__init__(name=name)
self._task_config = params
self._logging_dir = logging_dir
@property
def task_config(self) -> cfg.TaskConfig:
def task_config(self):
return self._task_config
@property
def logging_dir(self) -> str:
return self._logging_dir
@classmethod
def create_optimizer(cls, optimizer_config: OptimizationConfig,
runtime_config: Optional[RuntimeConfig] = None):
"""Creates an TF optimizer from configurations.
Args:
optimizer_config: the parameters of the Optimization settings.
runtime_config: the parameters of the runtime.
Returns:
A tf.optimizers.Optimizer object.
"""
opt_factory = optimization.OptimizerFactory(optimizer_config)
optimizer = opt_factory.build_optimizer(opt_factory.build_learning_rate())
# Configuring optimizer when loss_scale is set in runtime config. This helps
# avoiding overflow/underflow for float16 computations.
if runtime_config and runtime_config.loss_scale:
optimizer = performance.configure_optimizer(
optimizer,
use_float16=runtime_config.mixed_precision_dtype == "float16",
loss_scale=runtime_config.loss_scale)
return optimizer
def initialize(self, model: tf.keras.Model):
"""A callback function used as CheckpointManager's init_fn.
"""[Optional] A callback function used as CheckpointManager's init_fn.
This function will be called when no checkpoint is found for the model.
If there is a checkpoint, the checkpoint will be loaded and this function
......@@ -67,54 +95,34 @@ class Task(tf.Module):
Args:
model: The keras.Model built or used by this task.
"""
pass
ckpt_dir_or_file = self.task_config.init_checkpoint
logging.info("Trying to load pretrained checkpoint from %s",
ckpt_dir_or_file)
if tf.io.gfile.isdir(ckpt_dir_or_file):
ckpt_dir_or_file = tf.train.latest_checkpoint(ckpt_dir_or_file)
if not ckpt_dir_or_file:
return
if hasattr(model, "checkpoint_items"):
checkpoint_items = model.checkpoint_items
else:
checkpoint_items = dict(model=model)
ckpt = tf.train.Checkpoint(**checkpoint_items)
status = ckpt.read(ckpt_dir_or_file)
status.expect_partial().assert_existing_objects_matched()
logging.info("Finished loading pretrained checkpoint from %s",
ckpt_dir_or_file)
@abc.abstractmethod
def build_model(self) -> tf.keras.Model:
"""Creates model architecture.
"""[Optional] Creates model architecture.
Returns:
A model instance.
"""
def compile_model(self,
model: tf.keras.Model,
optimizer: tf.keras.optimizers.Optimizer,
loss=None,
train_step: Optional[Callable[..., Any]] = None,
validation_step: Optional[Callable[..., Any]] = None,
**kwargs) -> tf.keras.Model:
"""Compiles the model with objects created by the task.
The method should not be used in any customized training implementation.
Args:
model: a keras.Model.
optimizer: the keras optimizer.
loss: a callable/list of losses.
train_step: optional train step function defined by the task.
validation_step: optional validation_step step function defined by the
task.
**kwargs: other kwargs consumed by keras.Model compile().
Returns:
a compiled keras.Model.
"""
if bool(loss is None) == bool(train_step is None):
raise ValueError("`loss` and `train_step` should be exclusive to "
"each other.")
model.compile(optimizer=optimizer, loss=loss, **kwargs)
if train_step:
model.train_step = functools.partial(
train_step, model=model, optimizer=model.optimizer)
if validation_step:
model.test_step = functools.partial(validation_step, model=model)
return model
@abc.abstractmethod
def build_inputs(self,
params: cfg.DataConfig,
params,
input_context: Optional[tf.distribute.InputContext] = None):
"""Returns a dataset or a nested structure of dataset functions.
......@@ -122,7 +130,8 @@ class Task(tf.Module):
With distributed training, this method runs on remote hosts.
Args:
params: hyperparams to create input pipelines.
params: hyperparams to create input pipelines, which can be any of
dataclass, ConfigDict, namedtuple, etc.
input_context: optional distribution input pipeline context.
Returns:
......@@ -155,26 +164,30 @@ class Task(tf.Module):
return []
def process_metrics(self, metrics, labels, model_outputs):
"""Process and update metrics. Called when using custom training loop API.
"""Process and update metrics.
Called when using custom training loop API.
Args:
metrics: a nested structure of metrics objects.
The return of function self.build_metrics.
metrics: a nested structure of metrics objects. The return of function
self.build_metrics.
labels: a tensor or a nested structure of tensors.
model_outputs: a tensor or a nested structure of tensors.
For example, output of the keras model built by self.build_model.
model_outputs: a tensor or a nested structure of tensors. For example,
output of the keras model built by self.build_model.
"""
for metric in metrics:
metric.update_state(labels, model_outputs)
def process_compiled_metrics(self, compiled_metrics, labels, model_outputs):
"""Process and update compiled_metrics. call when using compile/fit API.
"""Process and update compiled_metrics.
call when using compile/fit API.
Args:
compiled_metrics: the compiled metrics (model.compiled_metrics).
labels: a tensor or a nested structure of tensors.
model_outputs: a tensor or a nested structure of tensors.
For example, output of the keras model built by self.build_model.
model_outputs: a tensor or a nested structure of tensors. For example,
output of the keras model built by self.build_model.
"""
compiled_metrics.update_state(labels, model_outputs)
......@@ -203,8 +216,14 @@ class Task(tf.Module):
with tf.GradientTape() as tape:
outputs = model(features, training=True)
# Computes per-replica loss.
loss = self.build_losses(
labels=labels, model_outputs=outputs, aux_losses=model.losses)
if model.compiled_loss:
loss = model.compiled_loss(
labels, outputs, regularization_losses=model.losses)
loss += self.build_losses(
labels=labels, model_outputs=outputs, aux_losses=None)
else:
loss = self.build_losses(
labels=labels, model_outputs=outputs, aux_losses=model.losses)
# Scales loss as the default gradients allreduce performs sum inside the
# optimizer.
scaled_loss = loss / tf.distribute.get_strategy().num_replicas_in_sync
......@@ -212,22 +231,22 @@ class Task(tf.Module):
# For mixed precision, when a LossScaleOptimizer is used, the loss is
# scaled to avoid numeric underflow.
if isinstance(optimizer,
tf.keras.mixed_precision.experimental.LossScaleOptimizer):
tf.keras.mixed_precision.LossScaleOptimizer):
scaled_loss = optimizer.get_scaled_loss(scaled_loss)
tvars = model.trainable_variables
grads = tape.gradient(scaled_loss, tvars)
if isinstance(optimizer,
tf.keras.mixed_precision.experimental.LossScaleOptimizer):
tf.keras.mixed_precision.LossScaleOptimizer):
grads = optimizer.get_unscaled_gradients(grads)
optimizer.apply_gradients(list(zip(grads, tvars)))
logs = {self.loss: loss}
if metrics:
self.process_metrics(metrics, labels, outputs)
logs.update({m.name: m.result() for m in metrics})
elif model.compiled_metrics:
if model.compiled_metrics:
self.process_compiled_metrics(model.compiled_metrics, labels, outputs)
logs.update({m.name: m.result() for m in metrics or []})
logs.update({m.name: m.result() for m in model.metrics})
return logs
......@@ -254,9 +273,9 @@ class Task(tf.Module):
logs = {self.loss: loss}
if metrics:
self.process_metrics(metrics, labels, outputs)
logs.update({m.name: m.result() for m in metrics})
elif model.compiled_metrics:
if model.compiled_metrics:
self.process_compiled_metrics(model.compiled_metrics, labels, outputs)
logs.update({m.name: m.result() for m in metrics or []})
logs.update({m.name: m.result() for m in model.metrics})
return logs
......@@ -278,53 +297,8 @@ class Task(tf.Module):
"""Optional aggregation over logs returned from a validation step."""
pass
def reduce_aggregated_logs(self, aggregated_logs):
def reduce_aggregated_logs(self,
aggregated_logs,
global_step: Optional[tf.Tensor] = None):
"""Optional reduce of aggregated logs over validation steps."""
return {}
_REGISTERED_TASK_CLS = {}
# TODO(b/158268740): Move these outside the base class file.
# TODO(b/158741360): Add type annotations once pytype checks across modules.
def register_task_cls(task_config_cls):
"""Decorates a factory of Tasks for lookup by a subclass of TaskConfig.
This decorator supports registration of tasks as follows:
```
@dataclasses.dataclass
class MyTaskConfig(TaskConfig):
# Add fields here.
pass
@register_task_cls(MyTaskConfig)
class MyTask(Task):
# Inherits def __init__(self, task_config).
pass
my_task_config = MyTaskConfig()
my_task = get_task(my_task_config) # Returns MyTask(my_task_config).
```
Besisdes a class itself, other callables that create a Task from a TaskConfig
can be decorated by the result of this function, as long as there is at most
one registration for each config class.
Args:
task_config_cls: a subclass of TaskConfig (*not* an instance of TaskConfig).
Each task_config_cls can only be used for a single registration.
Returns:
A callable for use as class decorator that registers the decorated class
for creation from an instance of task_config_cls.
"""
return registry.register(_REGISTERED_TASK_CLS, task_config_cls)
# The user-visible get_task() is defined after classes have been registered.
# TODO(b/158741360): Add type annotations once pytype checks across modules.
def get_task_cls(task_config_cls):
task_cls = registry.lookup(_REGISTERED_TASK_CLS, task_config_cls)
return task_cls
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Standard Trainer implementation.
The base trainer implements the Orbit `StandardTrainable` and
`StandardEvaluable` interfaces. Trainers inside this project should be
interchangable and independent on model architectures and tasks.
"""
import functools
from absl import logging
import gin
import orbit
import tensorflow as tf
from official.core import base_task
from official.core import config_definitions
from official.modeling import optimization
ExperimentConfig = config_definitions.ExperimentConfig
TrainerConfig = config_definitions.TrainerConfig
class Recovery:
"""Built-in model blowup recovery module.
Checks the loss value by the given threshold. If applicable, recover the
model by reading the checkpoint on disk.
"""
def __init__(self,
loss_upper_bound: float,
checkpoint_manager: tf.train.CheckpointManager,
recovery_begin_steps: int = 0,
recovery_max_trials: int = 3):
self.recover_counter = 0
self.recovery_begin_steps = recovery_begin_steps
self.recovery_max_trials = recovery_max_trials
self.loss_upper_bound = loss_upper_bound
self.checkpoint_manager = checkpoint_manager
def should_recover(self, loss_value, global_step):
if tf.math.is_nan(loss_value):
return True
if (global_step >= self.recovery_begin_steps and
loss_value > self.loss_upper_bound):
return True
return False
def maybe_recover(self, loss_value, global_step):
"""Conditionally recovers the training by triggering checkpoint restoration.
Args:
loss_value: the loss value as a float.
global_step: the number of global training steps.
Raises:
RuntimeError: when recovery happens more than the max number of trials,
the job should crash.
"""
if not self.should_recover(loss_value, global_step):
return
self.recover_counter += 1
if self.recover_counter > self.recovery_max_trials:
raise RuntimeError(
"The loss value is NaN after training loop and it happens %d times." %
self.recover_counter)
# Loads the previous good checkpoint.
checkpoint_path = self.checkpoint_manager.restore_or_initialize()
logging.warning(
"Recovering the model from checkpoint: %s. The loss value becomes "
"%f at step %d.", checkpoint_path, loss_value, global_step)
class _AsyncTrainer(orbit.StandardTrainer, orbit.StandardEvaluator):
"""Trainer class for both sync and async Strategy."""
def init_async(self):
"""Initializes the Async Trainer base class."""
assert isinstance(self._strategy, tf.distribute.Strategy)
self._is_async = isinstance(
self._strategy, tf.distribute.experimental.ParameterServerStrategy)
self._coordinator = None
if self._is_async:
self._coordinator = (
tf.distribute.experimental.coordinator.ClusterCoordinator(
self._strategy))
def join(self):
"""Join all async steps. Only useful in aysnc training."""
if getattr(self, "_is_async", False):
self._coordinator.join()
def create_train_loop_fn(self):
"""Creates a eval loop from the given step function and options."""
train_loop_fn = super().create_train_loop_fn()
if getattr(self, "_is_async", False):
def _async_loop_fn(iterator, num_steps):
self._coordinator.schedule(train_loop_fn, args=(iterator, num_steps))
return _async_loop_fn
else:
return train_loop_fn
def create_eval_loop_fn(self, has_state: bool):
"""Creates a training loop from the given step function and options."""
eval_loop_fn = super().create_eval_loop_fn(has_state)
if getattr(self, "_is_async", False):
if has_state:
raise ValueError(
"Stateful eval loop is not supported in async training.")
def _async_loop_fn(iterator, num_steps, state=None, reduce_fn=None):
assert state is None
assert reduce_fn is None
self._coordinator.schedule(eval_loop_fn, args=(iterator, num_steps))
return _async_loop_fn
else:
return eval_loop_fn
def distribute_dataset(self, dataset_or_fn, *args, **kwargs):
"""A utility function to help create a `tf.distribute.DistributedDataset`.
Args:
dataset_or_fn: A instance of `tf.data.Dataset`, or a "dataset function"
returning a `tf.data.Dataset`. If it is a function, it may optionally
have an argument named `input_context` which will be passed a
`tf.distribute.InputContext` instance.
*args: Any positional arguments to pass through to `dataset_or_fn`.
**kwargs: Any keyword arguments to pass through to `dataset_or_fn`.
Returns:
A distributed Dataset.
"""
if getattr(self, "_is_async", False):
per_worker_dataset_fn = functools.partial(
orbit.utils.make_distributed_dataset, self._strategy, dataset_or_fn,
*args, **kwargs)
per_worker_dataset_fn = tf.function(per_worker_dataset_fn)
return self._coordinator.create_per_worker_dataset(per_worker_dataset_fn)
else:
return orbit.utils.make_distributed_dataset(self._strategy, dataset_or_fn,
*args, **kwargs)
def get_runtime_options(config: ExperimentConfig):
"""Get tf.distribute.RunOptions from config."""
xla_options = {}
if config.runtime.tpu_enable_xla_dynamic_padder is not None:
xla_options["enable_xla_dynamic_padder"] = (
config.runtime.tpu_enable_xla_dynamic_padder)
return tf.distribute.RunOptions(
experimental_xla_options=tf.tpu.XLAOptions(**xla_options))
@gin.configurable
class Trainer(_AsyncTrainer):
"""Implements the common trainer shared for TensorFlow models."""
# pylint: disable=super-init-not-called
def __init__(self,
config: ExperimentConfig,
task: base_task.Task,
model: tf.keras.Model,
optimizer: tf.optimizers.Optimizer,
train: bool = True,
evaluate: bool = True,
checkpoint_exporter=None):
"""Initialize common trainer for TensorFlow models.
Args:
config: An `ExperimentConfig` instance specifying experiment config.
task: A base_task.Task instance.
model: The model instance, e.g. a tf.keras.Model instance.
optimizer: tf.optimizers.Optimizer instance.
train: bool, whether or not this trainer will be used for training.
default to True.
evaluate: bool, whether or not this trainer will be used for evaluation.
default to True.
checkpoint_exporter: an object that has the `maybe_export_checkpoint`
interface.
"""
# Gets the current distribution strategy. If not inside any strategy scope,
# it gets a single-replica no-op strategy.
self._strategy = tf.distribute.get_strategy()
self._validate_params(config)
self._config = config
self._task = task
self._model = model
self._optimizer = optimizer
self._checkpoint_exporter = checkpoint_exporter
self._recovery = None
# Runtime options are only applied to train_step.
# We use default for eval_step.
self._runtime_options = get_runtime_options(config)
# Creates a shadow copy of the weights to store weights moving average.
if isinstance(self._optimizer, optimization.ExponentialMovingAverage):
self._optimizer.shadow_copy(self._model)
# global_step increases by 1 after each training iteration.
# We should have global_step.numpy() == self.optimizer.iterations.numpy()
# when there is only 1 optimizer.
self._global_step = orbit.utils.create_global_step()
if hasattr(self.model, "checkpoint_items"):
checkpoint_items = self.model.checkpoint_items
else:
checkpoint_items = {}
self._checkpoint = tf.train.Checkpoint(
global_step=self.global_step,
model=self.model,
optimizer=self.optimizer,
**checkpoint_items)
self._train_loss = tf.keras.metrics.Mean("training_loss", dtype=tf.float32)
self._validation_loss = tf.keras.metrics.Mean(
"validation_loss", dtype=tf.float32)
self._train_metrics = self.task.build_metrics(
training=True) + self.model.metrics
self._validation_metrics = self.task.build_metrics(
training=False) + self.model.metrics
self.init_async()
if train:
train_dataset = self.distribute_dataset(
self.task.build_inputs, self.config.task.train_data)
orbit.StandardTrainer.__init__(
self,
train_dataset,
options=orbit.StandardTrainerOptions(
use_tf_while_loop=config.trainer.train_tf_while_loop,
use_tf_function=config.trainer.train_tf_function,
use_tpu_summary_optimization=config.trainer.allow_tpu_summary))
if evaluate:
eval_dataset = self.distribute_dataset(
self.task.build_inputs, self.config.task.validation_data)
orbit.StandardEvaluator.__init__(
self,
eval_dataset,
options=orbit.StandardEvaluatorOptions(
use_tf_function=config.trainer.eval_tf_function,
use_tf_while_loop=config.trainer.eval_tf_while_loop))
def _validate_params(self, config):
r"""Validates if the configuration object passed to the Trainer.
The experiment configuration should be structured as:
\trainer
\task
\train_data
\validation_data
Args:
config: a namedtuple, dataclass, ConfigDict, etc.
"""
if not hasattr(config, "trainer"):
raise AttributeError("The trainer requires the configuration contains an"
" attribute `trainer`.")
if not hasattr(config, "task"):
raise AttributeError("The trainer requires the configuration contains an"
" attribute `task`.")
if not hasattr(config.task, "train_data"):
raise AttributeError("The trainer requires the configuration contains an"
" attribute `task.train_data`.")
if not hasattr(config.task, "validation_data"):
raise AttributeError("The trainer requires the configuration contains an"
" attribute `task.validation_data`.")
@property
def strategy(self):
return self._strategy
@property
def config(self):
return self._config
@property
def task(self):
return self._task
@property
def model(self):
return self._model
@property
def optimizer(self):
if hasattr(self, "_optimizer"):
return self._optimizer
else:
return None
@property
def global_step(self):
return self._global_step
@property
def train_loss(self):
"""Accesses the training loss metric object."""
return self._train_loss
@property
def validation_loss(self):
"""Accesses the validation loss metric object."""
return self._validation_loss
@property
def train_metrics(self):
"""Accesses all training metric objects."""
return self._train_metrics
@property
def validation_metrics(self):
"""Accesses all validation metric metric objects."""
return self._validation_metrics
def initialize(self):
"""A callback function.
This function will be called when no checkpoint found for the model.
If there is a checkpoint, the checkpoint will be loaded and this function
will not be called. Tasks may use this callback function to load a
pretrained checkpoint, saved under a directory other than the model_dir.
"""
self.task.initialize(self.model)
@property
def checkpoint(self):
"""Accesses the training checkpoint."""
return self._checkpoint
def add_recovery(self, params: TrainerConfig,
checkpoint_manager: tf.train.CheckpointManager):
if params.recovery_max_trials >= 0:
self._recovery = Recovery(
loss_upper_bound=params.loss_upper_bound,
recovery_begin_steps=params.recovery_begin_steps,
recovery_max_trials=params.recovery_max_trials,
checkpoint_manager=checkpoint_manager)
def train_loop_end(self):
"""See base class."""
self.join()
# Checks if the model numeric status is stable and conducts the checkpoint
# recovery accordingly.
if self._recovery:
self._recovery.maybe_recover(self.train_loss.result().numpy(),
self.global_step.numpy())
logs = {}
for metric in self.train_metrics + [self.train_loss]:
logs[metric.name] = metric.result()
metric.reset_states()
if callable(self.optimizer.learning_rate):
logs["learning_rate"] = self.optimizer.learning_rate(self.global_step)
else:
logs["learning_rate"] = self.optimizer.learning_rate
return logs
def train_step(self, iterator):
"""See base class."""
def step_fn(inputs):
if self.config.runtime.enable_xla and (self.config.runtime.num_gpus > 0):
task_train_step = tf.function(self.task.train_step, jit_compile=True)
else:
task_train_step = self.task.train_step
logs = task_train_step(
inputs,
model=self.model,
optimizer=self.optimizer,
metrics=self.train_metrics)
self._train_loss.update_state(logs[self.task.loss])
self.global_step.assign_add(1)
self.strategy.run(
step_fn, args=(next(iterator),), options=self._runtime_options)
def eval_begin(self):
"""Sets up metrics."""
for metric in self.validation_metrics + [self.validation_loss]:
metric.reset_states()
# Swaps weights to test on weights moving average.
if self.optimizer and isinstance(
self.optimizer, optimization.ExponentialMovingAverage):
self.optimizer.swap_weights()
def eval_step(self, iterator):
"""See base class."""
def step_fn(inputs):
logs = self.task.validation_step(
inputs, model=self.model, metrics=self.validation_metrics)
if self.task.loss in logs:
self._validation_loss.update_state(logs[self.task.loss])
return logs
distributed_outputs = self.strategy.run(step_fn, args=(next(iterator),))
return tf.nest.map_structure(self.strategy.experimental_local_results,
distributed_outputs)
def eval_end(self, aggregated_logs=None):
"""Processes evaluation results."""
self.join()
logs = {}
for metric in self.validation_metrics:
logs[metric.name] = metric.result()
if self.validation_loss.count.numpy() != 0:
logs[self.validation_loss.name] = self.validation_loss.result()
else:
# `self.validation_loss` metric was not updated, because the validation
# loss was not returned from the task's `validation_step` method.
logging.info("The task did not report validation loss.")
if aggregated_logs:
metrics = self.task.reduce_aggregated_logs(
aggregated_logs, global_step=self.global_step)
logs.update(metrics)
if self._checkpoint_exporter:
self._checkpoint_exporter.maybe_export_checkpoint(
self.checkpoint, logs, self.global_step.numpy())
metric_name = self.config.trainer.best_checkpoint_eval_metric
logs["best_" +
metric_name] = self._checkpoint_exporter.best_ckpt_logs[metric_name]
# Swaps back weights after testing when EMA is used.
# This happens after best checkpoint export so that average weights used for
# eval are exported instead of regular weights.
if self.optimizer and isinstance(
self.optimizer, optimization.ExponentialMovingAverage):
self.optimizer.swap_weights()
return logs
def eval_reduce(self, state=None, step_outputs=None):
return self.task.aggregate_logs(state, step_outputs)
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for tensorflow_models.core.trainers.trainer."""
# pylint: disable=g-direct-tensorflow-import
import multiprocessing
import os
import sys
from absl.testing import parameterized
import numpy as np
import portpicker
import tensorflow as tf
from tensorflow.python.distribute import combinations
from tensorflow.python.distribute import strategy_combinations
from official.core import base_trainer as trainer_lib
from official.core import config_definitions as cfg
from official.core import train_lib
from official.utils.testing import mock_task
TPU_TEST = 'test_tpu' in sys.argv[0]
GPU_TEST = 'test_gpu' in sys.argv[0]
def all_strategy_combinations():
return combinations.combine(
distribution=[
strategy_combinations.default_strategy,
strategy_combinations.cloud_tpu_strategy,
strategy_combinations.one_device_strategy_gpu,
],)
def create_in_process_cluster(num_workers, num_ps):
"""Creates and starts local servers and returns the cluster_resolver."""
worker_ports = [portpicker.pick_unused_port() for _ in range(num_workers)]
ps_ports = [portpicker.pick_unused_port() for _ in range(num_ps)]
cluster_dict = {}
cluster_dict['worker'] = ['localhost:%s' % port for port in worker_ports]
if num_ps > 0:
cluster_dict['ps'] = ['localhost:%s' % port for port in ps_ports]
cluster_spec = tf.train.ClusterSpec(cluster_dict)
# Workers need some inter_ops threads to work properly.
worker_config = tf.compat.v1.ConfigProto()
if multiprocessing.cpu_count() < num_workers + 1:
worker_config.inter_op_parallelism_threads = num_workers + 1
for i in range(num_workers):
tf.distribute.Server(
cluster_spec,
job_name='worker',
task_index=i,
config=worker_config,
protocol='grpc')
for i in range(num_ps):
tf.distribute.Server(
cluster_spec, job_name='ps', task_index=i, protocol='grpc')
cluster_resolver = tf.distribute.cluster_resolver.SimpleClusterResolver(
cluster_spec, rpc_layer='grpc')
return cluster_resolver
def dataset_fn(input_context=None):
del input_context
def dummy_data(_):
return tf.zeros((1, 1), dtype=tf.float32)
dataset = tf.data.Dataset.range(1)
dataset = dataset.repeat()
dataset = dataset.map(
dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
return dataset
class MockAsyncTrainer(trainer_lib._AsyncTrainer):
"""Mock AsyncTrainer to test the _AsyncTrainer class."""
def __init__(self):
self._strategy = tf.distribute.get_strategy()
self.init_async()
self.global_step = tf.Variable(
0,
dtype=tf.int64,
name='global_step',
trainable=False,
aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA)
self.eval_global_step = tf.Variable(
0,
dtype=tf.int64,
name='eval_global_step',
trainable=False,
aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA)
train_dataset = self.distribute_dataset(dataset_fn)
trainer_lib.orbit.StandardTrainer.__init__(
self, train_dataset, options=trainer_lib.orbit.StandardTrainerOptions())
eval_dataset = self.distribute_dataset(dataset_fn)
trainer_lib.orbit.StandardEvaluator.__init__(
self,
eval_dataset,
options=trainer_lib.orbit.StandardEvaluatorOptions(
use_tf_while_loop=True))
def train_loop_begin(self):
self.global_step.assign(0)
def train_step(self, iterator):
def replica_step(_):
self.global_step.assign_add(1)
self._strategy.run(replica_step, args=(next(iterator),))
def train_loop_end(self):
self.join()
return self.global_step.numpy()
def eval_begin(self):
self.eval_global_step.assign(0)
def eval_step(self, iterator):
def replica_step(_):
self.eval_global_step.assign_add(1)
self._strategy.run(replica_step, args=(next(iterator),))
def eval_end(self):
self.join()
return self.eval_global_step.numpy()
class TrainerTest(tf.test.TestCase, parameterized.TestCase):
def setUp(self):
super().setUp()
self._config = cfg.ExperimentConfig(
trainer=cfg.TrainerConfig(
optimizer_config=cfg.OptimizationConfig({
'optimizer': {
'type': 'sgd'
},
'learning_rate': {
'type': 'constant'
}
})))
def create_test_trainer(self, config, model_dir=None, task=None):
task = task or mock_task.MockTask(config.task, logging_dir=model_dir)
ckpt_exporter = train_lib.maybe_create_best_ckpt_exporter(config, model_dir)
trainer = trainer_lib.Trainer(
config,
task,
model=task.build_model(),
optimizer=task.create_optimizer(config.trainer.optimizer_config,
config.runtime),
checkpoint_exporter=ckpt_exporter)
return trainer
@combinations.generate(all_strategy_combinations())
def test_trainer_train(self, distribution):
with distribution.scope():
trainer = self.create_test_trainer(self._config)
logs = trainer.train(tf.convert_to_tensor(5, dtype=tf.int32))
self.assertIn('training_loss', logs)
self.assertIn('learning_rate', logs)
def test_base_async_trainer(self):
if TPU_TEST or GPU_TEST:
self.skipTest('Aysnc training is not available on GPU/GPU.')
num_workers = 3
num_ps = 2
cluster_resolver = create_in_process_cluster(num_workers, num_ps)
distribution = tf.distribute.experimental.ParameterServerStrategy(
cluster_resolver)
with distribution.scope():
trainer = MockAsyncTrainer()
trainer.init_async()
self.assertIsInstance(
trainer._coordinator,
tf.distribute.experimental.coordinator.ClusterCoordinator)
self.assertEqual(trainer.train(tf.constant(10)), 10)
self.assertEqual(trainer.evaluate(tf.constant(11)), 11)
def test_async_trainer_train(self):
if TPU_TEST or GPU_TEST:
self.skipTest('Aysnc training is not available on GPU/GPU.')
num_workers = 3
num_ps = 2
cluster_resolver = create_in_process_cluster(num_workers, num_ps)
distribution = tf.distribute.experimental.ParameterServerStrategy(
cluster_resolver)
with distribution.scope():
config = cfg.ExperimentConfig(**self._config.as_dict())
config.trainer.eval_tf_while_loop = True
trainer = self.create_test_trainer(config)
logs = trainer.train(tf.convert_to_tensor(5, dtype=tf.int32))
self.assertIn('training_loss', logs)
self.assertIn('learning_rate', logs)
def test_async_trainer_validate(self):
if TPU_TEST or GPU_TEST:
self.skipTest('Aysnc training is not available on GPU/GPU.')
num_workers = 3
num_ps = 2
cluster_resolver = create_in_process_cluster(num_workers, num_ps)
distribution = tf.distribute.experimental.ParameterServerStrategy(
cluster_resolver)
with distribution.scope():
config = cfg.ExperimentConfig(**self._config.as_dict())
config.trainer.eval_tf_while_loop = True
trainer = self.create_test_trainer(config)
logs = trainer.evaluate(tf.convert_to_tensor(5, dtype=tf.int32))
self.assertIn('acc', logs)
self.assertIn('validation_loss', logs)
@combinations.generate(all_strategy_combinations())
def test_trainer_validate(self, distribution):
with distribution.scope():
trainer = self.create_test_trainer(self._config)
logs = trainer.evaluate(tf.convert_to_tensor(5, dtype=tf.int32))
self.assertEqual(logs['counter'], 5. * distribution.num_replicas_in_sync)
self.assertIn('validation_loss', logs)
@combinations.generate(all_strategy_combinations())
def test_trainer_validate_without_loss(self, distribution):
class MockTaskWithoutValidationLoss(mock_task.MockTask):
def validation_step(self, inputs, model, metrics=None):
# Disable validation loss.
logs = super().validation_step(inputs, model)
del logs[self.loss]
return logs
with distribution.scope():
task = MockTaskWithoutValidationLoss()
trainer = self.create_test_trainer(self._config, task=task)
logs = trainer.evaluate(tf.convert_to_tensor(5, dtype=tf.int32))
self.assertEqual(logs['counter'], 5. * distribution.num_replicas_in_sync)
self.assertNotIn('validation_loss', logs)
@combinations.generate(
combinations.combine(
mixed_precision_dtype=['float32', 'bfloat16', 'float16'],
loss_scale=[None, 'dynamic', 128, 256],
))
def test_configure_optimizer(self, mixed_precision_dtype, loss_scale):
config = cfg.ExperimentConfig(
runtime=cfg.RuntimeConfig(
mixed_precision_dtype=mixed_precision_dtype, loss_scale=loss_scale),
trainer=cfg.TrainerConfig(
optimizer_config=cfg.OptimizationConfig({
'optimizer': {
'type': 'sgd'
},
'learning_rate': {
'type': 'constant'
},
})))
trainer = self.create_test_trainer(config)
if mixed_precision_dtype != 'float16':
self.assertIsInstance(trainer.optimizer, tf.keras.optimizers.SGD)
elif mixed_precision_dtype == 'float16' and loss_scale is None:
self.assertIsInstance(trainer.optimizer, tf.keras.optimizers.SGD)
else:
self.assertIsInstance(trainer.optimizer,
tf.keras.mixed_precision.LossScaleOptimizer)
metrics = trainer.train(tf.convert_to_tensor(5, dtype=tf.int32))
self.assertIn('training_loss', metrics)
def test_export_best_ckpt(self):
config = cfg.ExperimentConfig(
trainer=cfg.TrainerConfig(
best_checkpoint_export_subdir='best_ckpt',
best_checkpoint_eval_metric='acc',
optimizer_config=cfg.OptimizationConfig({
'optimizer': {
'type': 'sgd'
},
'learning_rate': {
'type': 'constant'
}
})))
model_dir = self.get_temp_dir()
trainer = self.create_test_trainer(config, model_dir=model_dir)
trainer.train(tf.convert_to_tensor(1, dtype=tf.int32))
trainer.evaluate(tf.convert_to_tensor(1, dtype=tf.int32))
self.assertTrue(
tf.io.gfile.exists(os.path.join(model_dir, 'best_ckpt', 'info.json')))
def test_recovery(self):
config = cfg.ExperimentConfig(
trainer=cfg.TrainerConfig(
loss_upper_bound=0.5,
recovery_max_trials=2,
optimizer_config=cfg.OptimizationConfig({
'optimizer': {
'type': 'sgd'
},
'learning_rate': {
'type': 'constant'
}
})))
model_dir = self.get_temp_dir()
trainer = self.create_test_trainer(config, model_dir=model_dir)
checkpoint_manager = tf.train.CheckpointManager(
trainer.checkpoint, self.get_temp_dir(), max_to_keep=2)
checkpoint_manager.save()
trainer.add_recovery(config.trainer, checkpoint_manager=checkpoint_manager)
before_weights = trainer.model.get_weights()
_ = trainer.train(tf.convert_to_tensor(1, dtype=tf.int32))
# The training loss is 1.0 and upper_bound is 0.5, so the recover happens.
after_weights = trainer.model.get_weights()
for left, right in zip(before_weights, after_weights):
self.assertAllEqual(left, right)
# Let's the loss be NaN and max_trials = 0 to see RuntimeError.
config = cfg.ExperimentConfig(
trainer=cfg.TrainerConfig(
recovery_max_trials=0,
optimizer_config=cfg.OptimizationConfig({
'optimizer': {
'type': 'sgd'
},
'learning_rate': {
'type': 'constant'
}
})))
task = mock_task.MockTask(config.task, logging_dir=model_dir)
def build_losses(labels, model_outputs, aux_losses=None):
del labels, model_outputs
return tf.constant([np.nan], tf.float32) + aux_losses
task.build_losses = build_losses
trainer = trainer_lib.Trainer(
config,
task,
model=task.build_model(),
optimizer=task.create_optimizer(config.trainer.optimizer_config,
config.runtime))
trainer.add_recovery(config.trainer, checkpoint_manager=checkpoint_manager)
with self.assertRaises(RuntimeError):
_ = trainer.train(tf.convert_to_tensor(2, dtype=tf.int32))
def test_model_with_compiled_loss(self):
task = mock_task.MockTask()
model = task.build_model()
model.compile(loss=tf.keras.losses.CategoricalCrossentropy())
trainer = trainer_lib.Trainer(
self._config,
task,
model=model,
optimizer=task.create_optimizer(self._config.trainer.optimizer_config))
logs = trainer.train(tf.convert_to_tensor(5, dtype=tf.int32))
self.assertIn('training_loss', logs)
if __name__ == '__main__':
tf.test.main()
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Common configuration settings."""
from typing import Optional, Sequence, Union
import dataclasses
from official.modeling.hyperparams import base_config
from official.modeling.optimization.configs import optimization_config
OptimizationConfig = optimization_config.OptimizationConfig
@dataclasses.dataclass
class DataConfig(base_config.Config):
"""The base configuration for building datasets.
Attributes:
input_path: The path to the input. It can be either (1) a str indicating
a file path/pattern, or (2) a str indicating multiple file paths/patterns
separated by comma (e.g "a, b, c" or no spaces "a,b,c"), or
(3) a list of str, each of which is a file path/pattern or multiple file
paths/patterns separated by comma.
It should not be specified when the following `tfds_name` is specified.
tfds_name: The name of the tensorflow dataset (TFDS). It should not be
specified when the above `input_path` is specified.
tfds_split: A str indicating which split of the data to load from TFDS. It
is required when above `tfds_name` is specified.
global_batch_size: The global batch size across all replicas.
is_training: Whether this data is used for training or not.
drop_remainder: Whether the last batch should be dropped in the case it has
fewer than `global_batch_size` elements.
shuffle_buffer_size: The buffer size used for shuffling training data.
cache: Whether to cache dataset examples. If `True`, we will cache the
dataset after applying the decode_fn and parse_fn. It can be used to avoid
re-reading from disk, re-decoding and re-parsing the example on the
second epoch, but it requires significant memory overhead.
cycle_length: The number of files that will be processed concurrently when
interleaving files.
block_length: The number of consecutive elements to produce from each input
element before cycling to another input element when interleaving files.
deterministic: A boolean controlling whether determinism should be enforced.
sharding: Whether sharding is used in the input pipeline.
enable_tf_data_service: A boolean indicating whether to enable tf.data
service for the input pipeline.
tf_data_service_address: The URI of a tf.data service to offload
preprocessing onto during training. The URI should be in the format
"protocol://address", e.g. "grpc://tf-data-service:5050". It can be
overridden by `FLAGS.tf_data_service` flag in the binary.
tf_data_service_job_name: The name of the tf.data service job. This
argument makes it possible for multiple datasets to share the same job.
The default behavior is that the dataset creates anonymous, exclusively
owned jobs.
tfds_data_dir: A str specifying the directory to read/write TFDS data.
tfds_as_supervised: A bool. When loading dataset from TFDS, if True, the
returned tf.data.Dataset will have a 2-tuple structure (input, label)
according to builder.info.supervised_keys; if False, the default, the
returned tf.data.Dataset will have a dictionary with all the features.
tfds_skip_decoding_feature: A str to indicate which features are skipped for
decoding when loading dataset from TFDS. Use comma to separate multiple
features. The main use case is to skip the image/video decoding for better
performance.
seed: An optional seed to use for deterministic shuffling/preprocessing.
"""
input_path: Union[Sequence[str], str] = ""
tfds_name: str = ""
tfds_split: str = ""
global_batch_size: int = 0
is_training: bool = None
drop_remainder: bool = True
shuffle_buffer_size: int = 100
cache: bool = False
cycle_length: Optional[int] = None
block_length: int = 1
deterministic: Optional[bool] = None
sharding: bool = True
enable_tf_data_service: bool = False
tf_data_service_address: Optional[str] = None
tf_data_service_job_name: Optional[str] = None
tfds_data_dir: str = ""
tfds_as_supervised: bool = False
tfds_skip_decoding_feature: str = ""
seed: Optional[int] = None
@dataclasses.dataclass
class RuntimeConfig(base_config.Config):
"""High-level configurations for Runtime.
These include parameters that are not directly related to the experiment,
e.g. directories, accelerator type, etc.
Attributes:
distribution_strategy: e.g. 'mirrored', 'tpu', etc.
enable_xla: Whether or not to enable XLA.
per_gpu_thread_count: thread count per GPU.
gpu_thread_mode: Whether and how the GPU device uses its own threadpool.
dataset_num_private_threads: Number of threads for a private threadpool
created for all datasets computation.
tpu: The address of the TPU to use, if any.
num_gpus: The number of GPUs to use, if any.
worker_hosts: comma-separated list of worker ip:port pairs for running
multi-worker models with DistributionStrategy.
task_index: If multi-worker training, the task index of this worker.
all_reduce_alg: Defines the algorithm for performing all-reduce.
num_packs: Sets `num_packs` in the cross device ops used in
MirroredStrategy. For details, see tf.distribute.NcclAllReduce.
mixed_precision_dtype: dtype of mixed precision policy. It can be 'float32',
'float16', or 'bfloat16'.
loss_scale: The type of loss scale, or 'float' value. This is used when
setting the mixed precision policy.
run_eagerly: Whether or not to run the experiment eagerly.
batchnorm_spatial_persistent: Whether or not to enable the spatial
persistent mode for CuDNN batch norm kernel for improved GPU performance.
"""
distribution_strategy: str = "mirrored"
enable_xla: bool = False
gpu_thread_mode: Optional[str] = None
dataset_num_private_threads: Optional[int] = None
per_gpu_thread_count: int = 0
tpu: Optional[str] = None
num_gpus: int = 0
worker_hosts: Optional[str] = None
task_index: int = -1
all_reduce_alg: Optional[str] = None
num_packs: int = 1
mixed_precision_dtype: Optional[str] = None
loss_scale: Optional[Union[str, float]] = None
run_eagerly: bool = False
batchnorm_spatial_persistent: bool = False
# XLA runtime params.
# XLA params are only applied to the train_step.
# These augments can improve training speed. They can also improve eval, but
# may reduce usability and users would need to make changes to code.
# Whether to enable XLA dynamic padder
# infrastructure to handle dynamic shapes inputs inside XLA. True by
# default. Disabling this may cause correctness issues with dynamic shapes
# inputs, as XLA will just assume the inputs are with padded shapes. However
# users can optionally set it to False to improve device time if masking is
# already handled in the user side.
# If None, will respect XLA default.
tpu_enable_xla_dynamic_padder: Optional[bool] = None
# Global model parallelism configurations.
num_cores_per_replica: int = 1
default_shard_dim: int = -1
def model_parallelism(self):
return dict(
num_cores_per_replica=self.num_cores_per_replica,
default_shard_dim=self.default_shard_dim)
@dataclasses.dataclass
class TrainerConfig(base_config.Config):
"""Configuration for trainer.
Attributes:
optimizer_config: optimizer config, it includes optimizer, learning rate,
and warmup schedule configs.
train_tf_while_loop: whether or not to use tf while loop.
train_tf_function: whether or not to use tf_function for training loop.
eval_tf_function: whether or not to use tf_function for eval.
allow_tpu_summary: Whether to allow summary happen inside the XLA program
runs on TPU through automatic outside compilation.
steps_per_loop: number of steps per loop.
summary_interval: number of steps between each summary.
checkpoint_interval: number of steps between checkpoints.
max_to_keep: max checkpoints to keep.
continuous_eval_timeout: maximum number of seconds to wait between
checkpoints, if set to None, continuous eval will wait indefinitely. This
is only used continuous_train_and_eval and continuous_eval modes. Default
value is 1 hrs.
train_steps: number of train steps.
validation_steps: number of eval steps. If `None`, the entire eval dataset
is used.
validation_interval: number of training steps to run between evaluations.
best_checkpoint_export_subdir: if set, the trainer will keep track of the
best evaluation metric, and export the corresponding best checkpoint under
`model_dir/best_checkpoint_export_subdir`. Note that this only works if
mode contains eval (such as `train_and_eval`, `continuous_eval`, and
`continuous_train_and_eval`).
best_checkpoint_eval_metric: for exporting the best checkpoint, which
evaluation metric the trainer should monitor. This can be any evaluation
metric appears on tensorboard.
best_checkpoint_metric_comp: for exporting the best checkpoint, how the
trainer should compare the evaluation metrics. This can be either `higher`
(higher the better) or `lower` (lower the better).
validation_summary_subdir: A 'str', sub directory for saving eval summary.
"""
optimizer_config: OptimizationConfig = OptimizationConfig()
# Orbit settings.
train_tf_while_loop: bool = True
train_tf_function: bool = True
eval_tf_function: bool = True
eval_tf_while_loop: bool = False
allow_tpu_summary: bool = False
# Trainer intervals.
steps_per_loop: int = 1000
summary_interval: int = 1000
checkpoint_interval: int = 1000
# Checkpoint manager.
max_to_keep: int = 5
continuous_eval_timeout: int = 60 * 60
# Train/Eval routines.
train_steps: int = 0
# Sets validation steps to be -1 to evaluate the entire dataset.
validation_steps: int = -1
validation_interval: int = 1000
# Best checkpoint export.
best_checkpoint_export_subdir: str = ""
best_checkpoint_eval_metric: str = ""
best_checkpoint_metric_comp: str = "higher"
# Blowup recovery.
loss_upper_bound: float = 1e6
recovery_begin_steps: int = 0 # Enforcing the loss bound after these steps.
# When max trials < 0, no recovery module; max trials = 0, we will check
# the condition and fail the job if the condition happens; max trials > 0,
# we will retore the model states.
recovery_max_trials: int = 0
validation_summary_subdir: str = "validation"
@dataclasses.dataclass
class TaskConfig(base_config.Config):
init_checkpoint: str = ""
model: base_config.Config = None
train_data: DataConfig = DataConfig()
validation_data: DataConfig = DataConfig()
@dataclasses.dataclass
class ExperimentConfig(base_config.Config):
"""Top-level configuration."""
task: TaskConfig = TaskConfig()
trainer: TrainerConfig = TrainerConfig()
runtime: RuntimeConfig = RuntimeConfig()
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Experiment factory methods."""
from official.core import config_definitions as cfg
from official.core import registry
_REGISTERED_CONFIGS = {}
def register_config_factory(name):
"""Register ExperimentConfig factory method."""
return registry.register(_REGISTERED_CONFIGS, name)
def get_exp_config_creater(exp_name: str):
"""Looks up ExperimentConfig factory methods."""
exp_creater = registry.lookup(_REGISTERED_CONFIGS, exp_name)
return exp_creater
def get_exp_config(exp_name: str) -> cfg.ExperimentConfig:
return get_exp_config_creater(exp_name)()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment