Unverified Commit 55bf4b80 authored by Hongkun Yu's avatar Hongkun Yu Committed by GitHub
Browse files

Merge branch 'master' into absl

parents 15e0057f 2416dd9c
...@@ -2,12 +2,10 @@ ...@@ -2,12 +2,10 @@
This repository contains a number of different models implemented in [TensorFlow](https://www.tensorflow.org): This repository contains a number of different models implemented in [TensorFlow](https://www.tensorflow.org):
The [official models](official) are a collection of example models that use TensorFlow's high-level APIs. They are intended to be well-maintained, tested, and kept up to date with the latest stable TensorFlow API. They should also be reasonably optimized for fast performance while still being easy to read. We especially recommend newer TensorFlow users to start here. The [official models](official) are a collection of example models that use TensorFlow 2's high-level APIs. They are intended to be well-maintained, tested, and kept up to date with the latest stable TensorFlow API. They should also be reasonably optimized for fast performance while still being easy to read. We especially recommend newer TensorFlow users to start here.
The [research models](https://github.com/tensorflow/models/tree/master/research) are a large collection of models implemented in TensorFlow by researchers. They are not officially supported or available in release branches; it is up to the individual researchers to maintain the models and/or provide support on issues and pull requests. The [research models](https://github.com/tensorflow/models/tree/master/research) are a large collection of models implemented in TensorFlow by researchers. They are not officially supported or available in release branches; it is up to the individual researchers to maintain the models and/or provide support on issues and pull requests.
The [tutorials folder](tutorials) is a collection of models described in the [TensorFlow tutorials](https://www.tensorflow.org/tutorials/).
## Contribution guidelines ## Contribution guidelines
If you want to contribute to models, be sure to review the [contribution guidelines](CONTRIBUTING.md). If you want to contribute to models, be sure to review the [contribution guidelines](CONTRIBUTING.md).
......
# TensorFlow Official Models # TensorFlow Official Models
The TensorFlow official models are a collection of example models that use The TensorFlow official models are a collection of models that use
TensorFlow's high-level APIs. They are intended to be well-maintained, tested, TensorFlow's high-level APIs. They are intended to be well-maintained, tested,
and kept up to date with the latest TensorFlow API. They should also be and kept up to date with the latest TensorFlow API. They should also be
reasonably optimized for fast performance while still being easy to read. reasonably optimized for fast performance while still being easy to read.
...@@ -83,7 +83,7 @@ installable Official Models package. This is being tracked in ...@@ -83,7 +83,7 @@ installable Official Models package. This is being tracked in
* [bert](nlp/bert): A powerful pre-trained language representation model: * [bert](nlp/bert): A powerful pre-trained language representation model:
BERT, which stands for Bidirectional Encoder Representations from BERT, which stands for Bidirectional Encoder Representations from
Transformers. Transformers.
* [transformer](transformer): A transformer model to translate the WMT English * [transformer](nlp/transformer): A transformer model to translate the WMT English
to German dataset. to German dataset.
* [xlnet](nlp/xlnet): XLNet: Generalized Autoregressive Pretraining for * [xlnet](nlp/xlnet): XLNet: Generalized Autoregressive Pretraining for
Language Understanding. Language Understanding.
......
...@@ -23,7 +23,7 @@ import tensorflow as tf # pylint: disable=g-bad-import-order ...@@ -23,7 +23,7 @@ import tensorflow as tf # pylint: disable=g-bad-import-order
from official.benchmark import keras_benchmark from official.benchmark import keras_benchmark
from official.utils.testing import benchmark_wrappers from official.utils.testing import benchmark_wrappers
from official.vision.image_classification import resnet_imagenet_main from official.vision.image_classification.resnet import resnet_imagenet_main
MIN_TOP_1_ACCURACY = 0.76 MIN_TOP_1_ACCURACY = 0.76
MAX_TOP_1_ACCURACY = 0.77 MAX_TOP_1_ACCURACY = 0.77
...@@ -61,18 +61,6 @@ class Resnet50KerasAccuracy(keras_benchmark.KerasBenchmark): ...@@ -61,18 +61,6 @@ class Resnet50KerasAccuracy(keras_benchmark.KerasBenchmark):
super(Resnet50KerasAccuracy, self).__init__( super(Resnet50KerasAccuracy, self).__init__(
output_dir=output_dir, flag_methods=flag_methods) output_dir=output_dir, flag_methods=flag_methods)
def benchmark_graph_8_gpu(self):
"""Test Keras model with Keras fit/dist_strat and 8 GPUs."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.data_dir = self.data_dir
FLAGS.batch_size = 128 * 8
FLAGS.train_epochs = 90
FLAGS.epochs_between_evals = 10
FLAGS.model_dir = self._get_model_dir('benchmark_graph_8_gpu')
FLAGS.dtype = 'fp32'
self._run_and_report_benchmark()
def benchmark_8_gpu(self): def benchmark_8_gpu(self):
"""Test Keras model with eager, dist_strat and 8 GPUs.""" """Test Keras model with eager, dist_strat and 8 GPUs."""
self._setup() self._setup()
...@@ -135,30 +123,6 @@ class Resnet50KerasAccuracy(keras_benchmark.KerasBenchmark): ...@@ -135,30 +123,6 @@ class Resnet50KerasAccuracy(keras_benchmark.KerasBenchmark):
FLAGS.tf_gpu_thread_mode = 'gpu_private' FLAGS.tf_gpu_thread_mode = 'gpu_private'
self._run_and_report_benchmark() self._run_and_report_benchmark()
def benchmark_8_gpu_mlperf_like(self):
"""Test similar to the rules for MLPerf 0.5.
Listed below are reasons this comparison is not to the MLSpec, but this is
still a decent directional measurement:
- Eval is every 4 epochs and again at the end. ~2 extra times.
- Learning rate is not tuned to hit 75%, but we know the model is correct.
- We measure total time and MLPerf 0.5 excluded some startup time.
- Eval is not on the total set, need to set eval batch_size where
8*batch_size/50K is even. 250 is a good number.
- Not sure if we are doing any extra or too few steps due to epoch bleed.
"""
self._setup()
FLAGS.num_gpus = 8
FLAGS.data_dir = self.data_dir
FLAGS.batch_size = 256 * 8
FLAGS.train_epochs = 61
FLAGS.epochs_between_evals = 4
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_mlperf_like')
FLAGS.dtype = 'fp16'
FLAGS.enable_eager = True
FLAGS.enable_xla = True
self._run_and_report_benchmark(top_1_min=0.736)
def benchmark_xla_8_gpu_fp16_dynamic(self): def benchmark_xla_8_gpu_fp16_dynamic(self):
"""Test Keras model with XLA, eager, dist_strat, 8 GPUs, dynamic fp16.""" """Test Keras model with XLA, eager, dist_strat, 8 GPUs, dynamic fp16."""
self._setup() self._setup()
...@@ -921,129 +885,353 @@ class Resnet50KerasBenchmarkRemoteData(Resnet50KerasBenchmarkBase): ...@@ -921,129 +885,353 @@ class Resnet50KerasBenchmarkRemoteData(Resnet50KerasBenchmarkBase):
# Cache dataset so performance is stable after the first epoch. # Cache dataset so performance is stable after the first epoch.
def_flags['training_dataset_cache'] = True def_flags['training_dataset_cache'] = True
def_flags['log_steps'] = 100 def_flags['log_steps'] = 100
# Note that for single GPU and pure eager tests which are less likely to be
# input bound and more stable, these tests will run for shorter time by
# overriding FLAGS.train_epochs, train_seteps, log_steps in benchmark
# methods, and skip_steps in _run_and_report_benchmark().
super(Resnet50KerasBenchmarkRemoteData, self).__init__( super(Resnet50KerasBenchmarkRemoteData, self).__init__(
output_dir=output_dir, default_flags=def_flags) output_dir=output_dir, default_flags=def_flags)
@benchmark_wrappers.enable_runtime_flags def _override_flags_to_run_test_shorter(self):
def _run_and_report_benchmark(self): FLAGS.train_epochs = 1
# skip the first epoch for performance measurement. FLAGS.train_steps = 300
super(Resnet50KerasBenchmarkRemoteData, FLAGS.log_steps = 10
self)._run_and_report_benchmark(skip_steps=600)
def benchmark_1_gpu_no_dist_strat(self):
"""Test Keras model with 1 GPU, no distribution strategy."""
self._setup()
class TrivialKerasBenchmarkReal(keras_benchmark.KerasBenchmark): FLAGS.num_gpus = 1
"""Trivial model with real data benchmark tests.""" FLAGS.enable_eager = True
FLAGS.distribution_strategy = 'off'
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_no_dist_strat')
FLAGS.batch_size = 128
self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark()
def __init__(self, output_dir=None, root_data_dir=None, **kwargs): def benchmark_1_gpu_no_dist_strat_run_eagerly(self):
flag_methods = [resnet_imagenet_main.define_imagenet_keras_flags] """Test Keras model with 1 GPU, no distribution strategy, run eagerly."""
self._setup()
def_flags = {} FLAGS.num_gpus = 1
def_flags['use_trivial_model'] = True FLAGS.enable_eager = True
def_flags['skip_eval'] = True FLAGS.run_eagerly = True
def_flags['report_accuracy_metrics'] = False FLAGS.distribution_strategy = 'off'
def_flags['dtype'] = 'fp16' FLAGS.model_dir = self._get_model_dir(
def_flags['data_dir'] = os.path.join(root_data_dir, 'imagenet') 'benchmark_1_gpu_no_dist_strat_run_eagerly')
def_flags['train_steps'] = 600 FLAGS.batch_size = 64
def_flags['log_steps'] = 100 self._override_flags_to_run_test_shorter()
def_flags['distribution_strategy'] = 'mirrored' self._run_and_report_benchmark()
super(TrivialKerasBenchmarkReal, self).__init__( def benchmark_1_gpu_no_dist_strat_run_eagerly_tweaked(self):
output_dir=output_dir, """Test Keras model with 1 GPU, no distribution strategy, run eagerly."""
flag_methods=flag_methods, self._setup()
default_flags=def_flags)
@benchmark_wrappers.enable_runtime_flags FLAGS.num_gpus = 1
def _run_and_report_benchmark(self): FLAGS.enable_eager = True
start_time_sec = time.time() FLAGS.run_eagerly = True
stats = resnet_imagenet_main.run(FLAGS) FLAGS.explicit_gpu_placement = True
wall_time_sec = time.time() - start_time_sec FLAGS.distribution_strategy = 'off'
FLAGS.model_dir = self._get_model_dir(
'benchmark_1_gpu_no_dist_strat_run_eagerly_tweaked')
FLAGS.batch_size = 64
self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark()
super(TrivialKerasBenchmarkReal, self)._report_benchmark( def benchmark_1_gpu_no_dist_strat_run_eagerly_fp16(self):
stats, """Test with 1 GPU, no distribution strategy, fp16, run eagerly."""
wall_time_sec, self._setup()
total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps)
def benchmark_8_gpu_warmup(self): FLAGS.num_gpus = 1
"""Dummy test that runs over an epoch to warmup the machine.""" FLAGS.enable_eager = True
FLAGS.run_eagerly = True
FLAGS.distribution_strategy = 'off'
FLAGS.model_dir = self._get_model_dir(
'benchmark_1_gpu_no_dist_strat_run_eagerly_fp16')
FLAGS.dtype = 'fp16'
FLAGS.batch_size = 128
self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark()
def benchmark_1_gpu_no_dist_strat_run_eagerly_fp16_tweaked(self):
"""Test with 1 GPU, no distribution strategy, fp16, run eagerly."""
self._setup() self._setup()
FLAGS.num_gpus = 8 FLAGS.num_gpus = 1
FLAGS.enable_eager = True FLAGS.enable_eager = True
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_warmup') FLAGS.run_eagerly = True
FLAGS.batch_size = 256 * 8 FLAGS.explicit_gpu_placement = True
FLAGS.train_steps = 700 FLAGS.distribution_strategy = 'off'
FLAGS.model_dir = self._get_model_dir(
'benchmark_1_gpu_no_dist_strat_run_eagerly_fp16_tweaked')
FLAGS.dtype = 'fp16'
FLAGS.batch_size = 128
self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark()
def benchmark_graph_1_gpu_no_dist_strat(self):
"""Test Keras model in legacy graph mode with 1 GPU, no dist strat."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = False
FLAGS.distribution_strategy = 'off'
FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu_no_dist_strat')
FLAGS.batch_size = 96 # BatchNorm is less efficient in legacy graph mode
# due to its reliance on v1 cond.
self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark() self._run_and_report_benchmark()
def benchmark_1_gpu(self): def benchmark_1_gpu(self):
"""Test trivial Keras model (input pipeline) with 1 GPU.""" """Test Keras model with 1 GPU."""
self._setup() self._setup()
FLAGS.num_gpus = 1 FLAGS.num_gpus = 1
FLAGS.enable_eager = True FLAGS.enable_eager = True
FLAGS.enable_xla = True FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu') FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu')
FLAGS.batch_size = 128
self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark()
def benchmark_1_gpu_amp(self):
"""Test Keras model with 1 GPU with automatic mixed precision."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.dtype = 'fp16'
FLAGS.fp16_implementation = 'graph_rewrite'
FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_amp')
FLAGS.batch_size = 256 FLAGS.batch_size = 256
self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark() self._run_and_report_benchmark()
def benchmark_graph_1_gpu(self): def benchmark_xla_1_gpu(self):
"""Test trivial Keras model (input pipeline) with 1 GPU.""" """Test Keras model with XLA and 1 GPU."""
self._setup() self._setup()
FLAGS.num_gpus = 1 FLAGS.num_gpus = 1
FLAGS.enable_eager = False FLAGS.enable_eager = True
FLAGS.enable_xla = True FLAGS.enable_xla = True
FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu') FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu')
FLAGS.batch_size = 128
self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark()
def benchmark_xla_1_gpu_amp(self):
"""Test Keras model with XLA and 1 GPU with automatic mixed precision."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.dtype = 'fp16'
FLAGS.fp16_implementation = 'graph_rewrite'
FLAGS.enable_xla = True
FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_amp')
FLAGS.batch_size = 256 FLAGS.batch_size = 256
self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark() self._run_and_report_benchmark()
def benchmark_8_gpu(self): def benchmark_1_gpu_fp16(self):
"""Test trivial Keras model (input pipeline) with 8 GPUs.""" """Test Keras model with 1 GPU and fp16."""
self._setup() self._setup()
FLAGS.num_gpus = 8 FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_fp16')
FLAGS.dtype = 'fp16'
FLAGS.batch_size = 256
self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark()
def benchmark_1_gpu_fp16_dynamic(self):
"""Test Keras model with 1 GPU, fp16, and dynamic loss scaling."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_fp16_dynamic')
FLAGS.dtype = 'fp16'
FLAGS.batch_size = 256
FLAGS.loss_scale = 'dynamic'
self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark()
def benchmark_xla_1_gpu_fp16(self):
"""Test Keras model with XLA, 1 GPU and fp16."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True FLAGS.enable_eager = True
FLAGS.enable_xla = True FLAGS.enable_xla = True
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu') FLAGS.distribution_strategy = 'one_device'
FLAGS.batch_size = 256 * 8 FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_fp16')
FLAGS.dtype = 'fp16'
FLAGS.batch_size = 256
self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark() self._run_and_report_benchmark()
def benchmark_8_gpu_tweaked(self): def benchmark_xla_1_gpu_fp16_tweaked(self):
"""Test trivial Keras model with tuning and 8 GPUs.""" """Test Keras model with XLA, 1 GPU, fp16, and manual config tuning."""
self._setup() self._setup()
FLAGS.num_gpus = 8 FLAGS.num_gpus = 1
FLAGS.enable_eager = True FLAGS.enable_eager = True
FLAGS.enable_xla = True FLAGS.enable_xla = True
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_tweaked') FLAGS.distribution_strategy = 'one_device'
FLAGS.batch_size = 256 * 8 FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_fp16_tweaked')
FLAGS.dtype = 'fp16'
FLAGS.batch_size = 256
FLAGS.tf_gpu_thread_mode = 'gpu_private' FLAGS.tf_gpu_thread_mode = 'gpu_private'
FLAGS.datasets_num_private_threads = 48 self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark() self._run_and_report_benchmark()
def benchmark_graph_8_gpu(self): def benchmark_xla_1_gpu_fp16_dynamic(self):
"""Test trivial Keras model in legacy graph mode with 8 GPUs.""" """Test Keras model with XLA, 1 GPU, fp16, and dynamic loss scaling."""
self._setup() self._setup()
FLAGS.num_gpus = 8 FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.enable_xla = True
FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_fp16_dynamic')
FLAGS.dtype = 'fp16'
FLAGS.batch_size = 256
FLAGS.loss_scale = 'dynamic'
self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark()
def benchmark_graph_1_gpu(self):
"""Test Keras model in legacy graph mode with 1 GPU."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = False
FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu')
FLAGS.batch_size = 128
self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark()
def benchmark_graph_xla_1_gpu(self):
"""Test Keras model in legacy graph mode with XLA and 1 GPU."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = False FLAGS.enable_eager = False
FLAGS.enable_xla = True FLAGS.enable_xla = True
FLAGS.model_dir = self._get_model_dir('benchmark_graph_8_gpu') FLAGS.distribution_strategy = 'one_device'
FLAGS.batch_size = 256 * 8 FLAGS.model_dir = self._get_model_dir('benchmark_graph_xla_1_gpu')
FLAGS.batch_size = 128
self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark() self._run_and_report_benchmark()
def benchmark_graph_8_gpu_tweaked(self): def benchmark_graph_1_gpu_fp16(self):
"""Test trivial Keras model in legacy graph mode with tuning and 8 GPUs.""" """Test Keras model in legacy graph mode with 1 GPU and fp16."""
self._setup() self._setup()
FLAGS.num_gpus = 8 FLAGS.num_gpus = 1
FLAGS.dtype = 'fp16'
FLAGS.enable_eager = False
FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu_fp16')
FLAGS.batch_size = 256
self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark()
def benchmark_graph_xla_1_gpu_fp16(self):
"""Test Keras model in legacy graph mode with 1 GPU, fp16 and XLA."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.dtype = 'fp16'
FLAGS.enable_eager = False FLAGS.enable_eager = False
FLAGS.enable_xla = True FLAGS.enable_xla = True
FLAGS.model_dir = self._get_model_dir('benchmark_graph_8_gpu_tweaked') FLAGS.distribution_strategy = 'one_device'
FLAGS.batch_size = 256 * 8 FLAGS.model_dir = self._get_model_dir('benchmark_graph_xla_1_gpu_fp16')
FLAGS.batch_size = 256
self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark()
def benchmark_graph_xla_1_gpu_fp16_tweaked(self):
"""Test Keras model in legacy graph with 1 GPU, fp16, XLA, and tuning."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = False
FLAGS.enable_xla = True
FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir(
'benchmark_graph_xla_1_gpu_fp16_tweaked')
FLAGS.dtype = 'fp16'
FLAGS.batch_size = 256
FLAGS.tf_gpu_thread_mode = 'gpu_private' FLAGS.tf_gpu_thread_mode = 'gpu_private'
FLAGS.datasets_num_private_threads = 48 self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark()
@benchmark_wrappers.enable_runtime_flags
def _run_and_report_benchmark(self):
if FLAGS.num_gpus == 1 or FLAGS.run_eagerly:
# For single GPU and pure eager tests which are less likely to be input
# bound and more stable, run for shorter time and use the default
# skip_steps.
skip_steps = None
else:
# skip the first epoch for performance measurement.
skip_steps = 600
super(Resnet50KerasBenchmarkRemoteData,
self)._run_and_report_benchmark(skip_steps=skip_steps)
class TrivialKerasBenchmarkReal(keras_benchmark.KerasBenchmark):
"""Trivial model with real data benchmark tests."""
def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
flag_methods = [resnet_imagenet_main.define_imagenet_keras_flags]
def_flags = {}
def_flags['use_trivial_model'] = True
def_flags['skip_eval'] = True
def_flags['report_accuracy_metrics'] = False
def_flags['dtype'] = 'fp16'
def_flags['data_dir'] = os.path.join(root_data_dir, 'imagenet')
def_flags['train_steps'] = 600
def_flags['log_steps'] = 100
def_flags['distribution_strategy'] = 'mirrored'
super(TrivialKerasBenchmarkReal, self).__init__(
output_dir=output_dir,
flag_methods=flag_methods,
default_flags=def_flags)
@benchmark_wrappers.enable_runtime_flags
def _run_and_report_benchmark(self):
start_time_sec = time.time()
stats = resnet_imagenet_main.run(FLAGS)
wall_time_sec = time.time() - start_time_sec
super(TrivialKerasBenchmarkReal, self)._report_benchmark(
stats,
wall_time_sec,
total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps)
def benchmark_8_gpu_warmup(self):
"""Dummy test that runs over an epoch to warmup the machine."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.enable_eager = True
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_warmup')
FLAGS.batch_size = 256 * 8
FLAGS.train_steps = 700
self._run_and_report_benchmark() self._run_and_report_benchmark()
def fill_report_object(self, stats): def fill_report_object(self, stats):
......
...@@ -28,8 +28,8 @@ from official.utils.flags import core as flags_core ...@@ -28,8 +28,8 @@ from official.utils.flags import core as flags_core
from official.utils.logs import logger from official.utils.logs import logger
from official.utils.misc import distribution_utils from official.utils.misc import distribution_utils
from official.utils.misc import keras_utils from official.utils.misc import keras_utils
from official.vision.image_classification import cifar_preprocessing from official.vision.image_classification.resnet import cifar_preprocessing
from official.vision.image_classification import common from official.vision.image_classification.resnet import common
LR_SCHEDULE = [ # (multiplier, epoch to start) tuples LR_SCHEDULE = [ # (multiplier, epoch to start) tuples
......
...@@ -27,7 +27,7 @@ from tensorflow.python.platform import googletest ...@@ -27,7 +27,7 @@ from tensorflow.python.platform import googletest
from official.benchmark.models import resnet_cifar_main from official.benchmark.models import resnet_cifar_main
from official.utils.misc import keras_utils from official.utils.misc import keras_utils
from official.utils.testing import integration from official.utils.testing import integration
from official.vision.image_classification import cifar_preprocessing from official.vision.image_classification.resnet import cifar_preprocessing
class KerasCifarTest(googletest.TestCase): class KerasCifarTest(googletest.TestCase):
......
...@@ -22,8 +22,8 @@ import time ...@@ -22,8 +22,8 @@ import time
from absl import flags from absl import flags
import tensorflow as tf import tensorflow as tf
from official.vision.image_classification import common from official.vision.image_classification.resnet import common
from official.vision.image_classification import resnet_ctl_imagenet_main from official.vision.image_classification.resnet import resnet_ctl_imagenet_main
from official.utils.testing.perfzero_benchmark import PerfZeroBenchmark from official.utils.testing.perfzero_benchmark import PerfZeroBenchmark
from official.utils.testing import benchmark_wrappers from official.utils.testing import benchmark_wrappers
from official.utils.flags import core as flags_core from official.utils.flags import core as flags_core
...@@ -87,10 +87,9 @@ class CtlBenchmark(PerfZeroBenchmark): ...@@ -87,10 +87,9 @@ class CtlBenchmark(PerfZeroBenchmark):
# first entry in the time_log is start of step 0. The rest of the # first entry in the time_log is start of step 0. The rest of the
# entries are the end of each step recorded # entries are the end of each step recorded
time_log = stats['step_timestamp_log'] time_log = stats['step_timestamp_log']
elapsed = time_log[-1].timestamp - time_log[warmup].timestamp steps_elapsed = time_log[-1].batch_index - time_log[warmup].batch_index
num_examples = ( time_elapsed = time_log[-1].timestamp - time_log[warmup].timestamp
total_batch_size * log_steps * (len(time_log) - warmup - 1)) examples_per_sec = total_batch_size * (steps_elapsed / time_elapsed)
examples_per_sec = num_examples / elapsed
metrics.append({'name': 'exp_per_second', 'value': examples_per_sec}) metrics.append({'name': 'exp_per_second', 'value': examples_per_sec})
if 'avg_exp_per_second' in stats: if 'avg_exp_per_second' in stats:
......
...@@ -41,8 +41,14 @@ class TfHubMemoryUsageBenchmark(PerfZeroBenchmark): ...@@ -41,8 +41,14 @@ class TfHubMemoryUsageBenchmark(PerfZeroBenchmark):
output_dir=output_dir, default_flags=default_flags, **kwargs) output_dir=output_dir, default_flags=default_flags, **kwargs)
if hub_model_handle_list: if hub_model_handle_list:
for hub_model_handle in hub_model_handle_list.split(';'): for hub_model_handle in hub_model_handle_list.split(';'):
# Converts a model handle of the form
# https://tfhub.dev/google/nnlm-en-dim128/1 to valid python method name
# like google_nnlm_en_dim128_1.
hub_model_method_name = hub_model_handle.replace(
'https://tfhub.dev',
'').replace('/', '_').replace('-', '_').strip('_')
setattr( setattr(
self, 'benchmark_' + hub_model_handle, self, 'benchmark_' + hub_model_method_name,
functools.partial(self.benchmark_memory_usage, hub_model_handle)) functools.partial(self.benchmark_memory_usage, hub_model_handle))
def benchmark_memory_usage( def benchmark_memory_usage(
......
...@@ -102,6 +102,7 @@ def run_customized_training_loop( ...@@ -102,6 +102,7 @@ def run_customized_training_loop(
strategy=None, strategy=None,
model_fn=None, model_fn=None,
loss_fn=None, loss_fn=None,
scale_loss=True,
model_dir=None, model_dir=None,
train_input_fn=None, train_input_fn=None,
steps_per_epoch=None, steps_per_epoch=None,
...@@ -129,6 +130,8 @@ def run_customized_training_loop( ...@@ -129,6 +130,8 @@ def run_customized_training_loop(
to be used for initial checkpoint -- if provided. to be used for initial checkpoint -- if provided.
loss_fn: Function with signature func(labels, logits) and returns a loss loss_fn: Function with signature func(labels, logits) and returns a loss
tensor. tensor.
scale_loss: Whether to divide the raw loss by number of replicas before
gradients calculation.
model_dir: Model directory used during training for restoring/saving model model_dir: Model directory used during training for restoring/saving model
weights. weights.
train_input_fn: Function that returns a tf.data.Dataset used for training. train_input_fn: Function that returns a tf.data.Dataset used for training.
...@@ -211,7 +214,7 @@ def run_customized_training_loop( ...@@ -211,7 +214,7 @@ def run_customized_training_loop(
if run_eagerly: if run_eagerly:
if isinstance(strategy, tf.distribute.experimental.TPUStrategy): if isinstance(strategy, tf.distribute.experimental.TPUStrategy):
raise ValueError( raise ValueError(
'TPUStrategy should not run eagerly as it heavily replies on graph' 'TPUStrategy should not run eagerly as it heavily relies on graph'
' optimization for the distributed system.') ' optimization for the distributed system.')
if eval_input_fn and (eval_steps is None or metric_fn is None): if eval_input_fn and (eval_steps is None or metric_fn is None):
...@@ -223,9 +226,6 @@ def run_customized_training_loop( ...@@ -223,9 +226,6 @@ def run_customized_training_loop(
'if `metric_fn` is specified, metric_fn must be a callable.') 'if `metric_fn` is specified, metric_fn must be a callable.')
total_training_steps = steps_per_epoch * epochs total_training_steps = steps_per_epoch * epochs
# To reduce unnecessary send/receive input pipeline operation, we place input
# pipeline ops in worker task.
train_iterator = _get_input_iterator(train_input_fn, strategy) train_iterator = _get_input_iterator(train_input_fn, strategy)
with distribution_utils.get_strategy_scope(strategy): with distribution_utils.get_strategy_scope(strategy):
...@@ -287,6 +287,12 @@ def run_customized_training_loop( ...@@ -287,6 +287,12 @@ def run_customized_training_loop(
with tf.GradientTape() as tape: with tf.GradientTape() as tape:
model_outputs = model(inputs, training=True) model_outputs = model(inputs, training=True)
loss = loss_fn(labels, model_outputs) loss = loss_fn(labels, model_outputs)
# Raw loss is used for reporting in metrics/logs.
raw_loss = loss
if scale_loss:
# Scales down the loss for gradients to be invariant from replicas.
loss = loss / strategy.num_replicas_in_sync
if explicit_allreduce: if explicit_allreduce:
grad_utils.minimize_using_explicit_allreduce(tape, optimizer, loss, grad_utils.minimize_using_explicit_allreduce(tape, optimizer, loss,
training_vars, training_vars,
...@@ -303,7 +309,7 @@ def run_customized_training_loop( ...@@ -303,7 +309,7 @@ def run_customized_training_loop(
grads = tape.gradient(loss, training_vars) grads = tape.gradient(loss, training_vars)
optimizer.apply_gradients(zip(grads, training_vars)) optimizer.apply_gradients(zip(grads, training_vars))
# For reporting, the metric takes the mean of losses. # For reporting, the metric takes the mean of losses.
train_loss_metric.update_state(loss) train_loss_metric.update_state(raw_loss)
for metric in train_metrics: for metric in train_metrics:
metric.update_state(labels, model_outputs) metric.update_state(labels, model_outputs)
...@@ -324,7 +330,7 @@ def run_customized_training_loop( ...@@ -324,7 +330,7 @@ def run_customized_training_loop(
'retracing.') 'retracing.')
for _ in tf.range(steps): for _ in tf.range(steps):
strategy.experimental_run_v2(_replicated_step, args=(next(iterator),)) strategy.run(_replicated_step, args=(next(iterator),))
def train_single_step(iterator): def train_single_step(iterator):
"""Performs a distributed training step. """Performs a distributed training step.
...@@ -335,7 +341,7 @@ def run_customized_training_loop( ...@@ -335,7 +341,7 @@ def run_customized_training_loop(
Raises: Raises:
ValueError: Any of the arguments or tensor shapes are invalid. ValueError: Any of the arguments or tensor shapes are invalid.
""" """
strategy.experimental_run_v2(_replicated_step, args=(next(iterator),)) strategy.run(_replicated_step, args=(next(iterator),))
def test_step(iterator): def test_step(iterator):
"""Calculates evaluation metrics on distributed devices.""" """Calculates evaluation metrics on distributed devices."""
...@@ -348,7 +354,7 @@ def run_customized_training_loop( ...@@ -348,7 +354,7 @@ def run_customized_training_loop(
for metric in eval_metrics: for metric in eval_metrics:
metric.update_state(labels, model_outputs) metric.update_state(labels, model_outputs)
strategy.experimental_run_v2(_test_step_fn, args=(next(iterator),)) strategy.run(_test_step_fn, args=(next(iterator),))
if not run_eagerly: if not run_eagerly:
train_single_step = tf.function(train_single_step) train_single_step = tf.function(train_single_step)
......
...@@ -233,5 +233,4 @@ class ModelTrainingUtilsTest(tf.test.TestCase, parameterized.TestCase): ...@@ -233,5 +233,4 @@ class ModelTrainingUtilsTest(tf.test.TestCase, parameterized.TestCase):
if __name__ == '__main__': if __name__ == '__main__':
assert tf.version.VERSION.startswith('2.')
tf.test.main() tf.test.main()
...@@ -243,10 +243,10 @@ class DistributedExecutor(object): ...@@ -243,10 +243,10 @@ class DistributedExecutor(object):
raise ValueError('steps should be an Tensor. Python object may cause ' raise ValueError('steps should be an Tensor. Python object may cause '
'retracing.') 'retracing.')
per_replica_losses = strategy.experimental_run_v2( per_replica_losses = strategy.run(
_replicated_step, args=(next(iterator),)) _replicated_step, args=(next(iterator),))
for _ in tf.range(num_steps - 1): for _ in tf.range(num_steps - 1):
per_replica_losses = strategy.experimental_run_v2( per_replica_losses = strategy.run(
_replicated_step, args=(next(iterator),)) _replicated_step, args=(next(iterator),))
# For reporting, we returns the mean of losses. # For reporting, we returns the mean of losses.
...@@ -278,7 +278,7 @@ class DistributedExecutor(object): ...@@ -278,7 +278,7 @@ class DistributedExecutor(object):
metric.update_state(labels, model_outputs) metric.update_state(labels, model_outputs)
return labels, model_outputs return labels, model_outputs
return strategy.experimental_run_v2(_test_step_fn, args=(next(iterator),)) return strategy.run(_test_step_fn, args=(next(iterator),))
return test_step return test_step
......
...@@ -61,7 +61,7 @@ common_flags.define_common_bert_flags() ...@@ -61,7 +61,7 @@ common_flags.define_common_bert_flags()
FLAGS = flags.FLAGS FLAGS = flags.FLAGS
def get_loss_fn(num_classes, loss_factor=1.0): def get_loss_fn(num_classes):
"""Gets the classification loss function.""" """Gets the classification loss function."""
def classification_loss_fn(labels, logits): def classification_loss_fn(labels, logits):
...@@ -72,9 +72,7 @@ def get_loss_fn(num_classes, loss_factor=1.0): ...@@ -72,9 +72,7 @@ def get_loss_fn(num_classes, loss_factor=1.0):
tf.cast(labels, dtype=tf.int32), depth=num_classes, dtype=tf.float32) tf.cast(labels, dtype=tf.int32), depth=num_classes, dtype=tf.float32)
per_example_loss = -tf.reduce_sum( per_example_loss = -tf.reduce_sum(
tf.cast(one_hot_labels, dtype=tf.float32) * log_probs, axis=-1) tf.cast(one_hot_labels, dtype=tf.float32) * log_probs, axis=-1)
loss = tf.reduce_mean(per_example_loss) return tf.reduce_mean(per_example_loss)
loss *= loss_factor
return loss
return classification_loss_fn return classification_loss_fn
...@@ -135,17 +133,7 @@ def run_bert_classifier(strategy, ...@@ -135,17 +133,7 @@ def run_bert_classifier(strategy,
use_graph_rewrite=common_flags.use_graph_rewrite()) use_graph_rewrite=common_flags.use_graph_rewrite())
return classifier_model, core_model return classifier_model, core_model
# During distributed training, loss used for gradient computation is loss_fn = get_loss_fn(num_classes)
# summed over from all replicas. When Keras compile/fit() API is used,
# the fit() API internally normalizes the loss by dividing the loss by
# the number of replicas used for computation. However, when custom
# training loop is used this is not done automatically and should be
# done manually by the end user.
loss_multiplier = 1.0
if FLAGS.scale_loss and not use_keras_compile_fit:
loss_multiplier = 1.0 / strategy.num_replicas_in_sync
loss_fn = get_loss_fn(num_classes, loss_factor=loss_multiplier)
# Defines evaluation metrics function, which will create metrics in the # Defines evaluation metrics function, which will create metrics in the
# correct device and strategy scope. # correct device and strategy scope.
...@@ -267,7 +255,7 @@ def get_predictions_and_labels(strategy, trained_model, eval_input_fn, ...@@ -267,7 +255,7 @@ def get_predictions_and_labels(strategy, trained_model, eval_input_fn,
model_outputs = trained_model(inputs, training=False) model_outputs = trained_model(inputs, training=False)
return model_outputs, labels return model_outputs, labels
outputs, labels = strategy.experimental_run_v2( outputs, labels = strategy.run(
_test_step_fn, args=(next(iterator),)) _test_step_fn, args=(next(iterator),))
# outputs: current batch logits as a tuple of shard logits # outputs: current batch logits as a tuple of shard logits
outputs = tf.nest.map_structure(strategy.experimental_local_results, outputs = tf.nest.map_structure(strategy.experimental_local_results,
......
...@@ -74,11 +74,11 @@ def get_pretrain_dataset_fn(input_file_pattern, seq_length, ...@@ -74,11 +74,11 @@ def get_pretrain_dataset_fn(input_file_pattern, seq_length,
return _dataset_fn return _dataset_fn
def get_loss_fn(loss_factor=1.0): def get_loss_fn():
"""Returns loss function for BERT pretraining.""" """Returns loss function for BERT pretraining."""
def _bert_pretrain_loss_fn(unused_labels, losses, **unused_args): def _bert_pretrain_loss_fn(unused_labels, losses, **unused_args):
return tf.reduce_mean(losses) * loss_factor return tf.reduce_mean(losses)
return _bert_pretrain_loss_fn return _bert_pretrain_loss_fn
...@@ -116,9 +116,8 @@ def run_customized_training(strategy, ...@@ -116,9 +116,8 @@ def run_customized_training(strategy,
trained_model = model_training_utils.run_customized_training_loop( trained_model = model_training_utils.run_customized_training_loop(
strategy=strategy, strategy=strategy,
model_fn=_get_pretrain_model, model_fn=_get_pretrain_model,
loss_fn=get_loss_fn( loss_fn=get_loss_fn(),
loss_factor=1.0 / scale_loss=FLAGS.scale_loss,
strategy.num_replicas_in_sync if FLAGS.scale_loss else 1.0),
model_dir=model_dir, model_dir=model_dir,
train_input_fn=train_input_fn, train_input_fn=train_input_fn,
steps_per_epoch=steps_per_epoch, steps_per_epoch=steps_per_epoch,
......
...@@ -24,6 +24,7 @@ from absl import logging ...@@ -24,6 +24,7 @@ from absl import logging
import tensorflow as tf import tensorflow as tf
from official.modeling import model_training_utils from official.modeling import model_training_utils
from official.modeling import performance
from official.nlp import optimization from official.nlp import optimization
from official.nlp.bert import bert_models from official.nlp.bert import bert_models
from official.nlp.bert import common_flags from official.nlp.bert import common_flags
...@@ -89,8 +90,7 @@ FLAGS = flags.FLAGS ...@@ -89,8 +90,7 @@ FLAGS = flags.FLAGS
def squad_loss_fn(start_positions, def squad_loss_fn(start_positions,
end_positions, end_positions,
start_logits, start_logits,
end_logits, end_logits):
loss_factor=1.0):
"""Returns sparse categorical crossentropy for start/end logits.""" """Returns sparse categorical crossentropy for start/end logits."""
start_loss = tf.keras.losses.sparse_categorical_crossentropy( start_loss = tf.keras.losses.sparse_categorical_crossentropy(
start_positions, start_logits, from_logits=True) start_positions, start_logits, from_logits=True)
...@@ -98,11 +98,10 @@ def squad_loss_fn(start_positions, ...@@ -98,11 +98,10 @@ def squad_loss_fn(start_positions,
end_positions, end_logits, from_logits=True) end_positions, end_logits, from_logits=True)
total_loss = (tf.reduce_mean(start_loss) + tf.reduce_mean(end_loss)) / 2 total_loss = (tf.reduce_mean(start_loss) + tf.reduce_mean(end_loss)) / 2
total_loss *= loss_factor
return total_loss return total_loss
def get_loss_fn(loss_factor=1.0): def get_loss_fn():
"""Gets a loss function for squad task.""" """Gets a loss function for squad task."""
def _loss_fn(labels, model_outputs): def _loss_fn(labels, model_outputs):
...@@ -113,8 +112,7 @@ def get_loss_fn(loss_factor=1.0): ...@@ -113,8 +112,7 @@ def get_loss_fn(loss_factor=1.0):
start_positions, start_positions,
end_positions, end_positions,
start_logits, start_logits,
end_logits, end_logits)
loss_factor=loss_factor)
return _loss_fn return _loss_fn
...@@ -194,8 +192,7 @@ def predict_squad_customized(strategy, input_meta_data, bert_config, ...@@ -194,8 +192,7 @@ def predict_squad_customized(strategy, input_meta_data, bert_config,
start_logits=start_logits, start_logits=start_logits,
end_logits=end_logits) end_logits=end_logits)
outputs = strategy.experimental_run_v2( outputs = strategy.run(_replicated_step, args=(next(iterator),))
_replicated_step, args=(next(iterator),))
return tf.nest.map_structure(strategy.experimental_local_results, outputs) return tf.nest.map_structure(strategy.experimental_local_results, outputs)
all_results = [] all_results = []
...@@ -219,10 +216,7 @@ def train_squad(strategy, ...@@ -219,10 +216,7 @@ def train_squad(strategy,
' strategy.') ' strategy.')
# Enables XLA in Session Config. Should not be set for TPU. # Enables XLA in Session Config. Should not be set for TPU.
keras_utils.set_config_v2(FLAGS.enable_xla) keras_utils.set_config_v2(FLAGS.enable_xla)
performance.set_mixed_precision_policy(common_flags.dtype())
use_float16 = common_flags.use_float16()
if use_float16:
tf.keras.mixed_precision.experimental.set_policy('mixed_float16')
epochs = FLAGS.num_train_epochs epochs = FLAGS.num_train_epochs
num_train_examples = input_meta_data['train_data_size'] num_train_examples = input_meta_data['train_data_size']
...@@ -242,33 +236,16 @@ def train_squad(strategy, ...@@ -242,33 +236,16 @@ def train_squad(strategy,
max_seq_length, max_seq_length,
hub_module_url=FLAGS.hub_module_url, hub_module_url=FLAGS.hub_module_url,
hub_module_trainable=FLAGS.hub_module_trainable) hub_module_trainable=FLAGS.hub_module_trainable)
squad_model.optimizer = optimization.create_optimizer( optimizer = optimization.create_optimizer(FLAGS.learning_rate,
FLAGS.learning_rate, steps_per_epoch * epochs, warmup_steps) steps_per_epoch * epochs,
if use_float16: warmup_steps)
# Wraps optimizer with a LossScaleOptimizer. This is done automatically
# in compile() with the "mixed_float16" policy, but since we do not call squad_model.optimizer = performance.configure_optimizer(
# compile(), we must wrap the optimizer manually. optimizer,
squad_model.optimizer = ( use_float16=common_flags.use_float16(),
tf.keras.mixed_precision.experimental.LossScaleOptimizer( use_graph_rewrite=common_flags.use_graph_rewrite())
squad_model.optimizer, loss_scale=common_flags.get_loss_scale()))
if FLAGS.fp16_implementation == 'graph_rewrite':
# Note: when flags_obj.fp16_implementation == "graph_rewrite", dtype as
# determined by flags_core.get_tf_dtype(flags_obj) would be 'float32'
# which will ensure tf.compat.v2.keras.mixed_precision and
# tf.train.experimental.enable_mixed_precision_graph_rewrite do not double
# up.
squad_model.optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(
squad_model.optimizer)
return squad_model, core_model return squad_model, core_model
# The original BERT model does not scale the loss by
# 1/num_replicas_in_sync. It could be an accident. So, in order to use
# the same hyper parameter, we do the same thing here by keeping each
# replica loss as it is.
loss_fn = get_loss_fn(
loss_factor=1.0 /
strategy.num_replicas_in_sync if FLAGS.scale_loss else 1.0)
# If explicit_allreduce = True, apply_gradients() no longer implicitly # If explicit_allreduce = True, apply_gradients() no longer implicitly
# allreduce gradients, users manually allreduce gradient and pass the # allreduce gradients, users manually allreduce gradient and pass the
# allreduced grads_and_vars to apply_gradients(). clip_by_global_norm will be # allreduced grads_and_vars to apply_gradients(). clip_by_global_norm will be
...@@ -281,7 +258,7 @@ def train_squad(strategy, ...@@ -281,7 +258,7 @@ def train_squad(strategy,
model_training_utils.run_customized_training_loop( model_training_utils.run_customized_training_loop(
strategy=strategy, strategy=strategy,
model_fn=_get_squad_model, model_fn=_get_squad_model,
loss_fn=loss_fn, loss_fn=get_loss_fn(),
model_dir=FLAGS.model_dir, model_dir=FLAGS.model_dir,
steps_per_epoch=steps_per_epoch, steps_per_epoch=steps_per_epoch,
steps_per_loop=FLAGS.steps_per_loop, steps_per_loop=FLAGS.steps_per_loop,
......
...@@ -98,7 +98,6 @@ def convert_checkpoint(bert_config, output_path, v1_checkpoint): ...@@ -98,7 +98,6 @@ def convert_checkpoint(bert_config, output_path, v1_checkpoint):
def main(_): def main(_):
tf.enable_v2_behavior()
output_path = FLAGS.converted_checkpoint_path output_path = FLAGS.converted_checkpoint_path
v1_checkpoint = FLAGS.checkpoint_to_convert v1_checkpoint = FLAGS.checkpoint_to_convert
bert_config = configs.BertConfig.from_json_file(FLAGS.bert_config_file) bert_config = configs.BertConfig.from_json_file(FLAGS.bert_config_file)
......
...@@ -24,6 +24,7 @@ import os ...@@ -24,6 +24,7 @@ import os
from absl import logging from absl import logging
import tensorflow as tf import tensorflow as tf
import tensorflow_datasets as tfds
from official.nlp.bert import tokenization from official.nlp.bert import tokenization
...@@ -386,6 +387,99 @@ class QnliProcessor(DataProcessor): ...@@ -386,6 +387,99 @@ class QnliProcessor(DataProcessor):
return examples return examples
class TfdsProcessor(DataProcessor):
"""Processor for generic text classification TFDS data set.
The TFDS parameters are expected to be provided in the tfds_params string, in
a comma-separated list of parameter assignments.
Examples:
tfds_params="dataset=scicite,text_key=string"
tfds_params="dataset=imdb_reviews,test_split=,dev_split=test"
tfds_params="dataset=glue/cola,text_key=sentence"
tfds_params="dataset=glue/sst2,text_key=sentence"
tfds_params="dataset=glue/qnli,text_key=question,text_b_key=sentence"
tfds_params="dataset=glue/mrpc,text_key=sentence1,text_b_key=sentence2"
Possible parameters (please refer to the documentation of Tensorflow Datasets
(TFDS) for the meaning of individual parameters):
dataset: Required dataset name (potentially with subset and version number).
data_dir: Optional TFDS source root directory.
train_split: Name of the train split (defaults to `train`).
dev_split: Name of the dev split (defaults to `validation`).
test_split: Name of the test split (defaults to `test`).
text_key: Key of the text_a feature (defaults to `text`).
text_b_key: Key of the second text feature if available.
label_key: Key of the label feature (defaults to `label`).
test_text_key: Key of the text feature to use in test set.
test_text_b_key: Key of the second text feature to use in test set.
test_label: String to be used as the label for all test examples.
"""
def __init__(self, tfds_params,
process_text_fn=tokenization.convert_to_unicode):
super(TfdsProcessor, self).__init__(process_text_fn)
self._process_tfds_params_str(tfds_params)
self.dataset, info = tfds.load(self.dataset_name, data_dir=self.data_dir,
with_info=True)
self._labels = list(range(info.features[self.label_key].num_classes))
def _process_tfds_params_str(self, params_str):
"""Extracts TFDS parameters from a comma-separated assignements string."""
tuples = [x.split("=") for x in params_str.split(",")]
d = {k.strip(): v.strip() for k, v in tuples}
self.dataset_name = d["dataset"] # Required.
self.data_dir = d.get("data_dir", None)
self.train_split = d.get("train_split", "train")
self.dev_split = d.get("dev_split", "validation")
self.test_split = d.get("test_split", "test")
self.text_key = d.get("text_key", "text")
self.text_b_key = d.get("text_b_key", None)
self.label_key = d.get("label_key", "label")
self.test_text_key = d.get("test_text_key", self.text_key)
self.test_text_b_key = d.get("test_text_b_key", self.text_b_key)
self.test_label = d.get("test_label", "test_example")
def get_train_examples(self, data_dir):
assert data_dir is None
return self._create_examples(self.train_split, "train")
def get_dev_examples(self, data_dir):
assert data_dir is None
return self._create_examples(self.dev_split, "dev")
def get_test_examples(self, data_dir):
assert data_dir is None
return self._create_examples(self.test_split, "test")
def get_labels(self):
return self._labels
def get_processor_name(self):
return "TFDS_" + self.dataset_name
def _create_examples(self, split_name, set_type):
"""Creates examples for the training and dev sets."""
if split_name not in self.dataset:
raise ValueError("Split {} not available.".format(split_name))
dataset = self.dataset[split_name].as_numpy_iterator()
examples = []
text_b = None
for i, example in enumerate(dataset):
guid = "%s-%s" % (set_type, i)
if set_type == "test":
text_a = self.process_text_fn(example[self.test_text_key])
if self.test_text_b_key:
text_b = self.process_text_fn(example[self.test_text_b_key])
label = self.test_label
else:
text_a = self.process_text_fn(example[self.text_key])
if self.text_b_key:
text_b = self.process_text_fn(example[self.text_b_key])
label = int(example[self.label_key])
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
def convert_single_example(ex_index, example, label_list, max_seq_length, def convert_single_example(ex_index, example, label_list, max_seq_length,
tokenizer): tokenizer):
"""Converts a single `InputExample` into a single `InputFeatures`.""" """Converts a single `InputExample` into a single `InputFeatures`."""
......
...@@ -104,22 +104,16 @@ flags.DEFINE_enum( ...@@ -104,22 +104,16 @@ flags.DEFINE_enum(
"or sentence_piece tokenizer. Canonical BERT uses word_piece tokenizer, " "or sentence_piece tokenizer. Canonical BERT uses word_piece tokenizer, "
"while ALBERT uses sentence_piece tokenizer.") "while ALBERT uses sentence_piece tokenizer.")
flags.DEFINE_string("tfds_params", "",
"Comma-separated list of TFDS parameter assigments for "
"generic classfication data import (for more details "
"see the TfdsProcessor class documentation).")
def generate_classifier_dataset(): def generate_classifier_dataset():
"""Generates classifier dataset and returns input meta data.""" """Generates classifier dataset and returns input meta data."""
assert FLAGS.input_data_dir and FLAGS.classification_task_name assert (FLAGS.input_data_dir and FLAGS.classification_task_name
or FLAGS.tfds_params)
processors = {
"cola": classifier_data_lib.ColaProcessor,
"mnli": classifier_data_lib.MnliProcessor,
"mrpc": classifier_data_lib.MrpcProcessor,
"qnli": classifier_data_lib.QnliProcessor,
"sst-2": classifier_data_lib.SstProcessor,
"xnli": classifier_data_lib.XnliProcessor,
}
task_name = FLAGS.classification_task_name.lower()
if task_name not in processors:
raise ValueError("Task not found: %s" % (task_name))
if FLAGS.tokenizer_impl == "word_piece": if FLAGS.tokenizer_impl == "word_piece":
tokenizer = tokenization.FullTokenizer( tokenizer = tokenization.FullTokenizer(
...@@ -131,14 +125,38 @@ def generate_classifier_dataset(): ...@@ -131,14 +125,38 @@ def generate_classifier_dataset():
processor_text_fn = functools.partial( processor_text_fn = functools.partial(
tokenization.preprocess_text, lower=FLAGS.do_lower_case) tokenization.preprocess_text, lower=FLAGS.do_lower_case)
processor = processors[task_name](processor_text_fn) if FLAGS.tfds_params:
return classifier_data_lib.generate_tf_record_from_data_file( processor = classifier_data_lib.TfdsProcessor(
processor, tfds_params=FLAGS.tfds_params,
FLAGS.input_data_dir, process_text_fn=processor_text_fn)
tokenizer, return classifier_data_lib.generate_tf_record_from_data_file(
train_data_output_path=FLAGS.train_data_output_path, processor,
eval_data_output_path=FLAGS.eval_data_output_path, None,
max_seq_length=FLAGS.max_seq_length) tokenizer,
train_data_output_path=FLAGS.train_data_output_path,
eval_data_output_path=FLAGS.eval_data_output_path,
max_seq_length=FLAGS.max_seq_length)
else:
processors = {
"cola": classifier_data_lib.ColaProcessor,
"mnli": classifier_data_lib.MnliProcessor,
"mrpc": classifier_data_lib.MrpcProcessor,
"qnli": classifier_data_lib.QnliProcessor,
"sst-2": classifier_data_lib.SstProcessor,
"xnli": classifier_data_lib.XnliProcessor,
}
task_name = FLAGS.classification_task_name.lower()
if task_name not in processors:
raise ValueError("Task not found: %s" % (task_name))
processor = processors[task_name](processor_text_fn)
return classifier_data_lib.generate_tf_record_from_data_file(
processor,
FLAGS.input_data_dir,
tokenizer,
train_data_output_path=FLAGS.train_data_output_path,
eval_data_output_path=FLAGS.eval_data_output_path,
max_seq_length=FLAGS.max_seq_length)
def generate_squad_dataset(): def generate_squad_dataset():
......
...@@ -47,8 +47,10 @@ class UnicodeRegex(object): ...@@ -47,8 +47,10 @@ class UnicodeRegex(object):
self.symbol_re = re.compile("([" + self.property_chars("S") + "])") self.symbol_re = re.compile("([" + self.property_chars("S") + "])")
def property_chars(self, prefix): def property_chars(self, prefix):
return "".join(six.unichr(x) for x in range(sys.maxunicode) return "".join(
if unicodedata.category(six.unichr(x)).startswith(prefix)) six.unichr(x)
for x in range(sys.maxunicode)
if unicodedata.category(six.unichr(x)).startswith(prefix))
uregex = UnicodeRegex() uregex = UnicodeRegex()
...@@ -92,9 +94,10 @@ def bleu_wrapper(ref_filename, hyp_filename, case_sensitive=False): ...@@ -92,9 +94,10 @@ def bleu_wrapper(ref_filename, hyp_filename, case_sensitive=False):
tf.io.gfile.GFile(hyp_filename).read()).strip().splitlines() tf.io.gfile.GFile(hyp_filename).read()).strip().splitlines()
if len(ref_lines) != len(hyp_lines): if len(ref_lines) != len(hyp_lines):
raise ValueError("Reference and translation files have different number of " raise ValueError(
"lines. If training only a few steps (100-200), the " "Reference and translation files have different number of "
"translation may be empty.") "lines (%d VS %d). If training only a few steps (100-200), the "
"translation may be empty." % (len(ref_lines), len(hyp_lines)))
if not case_sensitive: if not case_sensitive:
ref_lines = [x.lower() for x in ref_lines] ref_lines = [x.lower() for x in ref_lines]
hyp_lines = [x.lower() for x in hyp_lines] hyp_lines = [x.lower() for x in hyp_lines]
...@@ -116,18 +119,23 @@ def main(unused_argv): ...@@ -116,18 +119,23 @@ def main(unused_argv):
def define_compute_bleu_flags(): def define_compute_bleu_flags():
"""Add flags for computing BLEU score.""" """Add flags for computing BLEU score."""
flags.DEFINE_string( flags.DEFINE_string(
name="translation", default=None, name="translation",
default=None,
help=flags_core.help_wrap("File containing translated text.")) help=flags_core.help_wrap("File containing translated text."))
flags.mark_flag_as_required("translation") flags.mark_flag_as_required("translation")
flags.DEFINE_string( flags.DEFINE_string(
name="reference", default=None, name="reference",
default=None,
help=flags_core.help_wrap("File containing reference translation.")) help=flags_core.help_wrap("File containing reference translation."))
flags.mark_flag_as_required("reference") flags.mark_flag_as_required("reference")
flags.DEFINE_enum( flags.DEFINE_enum(
name="bleu_variant", short_name="bv", default="both", name="bleu_variant",
enum_values=["both", "uncased", "cased"], case_sensitive=False, short_name="bv",
default="both",
enum_values=["both", "uncased", "cased"],
case_sensitive=False,
help=flags_core.help_wrap( help=flags_core.help_wrap(
"Specify one or more BLEU variants to calculate. Variants: \"cased\"" "Specify one or more BLEU variants to calculate. Variants: \"cased\""
", \"uncased\", or \"both\".")) ", \"uncased\", or \"both\"."))
......
...@@ -280,7 +280,7 @@ class TransformerTask(object): ...@@ -280,7 +280,7 @@ class TransformerTask(object):
for _ in tf.range(steps): for _ in tf.range(steps):
train_loss_metric.reset_states() train_loss_metric.reset_states()
self.distribution_strategy.experimental_run_v2( self.distribution_strategy.run(
_step_fn, args=(next(iterator),)) _step_fn, args=(next(iterator),))
cased_score, uncased_score = None, None cased_score, uncased_score = None, None
......
...@@ -132,7 +132,7 @@ def translate_file(model, ...@@ -132,7 +132,7 @@ def translate_file(model,
val_outputs, _ = model([val_inputs], training=False) val_outputs, _ = model([val_inputs], training=False)
return tag, val_outputs return tag, val_outputs
return distribution_strategy.experimental_run_v2(_step_fn, args=(inputs,)) return distribution_strategy.run(_step_fn, args=(inputs,))
translations = [] translations = []
if distribution_strategy: if distribution_strategy:
...@@ -151,7 +151,7 @@ def translate_file(model, ...@@ -151,7 +151,7 @@ def translate_file(model,
replica_id = replica_context.replica_id_in_sync_group replica_id = replica_context.replica_id_in_sync_group
return replica_id, text[replica_id] return replica_id, text[replica_id]
text = distribution_strategy.experimental_run_v2(text_as_per_replica) text = distribution_strategy.run(text_as_per_replica)
outputs = distribution_strategy.experimental_local_results( outputs = distribution_strategy.experimental_local_results(
predict_step(text)) predict_step(text))
tags, unordered_val_outputs = outputs[0] tags, unordered_val_outputs = outputs[0]
......
...@@ -29,6 +29,8 @@ import six ...@@ -29,6 +29,8 @@ import six
from six.moves import xrange # pylint: disable=redefined-builtin from six.moves import xrange # pylint: disable=redefined-builtin
import tensorflow as tf import tensorflow as tf
# pylint: disable=g-complex-comprehension
PAD = "<pad>" PAD = "<pad>"
PAD_ID = 0 PAD_ID = 0
EOS = "<EOS>" EOS = "<EOS>"
...@@ -46,27 +48,36 @@ _UNESCAPE_REGEX = re.compile(r"\\u|\\\\|\\([0-9]+);") ...@@ -46,27 +48,36 @@ _UNESCAPE_REGEX = re.compile(r"\\u|\\\\|\\([0-9]+);")
_UNDEFINED_UNICODE = u"\u3013" _UNDEFINED_UNICODE = u"\u3013"
def alphanumeric_char_set():
return set(
six.unichr(i)
for i in xrange(sys.maxunicode)
if (unicodedata.category(six.unichr(i)).startswith("L") or
unicodedata.category(six.unichr(i)).startswith("N")))
# Set contains all letter and number characters. # Set contains all letter and number characters.
_ALPHANUMERIC_CHAR_SET = set( _ALPHANUMERIC_CHAR_SET = alphanumeric_char_set()
six.unichr(i) for i in xrange(sys.maxunicode)
if (unicodedata.category(six.unichr(i)).startswith("L") or
unicodedata.category(six.unichr(i)).startswith("N")))
# min_count is the minimum number of times a subtoken must appear in the data # min_count is the minimum number of times a subtoken must appear in the data
# before before it is added to the vocabulary. The value is found using binary # before before it is added to the vocabulary. The value is found using binary
# search to obtain the target vocabulary size. # search to obtain the target vocabulary size.
_MIN_MIN_COUNT = 1 # min value to use when binary searching for min_count _MIN_MIN_COUNT = 1 # min value to use when binary searching for min_count
_MAX_MIN_COUNT = 1000 # max value to use when binary searching for min_count _MAX_MIN_COUNT = 1000 # max value to use when binary searching for min_count
class Subtokenizer(object): class Subtokenizer(object):
"""Encodes and decodes strings to/from integer IDs.""" """Encodes and decodes strings to/from integer IDs."""
def __init__(self, vocab_file, reserved_tokens=None): def __init__(self, vocab_file, reserved_tokens=None, master_char_set=None):
"""Initializes class, creating a vocab file if data_files is provided.""" """Initializes class, creating a vocab file if data_files is provided."""
logging.info("Initializing Subtokenizer from file %s." % logging.info("Initializing Subtokenizer from file %s." %
vocab_file) vocab_file)
if master_char_set is None:
master_char_set = _ALPHANUMERIC_CHAR_SET
if reserved_tokens is None: if reserved_tokens is None:
reserved_tokens = RESERVED_TOKENS reserved_tokens = RESERVED_TOKENS
...@@ -79,13 +90,20 @@ class Subtokenizer(object): ...@@ -79,13 +90,20 @@ class Subtokenizer(object):
self.max_subtoken_length = max(self.max_subtoken_length, len(subtoken)) self.max_subtoken_length = max(self.max_subtoken_length, len(subtoken))
# Create cache to speed up subtokenization # Create cache to speed up subtokenization
self._cache_size = 2 ** 20 self._cache_size = 2**20
self._cache = [(None, None)] * self._cache_size self._cache = [(None, None)] * self._cache_size
self._master_char_set = master_char_set
@staticmethod @staticmethod
def init_from_files( def init_from_files(vocab_file,
vocab_file, files, target_vocab_size, threshold, min_count=None, files,
file_byte_limit=1e6, reserved_tokens=None, correct_strip=True): target_vocab_size,
threshold,
min_count=None,
file_byte_limit=1e6,
reserved_tokens=None,
correct_strip=True,
master_char_set=None):
"""Create subtoken vocabulary based on files, and save vocab to file. """Create subtoken vocabulary based on files, and save vocab to file.
Args: Args:
...@@ -102,10 +120,13 @@ class Subtokenizer(object): ...@@ -102,10 +120,13 @@ class Subtokenizer(object):
reserved_tokens: List of string tokens that are guaranteed to be at the reserved_tokens: List of string tokens that are guaranteed to be at the
beginning of the subtoken vocabulary list. beginning of the subtoken vocabulary list.
correct_strip: Whether to convert text to unicode before strip. correct_strip: Whether to convert text to unicode before strip.
master_char_set: the char set.
Returns: Returns:
Subtokenizer object Subtokenizer object
""" """
if master_char_set is None:
master_char_set = _ALPHANUMERIC_CHAR_SET
if reserved_tokens is None: if reserved_tokens is None:
reserved_tokens = RESERVED_TOKENS reserved_tokens = RESERVED_TOKENS
...@@ -113,7 +134,8 @@ class Subtokenizer(object): ...@@ -113,7 +134,8 @@ class Subtokenizer(object):
logging.info("Vocab file already exists (%s)" % vocab_file) logging.info("Vocab file already exists (%s)" % vocab_file)
else: else:
logging.info("Begin steps to create subtoken vocabulary...") logging.info("Begin steps to create subtoken vocabulary...")
token_counts = _count_tokens(files, file_byte_limit, correct_strip) token_counts = _count_tokens(files, file_byte_limit, correct_strip,
master_char_set)
alphabet = _generate_alphabet_dict(token_counts) alphabet = _generate_alphabet_dict(token_counts)
subtoken_list = _generate_subtokens_with_target_vocab_size( subtoken_list = _generate_subtokens_with_target_vocab_size(
token_counts, alphabet, target_vocab_size, threshold, min_count, token_counts, alphabet, target_vocab_size, threshold, min_count,
...@@ -121,15 +143,18 @@ class Subtokenizer(object): ...@@ -121,15 +143,18 @@ class Subtokenizer(object):
logging.info("Generated vocabulary with %d subtokens." % logging.info("Generated vocabulary with %d subtokens." %
len(subtoken_list)) len(subtoken_list))
_save_vocab_file(vocab_file, subtoken_list) _save_vocab_file(vocab_file, subtoken_list)
return Subtokenizer(vocab_file) return Subtokenizer(vocab_file, master_char_set=master_char_set)
def encode(self, raw_string, add_eos=False): def encode(self, raw_string, add_eos=False):
"""Encodes a string into a list of int subtoken ids.""" """Encodes a string into a list of int subtoken ids."""
ret = [] ret = []
tokens = _split_string_to_tokens(native_to_unicode(raw_string)) tokens = _split_string_to_tokens(
native_to_unicode(raw_string), self._master_char_set)
for token in tokens: for token in tokens:
ret.extend(self._token_to_subtoken_ids(token)) ret.extend(self._token_to_subtoken_ids(token))
if add_eos: if add_eos:
assert EOS in self.subtoken_list, \
"Can't append 'EOS' because it is not in list of known subtokens."
ret.append(EOS_ID) ret.append(EOS_ID)
return ret return ret
...@@ -162,13 +187,14 @@ class Subtokenizer(object): ...@@ -162,13 +187,14 @@ class Subtokenizer(object):
"Subtokens argument passed into decode() must be a list of integers.") "Subtokens argument passed into decode() must be a list of integers.")
return _unicode_to_native( return _unicode_to_native(
_join_tokens_to_string(self._subtoken_ids_to_tokens(subtokens))) _join_tokens_to_string(
self._subtoken_ids_to_tokens(subtokens), self._master_char_set))
def _subtoken_ids_to_tokens(self, subtokens): def _subtoken_ids_to_tokens(self, subtokens):
"""Convert list of int subtoken ids to a list of string tokens.""" """Convert list of int subtoken ids to a list of string tokens."""
escaped_tokens = "".join([ escaped_tokens = "".join([
self.subtoken_list[s] for s in subtokens self.subtoken_list[s] for s in subtokens if s < len(self.subtoken_list)
if s < len(self.subtoken_list)]) ])
escaped_tokens = escaped_tokens.split("_") escaped_tokens = escaped_tokens.split("_")
# All tokens in the vocabulary list have been escaped (see _escape_token()) # All tokens in the vocabulary list have been escaped (see _escape_token())
...@@ -205,7 +231,7 @@ def _load_vocab_file(vocab_file, reserved_tokens=None): ...@@ -205,7 +231,7 @@ def _load_vocab_file(vocab_file, reserved_tokens=None):
def native_to_unicode(s): def native_to_unicode(s):
"""Convert string to unicode (required in Python 2).""" """Convert string to unicode (required in Python 2)."""
try: # Python 2 try: # Python 2
return s if isinstance(s, unicode) else s.decode("utf-8") return s if isinstance(s, unicode) else s.decode("utf-8")
except NameError: # Python 3 except NameError: # Python 3
return s return s
...@@ -213,22 +239,22 @@ def native_to_unicode(s): ...@@ -213,22 +239,22 @@ def native_to_unicode(s):
def _unicode_to_native(s): def _unicode_to_native(s):
"""Convert string from unicode to native format (required in Python 2).""" """Convert string from unicode to native format (required in Python 2)."""
try: # Python 2 try: # Python 2
return s.encode("utf-8") if isinstance(s, unicode) else s return s.encode("utf-8") if isinstance(s, unicode) else s
except NameError: # Python 3 except NameError: # Python 3
return s return s
def _split_string_to_tokens(text): def _split_string_to_tokens(text, master_char_set):
"""Splits text to a list of string tokens.""" """Splits text to a list of string tokens."""
if not text: if not text:
return [] return []
ret = [] ret = []
token_start = 0 token_start = 0
# Classify each character in the input string # Classify each character in the input string
is_alnum = [c in _ALPHANUMERIC_CHAR_SET for c in text] is_master = [c in master_char_set for c in text]
for pos in xrange(1, len(text)): for pos in xrange(1, len(text)):
if is_alnum[pos] != is_alnum[pos - 1]: if is_master[pos] != is_master[pos - 1]:
token = text[token_start:pos] token = text[token_start:pos]
if token != u" " or token_start == 0: if token != u" " or token_start == 0:
ret.append(token) ret.append(token)
...@@ -238,12 +264,12 @@ def _split_string_to_tokens(text): ...@@ -238,12 +264,12 @@ def _split_string_to_tokens(text):
return ret return ret
def _join_tokens_to_string(tokens): def _join_tokens_to_string(tokens, master_char_set):
"""Join a list of string tokens into a single string.""" """Join a list of string tokens into a single string."""
token_is_alnum = [t[0] in _ALPHANUMERIC_CHAR_SET for t in tokens] token_is_master = [t[0] in master_char_set for t in tokens]
ret = [] ret = []
for i, token in enumerate(tokens): for i, token in enumerate(tokens):
if i > 0 and token_is_alnum[i - 1] and token_is_alnum[i]: if i > 0 and token_is_master[i - 1] and token_is_master[i]:
ret.append(u" ") ret.append(u" ")
ret.append(token) ret.append(token)
return "".join(ret) return "".join(ret)
...@@ -325,7 +351,10 @@ def _unescape_token(token): ...@@ -325,7 +351,10 @@ def _unescape_token(token):
return _UNESCAPE_REGEX.sub(match, token) return _UNESCAPE_REGEX.sub(match, token)
def _count_tokens(files, file_byte_limit=1e6, correct_strip=True): def _count_tokens(files,
file_byte_limit=1e6,
correct_strip=True,
master_char_set=None):
"""Return token counts of words in the files. """Return token counts of words in the files.
Samples file_byte_limit bytes from each file, and counts the words that appear Samples file_byte_limit bytes from each file, and counts the words that appear
...@@ -338,11 +367,15 @@ def _count_tokens(files, file_byte_limit=1e6, correct_strip=True): ...@@ -338,11 +367,15 @@ def _count_tokens(files, file_byte_limit=1e6, correct_strip=True):
vocabulary generation for PY2. Sets correct_strip to False in PY2 to vocabulary generation for PY2. Sets correct_strip to False in PY2 to
reproduce previous common public result. Sets correct_strip to True will reproduce previous common public result. Sets correct_strip to True will
let PY2 and PY3 get a consistent vocabulary. let PY2 and PY3 get a consistent vocabulary.
master_char_set: the char set.
Returns: Returns:
Dictionary mapping tokens to the number of times they appear in the sampled Dictionary mapping tokens to the number of times they appear in the sampled
lines from the files. lines from the files.
""" """
if master_char_set is None:
master_char_set = _ALPHANUMERIC_CHAR_SET
token_counts = collections.defaultdict(int) token_counts = collections.defaultdict(int)
for filepath in files: for filepath in files:
...@@ -363,7 +396,8 @@ def _count_tokens(files, file_byte_limit=1e6, correct_strip=True): ...@@ -363,7 +396,8 @@ def _count_tokens(files, file_byte_limit=1e6, correct_strip=True):
counter = 0 counter = 0
# Add words to token counts # Add words to token counts
for token in _split_string_to_tokens(native_to_unicode(line)): for token in _split_string_to_tokens(
native_to_unicode(line), master_char_set):
token_counts[token] += 1 token_counts[token] += 1
return token_counts return token_counts
...@@ -395,9 +429,12 @@ def _split_token_to_subtokens(token, subtoken_dict, max_subtoken_length): ...@@ -395,9 +429,12 @@ def _split_token_to_subtokens(token, subtoken_dict, max_subtoken_length):
return ret return ret
def _generate_subtokens_with_target_vocab_size( def _generate_subtokens_with_target_vocab_size(token_counts,
token_counts, alphabet, target_size, threshold, min_count=None, alphabet,
reserved_tokens=None): target_size,
threshold,
min_count=None,
reserved_tokens=None):
"""Generate subtoken vocabulary close to the target size.""" """Generate subtoken vocabulary close to the target size."""
if reserved_tokens is None: if reserved_tokens is None:
reserved_tokens = RESERVED_TOKENS reserved_tokens = RESERVED_TOKENS
...@@ -450,8 +487,8 @@ def _generate_alphabet_dict(iterable, reserved_tokens=None): ...@@ -450,8 +487,8 @@ def _generate_alphabet_dict(iterable, reserved_tokens=None):
return alphabet return alphabet
def _count_and_gen_subtokens( def _count_and_gen_subtokens(token_counts, alphabet, subtoken_dict,
token_counts, alphabet, subtoken_dict, max_subtoken_length): max_subtoken_length):
"""Count number of times subtokens appear, and generate new subtokens. """Count number of times subtokens appear, and generate new subtokens.
Args: Args:
...@@ -469,8 +506,8 @@ def _count_and_gen_subtokens( ...@@ -469,8 +506,8 @@ def _count_and_gen_subtokens(
subtoken_counts = collections.defaultdict(int) subtoken_counts = collections.defaultdict(int)
for token, count in six.iteritems(token_counts): for token, count in six.iteritems(token_counts):
token = _escape_token(token, alphabet) token = _escape_token(token, alphabet)
subtokens = _split_token_to_subtokens( subtokens = _split_token_to_subtokens(token, subtoken_dict,
token, subtoken_dict, max_subtoken_length) max_subtoken_length)
# Generate new subtokens by taking substrings from token. # Generate new subtokens by taking substrings from token.
start = 0 start = 0
...@@ -504,8 +541,10 @@ def _filter_and_bucket_subtokens(subtoken_counts, min_count): ...@@ -504,8 +541,10 @@ def _filter_and_bucket_subtokens(subtoken_counts, min_count):
return subtoken_buckets return subtoken_buckets
def _gen_new_subtoken_list( def _gen_new_subtoken_list(subtoken_counts,
subtoken_counts, min_count, alphabet, reserved_tokens=None): min_count,
alphabet,
reserved_tokens=None):
"""Generate candidate subtokens ordered by count, and new max subtoken length. """Generate candidate subtokens ordered by count, and new max subtoken length.
Add subtokens to the candiate list in order of length (longest subtokens Add subtokens to the candiate list in order of length (longest subtokens
...@@ -576,9 +615,11 @@ def _gen_new_subtoken_list( ...@@ -576,9 +615,11 @@ def _gen_new_subtoken_list(
return subtoken_list, max_subtoken_length return subtoken_list, max_subtoken_length
def _generate_subtokens( def _generate_subtokens(token_counts,
token_counts, alphabet, min_count, num_iterations=4, alphabet,
reserved_tokens=None): min_count,
num_iterations=4,
reserved_tokens=None):
"""Create a list of subtokens in decreasing order of frequency. """Create a list of subtokens in decreasing order of frequency.
Args: Args:
...@@ -610,8 +651,9 @@ def _generate_subtokens( ...@@ -610,8 +651,9 @@ def _generate_subtokens(
# Create dict mapping subtoken->count, with additional subtokens created # Create dict mapping subtoken->count, with additional subtokens created
# from substrings taken from the tokens. # from substrings taken from the tokens.
subtoken_counts = _count_and_gen_subtokens( subtoken_counts = _count_and_gen_subtokens(token_counts, alphabet,
token_counts, alphabet, subtoken_dict, max_subtoken_length) subtoken_dict,
max_subtoken_length)
# Generate new list of subtokens sorted by subtoken count. # Generate new list of subtokens sorted by subtoken count.
subtoken_list, max_subtoken_length = _gen_new_subtoken_list( subtoken_list, max_subtoken_length = _gen_new_subtoken_list(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment