Merge pull request #1 from tensorflow/master

new pull

Merge pull request #1 from tensorflow/master
new pull
f16a7b5b · vedanshu · GitHub · 8e9296ff · 8f58f396 · 8e9296ff
Unverified Commit f16a7b5b authored May 04, 2021 by vedanshu Committed by GitHub May 04, 2021
20 changed files
--- a/official/benchmark/models/cifar_preprocessing.py
+++ b/official/benchmark/models/cifar_preprocessing.py
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Provides utilities to Cifar-10 dataset."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-from absl import logging
-import tensorflow as tf
-
-from official.vision.image_classification.resnet import imagenet_preprocessing
-
-HEIGHT = 32
-WIDTH = 32
-NUM_CHANNELS = 3
-_DEFAULT_IMAGE_BYTES = HEIGHT * WIDTH * NUM_CHANNELS
-# The record is the image plus a one-byte label
-_RECORD_BYTES = _DEFAULT_IMAGE_BYTES + 1
-
-# TODO(tobyboyd): Change to best practice 45K(train)/5K(val)/10K(test) splits.
-NUM_IMAGES = {
-    'train': 50000,
-    'validation': 10000,
-}
-_NUM_DATA_FILES = 5
-NUM_CLASSES = 10
-
-
-def parse_record(raw_record, is_training, dtype):
-  """Parses a record containing a training example of an image.
-
-  The input record is parsed into a label and image, and the image is passed
-  through preprocessing steps (cropping, flipping, and so on).
-
-  This method converts the label to one hot to fit the loss function.
-
-  Args:
-    raw_record: scalar Tensor tf.string containing a serialized
-      Example protocol buffer.
-    is_training: A boolean denoting whether the input is for training.
-    dtype: Data type to use for input images.
-
-  Returns:
-    Tuple with processed image tensor and one-hot-encoded label tensor.
-  """
-  # Convert bytes to a vector of uint8 that is record_bytes long.
-  record_vector = tf.io.decode_raw(raw_record, tf.uint8)
-
-  # The first byte represents the label, which we convert from uint8 to int32
-  # and then to one-hot.
-  label = tf.cast(record_vector[0], tf.int32)
-
-  # The remaining bytes after the label represent the image, which we reshape
-  # from [depth * height * width] to [depth, height, width].
-  depth_major = tf.reshape(record_vector[1:_RECORD_BYTES],
-                           [NUM_CHANNELS, HEIGHT, WIDTH])
-
-  # Convert from [depth, height, width] to [height, width, depth], and cast as
-  # float32.
-  image = tf.cast(tf.transpose(a=depth_major, perm=[1, 2, 0]), tf.float32)
-
-  image = preprocess_image(image, is_training)
-  image = tf.cast(image, dtype)
-
-  return image, label
-
-
-def preprocess_image(image, is_training):
-  """Preprocess a single image of layout [height, width, depth]."""
-  if is_training:
-    # Resize the image to add four extra pixels on each side.
-    image = tf.image.resize_with_crop_or_pad(
-        image, HEIGHT + 8, WIDTH + 8)
-
-    # Randomly crop a [HEIGHT, WIDTH] section of the image.
-    image = tf.image.random_crop(image, [HEIGHT, WIDTH, NUM_CHANNELS])
-
-    # Randomly flip the image horizontally.
-    image = tf.image.random_flip_left_right(image)
-
-  # Subtract off the mean and divide by the variance of the pixels.
-  image = tf.image.per_image_standardization(image)
-  return image
-
-
-def get_filenames(is_training, data_dir):
-  """Returns a list of filenames."""
-  assert tf.io.gfile.exists(data_dir), (
-      'Run cifar10_download_and_extract.py first to download and extract the '
-      'CIFAR-10 data.')
-
-  if is_training:
-    return [
-        os.path.join(data_dir, 'data_batch_%d.bin' % i)
-        for i in range(1, _NUM_DATA_FILES + 1)
-    ]
-  else:
-    return [os.path.join(data_dir, 'test_batch.bin')]
-
-
-def input_fn(is_training,
-             data_dir,
-             batch_size,
-             dtype=tf.float32,
-             datasets_num_private_threads=None,
-             parse_record_fn=parse_record,
-             input_context=None,
-             drop_remainder=False):
-  """Input function which provides batches for train or eval.
-
-  Args:
-    is_training: A boolean denoting whether the input is for training.
-    data_dir: The directory containing the input data.
-    batch_size: The number of samples per batch.
-    dtype: Data type to use for images/features
-    datasets_num_private_threads: Number of private threads for tf.data.
-    parse_record_fn: Function to use for parsing the records.
-    input_context: A `tf.distribute.InputContext` object passed in by
-      `tf.distribute.Strategy`.
-    drop_remainder: A boolean indicates whether to drop the remainder of the
-      batches. If True, the batch dimension will be static.
-
-  Returns:
-    A dataset that can be used for iteration.
-  """
-  filenames = get_filenames(is_training, data_dir)
-  dataset = tf.data.FixedLengthRecordDataset(filenames, _RECORD_BYTES)
-
-  if input_context:
-    logging.info(
-        'Sharding the dataset: input_pipeline_id=%d num_input_pipelines=%d',
-        input_context.input_pipeline_id, input_context.num_input_pipelines)
-    dataset = dataset.shard(input_context.num_input_pipelines,
-                            input_context.input_pipeline_id)
-
-  return imagenet_preprocessing.process_record_dataset(
-      dataset=dataset,
-      is_training=is_training,
-      batch_size=batch_size,
-      shuffle_buffer=NUM_IMAGES['train'],
-      parse_record_fn=parse_record_fn,
-      dtype=dtype,
-      datasets_num_private_threads=datasets_num_private_threads,
-      drop_remainder=drop_remainder
-  )
--- a/official/benchmark/models/resnet_cifar_main.py
+++ b/official/benchmark/models/resnet_cifar_main.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Runs a ResNet model on the Cifar-10 dataset."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl import app
-from absl import flags
-from absl import logging
-import numpy as np
-import tensorflow as tf
-from official.benchmark.models import cifar_preprocessing
-from official.benchmark.models import resnet_cifar_model
-from official.benchmark.models import synthetic_util
-from official.utils.flags import core as flags_core
-from official.utils.misc import distribution_utils
-from official.utils.misc import keras_utils
-from official.vision.image_classification.resnet import common
-
-
-LR_SCHEDULE = [  # (multiplier, epoch to start) tuples
-    (0.1, 91), (0.01, 136), (0.001, 182)
-]
-
-
-def learning_rate_schedule(current_epoch,
-                           current_batch,
-                           batches_per_epoch,
-                           batch_size):
-  """Handles linear scaling rule and LR decay.
-
-  Scale learning rate at epoch boundaries provided in LR_SCHEDULE by the
-  provided scaling factor.
-
-  Args:
-    current_epoch: integer, current epoch indexed from 0.
-    current_batch: integer, current batch in the current epoch, indexed from 0.
-    batches_per_epoch: integer, number of steps in an epoch.
-    batch_size: integer, total batch sized.
-
-  Returns:
-    Adjusted learning rate.
-  """
-  del current_batch, batches_per_epoch  # not used
-  initial_learning_rate = common.BASE_LEARNING_RATE * batch_size / 128
-  learning_rate = initial_learning_rate
-  for mult, start_epoch in LR_SCHEDULE:
-    if current_epoch >= start_epoch:
-      learning_rate = initial_learning_rate * mult
-    else:
-      break
-  return learning_rate
-
-
-class LearningRateBatchScheduler(tf.keras.callbacks.Callback):
-  """Callback to update learning rate on every batch (not epoch boundaries).
-
-  N.B. Only support Keras optimizers, not TF optimizers.
-
-  Attributes:
-      schedule: a function that takes an epoch index and a batch index as input
-          (both integer, indexed from 0) and returns a new learning rate as
-          output (float).
-  """
-
-  def __init__(self, schedule, batch_size, steps_per_epoch):
-    super(LearningRateBatchScheduler, self).__init__()
-    self.schedule = schedule
-    self.steps_per_epoch = steps_per_epoch
-    self.batch_size = batch_size
-    self.epochs = -1
-    self.prev_lr = -1
-
-  def on_epoch_begin(self, epoch, logs=None):
-    if not hasattr(self.model.optimizer, 'learning_rate'):
-      raise ValueError('Optimizer must have a "learning_rate" attribute.')
-    self.epochs += 1
-
-  def on_batch_begin(self, batch, logs=None):
-    """Executes before step begins."""
-    lr = self.schedule(self.epochs,
-                       batch,
-                       self.steps_per_epoch,
-                       self.batch_size)
-    if not isinstance(lr, (float, np.float32, np.float64)):
-      raise ValueError('The output of the "schedule" function should be float.')
-    if lr != self.prev_lr:
-      self.model.optimizer.learning_rate = lr  # lr should be a float here
-      self.prev_lr = lr
-      logging.debug(
-          'Epoch %05d Batch %05d: LearningRateBatchScheduler '
-          'change learning rate to %s.', self.epochs, batch, lr)
-
-
-def run(flags_obj):
-  """Run ResNet Cifar-10 training and eval loop using native Keras APIs.
-
-  Args:
-    flags_obj: An object containing parsed flag values.
-
-  Raises:
-    ValueError: If fp16 is passed as it is not currently supported.
-
-  Returns:
-    Dictionary of training and eval stats.
-  """
-  keras_utils.set_session_config(
-      enable_xla=flags_obj.enable_xla)
-
-  # Execute flag override logic for better model performance
-  if flags_obj.tf_gpu_thread_mode:
-    keras_utils.set_gpu_thread_mode_and_count(
-        per_gpu_thread_count=flags_obj.per_gpu_thread_count,
-        gpu_thread_mode=flags_obj.tf_gpu_thread_mode,
-        num_gpus=flags_obj.num_gpus,
-        datasets_num_private_threads=flags_obj.datasets_num_private_threads)
-  common.set_cudnn_batchnorm_mode()
-
-  dtype = flags_core.get_tf_dtype(flags_obj)
-  if dtype == 'fp16':
-    raise ValueError('dtype fp16 is not supported in Keras. Use the default '
-                     'value(fp32).')
-
-  data_format = flags_obj.data_format
-  if data_format is None:
-    data_format = ('channels_first' if tf.config.list_physical_devices('GPU')
-                   else 'channels_last')
-  tf.keras.backend.set_image_data_format(data_format)
-
-  strategy = distribution_utils.get_distribution_strategy(
-      distribution_strategy=flags_obj.distribution_strategy,
-      num_gpus=flags_obj.num_gpus,
-      all_reduce_alg=flags_obj.all_reduce_alg,
-      num_packs=flags_obj.num_packs)
-
-  if strategy:
-    # flags_obj.enable_get_next_as_optional controls whether enabling
-    # get_next_as_optional behavior in DistributedIterator. If true, last
-    # partial batch can be supported.
-    strategy.extended.experimental_enable_get_next_as_optional = (
-        flags_obj.enable_get_next_as_optional
-    )
-
-  strategy_scope = distribution_utils.get_strategy_scope(strategy)
-
-  if flags_obj.use_synthetic_data:
-    synthetic_util.set_up_synthetic_data()
-    input_fn = common.get_synth_input_fn(
-        height=cifar_preprocessing.HEIGHT,
-        width=cifar_preprocessing.WIDTH,
-        num_channels=cifar_preprocessing.NUM_CHANNELS,
-        num_classes=cifar_preprocessing.NUM_CLASSES,
-        dtype=flags_core.get_tf_dtype(flags_obj),
-        drop_remainder=True)
-  else:
-    synthetic_util.undo_set_up_synthetic_data()
-    input_fn = cifar_preprocessing.input_fn
-
-  train_input_dataset = input_fn(
-      is_training=True,
-      data_dir=flags_obj.data_dir,
-      batch_size=flags_obj.batch_size,
-      parse_record_fn=cifar_preprocessing.parse_record,
-      datasets_num_private_threads=flags_obj.datasets_num_private_threads,
-      dtype=dtype,
-      # Setting drop_remainder to avoid the partial batch logic in normalization
-      # layer, which triggers tf.where and leads to extra memory copy of input
-      # sizes between host and GPU.
-      drop_remainder=(not flags_obj.enable_get_next_as_optional))
-
-  eval_input_dataset = None
-  if not flags_obj.skip_eval:
-    eval_input_dataset = input_fn(
-        is_training=False,
-        data_dir=flags_obj.data_dir,
-        batch_size=flags_obj.batch_size,
-        parse_record_fn=cifar_preprocessing.parse_record)
-
-  steps_per_epoch = (
-      cifar_preprocessing.NUM_IMAGES['train'] // flags_obj.batch_size)
-  lr_schedule = 0.1
-  if flags_obj.use_tensor_lr:
-    initial_learning_rate = common.BASE_LEARNING_RATE * flags_obj.batch_size / 128
-    lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(
-        boundaries=list(p[1] * steps_per_epoch for p in LR_SCHEDULE),
-        values=[initial_learning_rate] +
-        list(p[0] * initial_learning_rate for p in LR_SCHEDULE))
-
-  with strategy_scope:
-    optimizer = common.get_optimizer(lr_schedule)
-    model = resnet_cifar_model.resnet56(classes=cifar_preprocessing.NUM_CLASSES)
-    model.compile(
-        loss='sparse_categorical_crossentropy',
-        optimizer=optimizer,
-        metrics=(['sparse_categorical_accuracy']
-                 if flags_obj.report_accuracy_metrics else None),
-        run_eagerly=flags_obj.run_eagerly)
-
-  train_epochs = flags_obj.train_epochs
-
-  callbacks = common.get_callbacks()
-
-  if not flags_obj.use_tensor_lr:
-    lr_callback = LearningRateBatchScheduler(
-        schedule=learning_rate_schedule,
-        batch_size=flags_obj.batch_size,
-        steps_per_epoch=steps_per_epoch)
-    callbacks.append(lr_callback)
-
-  # if mutliple epochs, ignore the train_steps flag.
-  if train_epochs <= 1 and flags_obj.train_steps:
-    steps_per_epoch = min(flags_obj.train_steps, steps_per_epoch)
-    train_epochs = 1
-
-  num_eval_steps = (cifar_preprocessing.NUM_IMAGES['validation'] //
-                    flags_obj.batch_size)
-
-  validation_data = eval_input_dataset
-  if flags_obj.skip_eval:
-    if flags_obj.set_learning_phase_to_train:
-      # TODO(haoyuzhang): Understand slowdown of setting learning phase when
-      # not using distribution strategy.
-      tf.keras.backend.set_learning_phase(1)
-    num_eval_steps = None
-    validation_data = None
-
-  if not strategy and flags_obj.explicit_gpu_placement:
-    # TODO(b/135607227): Add device scope automatically in Keras training loop
-    # when not using distribition strategy.
-    no_dist_strat_device = tf.device('/device:GPU:0')
-    no_dist_strat_device.__enter__()
-
-  history = model.fit(train_input_dataset,
-                      epochs=train_epochs,
-                      steps_per_epoch=steps_per_epoch,
-                      callbacks=callbacks,
-                      validation_steps=num_eval_steps,
-                      validation_data=validation_data,
-                      validation_freq=flags_obj.epochs_between_evals,
-                      verbose=2)
-  eval_output = None
-  if not flags_obj.skip_eval:
-    eval_output = model.evaluate(eval_input_dataset,
-                                 steps=num_eval_steps,
-                                 verbose=2)
-
-  if not strategy and flags_obj.explicit_gpu_placement:
-    no_dist_strat_device.__exit__()
-
-  stats = common.build_stats(history, eval_output, callbacks)
-  return stats
-
-
-def define_cifar_flags():
-  common.define_keras_flags(dynamic_loss_scale=False)
-
-  flags_core.set_defaults(data_dir='/tmp/cifar10_data/cifar-10-batches-bin',
-                          model_dir='/tmp/cifar10_model',
-                          epochs_between_evals=10,
-                          batch_size=128)
-
-
-def main(_):
-  return run(flags.FLAGS)
-
-
-if __name__ == '__main__':
-  logging.set_verbosity(logging.INFO)
-  define_cifar_flags()
-  app.run(main)
--- a/official/benchmark/models/resnet_cifar_model.py
+++ b/official/benchmark/models/resnet_cifar_model.py
--- a/official/benchmark/models/resnet_cifar_test.py
+++ b/official/benchmark/models/resnet_cifar_test.py
--- a/official/benchmark/models/resnet_imagenet_main.py
+++ b/official/benchmark/models/resnet_imagenet_main.py
--- a/official/benchmark/models/resnet_imagenet_test.py
+++ b/official/benchmark/models/resnet_imagenet_test.py
--- a/official/benchmark/models/resnet_imagenet_test_tpu.py
+++ b/official/benchmark/models/resnet_imagenet_test_tpu.py
--- a/official/benchmark/models/shakespeare/README.md
+++ b/official/benchmark/models/shakespeare/README.md
-# Shakespeare character LSTM model
-
-This is an implemention of a simple character LSTM used to generate text.
-
-## Instructions
-
-First download the source data:
-
-```
-wget https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt
-```
-
-Note that files other than shakepeare.txt can also be used to train the model to generater other text.
-
-Then train the model:
-
-```python
-python3 shakespeare_main.py --training_data shakespeare.txt \
-    --model_dir /tmp/shakespeare
-```
-
-This will place model checkpoints in `/tmp/shakespeare`, so that we can use them to make predictions.
-
-Then generate predictions:
-
-```python
-python3 shakespeare_main.py --training_data shakespeare.txt \
-    --model_dir /tmp/shakespeare --notrain --predict_context=ROMEO:
-```
-
-Change `--predict_context` and `--predict_length` to suit your needs.
--- a/official/benchmark/models/shakespeare/__init__.py
+++ b/official/benchmark/models/shakespeare/__init__.py
-
--- a/official/benchmark/models/shakespeare/shakespeare_main.py
+++ b/official/benchmark/models/shakespeare/shakespeare_main.py
--- a/official/benchmark/models/synthetic_util.py
+++ b/official/benchmark/models/synthetic_util.py
--- a/official/benchmark/ncf_keras_benchmark.py
+++ b/official/benchmark/ncf_keras_benchmark.py
--- a/official/benchmark/nhnet_benchmark.py
+++ b/official/benchmark/nhnet_benchmark.py
--- a/official/benchmark/owner_utils.py
+++ b/official/benchmark/owner_utils.py
--- a/official/benchmark/owner_utils_test.py
+++ b/official/benchmark/owner_utils_test.py
--- a/official/benchmark/perfzero_benchmark.py
+++ b/official/benchmark/perfzero_benchmark.py
--- a/official/benchmark/resnet_ctl_imagenet_benchmark.py
+++ b/official/benchmark/resnet_ctl_imagenet_benchmark.py
--- a/official/benchmark/retinanet_benchmark.py
+++ b/official/benchmark/retinanet_benchmark.py
--- a/official/benchmark/shakespeare_benchmark.py
+++ b/official/benchmark/shakespeare_benchmark.py
--- a/official/benchmark/tfhub_memory_usage_benchmark.py
+++ b/official/benchmark/tfhub_memory_usage_benchmark.py