classifier_trainer.py 15.8 KB
Newer Older
Allen Wang's avatar
Allen Wang committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# Lint as: python3
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Runs an Image Classification model."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os

import pprint
from typing import Any, Tuple, Text, Optional, Mapping

from absl import app
from absl import flags
from absl import logging
Hongkun Yu's avatar
Hongkun Yu committed
30
import tensorflow as tf
Allen Wang's avatar
Allen Wang committed
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46

from official.modeling import performance
from official.modeling.hyperparams import params_dict
from official.utils import hyperparams_flags
from official.utils.logs import logger
from official.utils.misc import distribution_utils
from official.utils.misc import keras_utils
from official.vision.image_classification import callbacks as custom_callbacks
from official.vision.image_classification import dataset_factory
from official.vision.image_classification import optimizer_factory
from official.vision.image_classification.configs import base_configs
from official.vision.image_classification.configs import configs
from official.vision.image_classification.efficientnet import efficientnet_model
from official.vision.image_classification.resnet import common
from official.vision.image_classification.resnet import resnet_model

Allen Wang's avatar
Allen Wang committed
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64

def get_models() -> Mapping[str, tf.keras.Model]:
  """Returns the mapping from model type name to Keras model."""
  return  {
      'efficientnet': efficientnet_model.EfficientNet.from_name,
      'resnet': resnet_model.resnet50,
  }


def get_dtype_map() -> Mapping[str, tf.dtypes.DType]:
  """Returns the mapping from dtype string representations to TF dtypes."""
  return {
      'float32': tf.float32,
      'bfloat16': tf.bfloat16,
      'float16': tf.float16,
      'fp32': tf.float32,
      'bf16': tf.bfloat16,
    }
Allen Wang's avatar
Allen Wang committed
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111


def _get_metrics(one_hot: bool) -> Mapping[Text, Any]:
  """Get a dict of available metrics to track."""
  if one_hot:
    return {
        # (name, metric_fn)
        'acc': tf.keras.metrics.CategoricalAccuracy(name='accuracy'),
        'accuracy': tf.keras.metrics.CategoricalAccuracy(name='accuracy'),
        'top_1': tf.keras.metrics.CategoricalAccuracy(name='accuracy'),
        'top_5': tf.keras.metrics.TopKCategoricalAccuracy(
            k=5,
            name='top_5_accuracy'),
    }
  else:
    return {
        # (name, metric_fn)
        'acc': tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy'),
        'accuracy': tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy'),
        'top_1': tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy'),
        'top_5': tf.keras.metrics.SparseTopKCategoricalAccuracy(
            k=5,
            name='top_5_accuracy'),
    }


def get_image_size_from_model(
    params: base_configs.ExperimentConfig) -> Optional[int]:
  """If the given model has a preferred image size, return it."""
  if params.model_name == 'efficientnet':
    efficientnet_name = params.model.model_params.model_name
    if efficientnet_name in efficientnet_model.MODEL_CONFIGS:
      return efficientnet_model.MODEL_CONFIGS[efficientnet_name].resolution
  return None


def _get_dataset_builders(params: base_configs.ExperimentConfig,
                          strategy: tf.distribute.Strategy,
                          one_hot: bool
                         ) -> Tuple[Any, Any, Any]:
  """Create and return train, validation, and test dataset builders."""
  if one_hot:
    logging.warning('label_smoothing > 0, so datasets will be one hot encoded.')
  else:
    logging.warning('label_smoothing not applied, so datasets will not be one '
                    'hot encoded.')

112
113
  num_devices = strategy.num_replicas_in_sync if strategy else 1

Allen Wang's avatar
Allen Wang committed
114
115
116
  image_size = get_image_size_from_model(params)

  dataset_configs = [
117
      params.train_dataset, params.validation_dataset
Allen Wang's avatar
Allen Wang committed
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
  ]
  builders = []

  for config in dataset_configs:
    if config is not None and config.has_data:
      builder = dataset_factory.DatasetBuilder(
          config,
          image_size=image_size or config.image_size,
          num_devices=num_devices,
          one_hot=one_hot)
    else:
      builder = None
    builders.append(builder)

  return builders


def get_loss_scale(params: base_configs.ExperimentConfig,
                   fp16_default: float = 128.) -> float:
  """Returns the loss scale for initializations."""
Allen Wang's avatar
Allen Wang committed
138
  loss_scale = params.runtime.loss_scale
Allen Wang's avatar
Allen Wang committed
139
140
141
142
  if loss_scale == 'dynamic':
    return loss_scale
  elif loss_scale is not None:
    return float(loss_scale)
Hongkun Yu's avatar
Hongkun Yu committed
143
144
  elif (params.train_dataset.dtype == 'float32' or
        params.train_dataset.dtype == 'bfloat16'):
Allen Wang's avatar
Allen Wang committed
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
    return 1.
  else:
    assert params.train_dataset.dtype == 'float16'
    return fp16_default


def _get_params_from_flags(flags_obj: flags.FlagValues):
  """Get ParamsDict from flags."""
  model = flags_obj.model_type.lower()
  dataset = flags_obj.dataset.lower()
  params = configs.get_config(model=model, dataset=dataset)

  flags_overrides = {
      'model_dir': flags_obj.model_dir,
      'mode': flags_obj.mode,
      'model': {
          'name': model,
      },
      'runtime': {
Allen Wang's avatar
Allen Wang committed
164
          'run_eagerly': flags_obj.run_eagerly,
Allen Wang's avatar
Allen Wang committed
165
166
167
168
169
170
171
172
          'tpu': flags_obj.tpu,
      },
      'train_dataset': {
          'data_dir': flags_obj.data_dir,
      },
      'validation_dataset': {
          'data_dir': flags_obj.data_dir,
      },
Allen Wang's avatar
Allen Wang committed
173
174
175
176
177
      'train': {
          'time_history': {
              'log_steps': flags_obj.log_steps,
          },
      },
Allen Wang's avatar
Allen Wang committed
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
  }

  overriding_configs = (flags_obj.config_file,
                        flags_obj.params_override,
                        flags_overrides)

  pp = pprint.PrettyPrinter()

  logging.info('Base params: %s', pp.pformat(params.as_dict()))

  for param in overriding_configs:
    logging.info('Overriding params: %s', param)
    # Set is_strict to false because we can have dynamic dict parameters.
    params = params_dict.override_params_dict(params, param, is_strict=False)

  params.validate()
  params.lock()

  logging.info('Final model parameters: %s', pp.pformat(params.as_dict()))
  return params


def resume_from_checkpoint(model: tf.keras.Model,
                           model_dir: str,
                           train_steps: int) -> int:
  """Resumes from the latest checkpoint, if possible.

  Loads the model weights and optimizer settings from a checkpoint.
  This function should be used in case of preemption recovery.

  Args:
    model: The model whose weights should be restored.
    model_dir: The directory where model weights were saved.
    train_steps: The number of steps to train.

  Returns:
    The epoch of the latest checkpoint, or 0 if not restoring.

  """
  logging.info('Load from checkpoint is enabled.')
  latest_checkpoint = tf.train.latest_checkpoint(model_dir)
  logging.info('latest_checkpoint: %s', latest_checkpoint)
  if not latest_checkpoint:
    logging.info('No checkpoint detected.')
    return 0

  logging.info('Checkpoint file %s found and restoring from '
               'checkpoint', latest_checkpoint)
  model.load_weights(latest_checkpoint)
  initial_epoch = model.optimizer.iterations // train_steps
  logging.info('Completed loading from checkpoint.')
  logging.info('Resuming from epoch %d', initial_epoch)
  return int(initial_epoch)


Allen Wang's avatar
Allen Wang committed
233
234
def initialize(params: base_configs.ExperimentConfig,
               dataset_builder: dataset_factory.DatasetBuilder):
Allen Wang's avatar
Allen Wang committed
235
236
237
  """Initializes backend related initializations."""
  keras_utils.set_session_config(
      enable_xla=params.runtime.enable_xla)
238
  if params.runtime.gpu_thread_mode:
Allen Wang's avatar
Allen Wang committed
239
240
241
242
243
244
    keras_utils.set_gpu_thread_mode_and_count(
        per_gpu_thread_count=params.runtime.per_gpu_thread_count,
        gpu_thread_mode=params.runtime.gpu_thread_mode,
        num_gpus=params.runtime.num_gpus,
        datasets_num_private_threads=params.runtime.dataset_num_private_threads)

Hongkun Yu's avatar
Hongkun Yu committed
245
246
  performance.set_mixed_precision_policy(dataset_builder.dtype,
                                         get_loss_scale(params))
Allen Wang's avatar
Allen Wang committed
247
  if tf.config.list_physical_devices('GPU'):
Allen Wang's avatar
Allen Wang committed
248
249
250
251
252
253
254
    data_format = 'channels_first'
  else:
    data_format = 'channels_last'
  tf.keras.backend.set_image_data_format(data_format)
  distribution_utils.configure_cluster(
      params.runtime.worker_hosts,
      params.runtime.task_index)
Allen Wang's avatar
Allen Wang committed
255
  if params.runtime.run_eagerly:
Allen Wang's avatar
Allen Wang committed
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
    # Enable eager execution to allow step-by-step debugging
    tf.config.experimental_run_functions_eagerly(True)


def define_classifier_flags():
  """Defines common flags for image classification."""
  hyperparams_flags.initialize_common_flags()
  flags.DEFINE_string(
      'data_dir',
      default=None,
      help='The location of the input data.')
  flags.DEFINE_string(
      'mode',
      default=None,
      help='Mode to run: `train`, `eval`, `train_and_eval` or `export`.')
  flags.DEFINE_bool(
Allen Wang's avatar
Allen Wang committed
272
      'run_eagerly',
Allen Wang's avatar
Allen Wang committed
273
274
275
276
277
278
279
280
281
282
      default=None,
      help='Use eager execution and disable autograph for debugging.')
  flags.DEFINE_string(
      'model_type',
      default=None,
      help='The type of the model, e.g. EfficientNet, etc.')
  flags.DEFINE_string(
      'dataset',
      default=None,
      help='The name of the dataset, e.g. ImageNet, etc.')
Allen Wang's avatar
Allen Wang committed
283
284
285
286
  flags.DEFINE_integer(
      'log_steps',
      default=100,
      help='The interval of steps between logging of batch level stats.')
Allen Wang's avatar
Allen Wang committed
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312


def serialize_config(params: base_configs.ExperimentConfig,
                     model_dir: str):
  """Serializes and saves the experiment config."""
  params_save_path = os.path.join(model_dir, 'params.yaml')
  logging.info('Saving experiment configuration to %s', params_save_path)
  tf.io.gfile.makedirs(model_dir)
  params_dict.save_params_dict_to_yaml(params, params_save_path)


def train_and_eval(
    params: base_configs.ExperimentConfig,
    strategy_override: tf.distribute.Strategy) -> Mapping[str, Any]:
  """Runs the train and eval path using compile/fit."""
  logging.info('Running train and eval.')

  # Note: for TPUs, strategy and scope should be created before the dataset
  strategy = strategy_override or distribution_utils.get_distribution_strategy(
      distribution_strategy=params.runtime.distribution_strategy,
      all_reduce_alg=params.runtime.all_reduce_alg,
      num_gpus=params.runtime.num_gpus,
      tpu_address=params.runtime.tpu)

  strategy_scope = distribution_utils.get_strategy_scope(strategy)

313
314
  logging.info('Detected %d devices.',
               strategy.num_replicas_in_sync if strategy else 1)
Allen Wang's avatar
Allen Wang committed
315
316
317
318
319
320
321
322

  label_smoothing = params.model.loss.label_smoothing
  one_hot = label_smoothing and label_smoothing > 0

  builders = _get_dataset_builders(params, strategy, one_hot)
  datasets = [builder.build() if builder else None for builder in builders]

  # Unpack datasets and builders based on train/val/test splits
323
324
  train_builder, validation_builder = builders  # pylint: disable=unbalanced-tuple-unpacking
  train_dataset, validation_dataset = datasets
Allen Wang's avatar
Allen Wang committed
325
326
327
328
329

  train_epochs = params.train.epochs
  train_steps = params.train.steps or train_builder.num_steps
  validation_steps = params.evaluation.steps or validation_builder.num_steps

Allen Wang's avatar
Allen Wang committed
330
331
  initialize(params, train_builder)

Allen Wang's avatar
Allen Wang committed
332
333
334
335
  logging.info('Global batch size: %d', train_builder.global_batch_size)

  with strategy_scope:
    model_params = params.model.model_params.as_dict()
Allen Wang's avatar
Allen Wang committed
336
    model = get_models()[params.model.name](**model_params)
Allen Wang's avatar
Allen Wang committed
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
    learning_rate = optimizer_factory.build_learning_rate(
        params=params.model.learning_rate,
        batch_size=train_builder.global_batch_size,
        train_steps=train_steps)
    optimizer = optimizer_factory.build_optimizer(
        optimizer_name=params.model.optimizer.name,
        base_learning_rate=learning_rate,
        params=params.model.optimizer.as_dict())

    metrics_map = _get_metrics(one_hot)
    metrics = [metrics_map[metric] for metric in params.train.metrics]

    if one_hot:
      loss_obj = tf.keras.losses.CategoricalCrossentropy(
          label_smoothing=params.model.loss.label_smoothing)
    else:
      loss_obj = tf.keras.losses.SparseCategoricalCrossentropy()
    model.compile(optimizer=optimizer,
                  loss=loss_obj,
Allen Wang's avatar
Allen Wang committed
356
                  metrics=metrics)
Allen Wang's avatar
Allen Wang committed
357
358
359
360
361
362
363

    initial_epoch = 0
    if params.train.resume_checkpoint:
      initial_epoch = resume_from_checkpoint(model=model,
                                             model_dir=params.model_dir,
                                             train_steps=train_steps)

A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
364
365
366
367
368
369
370
371
372
373
374
    callbacks = custom_callbacks.get_callbacks(
        model_checkpoint=params.train.callbacks.enable_checkpoint_and_export,
        include_tensorboard=params.train.callbacks.enable_tensorboard,
        time_history=params.train.callbacks.enable_time_history,
        track_lr=params.train.tensorboard.track_lr,
        write_model_weights=params.train.tensorboard.write_model_weights,
        initial_step=initial_epoch * train_steps,
        batch_size=train_builder.global_batch_size,
        log_steps=params.train.time_history.log_steps,
        model_dir=params.model_dir)

Allen Wang's avatar
Allen Wang committed
375
376
  serialize_config(params=params, model_dir=params.model_dir)

Allen Wang's avatar
Allen Wang committed
377
378
379
380
381
382
383
384
385
  if params.evaluation.skip_eval:
    validation_kwargs = {}
  else:
    validation_kwargs = {
        'validation_data': validation_dataset,
        'validation_steps': validation_steps,
        'validation_freq': params.evaluation.epochs_between_evals,
    }

Allen Wang's avatar
Allen Wang committed
386
387
388
389
390
391
  history = model.fit(
      train_dataset,
      epochs=train_epochs,
      steps_per_epoch=train_steps,
      initial_epoch=initial_epoch,
      callbacks=callbacks,
A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
392
393
394
      **validation_kwargs,
      experimental_steps_per_execution=params.train.steps_per_loop,
      verbose=2)
Allen Wang's avatar
Allen Wang committed
395

Allen Wang's avatar
Allen Wang committed
396
397
398
399
  validation_output = None
  if not params.evaluation.skip_eval:
    validation_output = model.evaluate(
        validation_dataset, steps=validation_steps, verbose=2)
Allen Wang's avatar
Allen Wang committed
400
401
402
403
404
405
406
407
408
409
410
411

  # TODO(dankondratyuk): eval and save final test accuracy
  stats = common.build_stats(history,
                             validation_output,
                             callbacks)
  return stats


def export(params: base_configs.ExperimentConfig):
  """Runs the model export functionality."""
  logging.info('Exporting model.')
  model_params = params.model.model_params.as_dict()
Allen Wang's avatar
Allen Wang committed
412
  model = get_models()[params.model.name](**model_params)
Allen Wang's avatar
Allen Wang committed
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
  checkpoint = params.export.checkpoint
  if checkpoint is None:
    logging.info('No export checkpoint was provided. Using the latest '
                 'checkpoint from model_dir.')
    checkpoint = tf.train.latest_checkpoint(params.model_dir)

  model.load_weights(checkpoint)
  model.save(params.export.destination)


def run(flags_obj: flags.FlagValues,
        strategy_override: tf.distribute.Strategy = None) -> Mapping[str, Any]:
  """Runs Image Classification model using native Keras APIs.

  Args:
    flags_obj: An object containing parsed flag values.
    strategy_override: A `tf.distribute.Strategy` object to use for model.

  Returns:
    Dictionary of training/eval stats
  """
  params = _get_params_from_flags(flags_obj)
  if params.mode == 'train_and_eval':
    return train_and_eval(params, strategy_override)
  elif params.mode == 'export_only':
    export(params)
  else:
    raise ValueError('{} is not a valid mode.'.format(params.mode))


def main(_):
  with logger.benchmark_context(flags.FLAGS):
    stats = run(flags.FLAGS)
  if stats:
    logging.info('Run stats:\n%s', stats)


if __name__ == '__main__':
  logging.set_verbosity(logging.INFO)
  define_classifier_flags()
  flags.mark_flag_as_required('data_dir')
  flags.mark_flag_as_required('mode')
  flags.mark_flag_as_required('model_type')
  flags.mark_flag_as_required('dataset')

  app.run(main)