classifier_trainer.py 15.7 KB
Newer Older
Allen Wang's avatar
Allen Wang committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# Lint as: python3
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Runs an Image Classification model."""

import os
import pprint
from typing import Any, Tuple, Text, Optional, Mapping

from absl import app
from absl import flags
from absl import logging
Hongkun Yu's avatar
Hongkun Yu committed
25
import tensorflow as tf
Allen Wang's avatar
Allen Wang committed
26

27
from official.modeling import hyperparams
Allen Wang's avatar
Allen Wang committed
28
29
30
31
32
33
34
35
36
37
38
39
40
from official.modeling import performance
from official.utils import hyperparams_flags
from official.utils.misc import distribution_utils
from official.utils.misc import keras_utils
from official.vision.image_classification import callbacks as custom_callbacks
from official.vision.image_classification import dataset_factory
from official.vision.image_classification import optimizer_factory
from official.vision.image_classification.configs import base_configs
from official.vision.image_classification.configs import configs
from official.vision.image_classification.efficientnet import efficientnet_model
from official.vision.image_classification.resnet import common
from official.vision.image_classification.resnet import resnet_model

Allen Wang's avatar
Allen Wang committed
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58

def get_models() -> Mapping[str, tf.keras.Model]:
  """Returns the mapping from model type name to Keras model."""
  return  {
      'efficientnet': efficientnet_model.EfficientNet.from_name,
      'resnet': resnet_model.resnet50,
  }


def get_dtype_map() -> Mapping[str, tf.dtypes.DType]:
  """Returns the mapping from dtype string representations to TF dtypes."""
  return {
      'float32': tf.float32,
      'bfloat16': tf.bfloat16,
      'float16': tf.float16,
      'fp32': tf.float32,
      'bf16': tf.bfloat16,
    }
Allen Wang's avatar
Allen Wang committed
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97


def _get_metrics(one_hot: bool) -> Mapping[Text, Any]:
  """Get a dict of available metrics to track."""
  if one_hot:
    return {
        # (name, metric_fn)
        'acc': tf.keras.metrics.CategoricalAccuracy(name='accuracy'),
        'accuracy': tf.keras.metrics.CategoricalAccuracy(name='accuracy'),
        'top_1': tf.keras.metrics.CategoricalAccuracy(name='accuracy'),
        'top_5': tf.keras.metrics.TopKCategoricalAccuracy(
            k=5,
            name='top_5_accuracy'),
    }
  else:
    return {
        # (name, metric_fn)
        'acc': tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy'),
        'accuracy': tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy'),
        'top_1': tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy'),
        'top_5': tf.keras.metrics.SparseTopKCategoricalAccuracy(
            k=5,
            name='top_5_accuracy'),
    }


def get_image_size_from_model(
    params: base_configs.ExperimentConfig) -> Optional[int]:
  """If the given model has a preferred image size, return it."""
  if params.model_name == 'efficientnet':
    efficientnet_name = params.model.model_params.model_name
    if efficientnet_name in efficientnet_model.MODEL_CONFIGS:
      return efficientnet_model.MODEL_CONFIGS[efficientnet_name].resolution
  return None


def _get_dataset_builders(params: base_configs.ExperimentConfig,
                          strategy: tf.distribute.Strategy,
                          one_hot: bool
Allen Wang's avatar
Allen Wang committed
98
99
                         ) -> Tuple[Any, Any]:
  """Create and return train and validation dataset builders."""
Allen Wang's avatar
Allen Wang committed
100
101
102
103
104
105
  if one_hot:
    logging.warning('label_smoothing > 0, so datasets will be one hot encoded.')
  else:
    logging.warning('label_smoothing not applied, so datasets will not be one '
                    'hot encoded.')

106
107
  num_devices = strategy.num_replicas_in_sync if strategy else 1

Allen Wang's avatar
Allen Wang committed
108
109
110
  image_size = get_image_size_from_model(params)

  dataset_configs = [
111
      params.train_dataset, params.validation_dataset
Allen Wang's avatar
Allen Wang committed
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
  ]
  builders = []

  for config in dataset_configs:
    if config is not None and config.has_data:
      builder = dataset_factory.DatasetBuilder(
          config,
          image_size=image_size or config.image_size,
          num_devices=num_devices,
          one_hot=one_hot)
    else:
      builder = None
    builders.append(builder)

  return builders


def get_loss_scale(params: base_configs.ExperimentConfig,
                   fp16_default: float = 128.) -> float:
  """Returns the loss scale for initializations."""
Allen Wang's avatar
Allen Wang committed
132
  loss_scale = params.runtime.loss_scale
Allen Wang's avatar
Allen Wang committed
133
134
135
136
  if loss_scale == 'dynamic':
    return loss_scale
  elif loss_scale is not None:
    return float(loss_scale)
Hongkun Yu's avatar
Hongkun Yu committed
137
138
  elif (params.train_dataset.dtype == 'float32' or
        params.train_dataset.dtype == 'bfloat16'):
Allen Wang's avatar
Allen Wang committed
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
    return 1.
  else:
    assert params.train_dataset.dtype == 'float16'
    return fp16_default


def _get_params_from_flags(flags_obj: flags.FlagValues):
  """Get ParamsDict from flags."""
  model = flags_obj.model_type.lower()
  dataset = flags_obj.dataset.lower()
  params = configs.get_config(model=model, dataset=dataset)

  flags_overrides = {
      'model_dir': flags_obj.model_dir,
      'mode': flags_obj.mode,
      'model': {
          'name': model,
      },
      'runtime': {
Allen Wang's avatar
Allen Wang committed
158
          'run_eagerly': flags_obj.run_eagerly,
Allen Wang's avatar
Allen Wang committed
159
160
161
162
163
164
165
166
          'tpu': flags_obj.tpu,
      },
      'train_dataset': {
          'data_dir': flags_obj.data_dir,
      },
      'validation_dataset': {
          'data_dir': flags_obj.data_dir,
      },
Allen Wang's avatar
Allen Wang committed
167
168
169
170
171
      'train': {
          'time_history': {
              'log_steps': flags_obj.log_steps,
          },
      },
Allen Wang's avatar
Allen Wang committed
172
173
174
175
176
177
178
179
180
181
182
183
  }

  overriding_configs = (flags_obj.config_file,
                        flags_obj.params_override,
                        flags_overrides)

  pp = pprint.PrettyPrinter()

  logging.info('Base params: %s', pp.pformat(params.as_dict()))

  for param in overriding_configs:
    logging.info('Overriding params: %s', param)
184
    params = hyperparams.override_params_dict(params, param, is_strict=True)
Allen Wang's avatar
Allen Wang committed
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225

  params.validate()
  params.lock()

  logging.info('Final model parameters: %s', pp.pformat(params.as_dict()))
  return params


def resume_from_checkpoint(model: tf.keras.Model,
                           model_dir: str,
                           train_steps: int) -> int:
  """Resumes from the latest checkpoint, if possible.

  Loads the model weights and optimizer settings from a checkpoint.
  This function should be used in case of preemption recovery.

  Args:
    model: The model whose weights should be restored.
    model_dir: The directory where model weights were saved.
    train_steps: The number of steps to train.

  Returns:
    The epoch of the latest checkpoint, or 0 if not restoring.

  """
  logging.info('Load from checkpoint is enabled.')
  latest_checkpoint = tf.train.latest_checkpoint(model_dir)
  logging.info('latest_checkpoint: %s', latest_checkpoint)
  if not latest_checkpoint:
    logging.info('No checkpoint detected.')
    return 0

  logging.info('Checkpoint file %s found and restoring from '
               'checkpoint', latest_checkpoint)
  model.load_weights(latest_checkpoint)
  initial_epoch = model.optimizer.iterations // train_steps
  logging.info('Completed loading from checkpoint.')
  logging.info('Resuming from epoch %d', initial_epoch)
  return int(initial_epoch)


Allen Wang's avatar
Allen Wang committed
226
227
def initialize(params: base_configs.ExperimentConfig,
               dataset_builder: dataset_factory.DatasetBuilder):
Allen Wang's avatar
Allen Wang committed
228
229
230
  """Initializes backend related initializations."""
  keras_utils.set_session_config(
      enable_xla=params.runtime.enable_xla)
Hongkun Yu's avatar
Hongkun Yu committed
231
232
  performance.set_mixed_precision_policy(dataset_builder.dtype,
                                         get_loss_scale(params))
Allen Wang's avatar
Allen Wang committed
233
  if tf.config.list_physical_devices('GPU'):
Allen Wang's avatar
Allen Wang committed
234
235
236
237
238
239
240
    data_format = 'channels_first'
  else:
    data_format = 'channels_last'
  tf.keras.backend.set_image_data_format(data_format)
  distribution_utils.configure_cluster(
      params.runtime.worker_hosts,
      params.runtime.task_index)
Allen Wang's avatar
Allen Wang committed
241
  if params.runtime.run_eagerly:
Allen Wang's avatar
Allen Wang committed
242
243
    # Enable eager execution to allow step-by-step debugging
    tf.config.experimental_run_functions_eagerly(True)
244
245
246
247
248
249
250
251
252
  if tf.config.list_physical_devices('GPU'):
    if params.runtime.gpu_thread_mode:
      keras_utils.set_gpu_thread_mode_and_count(
          per_gpu_thread_count=params.runtime.per_gpu_thread_count,
          gpu_thread_mode=params.runtime.gpu_thread_mode,
          num_gpus=params.runtime.num_gpus,
          datasets_num_private_threads=params.runtime.dataset_num_private_threads)  # pylint:disable=line-too-long
    if params.runtime.batchnorm_spatial_persistent:
      os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'
Allen Wang's avatar
Allen Wang committed
253
254
255
256
257
258
259
260
261
262
263
264
265
266


def define_classifier_flags():
  """Defines common flags for image classification."""
  hyperparams_flags.initialize_common_flags()
  flags.DEFINE_string(
      'data_dir',
      default=None,
      help='The location of the input data.')
  flags.DEFINE_string(
      'mode',
      default=None,
      help='Mode to run: `train`, `eval`, `train_and_eval` or `export`.')
  flags.DEFINE_bool(
Allen Wang's avatar
Allen Wang committed
267
      'run_eagerly',
Allen Wang's avatar
Allen Wang committed
268
269
270
271
272
273
274
275
276
277
      default=None,
      help='Use eager execution and disable autograph for debugging.')
  flags.DEFINE_string(
      'model_type',
      default=None,
      help='The type of the model, e.g. EfficientNet, etc.')
  flags.DEFINE_string(
      'dataset',
      default=None,
      help='The name of the dataset, e.g. ImageNet, etc.')
Allen Wang's avatar
Allen Wang committed
278
279
280
281
  flags.DEFINE_integer(
      'log_steps',
      default=100,
      help='The interval of steps between logging of batch level stats.')
Allen Wang's avatar
Allen Wang committed
282
283
284
285
286
287
288
289


def serialize_config(params: base_configs.ExperimentConfig,
                     model_dir: str):
  """Serializes and saves the experiment config."""
  params_save_path = os.path.join(model_dir, 'params.yaml')
  logging.info('Saving experiment configuration to %s', params_save_path)
  tf.io.gfile.makedirs(model_dir)
290
  hyperparams.save_params_dict_to_yaml(params, params_save_path)
Allen Wang's avatar
Allen Wang committed
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307


def train_and_eval(
    params: base_configs.ExperimentConfig,
    strategy_override: tf.distribute.Strategy) -> Mapping[str, Any]:
  """Runs the train and eval path using compile/fit."""
  logging.info('Running train and eval.')

  # Note: for TPUs, strategy and scope should be created before the dataset
  strategy = strategy_override or distribution_utils.get_distribution_strategy(
      distribution_strategy=params.runtime.distribution_strategy,
      all_reduce_alg=params.runtime.all_reduce_alg,
      num_gpus=params.runtime.num_gpus,
      tpu_address=params.runtime.tpu)

  strategy_scope = distribution_utils.get_strategy_scope(strategy)

308
309
  logging.info('Detected %d devices.',
               strategy.num_replicas_in_sync if strategy else 1)
Allen Wang's avatar
Allen Wang committed
310
311
312
313
314

  label_smoothing = params.model.loss.label_smoothing
  one_hot = label_smoothing and label_smoothing > 0

  builders = _get_dataset_builders(params, strategy, one_hot)
315
316
  datasets = [builder.build(strategy)
              if builder else None for builder in builders]
Allen Wang's avatar
Allen Wang committed
317
318

  # Unpack datasets and builders based on train/val/test splits
319
320
  train_builder, validation_builder = builders  # pylint: disable=unbalanced-tuple-unpacking
  train_dataset, validation_dataset = datasets
Allen Wang's avatar
Allen Wang committed
321
322
323
324
325

  train_epochs = params.train.epochs
  train_steps = params.train.steps or train_builder.num_steps
  validation_steps = params.evaluation.steps or validation_builder.num_steps

Allen Wang's avatar
Allen Wang committed
326
327
  initialize(params, train_builder)

Allen Wang's avatar
Allen Wang committed
328
329
330
331
  logging.info('Global batch size: %d', train_builder.global_batch_size)

  with strategy_scope:
    model_params = params.model.model_params.as_dict()
Allen Wang's avatar
Allen Wang committed
332
    model = get_models()[params.model.name](**model_params)
Allen Wang's avatar
Allen Wang committed
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
    learning_rate = optimizer_factory.build_learning_rate(
        params=params.model.learning_rate,
        batch_size=train_builder.global_batch_size,
        train_steps=train_steps)
    optimizer = optimizer_factory.build_optimizer(
        optimizer_name=params.model.optimizer.name,
        base_learning_rate=learning_rate,
        params=params.model.optimizer.as_dict())

    metrics_map = _get_metrics(one_hot)
    metrics = [metrics_map[metric] for metric in params.train.metrics]

    if one_hot:
      loss_obj = tf.keras.losses.CategoricalCrossentropy(
          label_smoothing=params.model.loss.label_smoothing)
    else:
      loss_obj = tf.keras.losses.SparseCategoricalCrossentropy()
    model.compile(optimizer=optimizer,
                  loss=loss_obj,
Hongkun Yu's avatar
Hongkun Yu committed
352
353
                  metrics=metrics,
                  experimental_steps_per_execution=params.train.steps_per_loop)
Allen Wang's avatar
Allen Wang committed
354
355
356
357
358
359
360

    initial_epoch = 0
    if params.train.resume_checkpoint:
      initial_epoch = resume_from_checkpoint(model=model,
                                             model_dir=params.model_dir,
                                             train_steps=train_steps)

A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
361
362
363
364
365
366
367
368
369
370
371
    callbacks = custom_callbacks.get_callbacks(
        model_checkpoint=params.train.callbacks.enable_checkpoint_and_export,
        include_tensorboard=params.train.callbacks.enable_tensorboard,
        time_history=params.train.callbacks.enable_time_history,
        track_lr=params.train.tensorboard.track_lr,
        write_model_weights=params.train.tensorboard.write_model_weights,
        initial_step=initial_epoch * train_steps,
        batch_size=train_builder.global_batch_size,
        log_steps=params.train.time_history.log_steps,
        model_dir=params.model_dir)

Allen Wang's avatar
Allen Wang committed
372
373
  serialize_config(params=params, model_dir=params.model_dir)

Allen Wang's avatar
Allen Wang committed
374
375
376
377
378
379
380
381
382
  if params.evaluation.skip_eval:
    validation_kwargs = {}
  else:
    validation_kwargs = {
        'validation_data': validation_dataset,
        'validation_steps': validation_steps,
        'validation_freq': params.evaluation.epochs_between_evals,
    }

Allen Wang's avatar
Allen Wang committed
383
384
385
386
387
388
  history = model.fit(
      train_dataset,
      epochs=train_epochs,
      steps_per_epoch=train_steps,
      initial_epoch=initial_epoch,
      callbacks=callbacks,
Hongkun Yu's avatar
Hongkun Yu committed
389
390
      verbose=2,
      **validation_kwargs)
Allen Wang's avatar
Allen Wang committed
391

Allen Wang's avatar
Allen Wang committed
392
393
394
395
  validation_output = None
  if not params.evaluation.skip_eval:
    validation_output = model.evaluate(
        validation_dataset, steps=validation_steps, verbose=2)
Allen Wang's avatar
Allen Wang committed
396
397
398
399
400
401
402
403
404
405
406
407

  # TODO(dankondratyuk): eval and save final test accuracy
  stats = common.build_stats(history,
                             validation_output,
                             callbacks)
  return stats


def export(params: base_configs.ExperimentConfig):
  """Runs the model export functionality."""
  logging.info('Exporting model.')
  model_params = params.model.model_params.as_dict()
Allen Wang's avatar
Allen Wang committed
408
  model = get_models()[params.model.name](**model_params)
Allen Wang's avatar
Allen Wang committed
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
  checkpoint = params.export.checkpoint
  if checkpoint is None:
    logging.info('No export checkpoint was provided. Using the latest '
                 'checkpoint from model_dir.')
    checkpoint = tf.train.latest_checkpoint(params.model_dir)

  model.load_weights(checkpoint)
  model.save(params.export.destination)


def run(flags_obj: flags.FlagValues,
        strategy_override: tf.distribute.Strategy = None) -> Mapping[str, Any]:
  """Runs Image Classification model using native Keras APIs.

  Args:
    flags_obj: An object containing parsed flag values.
    strategy_override: A `tf.distribute.Strategy` object to use for model.

  Returns:
    Dictionary of training/eval stats
  """
  params = _get_params_from_flags(flags_obj)
  if params.mode == 'train_and_eval':
    return train_and_eval(params, strategy_override)
  elif params.mode == 'export_only':
    export(params)
  else:
    raise ValueError('{} is not a valid mode.'.format(params.mode))


def main(_):
Hongkun Yu's avatar
Hongkun Yu committed
440
  stats = run(flags.FLAGS)
Allen Wang's avatar
Allen Wang committed
441
442
443
444
445
446
447
448
449
450
451
452
453
  if stats:
    logging.info('Run stats:\n%s', stats)


if __name__ == '__main__':
  logging.set_verbosity(logging.INFO)
  define_classifier_flags()
  flags.mark_flag_as_required('data_dir')
  flags.mark_flag_as_required('mode')
  flags.mark_flag_as_required('model_type')
  flags.mark_flag_as_required('dataset')

  app.run(main)