classifier_trainer.py 15.9 KB
Newer Older
Allen Wang's avatar
Allen Wang committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# Lint as: python3
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Runs an Image Classification model."""

import os
import pprint
from typing import Any, Tuple, Text, Optional, Mapping

from absl import app
from absl import flags
from absl import logging
Hongkun Yu's avatar
Hongkun Yu committed
25
import tensorflow as tf
26
from official.common import distribute_utils
27
from official.modeling import hyperparams
Allen Wang's avatar
Allen Wang committed
28
29
30
31
32
33
34
35
36
37
38
39
from official.modeling import performance
from official.utils import hyperparams_flags
from official.utils.misc import keras_utils
from official.vision.image_classification import callbacks as custom_callbacks
from official.vision.image_classification import dataset_factory
from official.vision.image_classification import optimizer_factory
from official.vision.image_classification.configs import base_configs
from official.vision.image_classification.configs import configs
from official.vision.image_classification.efficientnet import efficientnet_model
from official.vision.image_classification.resnet import common
from official.vision.image_classification.resnet import resnet_model

Allen Wang's avatar
Allen Wang committed
40
41
42

def get_models() -> Mapping[str, tf.keras.Model]:
  """Returns the mapping from model type name to Keras model."""
Hongkun Yu's avatar
Hongkun Yu committed
43
  return {
Allen Wang's avatar
Allen Wang committed
44
45
46
47
48
49
50
51
52
53
54
55
56
      'efficientnet': efficientnet_model.EfficientNet.from_name,
      'resnet': resnet_model.resnet50,
  }


def get_dtype_map() -> Mapping[str, tf.dtypes.DType]:
  """Returns the mapping from dtype string representations to TF dtypes."""
  return {
      'float32': tf.float32,
      'bfloat16': tf.bfloat16,
      'float16': tf.float16,
      'fp32': tf.float32,
      'bf16': tf.bfloat16,
Hongkun Yu's avatar
Hongkun Yu committed
57
  }
Allen Wang's avatar
Allen Wang committed
58
59
60
61
62
63
64


def _get_metrics(one_hot: bool) -> Mapping[Text, Any]:
  """Get a dict of available metrics to track."""
  if one_hot:
    return {
        # (name, metric_fn)
Hongkun Yu's avatar
Hongkun Yu committed
65
66
67
68
69
70
71
72
73
        'acc':
            tf.keras.metrics.CategoricalAccuracy(name='accuracy'),
        'accuracy':
            tf.keras.metrics.CategoricalAccuracy(name='accuracy'),
        'top_1':
            tf.keras.metrics.CategoricalAccuracy(name='accuracy'),
        'top_5':
            tf.keras.metrics.TopKCategoricalAccuracy(
                k=5, name='top_5_accuracy'),
Allen Wang's avatar
Allen Wang committed
74
75
76
77
    }
  else:
    return {
        # (name, metric_fn)
Hongkun Yu's avatar
Hongkun Yu committed
78
79
80
81
82
83
84
85
86
        'acc':
            tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy'),
        'accuracy':
            tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy'),
        'top_1':
            tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy'),
        'top_5':
            tf.keras.metrics.SparseTopKCategoricalAccuracy(
                k=5, name='top_5_accuracy'),
Allen Wang's avatar
Allen Wang committed
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
    }


def get_image_size_from_model(
    params: base_configs.ExperimentConfig) -> Optional[int]:
  """If the given model has a preferred image size, return it."""
  if params.model_name == 'efficientnet':
    efficientnet_name = params.model.model_params.model_name
    if efficientnet_name in efficientnet_model.MODEL_CONFIGS:
      return efficientnet_model.MODEL_CONFIGS[efficientnet_name].resolution
  return None


def _get_dataset_builders(params: base_configs.ExperimentConfig,
                          strategy: tf.distribute.Strategy,
Hongkun Yu's avatar
Hongkun Yu committed
102
                          one_hot: bool) -> Tuple[Any, Any]:
Allen Wang's avatar
Allen Wang committed
103
  """Create and return train and validation dataset builders."""
Allen Wang's avatar
Allen Wang committed
104
105
106
107
108
109
  if one_hot:
    logging.warning('label_smoothing > 0, so datasets will be one hot encoded.')
  else:
    logging.warning('label_smoothing not applied, so datasets will not be one '
                    'hot encoded.')

110
111
  num_devices = strategy.num_replicas_in_sync if strategy else 1

Allen Wang's avatar
Allen Wang committed
112
113
  image_size = get_image_size_from_model(params)

Hongkun Yu's avatar
Hongkun Yu committed
114
  dataset_configs = [params.train_dataset, params.validation_dataset]
Allen Wang's avatar
Allen Wang committed
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
  builders = []

  for config in dataset_configs:
    if config is not None and config.has_data:
      builder = dataset_factory.DatasetBuilder(
          config,
          image_size=image_size or config.image_size,
          num_devices=num_devices,
          one_hot=one_hot)
    else:
      builder = None
    builders.append(builder)

  return builders


def get_loss_scale(params: base_configs.ExperimentConfig,
                   fp16_default: float = 128.) -> float:
  """Returns the loss scale for initializations."""
Allen Wang's avatar
Allen Wang committed
134
  loss_scale = params.runtime.loss_scale
Allen Wang's avatar
Allen Wang committed
135
136
137
138
  if loss_scale == 'dynamic':
    return loss_scale
  elif loss_scale is not None:
    return float(loss_scale)
Hongkun Yu's avatar
Hongkun Yu committed
139
140
  elif (params.train_dataset.dtype == 'float32' or
        params.train_dataset.dtype == 'bfloat16'):
Allen Wang's avatar
Allen Wang committed
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
    return 1.
  else:
    assert params.train_dataset.dtype == 'float16'
    return fp16_default


def _get_params_from_flags(flags_obj: flags.FlagValues):
  """Get ParamsDict from flags."""
  model = flags_obj.model_type.lower()
  dataset = flags_obj.dataset.lower()
  params = configs.get_config(model=model, dataset=dataset)

  flags_overrides = {
      'model_dir': flags_obj.model_dir,
      'mode': flags_obj.mode,
      'model': {
          'name': model,
      },
      'runtime': {
Allen Wang's avatar
Allen Wang committed
160
          'run_eagerly': flags_obj.run_eagerly,
Allen Wang's avatar
Allen Wang committed
161
162
163
164
165
166
167
168
          'tpu': flags_obj.tpu,
      },
      'train_dataset': {
          'data_dir': flags_obj.data_dir,
      },
      'validation_dataset': {
          'data_dir': flags_obj.data_dir,
      },
Allen Wang's avatar
Allen Wang committed
169
170
171
172
173
      'train': {
          'time_history': {
              'log_steps': flags_obj.log_steps,
          },
      },
Allen Wang's avatar
Allen Wang committed
174
175
  }

Hongkun Yu's avatar
Hongkun Yu committed
176
  overriding_configs = (flags_obj.config_file, flags_obj.params_override,
Allen Wang's avatar
Allen Wang committed
177
178
179
180
181
182
183
184
                        flags_overrides)

  pp = pprint.PrettyPrinter()

  logging.info('Base params: %s', pp.pformat(params.as_dict()))

  for param in overriding_configs:
    logging.info('Overriding params: %s', param)
185
    params = hyperparams.override_params_dict(params, param, is_strict=True)
Allen Wang's avatar
Allen Wang committed
186
187
188
189
190
191
192
193

  params.validate()
  params.lock()

  logging.info('Final model parameters: %s', pp.pformat(params.as_dict()))
  return params


Hongkun Yu's avatar
Hongkun Yu committed
194
def resume_from_checkpoint(model: tf.keras.Model, model_dir: str,
Allen Wang's avatar
Allen Wang committed
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
                           train_steps: int) -> int:
  """Resumes from the latest checkpoint, if possible.

  Loads the model weights and optimizer settings from a checkpoint.
  This function should be used in case of preemption recovery.

  Args:
    model: The model whose weights should be restored.
    model_dir: The directory where model weights were saved.
    train_steps: The number of steps to train.

  Returns:
    The epoch of the latest checkpoint, or 0 if not restoring.

  """
  logging.info('Load from checkpoint is enabled.')
  latest_checkpoint = tf.train.latest_checkpoint(model_dir)
  logging.info('latest_checkpoint: %s', latest_checkpoint)
  if not latest_checkpoint:
    logging.info('No checkpoint detected.')
    return 0

  logging.info('Checkpoint file %s found and restoring from '
               'checkpoint', latest_checkpoint)
  model.load_weights(latest_checkpoint)
  initial_epoch = model.optimizer.iterations // train_steps
  logging.info('Completed loading from checkpoint.')
  logging.info('Resuming from epoch %d', initial_epoch)
  return int(initial_epoch)


Allen Wang's avatar
Allen Wang committed
226
227
def initialize(params: base_configs.ExperimentConfig,
               dataset_builder: dataset_factory.DatasetBuilder):
Allen Wang's avatar
Allen Wang committed
228
  """Initializes backend related initializations."""
Hongkun Yu's avatar
Hongkun Yu committed
229
  keras_utils.set_session_config(enable_xla=params.runtime.enable_xla)
Hongkun Yu's avatar
Hongkun Yu committed
230
  performance.set_mixed_precision_policy(dataset_builder.dtype,
Reed Wanderman-Milne's avatar
Reed Wanderman-Milne committed
231
                                         use_experimental_api=False)
Allen Wang's avatar
Allen Wang committed
232
  if tf.config.list_physical_devices('GPU'):
Allen Wang's avatar
Allen Wang committed
233
234
235
236
    data_format = 'channels_first'
  else:
    data_format = 'channels_last'
  tf.keras.backend.set_image_data_format(data_format)
Allen Wang's avatar
Allen Wang committed
237
  if params.runtime.run_eagerly:
Allen Wang's avatar
Allen Wang committed
238
239
    # Enable eager execution to allow step-by-step debugging
    tf.config.experimental_run_functions_eagerly(True)
240
241
242
243
244
245
  if tf.config.list_physical_devices('GPU'):
    if params.runtime.gpu_thread_mode:
      keras_utils.set_gpu_thread_mode_and_count(
          per_gpu_thread_count=params.runtime.per_gpu_thread_count,
          gpu_thread_mode=params.runtime.gpu_thread_mode,
          num_gpus=params.runtime.num_gpus,
Hongkun Yu's avatar
Hongkun Yu committed
246
247
          datasets_num_private_threads=params.runtime
          .dataset_num_private_threads)  # pylint:disable=line-too-long
248
249
    if params.runtime.batchnorm_spatial_persistent:
      os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'
Allen Wang's avatar
Allen Wang committed
250
251
252
253
254
255


def define_classifier_flags():
  """Defines common flags for image classification."""
  hyperparams_flags.initialize_common_flags()
  flags.DEFINE_string(
Hongkun Yu's avatar
Hongkun Yu committed
256
      'data_dir', default=None, help='The location of the input data.')
Allen Wang's avatar
Allen Wang committed
257
258
259
260
261
  flags.DEFINE_string(
      'mode',
      default=None,
      help='Mode to run: `train`, `eval`, `train_and_eval` or `export`.')
  flags.DEFINE_bool(
Allen Wang's avatar
Allen Wang committed
262
      'run_eagerly',
Allen Wang's avatar
Allen Wang committed
263
264
265
266
267
268
269
270
271
272
      default=None,
      help='Use eager execution and disable autograph for debugging.')
  flags.DEFINE_string(
      'model_type',
      default=None,
      help='The type of the model, e.g. EfficientNet, etc.')
  flags.DEFINE_string(
      'dataset',
      default=None,
      help='The name of the dataset, e.g. ImageNet, etc.')
Allen Wang's avatar
Allen Wang committed
273
274
275
276
  flags.DEFINE_integer(
      'log_steps',
      default=100,
      help='The interval of steps between logging of batch level stats.')
Allen Wang's avatar
Allen Wang committed
277
278


Hongkun Yu's avatar
Hongkun Yu committed
279
def serialize_config(params: base_configs.ExperimentConfig, model_dir: str):
Allen Wang's avatar
Allen Wang committed
280
281
282
283
  """Serializes and saves the experiment config."""
  params_save_path = os.path.join(model_dir, 'params.yaml')
  logging.info('Saving experiment configuration to %s', params_save_path)
  tf.io.gfile.makedirs(model_dir)
284
  hyperparams.save_params_dict_to_yaml(params, params_save_path)
Allen Wang's avatar
Allen Wang committed
285
286
287
288
289
290
291
292


def train_and_eval(
    params: base_configs.ExperimentConfig,
    strategy_override: tf.distribute.Strategy) -> Mapping[str, Any]:
  """Runs the train and eval path using compile/fit."""
  logging.info('Running train and eval.')

293
294
  distribute_utils.configure_cluster(params.runtime.worker_hosts,
                                     params.runtime.task_index)
295

Allen Wang's avatar
Allen Wang committed
296
  # Note: for TPUs, strategy and scope should be created before the dataset
297
  strategy = strategy_override or distribute_utils.get_distribution_strategy(
Allen Wang's avatar
Allen Wang committed
298
299
300
301
302
      distribution_strategy=params.runtime.distribution_strategy,
      all_reduce_alg=params.runtime.all_reduce_alg,
      num_gpus=params.runtime.num_gpus,
      tpu_address=params.runtime.tpu)

303
  strategy_scope = distribute_utils.get_strategy_scope(strategy)
Allen Wang's avatar
Allen Wang committed
304

305
306
  logging.info('Detected %d devices.',
               strategy.num_replicas_in_sync if strategy else 1)
Allen Wang's avatar
Allen Wang committed
307
308
309
310
311

  label_smoothing = params.model.loss.label_smoothing
  one_hot = label_smoothing and label_smoothing > 0

  builders = _get_dataset_builders(params, strategy, one_hot)
Hongkun Yu's avatar
Hongkun Yu committed
312
313
314
  datasets = [
      builder.build(strategy) if builder else None for builder in builders
  ]
Allen Wang's avatar
Allen Wang committed
315
316

  # Unpack datasets and builders based on train/val/test splits
317
318
  train_builder, validation_builder = builders  # pylint: disable=unbalanced-tuple-unpacking
  train_dataset, validation_dataset = datasets
Allen Wang's avatar
Allen Wang committed
319
320
321
322
323

  train_epochs = params.train.epochs
  train_steps = params.train.steps or train_builder.num_steps
  validation_steps = params.evaluation.steps or validation_builder.num_steps

Allen Wang's avatar
Allen Wang committed
324
325
  initialize(params, train_builder)

Allen Wang's avatar
Allen Wang committed
326
327
328
329
  logging.info('Global batch size: %d', train_builder.global_batch_size)

  with strategy_scope:
    model_params = params.model.model_params.as_dict()
Allen Wang's avatar
Allen Wang committed
330
    model = get_models()[params.model.name](**model_params)
Allen Wang's avatar
Allen Wang committed
331
332
333
    learning_rate = optimizer_factory.build_learning_rate(
        params=params.model.learning_rate,
        batch_size=train_builder.global_batch_size,
A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
334
        train_epochs=train_epochs,
Allen Wang's avatar
Allen Wang committed
335
336
337
338
        train_steps=train_steps)
    optimizer = optimizer_factory.build_optimizer(
        optimizer_name=params.model.optimizer.name,
        base_learning_rate=learning_rate,
Allen Wang's avatar
Allen Wang committed
339
340
        params=params.model.optimizer.as_dict(),
        model=model)
Reed Wanderman-Milne's avatar
Reed Wanderman-Milne committed
341
342
343
    optimizer = performance.configure_optimizer(
        optimizer,
        use_float16=train_builder.dtype == 'float16',
344
345
        loss_scale=get_loss_scale(params),
        use_experimental_api=True)
Allen Wang's avatar
Allen Wang committed
346
347
348

    metrics_map = _get_metrics(one_hot)
    metrics = [metrics_map[metric] for metric in params.train.metrics]
Allen Wang's avatar
Allen Wang committed
349
    steps_per_loop = train_steps if params.train.set_epoch_loop else 1
Allen Wang's avatar
Allen Wang committed
350
351
352
353
354
355

    if one_hot:
      loss_obj = tf.keras.losses.CategoricalCrossentropy(
          label_smoothing=params.model.loss.label_smoothing)
    else:
      loss_obj = tf.keras.losses.SparseCategoricalCrossentropy()
Hongkun Yu's avatar
Hongkun Yu committed
356
357
358
359
    model.compile(
        optimizer=optimizer,
        loss=loss_obj,
        metrics=metrics,
360
        steps_per_execution=steps_per_loop)
Allen Wang's avatar
Allen Wang committed
361
362
363

    initial_epoch = 0
    if params.train.resume_checkpoint:
Hongkun Yu's avatar
Hongkun Yu committed
364
365
      initial_epoch = resume_from_checkpoint(
          model=model, model_dir=params.model_dir, train_steps=train_steps)
Allen Wang's avatar
Allen Wang committed
366

A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
367
368
369
370
371
372
373
374
375
    callbacks = custom_callbacks.get_callbacks(
        model_checkpoint=params.train.callbacks.enable_checkpoint_and_export,
        include_tensorboard=params.train.callbacks.enable_tensorboard,
        time_history=params.train.callbacks.enable_time_history,
        track_lr=params.train.tensorboard.track_lr,
        write_model_weights=params.train.tensorboard.write_model_weights,
        initial_step=initial_epoch * train_steps,
        batch_size=train_builder.global_batch_size,
        log_steps=params.train.time_history.log_steps,
376
377
        model_dir=params.model_dir,
        backup_and_restore=params.train.callbacks.enable_backup_and_restore)
A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
378

Allen Wang's avatar
Allen Wang committed
379
380
  serialize_config(params=params, model_dir=params.model_dir)

Allen Wang's avatar
Allen Wang committed
381
382
383
384
385
386
387
388
389
  if params.evaluation.skip_eval:
    validation_kwargs = {}
  else:
    validation_kwargs = {
        'validation_data': validation_dataset,
        'validation_steps': validation_steps,
        'validation_freq': params.evaluation.epochs_between_evals,
    }

Allen Wang's avatar
Allen Wang committed
390
391
392
393
394
395
  history = model.fit(
      train_dataset,
      epochs=train_epochs,
      steps_per_epoch=train_steps,
      initial_epoch=initial_epoch,
      callbacks=callbacks,
Hongkun Yu's avatar
Hongkun Yu committed
396
397
      verbose=2,
      **validation_kwargs)
Allen Wang's avatar
Allen Wang committed
398

Allen Wang's avatar
Allen Wang committed
399
400
401
402
  validation_output = None
  if not params.evaluation.skip_eval:
    validation_output = model.evaluate(
        validation_dataset, steps=validation_steps, verbose=2)
Allen Wang's avatar
Allen Wang committed
403
404

  # TODO(dankondratyuk): eval and save final test accuracy
Hongkun Yu's avatar
Hongkun Yu committed
405
  stats = common.build_stats(history, validation_output, callbacks)
Allen Wang's avatar
Allen Wang committed
406
407
408
409
410
411
412
  return stats


def export(params: base_configs.ExperimentConfig):
  """Runs the model export functionality."""
  logging.info('Exporting model.')
  model_params = params.model.model_params.as_dict()
Allen Wang's avatar
Allen Wang committed
413
  model = get_models()[params.model.name](**model_params)
Allen Wang's avatar
Allen Wang committed
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
  checkpoint = params.export.checkpoint
  if checkpoint is None:
    logging.info('No export checkpoint was provided. Using the latest '
                 'checkpoint from model_dir.')
    checkpoint = tf.train.latest_checkpoint(params.model_dir)

  model.load_weights(checkpoint)
  model.save(params.export.destination)


def run(flags_obj: flags.FlagValues,
        strategy_override: tf.distribute.Strategy = None) -> Mapping[str, Any]:
  """Runs Image Classification model using native Keras APIs.

  Args:
    flags_obj: An object containing parsed flag values.
    strategy_override: A `tf.distribute.Strategy` object to use for model.

  Returns:
    Dictionary of training/eval stats
  """
  params = _get_params_from_flags(flags_obj)
  if params.mode == 'train_and_eval':
    return train_and_eval(params, strategy_override)
  elif params.mode == 'export_only':
    export(params)
  else:
    raise ValueError('{} is not a valid mode.'.format(params.mode))


def main(_):
Hongkun Yu's avatar
Hongkun Yu committed
445
  stats = run(flags.FLAGS)
Allen Wang's avatar
Allen Wang committed
446
447
448
449
450
451
452
453
454
455
456
457
458
  if stats:
    logging.info('Run stats:\n%s', stats)


if __name__ == '__main__':
  logging.set_verbosity(logging.INFO)
  define_classifier_flags()
  flags.mark_flag_as_required('data_dir')
  flags.mark_flag_as_required('mode')
  flags.mark_flag_as_required('model_type')
  flags.mark_flag_as_required('dataset')

  app.run(main)