resnet_ctl_imagenet_benchmark.py 16 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Executes CTL benchmarks and accuracy tests."""
Hongkun Yu's avatar
Hongkun Yu committed
16
# pylint: disable=line-too-long,g-bad-import-order
17
18
from __future__ import print_function

Jin Young Sohn's avatar
Jin Young Sohn committed
19
import os  # pylint: disable=unused-import
20
21
22
23
24
import time

from absl import flags
import tensorflow as tf

A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
25
from official.benchmark import owner_utils
Fan Yang's avatar
Fan Yang committed
26
27
from official.legacy.image_classification.resnet import common
from official.legacy.image_classification.resnet import resnet_ctl_imagenet_main
28
from official.benchmark.perfzero_benchmark import PerfZeroBenchmark
29
from official.benchmark import benchmark_wrappers
30
from official.utils.flags import core as flags_core
31

A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
32
IMAGENET_DEFAULT_DATA_PATH = 'gs://mlcompass-data/imagenet/imagenet-2012-tfrecord'
A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
33
34
35
# TODO(emizan) Remove comment once you make sure that dataset caching has similar or better
# performance as the uncached local SSD dataset below.
# IMAGENET_EXP_DATA_PATH = 'gs://mlcompass-data/imagenet/imagenet-2012-tfrecord'
A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
36

37
38
39
40
41
42
43
44
45
MIN_TOP_1_ACCURACY = 0.76
MAX_TOP_1_ACCURACY = 0.77

FLAGS = flags.FLAGS


class CtlBenchmark(PerfZeroBenchmark):
  """Base benchmark class with methods to simplify testing."""

Allen Wang's avatar
Allen Wang committed
46
47
48
49
50
  def __init__(self,
               output_dir=None,
               default_flags=None,
               flag_methods=None,
               **kwargs):
51
52
53
    self.default_flags = default_flags or {}
    self.flag_methods = flag_methods or {}
    super(CtlBenchmark, self).__init__(
Zongwei Zhou's avatar
Zongwei Zhou committed
54
        output_dir=output_dir,
55
        default_flags=self.default_flags,
Allen Wang's avatar
Allen Wang committed
56
57
        flag_methods=self.flag_methods,
        **kwargs)
58
59
60
61
62
63
64
65

  def _report_benchmark(self,
                        stats,
                        wall_time_sec,
                        top_1_max=None,
                        top_1_min=None,
                        total_batch_size=None,
                        log_steps=None,
A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
66
67
                        warmup=1,
                        start_time_sec=None):
68
69
70
71
72
73
74
75
76
77
    """Report benchmark results by writing to local protobuf file.

    Args:
      stats: dict returned from keras models with known entries.
      wall_time_sec: the during of the benchmark execution in seconds
      top_1_max: highest passing level for top_1 accuracy.
      top_1_min: lowest passing level for top_1 accuracy.
      total_batch_size: Global batch-size.
      log_steps: How often the log was created for stats['step_timestamp_log'].
      warmup: number of entries in stats['step_timestamp_log'] to ignore.
A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
78
      start_time_sec: the start time of the program in seconds since epoch.
79
80
81
82
    """

    metrics = []
    if 'eval_acc' in stats:
83
84
85
86
87
88
89
90
91
92
93
94
95
      metrics.append({
          'name': 'accuracy_top_1',
          'value': stats['eval_acc'],
          'min_value': top_1_min,
          'max_value': top_1_max
      })
      metrics.append({'name': 'eval_loss', 'value': stats['eval_loss']})

      metrics.append({
          'name': 'top_1_train_accuracy',
          'value': stats['train_acc']
      })
      metrics.append({'name': 'train_loss', 'value': stats['train_loss']})
96
97

    if (warmup and 'step_timestamp_log' in stats and
Ruoxin Sang's avatar
Ruoxin Sang committed
98
99
        len(stats['step_timestamp_log']) > warmup + 1):
      # first entry in the time_log is start of step 0. The rest of the
100
101
      # entries are the end of each step recorded
      time_log = stats['step_timestamp_log']
Will Cromar's avatar
Will Cromar committed
102
103
104
      steps_elapsed = time_log[-1].batch_index - time_log[warmup].batch_index
      time_elapsed = time_log[-1].timestamp - time_log[warmup].timestamp
      examples_per_sec = total_batch_size * (steps_elapsed / time_elapsed)
105
      metrics.append({'name': 'exp_per_second', 'value': examples_per_sec})
106
107

    if 'avg_exp_per_second' in stats:
108
109
110
111
      metrics.append({
          'name': 'avg_exp_per_second',
          'value': stats['avg_exp_per_second']
      })
112

A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
113
114
115
116
117
118
    if start_time_sec and 'step_timestamp_log' in stats:
      time_log = stats['step_timestamp_log']
      # time_log[0] is recorded at the beginning of the first step.
      startup_time = time_log[0].timestamp - start_time_sec
      metrics.append({'name': 'startup_time', 'value': startup_time})

119
    flags_str = flags_core.get_nondefault_flags_as_str()
120
121
122
123
124
    self.report_benchmark(
        iters=-1,
        wall_time=wall_time_sec,
        metrics=metrics,
        extras={'flags': flags_str})
125
126
127
128
129
130
131
132
133
134
135
136


class Resnet50CtlAccuracy(CtlBenchmark):
  """Benchmark accuracy tests for ResNet50 in CTL."""

  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
    """A benchmark class.

    Args:
      output_dir: directory where to output e.g. log files
      root_data_dir: directory under which to look for dataset
      **kwargs: arbitrary named arguments. This is needed to make the
137
138
        constructor forward compatible in case PerfZero provides more named
        arguments before updating the constructor.
139
140
    """

Hongkun Yu's avatar
Hongkun Yu committed
141
    flag_methods = [common.define_keras_flags]
142

143
    self.data_dir = os.path.join(root_data_dir, 'imagenet')
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
    super(Resnet50CtlAccuracy, self).__init__(
        output_dir=output_dir, flag_methods=flag_methods)

  def benchmark_8_gpu(self):
    """Test Keras model with eager, dist_strat and 8 GPUs."""
    self._setup()
    FLAGS.num_gpus = 8
    FLAGS.data_dir = self.data_dir
    FLAGS.batch_size = 128 * 8
    FLAGS.train_epochs = 90
    FLAGS.epochs_between_evals = 10
    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu')
    FLAGS.dtype = 'fp32'
    self._run_and_report_benchmark()

159
160
161
162
163
164
165
166
167
168
169
170
  def benchmark_8_gpu_fp16(self):
    """Test Keras model with eager, 8 GPUs with tf.keras mixed precision."""
    self._setup()
    FLAGS.num_gpus = 8
    FLAGS.data_dir = self.data_dir
    FLAGS.batch_size = 256 * 8
    FLAGS.train_epochs = 90
    FLAGS.epochs_between_evals = 10
    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_fp16')
    FLAGS.dtype = 'fp16'
    self._run_and_report_benchmark()

171
  @benchmark_wrappers.enable_runtime_flags
172
173
  def _run_and_report_benchmark(self):
    start_time_sec = time.time()
Hongkun Yu's avatar
Hongkun Yu committed
174
    stats = resnet_ctl_imagenet_main.run(flags.FLAGS)
175
176
177
178
179
180
181
182
    wall_time_sec = time.time() - start_time_sec

    super(Resnet50CtlAccuracy, self)._report_benchmark(
        stats,
        wall_time_sec,
        top_1_min=MIN_TOP_1_ACCURACY,
        top_1_max=MAX_TOP_1_ACCURACY,
        total_batch_size=FLAGS.batch_size,
A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
183
184
        log_steps=100,
        start_time_sec=start_time_sec)
185
186
187
188
189


class Resnet50CtlBenchmarkBase(CtlBenchmark):
  """Resnet50 benchmarks."""

Allen Wang's avatar
Allen Wang committed
190
  def __init__(self, output_dir=None, default_flags=None, **kwargs):
Hongkun Yu's avatar
Hongkun Yu committed
191
    flag_methods = [common.define_keras_flags]
192
193
194
195

    super(Resnet50CtlBenchmarkBase, self).__init__(
        output_dir=output_dir,
        flag_methods=flag_methods,
Allen Wang's avatar
Allen Wang committed
196
197
        default_flags=default_flags,
        **kwargs)
198

199
  @benchmark_wrappers.enable_runtime_flags
200
201
  def _run_and_report_benchmark(self):
    start_time_sec = time.time()
Hongkun Yu's avatar
Hongkun Yu committed
202
    stats = resnet_ctl_imagenet_main.run(FLAGS)
203
204
    wall_time_sec = time.time() - start_time_sec

Zongwei Zhou's avatar
Zongwei Zhou committed
205
206
    # Warmup means the number of logged step time entries that are excluded in
    # performance report. Default to exclude 1 FLAGS.log_steps time.
207
208
209
210
211
    super(Resnet50CtlBenchmarkBase, self)._report_benchmark(
        stats,
        wall_time_sec,
        total_batch_size=FLAGS.batch_size,
        log_steps=FLAGS.log_steps,
Zongwei Zhou's avatar
Zongwei Zhou committed
212
        warmup=1,
A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
213
        start_time_sec=start_time_sec)
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229

  def benchmark_1_gpu_no_dist_strat(self):
    """Test Keras model with 1 GPU, no distribution strategy."""
    self._setup()

    FLAGS.num_gpus = 1
    FLAGS.distribution_strategy = 'off'
    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_no_dist_strat')
    FLAGS.batch_size = 128
    self._run_and_report_benchmark()

  def benchmark_1_gpu(self):
    """Test Keras model with 1 GPU."""
    self._setup()

    FLAGS.num_gpus = 1
230
    FLAGS.distribution_strategy = 'one_device'
231
232
233
234
    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu')
    FLAGS.batch_size = 128
    self._run_and_report_benchmark()

235
236
237
238
239
240
241
242
243
244
245
  def benchmark_1_gpu_fp16(self):
    """Test Keras model with 1 GPU with tf.keras mixed precision."""
    self._setup()

    FLAGS.num_gpus = 1
    FLAGS.distribution_strategy = 'one_device'
    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_fp16')
    FLAGS.batch_size = 256
    FLAGS.dtype = 'fp16'
    self._run_and_report_benchmark()

246
247
248
249
250
  def benchmark_1_gpu_eager(self):
    """Test Keras model with 1 GPU in pure eager mode."""
    self._setup()

    FLAGS.num_gpus = 1
251
    FLAGS.distribution_strategy = 'one_device'
252
    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_eager')
253
    FLAGS.batch_size = 120
254
    FLAGS.use_tf_function = False
255
    FLAGS.use_tf_while_loop = False
256
    FLAGS.single_l2_loss_op = True
257
258
    self._run_and_report_benchmark()

259
260
261
262
263
264
265
  def benchmark_1_gpu_fp16_eager(self):
    """Test Keras model with 1 GPU with fp16 and pure eager mode."""
    self._setup()

    FLAGS.num_gpus = 1
    FLAGS.distribution_strategy = 'one_device'
    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_fp16_eager')
266
    FLAGS.batch_size = 232
267
268
    FLAGS.dtype = 'fp16'
    FLAGS.use_tf_function = False
269
    FLAGS.use_tf_while_loop = False
270
271
272
    FLAGS.single_l2_loss_op = True
    self._run_and_report_benchmark()

273
274
275
276
277
  def benchmark_8_gpu(self):
    """Test Keras model with 8 GPUs."""
    self._setup()

    FLAGS.num_gpus = 8
278
    FLAGS.distribution_strategy = 'mirrored'
279
280
281
282
    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu')
    FLAGS.batch_size = 128 * 8  # 8 GPUs
    self._run_and_report_benchmark()

Pankaj Kanwar's avatar
Pankaj Kanwar committed
283
284
285
286
287
288
289
290
291
292
  def benchmark_8_gpu_fp32_no_tf32(self):
    """Test Keras model with 8 GPUs.Runs in FP32 by disabling TF32 execution."""
    self._setup()
    tf.config.experimental.enable_tensor_float_32_execution(False)
    FLAGS.num_gpus = 8
    FLAGS.distribution_strategy = 'mirrored'
    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_fp32_no_tf32')
    FLAGS.batch_size = 128 * 8  # 8 GPUs
    self._run_and_report_benchmark()

293
294
295
296
297
298
299
300
301
302
303
  def benchmark_8_gpu_fp16(self):
    """Test Keras model with 8 GPUs with tf.keras mixed precision."""
    self._setup()

    FLAGS.num_gpus = 8
    FLAGS.distribution_strategy = 'mirrored'
    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_fp16')
    FLAGS.batch_size = 256 * 8  # 8 GPUs
    FLAGS.dtype = 'fp16'
    self._run_and_report_benchmark()

A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
304
305
306
307
308
309
310
311
312
313
314
315
  def benchmark_xla_8_gpu_fp16(self):
    """Test Keras model with 8 GPUs with tf.keras mixed precision."""
    self._setup()

    FLAGS.num_gpus = 8
    FLAGS.distribution_strategy = 'mirrored'
    FLAGS.model_dir = self._get_model_dir('benchmark_xla_8_gpu_fp16')
    FLAGS.batch_size = 256 * 8  # 8 GPUs
    FLAGS.dtype = 'fp16'
    FLAGS.enable_xla = True
    self._run_and_report_benchmark()

Sai Ganesh Bandiatmakuri's avatar
Sai Ganesh Bandiatmakuri committed
316
317
318
319
320
321
  def benchmark_8_gpu_eager(self):
    """Test Keras model with 8 GPUs, eager, fp32."""
    self._setup()

    FLAGS.num_gpus = 8
    FLAGS.use_tf_function = False
322
    FLAGS.use_tf_while_loop = False
Sai Ganesh Bandiatmakuri's avatar
Sai Ganesh Bandiatmakuri committed
323
324
325
326
327
328
329
330
331
332
333
334
    FLAGS.distribution_strategy = 'mirrored'
    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_eager')
    FLAGS.batch_size = 128
    self._run_and_report_benchmark()

  def benchmark_8_gpu_eager_fp16(self):
    """Test Keras model with 8 GPUs, eager, fp16."""
    self._setup()

    FLAGS.num_gpus = 8
    FLAGS.dtype = 'fp16'
    FLAGS.use_tf_function = False
335
    FLAGS.use_tf_while_loop = False
Sai Ganesh Bandiatmakuri's avatar
Sai Ganesh Bandiatmakuri committed
336
337
338
339
340
    FLAGS.distribution_strategy = 'mirrored'
    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_eager_fp16')
    FLAGS.batch_size = 128
    self._run_and_report_benchmark()

Zongwei Zhou's avatar
Zongwei Zhou committed
341
342
343
344
345
346
347
348
349
350
351
352
353
  def _set_df_common(self):
    FLAGS.steps_per_loop = 500
    FLAGS.train_epochs = 2
    FLAGS.train_steps = None
    FLAGS.skip_eval = True
    FLAGS.enable_eager = True
    FLAGS.enable_tensorboard = False
    FLAGS.distribution_strategy = 'tpu'
    FLAGS.report_accuracy_metrics = False
    FLAGS.log_steps = 50
    FLAGS.single_l2_loss_op = True
    FLAGS.use_tf_function = True
    FLAGS.enable_checkpoint_and_export = False
A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
354
    FLAGS.data_dir = IMAGENET_DEFAULT_DATA_PATH
Zongwei Zhou's avatar
Zongwei Zhou committed
355
356
357
358
359
360

  def benchmark_2x2_tpu_bf16(self):
    self._setup()
    self._set_df_common()
    FLAGS.batch_size = 1024
    FLAGS.dtype = 'bf16'
Allen Wang's avatar
Allen Wang committed
361
    FLAGS.model_dir = self._get_model_dir('benchmark_2x2_tpu_bf16')
Zongwei Zhou's avatar
Zongwei Zhou committed
362
363
    self._run_and_report_benchmark()

364
365
366
367
368
369
370
  @owner_utils.Owner('tf-graph-compiler')
  def benchmark_2x2_tpu_bf16_mlir(self):
    self._setup()
    self._set_df_common()
    FLAGS.batch_size = 1024
    FLAGS.dtype = 'bf16'
    tf.config.experimental.enable_mlir_bridge()
Allen Wang's avatar
Allen Wang committed
371
    FLAGS.model_dir = self._get_model_dir('benchmark_2x2_tpu_bf16_mlir')
372
373
    self._run_and_report_benchmark()

Zongwei Zhou's avatar
Zongwei Zhou committed
374
375
376
  def benchmark_4x4_tpu_bf16(self):
    self._setup()
    self._set_df_common()
A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
377
378
    FLAGS.batch_size = 8192
    FLAGS.train_epochs = 4
Zongwei Zhou's avatar
Zongwei Zhou committed
379
    FLAGS.dtype = 'bf16'
Allen Wang's avatar
Allen Wang committed
380
    FLAGS.model_dir = self._get_model_dir('benchmark_4x4_tpu_bf16')
A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
381
382
    FLAGS.data_dir = IMAGENET_DEFAULT_DATA_PATH
    FLAGS.training_dataset_cache = True
Zongwei Zhou's avatar
Zongwei Zhou committed
383
384
    self._run_and_report_benchmark()

A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
385
386
387
388
389
390
391
  @owner_utils.Owner('tf-graph-compiler')
  def benchmark_4x4_tpu_bf16_mlir(self):
    """Run resnet model on 4x4 with the MLIR Bridge enabled."""
    self._setup()
    self._set_df_common()
    FLAGS.batch_size = 4096
    FLAGS.dtype = 'bf16'
Allen Wang's avatar
Allen Wang committed
392
    FLAGS.model_dir = self._get_model_dir('benchmark_4x4_tpu_bf16_mlir')
A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
393
394
395
    tf.config.experimental.enable_mlir_bridge()
    self._run_and_report_benchmark()

A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
  def benchmark_8x8_tpu_bf16(self):
    self._setup()
    self._set_df_common()
    FLAGS.batch_size = 8192
    FLAGS.dtype = 'bf16'
    FLAGS.model_dir = self._get_model_dir('benchmark_8x8_tpu_bf16')
    self._run_and_report_benchmark()

  @owner_utils.Owner('tf-graph-compiler')
  def benchmark_8x8_tpu_bf16_mlir(self):
    self._setup()
    self._set_df_common()
    FLAGS.batch_size = 8192
    FLAGS.dtype = 'bf16'
    FLAGS.model_dir = self._get_model_dir('benchmark_8x8_tpu_bf16_mlir')
    tf.config.experimental.enable_mlir_bridge()
    self._run_and_report_benchmark()

A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
  def benchmark_8x8_tpu(self):
    self._setup()
    self._set_df_common()
    FLAGS.batch_size = 8192
    FLAGS.model_dir = self._get_model_dir('benchmark_8x8_tpu')
    self._run_and_report_benchmark()

  @owner_utils.Owner('tf-graph-compiler')
  def benchmark_8x8_tpu_mlir(self):
    self._setup()
    self._set_df_common()
    FLAGS.batch_size = 8192
    FLAGS.model_dir = self._get_model_dir('benchmark_8x8_tpu_mlir')
    tf.config.experimental.enable_mlir_bridge()
    self._run_and_report_benchmark()

Zongwei Zhou's avatar
Zongwei Zhou committed
430
431
432
433
434
  def benchmark_8x16_tpu_bf16(self):
    self._setup()
    self._set_df_common()
    FLAGS.batch_size = 8192
    FLAGS.dtype = 'bf16'
A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
435
    FLAGS.model_dir = self._get_model_dir('benchmark_8x16_tpu_bf16')
Zongwei Zhou's avatar
Zongwei Zhou committed
436
437
    self._run_and_report_benchmark()

438
439
  def fill_report_object(self, stats):
    super(Resnet50CtlBenchmarkBase, self).fill_report_object(
440
        stats, total_batch_size=FLAGS.batch_size, log_steps=FLAGS.log_steps)
441
442
443
444
445
446
447
448
449
450


class Resnet50CtlBenchmarkSynth(Resnet50CtlBenchmarkBase):
  """Resnet50 synthetic benchmark tests."""

  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
    def_flags = {}
    def_flags['skip_eval'] = True
    def_flags['use_synthetic_data'] = True
    def_flags['train_steps'] = 110
Hongkun Yu's avatar
Hongkun Yu committed
451
    def_flags['steps_per_loop'] = 10
452
453
454
    def_flags['log_steps'] = 10

    super(Resnet50CtlBenchmarkSynth, self).__init__(
Allen Wang's avatar
Allen Wang committed
455
        output_dir=output_dir, default_flags=def_flags, **kwargs)
456
457
458
459
460
461
462
463


class Resnet50CtlBenchmarkReal(Resnet50CtlBenchmarkBase):
  """Resnet50 real data benchmark tests."""

  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
    def_flags = {}
    def_flags['skip_eval'] = True
Hongkun Yu's avatar
Hongkun Yu committed
464
465
    def_flags[
        'data_dir'] = os.path.join(root_data_dir, 'imagenet')
466
    def_flags['train_steps'] = 110
Hongkun Yu's avatar
Hongkun Yu committed
467
    def_flags['steps_per_loop'] = 10
468
469
470
    def_flags['log_steps'] = 10

    super(Resnet50CtlBenchmarkReal, self).__init__(
Allen Wang's avatar
Allen Wang committed
471
        output_dir=output_dir, default_flags=def_flags, **kwargs)
472

473

474
475
if __name__ == '__main__':
  tf.test.main()