reference.log 95.9 KB
Newer Older
liangjing's avatar
liangjing committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
nohup: ignoring input
:::MLL 1679230527.145 cache_clear: {"value": true, "metadata": {"lineno": 116, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 12:55:27.145380 140547769902912 mlp_log.py:80] :::MLL 1679230527.145 cache_clear: {"value": true, "metadata": {"lineno": 116, "file": "./resnet_ctl_imagenet_main.py"}}
:::MLL 1679230527.146 init_start: {"value": null, "metadata": {"lineno": 117, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 12:55:27.146378 140547769902912 mlp_log.py:80] :::MLL 1679230527.146 init_start: {"value": null, "metadata": {"lineno": 117, "file": "./resnet_ctl_imagenet_main.py"}}
:::MLL 1679230527.147 submission_benchmark: {"value": "resnet", "metadata": {"lineno": 118, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 12:55:27.147078 140547769902912 mlp_log.py:80] :::MLL 1679230527.147 submission_benchmark: {"value": "resnet", "metadata": {"lineno": 118, "file": "./resnet_ctl_imagenet_main.py"}}
:::MLL 1679230527.148 submission_division: {"value": "closed", "metadata": {"lineno": 119, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 12:55:27.147791 140547769902912 mlp_log.py:80] :::MLL 1679230527.148 submission_division: {"value": "closed", "metadata": {"lineno": 119, "file": "./resnet_ctl_imagenet_main.py"}}
:::MLL 1679230527.148 submission_org: {"value": "google", "metadata": {"lineno": 120, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 12:55:27.148500 140547769902912 mlp_log.py:80] :::MLL 1679230527.148 submission_org: {"value": "google", "metadata": {"lineno": 120, "file": "./resnet_ctl_imagenet_main.py"}}
:::MLL 1679230527.149 submission_platform: {"value": "gpu-v100-8", "metadata": {"lineno": 121, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 12:55:27.149215 140547769902912 mlp_log.py:80] :::MLL 1679230527.149 submission_platform: {"value": "gpu-v100-8", "metadata": {"lineno": 121, "file": "./resnet_ctl_imagenet_main.py"}}
:::MLL 1679230527.150 submission_status: {"value": "cloud", "metadata": {"lineno": 124, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 12:55:27.149919 140547769902912 mlp_log.py:80] :::MLL 1679230527.150 submission_status: {"value": "cloud", "metadata": {"lineno": 124, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 12:55:27.150071 140547769902912 common.py:617] Module ./resnet_ctl_imagenet_main.py:
I0319 12:55:27.150561 140547769902912 common.py:620] 	 flags_obj.use_tf_function = True
I0319 12:55:27.150646 140547769902912 common.py:620] 	 flags_obj.single_l2_loss_op = True
I0319 12:55:27.150727 140547769902912 common.py:620] 	 flags_obj.cache_decoded_image = False
I0319 12:55:27.150808 140547769902912 common.py:620] 	 flags_obj.enable_device_warmup = True
I0319 12:55:27.150889 140547769902912 common.py:620] 	 flags_obj.device_warmup_steps = 1
I0319 12:55:27.150968 140547769902912 common.py:620] 	 flags_obj.num_replicas = 32
I0319 12:55:27.151046 140547769902912 common.py:617] Module absl.app:
I0319 12:55:27.151130 140547769902912 common.py:620] 	 flags_obj.run_with_pdb = False
I0319 12:55:27.151208 140547769902912 common.py:620] 	 flags_obj.pdb_post_mortem = False
I0319 12:55:27.151290 140547769902912 common.py:620] 	 flags_obj.pdb = False
I0319 12:55:27.151383 140547769902912 common.py:620] 	 flags_obj.run_with_profiling = False
I0319 12:55:27.151461 140547769902912 common.py:620] 	 flags_obj.profile_file = None
I0319 12:55:27.151540 140547769902912 common.py:620] 	 flags_obj.use_cprofile_for_profiling = True
I0319 12:55:27.151618 140547769902912 common.py:620] 	 flags_obj.only_check_args = False
I0319 12:55:27.151695 140547769902912 common.py:620] 	 flags_obj.help = False
I0319 12:55:27.151774 140547769902912 common.py:620] 	 flags_obj.helpshort = False
I0319 12:55:27.151850 140547769902912 common.py:620] 	 flags_obj.helpfull = False
I0319 12:55:27.151929 140547769902912 common.py:620] 	 flags_obj.helpxml = False
I0319 12:55:27.152006 140547769902912 common.py:617] Module absl.logging:
I0319 12:55:27.152086 140547769902912 common.py:620] 	 flags_obj.logtostderr = False
I0319 12:55:27.152163 140547769902912 common.py:620] 	 flags_obj.alsologtostderr = False
I0319 12:55:27.152240 140547769902912 common.py:620] 	 flags_obj.log_dir = 
I0319 12:55:27.152339 140547769902912 common.py:620] 	 flags_obj.verbosity = 0
I0319 12:55:27.152423 140547769902912 common.py:620] 	 flags_obj.logger_levels = {}
I0319 12:55:27.152507 140547769902912 common.py:620] 	 flags_obj.stderrthreshold = fatal
I0319 12:55:27.152584 140547769902912 common.py:620] 	 flags_obj.showprefixforinfo = True
I0319 12:55:27.152662 140547769902912 common.py:617] Module absl.testing.absltest:
I0319 12:55:27.152743 140547769902912 common.py:620] 	 flags_obj.test_srcdir = 
I0319 12:55:27.152820 140547769902912 common.py:620] 	 flags_obj.test_tmpdir = /tmp/absl_testing
I0319 12:55:27.152901 140547769902912 common.py:620] 	 flags_obj.test_random_seed = 301
I0319 12:55:27.152981 140547769902912 common.py:620] 	 flags_obj.test_randomize_ordering_seed = 1
I0319 12:55:27.153058 140547769902912 common.py:620] 	 flags_obj.xml_output_file = 
I0319 12:55:27.153135 140547769902912 common.py:617] Module common:
I0319 12:55:27.153217 140547769902912 common.py:620] 	 flags_obj.enable_eager = True
I0319 12:55:27.153294 140547769902912 common.py:620] 	 flags_obj.skip_eval = False
I0319 12:55:27.153382 140547769902912 common.py:620] 	 flags_obj.set_learning_phase_to_train = True
I0319 12:55:27.153460 140547769902912 common.py:620] 	 flags_obj.explicit_gpu_placement = False
I0319 12:55:27.153537 140547769902912 common.py:620] 	 flags_obj.use_trivial_model = False
I0319 12:55:27.153614 140547769902912 common.py:620] 	 flags_obj.report_accuracy_metrics = True
I0319 12:55:27.153692 140547769902912 common.py:620] 	 flags_obj.lr_schedule = polynomial
I0319 12:55:27.153769 140547769902912 common.py:620] 	 flags_obj.enable_tensorboard = False
I0319 12:55:27.153845 140547769902912 common.py:620] 	 flags_obj.train_steps = None
I0319 12:55:27.153923 140547769902912 common.py:620] 	 flags_obj.profile_steps = None
I0319 12:55:27.154000 140547769902912 common.py:620] 	 flags_obj.batchnorm_spatial_persistent = True
I0319 12:55:27.154076 140547769902912 common.py:620] 	 flags_obj.enable_get_next_as_optional = False
I0319 12:55:27.154153 140547769902912 common.py:620] 	 flags_obj.enable_checkpoint_and_export = False
I0319 12:55:27.154229 140547769902912 common.py:620] 	 flags_obj.tpu = 
I0319 12:55:27.154305 140547769902912 common.py:620] 	 flags_obj.tpu_zone = 
I0319 12:55:27.154394 140547769902912 common.py:620] 	 flags_obj.steps_per_loop = 514
I0319 12:55:27.154473 140547769902912 common.py:620] 	 flags_obj.use_tf_while_loop = True
I0319 12:55:27.154549 140547769902912 common.py:620] 	 flags_obj.use_tf_keras_layers = False
I0319 12:55:27.154627 140547769902912 common.py:620] 	 flags_obj.base_learning_rate = 4.9
I0319 12:55:27.154710 140547769902912 common.py:620] 	 flags_obj.optimizer = LARS
I0319 12:55:27.154787 140547769902912 common.py:620] 	 flags_obj.drop_train_remainder = True
I0319 12:55:27.154863 140547769902912 common.py:620] 	 flags_obj.drop_eval_remainder = False
I0319 12:55:27.154940 140547769902912 common.py:620] 	 flags_obj.label_smoothing = 0.1
I0319 12:55:27.155020 140547769902912 common.py:620] 	 flags_obj.num_classes = 1000
I0319 12:55:27.155099 140547769902912 common.py:620] 	 flags_obj.eval_offset_epochs = 3
I0319 12:55:27.155177 140547769902912 common.py:620] 	 flags_obj.target_accuracy = 0.759
I0319 12:55:27.155256 140547769902912 common.py:617] Module lars_util:
I0319 12:55:27.155346 140547769902912 common.py:620] 	 flags_obj.end_learning_rate = None
I0319 12:55:27.155426 140547769902912 common.py:620] 	 flags_obj.lars_epsilon = 0.0
I0319 12:55:27.155504 140547769902912 common.py:620] 	 flags_obj.warmup_epochs = 5.0
I0319 12:55:27.155582 140547769902912 common.py:620] 	 flags_obj.momentum = 0.9
I0319 12:55:27.155662 140547769902912 common.py:617] Module resnet_model:
I0319 12:55:27.155743 140547769902912 common.py:620] 	 flags_obj.weight_decay = 0.0002
I0319 12:55:27.155822 140547769902912 common.py:620] 	 flags_obj.num_accumulation_steps = 1
I0319 12:55:27.155900 140547769902912 common.py:617] Module resnet_runnable:
I0319 12:55:27.155981 140547769902912 common.py:620] 	 flags_obj.trace_warmup = False
I0319 12:55:27.156070 140547769902912 common.py:617] Module tensorflow.python.ops.parallel_for.pfor:
I0319 12:55:27.156152 140547769902912 common.py:620] 	 flags_obj.op_conversion_fallback_to_while_loop = True
I0319 12:55:27.156228 140547769902912 common.py:617] Module tensorflow.python.tpu.client.client:
I0319 12:55:27.156317 140547769902912 common.py:620] 	 flags_obj.runtime_oom_exit = True
I0319 12:55:27.156397 140547769902912 common.py:620] 	 flags_obj.hbm_oom_exit = True
I0319 12:55:27.156476 140547769902912 common.py:617] Module tf2_common.utils.flags._base:
I0319 12:55:27.156557 140547769902912 common.py:620] 	 flags_obj.data_dir = /data/tf-imagenet/imagenet
I0319 12:55:27.156634 140547769902912 common.py:620] 	 flags_obj.model_dir = /tmp
I0319 12:55:27.156712 140547769902912 common.py:620] 	 flags_obj.clean = False
I0319 12:55:27.156790 140547769902912 common.py:620] 	 flags_obj.train_epochs = 70
I0319 12:55:27.156867 140547769902912 common.py:620] 	 flags_obj.epochs_between_evals = 4
I0319 12:55:27.156945 140547769902912 common.py:620] 	 flags_obj.batch_size = 2496
I0319 12:55:27.157022 140547769902912 common.py:620] 	 flags_obj.num_gpus = 8
I0319 12:55:27.157100 140547769902912 common.py:620] 	 flags_obj.run_eagerly = False
I0319 12:55:27.157177 140547769902912 common.py:620] 	 flags_obj.distribution_strategy = mirrored
I0319 12:55:27.157255 140547769902912 common.py:617] Module tf2_common.utils.flags._benchmark:
I0319 12:55:27.157347 140547769902912 common.py:620] 	 flags_obj.benchmark_logger_type = BaseBenchmarkLogger
I0319 12:55:27.157434 140547769902912 common.py:620] 	 flags_obj.benchmark_test_id = None
I0319 12:55:27.157512 140547769902912 common.py:620] 	 flags_obj.log_steps = 125
I0319 12:55:27.157588 140547769902912 common.py:620] 	 flags_obj.benchmark_log_dir = None
I0319 12:55:27.157666 140547769902912 common.py:620] 	 flags_obj.gcp_project = None
I0319 12:55:27.157744 140547769902912 common.py:620] 	 flags_obj.bigquery_data_set = test_benchmark
I0319 12:55:27.157821 140547769902912 common.py:620] 	 flags_obj.bigquery_run_table = benchmark_run
I0319 12:55:27.157899 140547769902912 common.py:620] 	 flags_obj.bigquery_run_status_table = benchmark_run_status
I0319 12:55:27.157977 140547769902912 common.py:620] 	 flags_obj.bigquery_metric_table = benchmark_metric
I0319 12:55:27.158053 140547769902912 common.py:617] Module tf2_common.utils.flags._distribution:
I0319 12:55:27.158134 140547769902912 common.py:620] 	 flags_obj.worker_hosts = None
I0319 12:55:27.158211 140547769902912 common.py:620] 	 flags_obj.task_index = -1
I0319 12:55:27.158288 140547769902912 common.py:617] Module tf2_common.utils.flags._misc:
I0319 12:55:27.158379 140547769902912 common.py:620] 	 flags_obj.data_format = None
I0319 12:55:27.158457 140547769902912 common.py:617] Module tf2_common.utils.flags._performance:
I0319 12:55:27.158539 140547769902912 common.py:620] 	 flags_obj.use_synthetic_data = False
I0319 12:55:27.158615 140547769902912 common.py:620] 	 flags_obj.dtype = fp16
I0319 12:55:27.158691 140547769902912 common.py:620] 	 flags_obj.loss_scale = None
I0319 12:55:27.158768 140547769902912 common.py:620] 	 flags_obj.fp16_implementation = keras
I0319 12:55:27.158844 140547769902912 common.py:620] 	 flags_obj.all_reduce_alg = nccl
I0319 12:55:27.158921 140547769902912 common.py:620] 	 flags_obj.num_packs = 1
I0319 12:55:27.158999 140547769902912 common.py:620] 	 flags_obj.tf_gpu_thread_mode = gpu_private
I0319 12:55:27.159075 140547769902912 common.py:620] 	 flags_obj.per_gpu_thread_count = 0
I0319 12:55:27.159153 140547769902912 common.py:620] 	 flags_obj.datasets_num_private_threads = 32
I0319 12:55:27.159230 140547769902912 common.py:620] 	 flags_obj.training_dataset_cache = True
I0319 12:55:27.159306 140547769902912 common.py:620] 	 flags_obj.training_prefetch_batchs = 128
I0319 12:55:27.159394 140547769902912 common.py:620] 	 flags_obj.eval_dataset_cache = True
I0319 12:55:27.159471 140547769902912 common.py:620] 	 flags_obj.eval_prefetch_batchs = 192
I0319 12:55:27.159548 140547769902912 common.py:620] 	 flags_obj.tf_data_experimental_slack = False
I0319 12:55:27.159631 140547769902912 common.py:620] 	 flags_obj.enable_xla = False
I0319 12:55:27.159710 140547769902912 common.py:620] 	 flags_obj.force_v2_in_keras_compile = None
WARNING:tensorflow:Mixed precision compatibility check (mixed_float16): WARNING
Your GPUs may run slowly with dtype policy mixed_float16 because they do not have compute capability of at least 7.0. Your GPUs:
  Z100L, no compute capability (probably not an Nvidia GPU) (x8)
See https://developer.nvidia.com/cuda-gpus for a list of GPUs and their compute capabilities.
If you will use compatible GPU(s) not attached to this host, e.g. by running a multi-worker model, you can ignore this warning. This message will only be logged once
W0319 12:55:27.160811 140547769902912 device_compatibility_check.py:107] Mixed precision compatibility check (mixed_float16): WARNING
Your GPUs may run slowly with dtype policy mixed_float16 because they do not have compute capability of at least 7.0. Your GPUs:
  Z100L, no compute capability (probably not an Nvidia GPU) (x8)
See https://developer.nvidia.com/cuda-gpus for a list of GPUs and their compute capabilities.
If you will use compatible GPU(s) not attached to this host, e.g. by running a multi-worker model, you can ignore this warning. This message will only be logged once
I0319 12:55:27.161139 140547769902912 keras_utils.py:243] Logical CPU cores: 128
I0319 12:55:27.161378 140547769902912 keras_utils.py:249] TF_GPU_THREAD_COUNT: 2
I0319 12:55:27.161468 140547769902912 keras_utils.py:251] TF_GPU_THREAD_MODE: gpu_private
I0319 12:55:27.161551 140547769902912 keras_utils.py:261] Recommended datasets_num_private_threads: 64
2023-03-19 12:55:27.162998: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-19 12:55:27.181835: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
2023-03-19 12:55:27.181964: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 32252 MB memory:  -> device: 0, name: Z100L, pci bus id: 0000:07:00.0
2023-03-19 12:55:27.582374: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
2023-03-19 12:55:27.582493: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 32252 MB memory:  -> device: 1, name: Z100L, pci bus id: 0000:0a:00.0
2023-03-19 12:55:27.961772: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
2023-03-19 12:55:27.961893: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:2 with 32252 MB memory:  -> device: 2, name: Z100L, pci bus id: 0000:15:00.0
2023-03-19 12:55:28.339247: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
2023-03-19 12:55:28.339376: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:3 with 32252 MB memory:  -> device: 3, name: Z100L, pci bus id: 0000:0f:00.0
2023-03-19 12:55:28.719486: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
2023-03-19 12:55:28.719627: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:4 with 32252 MB memory:  -> device: 4, name: Z100L, pci bus id: 0000:85:00.0
2023-03-19 12:55:29.097492: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
2023-03-19 12:55:29.097606: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:5 with 32252 MB memory:  -> device: 5, name: Z100L, pci bus id: 0000:7f:00.0
2023-03-19 12:55:29.475299: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
2023-03-19 12:55:29.475428: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:6 with 32252 MB memory:  -> device: 6, name: Z100L, pci bus id: 0000:77:00.0
2023-03-19 12:55:29.855076: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
2023-03-19 12:55:29.855191: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:7 with 32252 MB memory:  -> device: 7, name: Z100L, pci bus id: 0000:7a:00.0
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3', '/job:localhost/replica:0/task:0/device:GPU:4', '/job:localhost/replica:0/task:0/device:GPU:5', '/job:localhost/replica:0/task:0/device:GPU:6', '/job:localhost/replica:0/task:0/device:GPU:7')
I0319 12:55:30.261204 140547769902912 mirrored_strategy.py:376] Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3', '/job:localhost/replica:0/task:0/device:GPU:4', '/job:localhost/replica:0/task:0/device:GPU:5', '/job:localhost/replica:0/task:0/device:GPU:6', '/job:localhost/replica:0/task:0/device:GPU:7')
num_index -1
enter the tf.float16 set policy
Compute dtype: float16
Variable dtype: float32
:::MLL 1679230530.264 global_batch_size: {"value": 2496, "metadata": {"lineno": 190, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 12:55:30.263783 140547769902912 mlp_log.py:80] :::MLL 1679230530.264 global_batch_size: {"value": 2496, "metadata": {"lineno": 190, "file": "./resnet_ctl_imagenet_main.py"}}
:::MLL 1679230530.265 train_samples: {"value": 1281167, "metadata": {"lineno": 191, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 12:55:30.264862 140547769902912 mlp_log.py:80] :::MLL 1679230530.265 train_samples: {"value": 1281167, "metadata": {"lineno": 191, "file": "./resnet_ctl_imagenet_main.py"}}
:::MLL 1679230530.266 eval_samples: {"value": 50000, "metadata": {"lineno": 193, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 12:55:30.265909 140547769902912 mlp_log.py:80] :::MLL 1679230530.266 eval_samples: {"value": 50000, "metadata": {"lineno": 193, "file": "./resnet_ctl_imagenet_main.py"}}
:::MLL 1679230530.267 model_bn_span: {"value": 312, "metadata": {"lineno": 195, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 12:55:30.266957 140547769902912 mlp_log.py:80] :::MLL 1679230530.267 model_bn_span: {"value": 312, "metadata": {"lineno": 195, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 12:55:30.267157 140547769902912 resnet_ctl_imagenet_main.py:204] Training 71 epochs, each epoch has 513 steps, total steps: 36423; Eval 21 steps
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0319 12:55:30.377633 140547769902912 cross_device_ops.py:619] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0319 12:55:30.390385 140547769902912 cross_device_ops.py:619] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0319 12:55:30.400095 140547769902912 cross_device_ops.py:619] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0319 12:55:30.402572 140547769902912 cross_device_ops.py:619] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0319 12:55:30.414422 140547769902912 cross_device_ops.py:619] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0319 12:55:30.426609 140547769902912 cross_device_ops.py:619] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0319 12:55:30.486386 140547769902912 cross_device_ops.py:619] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0319 12:55:30.488949 140547769902912 cross_device_ops.py:619] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0319 12:55:30.497610 140547769902912 cross_device_ops.py:619] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0319 12:55:30.500023 140547769902912 cross_device_ops.py:619] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
:::MLL 1679230535.378 opt_name: {"value": "lars", "metadata": {"lineno": 101, "file": "/root/resnet50/lars_util.py"}}
I0319 12:55:35.377869 140547769902912 mlp_log.py:80] :::MLL 1679230535.378 opt_name: {"value": "lars", "metadata": {"lineno": 101, "file": "/root/resnet50/lars_util.py"}}
:::MLL 1679230535.379 lars_epsilon: {"value": 0.0, "metadata": {"lineno": 103, "file": "/root/resnet50/lars_util.py"}}
I0319 12:55:35.378870 140547769902912 mlp_log.py:80] :::MLL 1679230535.379 lars_epsilon: {"value": 0.0, "metadata": {"lineno": 103, "file": "/root/resnet50/lars_util.py"}}
:::MLL 1679230535.380 lars_opt_weight_decay: {"value": 0.0002, "metadata": {"lineno": 104, "file": "/root/resnet50/lars_util.py"}}
I0319 12:55:35.379752 140547769902912 mlp_log.py:80] :::MLL 1679230535.380 lars_opt_weight_decay: {"value": 0.0002, "metadata": {"lineno": 104, "file": "/root/resnet50/lars_util.py"}}
:::MLL 1679230535.381 lars_opt_base_learning_rate: {"value": 4.9, "metadata": {"lineno": 106, "file": "/root/resnet50/lars_util.py"}}
I0319 12:55:35.380624 140547769902912 mlp_log.py:80] :::MLL 1679230535.381 lars_opt_base_learning_rate: {"value": 4.9, "metadata": {"lineno": 106, "file": "/root/resnet50/lars_util.py"}}
:::MLL 1679230535.381 lars_opt_learning_rate_warmup_epochs: {"value": 5.0, "metadata": {"lineno": 108, "file": "/root/resnet50/lars_util.py"}}
I0319 12:55:35.381502 140547769902912 mlp_log.py:80] :::MLL 1679230535.381 lars_opt_learning_rate_warmup_epochs: {"value": 5.0, "metadata": {"lineno": 108, "file": "/root/resnet50/lars_util.py"}}
:::MLL 1679230535.382 lars_opt_end_learning_rate: {"value": 0.0001, "metadata": {"lineno": 110, "file": "/root/resnet50/lars_util.py"}}
I0319 12:55:35.382365 140547769902912 mlp_log.py:80] :::MLL 1679230535.382 lars_opt_end_learning_rate: {"value": 0.0001, "metadata": {"lineno": 110, "file": "/root/resnet50/lars_util.py"}}
:::MLL 1679230535.384 lars_opt_learning_rate_decay_steps: {"value": 33346, "metadata": {"lineno": 115, "file": "/root/resnet50/lars_util.py"}}
I0319 12:55:35.383680 140547769902912 mlp_log.py:80] :::MLL 1679230535.384 lars_opt_learning_rate_decay_steps: {"value": 33346, "metadata": {"lineno": 115, "file": "/root/resnet50/lars_util.py"}}
:::MLL 1679230535.385 lars_opt_learning_rate_decay_poly_power: {"value": 2.0, "metadata": {"lineno": 117, "file": "/root/resnet50/lars_util.py"}}
I0319 12:55:35.384541 140547769902912 mlp_log.py:80] :::MLL 1679230535.385 lars_opt_learning_rate_decay_poly_power: {"value": 2.0, "metadata": {"lineno": 117, "file": "/root/resnet50/lars_util.py"}}
:::MLL 1679230535.385 lars_opt_momentum: {"value": 0.9, "metadata": {"lineno": 119, "file": "/root/resnet50/lars_util.py"}}
I0319 12:55:35.385398 140547769902912 mlp_log.py:80] :::MLL 1679230535.385 lars_opt_momentum: {"value": 0.9, "metadata": {"lineno": 119, "file": "/root/resnet50/lars_util.py"}}
I0319 12:55:35.494630 140547769902912 resnet_ctl_imagenet_main.py:238] Warmup for 1 steps.
I0319 12:55:35.496956 140547769902912 controller.py:340] Warmup at step 0 of 1
I0319 12:55:35.497112 140547769902912 controller.py:345] Entering warmup loop with 1 steps, at step 0 of 1
WARNING:tensorflow:From /root/resnet50/tf2_common/training/utils.py:139: StrategyBase.experimental_distribute_datasets_from_function (from tensorflow.python.distribute.distribute_lib) is deprecated and will be removed in a future version.
Instructions for updating:
rename to distribute_datasets_from_function
W0319 12:55:35.497444 140547769902912 deprecation.py:341] From /root/resnet50/tf2_common/training/utils.py:139: StrategyBase.experimental_distribute_datasets_from_function (from tensorflow.python.distribute.distribute_lib) is deprecated and will be removed in a future version.
Instructions for updating:
rename to distribute_datasets_from_function
I0319 12:55:35.897564 140547769902912 resnet_runnable.py:484] Entering the warmup loop.
WARNING:tensorflow:From /usr/local/lib/python3.8/site-packages/tensorflow/python/autograph/impl/api.py:464: calling function (from tensorflow.python.eager.def_function) with experimental_compile is deprecated and will be removed in a future version.
Instructions for updating:
experimental_compile is deprecated, use jit_compile instead
W0319 12:55:37.124004 140547769902912 deprecation.py:545] From /usr/local/lib/python3.8/site-packages/tensorflow/python/autograph/impl/api.py:464: calling function (from tensorflow.python.eager.def_function) with experimental_compile is deprecated and will be removed in a future version.
Instructions for updating:
experimental_compile is deprecated, use jit_compile instead
INFO:tensorflow:batch_all_reduce: 161 all-reduces with algorithm = nccl, num_packs = 1
I0319 12:55:55.412617 140547769902912 cross_device_ops.py:900] batch_all_reduce: 161 all-reduces with algorithm = nccl, num_packs = 1
INFO:tensorflow:batch_all_reduce: 161 all-reduces with algorithm = nccl, num_packs = 1
I0319 12:56:48.352646 140547769902912 cross_device_ops.py:900] batch_all_reduce: 161 all-reduces with algorithm = nccl, num_packs = 1
I0319 13:00:32.592645 140547769902912 resnet_runnable.py:497] Exiting the warmup loop.
I0319 13:00:32.595108 140547769902912 controller.py:220] step: 1        steps_per_second: 0.00
enter fp16 computing
step: 1        steps_per_second: 0.00
:::MLL 1679230832.596 init_stop: {"value": null, "metadata": {"lineno": 258, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 13:00:32.596201 140547769902912 mlp_log.py:80] :::MLL 1679230832.596 init_stop: {"value": null, "metadata": {"lineno": 258, "file": "./resnet_ctl_imagenet_main.py"}}
:::MLL 1679230832.597 run_start: {"value": null, "metadata": {"lineno": 267, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 13:00:32.596997 140547769902912 mlp_log.py:80] :::MLL 1679230832.597 run_start: {"value": null, "metadata": {"lineno": 267, "file": "./resnet_ctl_imagenet_main.py"}}
:::MLL 1679230832.598 block_start: {"value": null, "metadata": {"first_epoch_num": 1, "epoch_count": 3, "lineno": 268, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 13:00:32.597745 140547769902912 mlp_log.py:80] :::MLL 1679230832.598 block_start: {"value": null, "metadata": {"first_epoch_num": 1, "epoch_count": 3, "lineno": 268, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 13:00:32.599620 140547769902912 controller.py:247] Train at step 0 of 36423
I0319 13:00:32.599745 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 0 of 36423
I0319 13:00:32.612586 140547769902912 imagenet_preprocessing.py:338] Sharding the dataset: input_pipeline_id=0 num_input_pipelines=1
W0319 13:00:32.634842 140547769902912 options.py:503] options.experimental_threading is deprecated. Use options.threading instead.
I0319 13:00:32.636068 140547769902912 imagenet_preprocessing.py:104] datasets_num_private_threads: 32
I0319 13:00:32.637336 140547769902912 imagenet_preprocessing.py:118] Num classes: 1000
I0319 13:00:32.637444 140547769902912 imagenet_preprocessing.py:119] One hot: True
I0319 13:08:32.765698 140547769902912 keras_utils.py:120] TimeHistory: 2676.05 examples/second between steps 0 and 513
I0319 13:08:32.769956 140547769902912 controller.py:220] step: 513        steps_per_second: 1.07        {'train_loss': 101.53466, 'train_accuracy': 0.025109181}
I0319 13:08:32.770123 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 513 of 36423
I0319 13:16:30.476807 140547769902912 keras_utils.py:120] TimeHistory: 2680.53 examples/second between steps 513 and 1026
I0319 13:16:30.481098 140547769902912 controller.py:220] step: 1026        steps_per_second: 1.07        {'train_loss': 80.75745, 'train_accuracy': 0.13648738}
I0319 13:16:30.481256 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 1026 of 36423
I0319 13:24:28.062501 140547769902912 keras_utils.py:120] TimeHistory: 2681.24 examples/second between steps 1026 and 1539
I0319 13:24:28.066748 140547769902912 controller.py:220] step: 1539        steps_per_second: 1.07        {'train_loss': 68.72967, 'train_accuracy': 0.25144324}
I0319 13:24:28.066913 140547769902912 controller.py:185] Start evaluation at step: 1539
I0319 13:24:28.070569 140547769902912 imagenet_preprocessing.py:338] Sharding the dataset: input_pipeline_id=0 num_input_pipelines=1
W0319 13:24:28.088642 140547769902912 options.py:503] options.experimental_threading is deprecated. Use options.threading instead.
I0319 13:24:28.089705 140547769902912 imagenet_preprocessing.py:104] datasets_num_private_threads: 32
I0319 13:24:28.089835 140547769902912 imagenet_preprocessing.py:118] Num classes: 1000
I0319 13:24:28.089923 140547769902912 imagenet_preprocessing.py:119] One hot: True
step: 513        steps_per_second: 1.07        {'train_loss': 101.53466, 'train_accuracy': 0.025109181}
step: 1026        steps_per_second: 1.07        {'train_loss': 80.75745, 'train_accuracy': 0.13648738}
step: 1539        steps_per_second: 1.07        {'train_loss': 68.72967, 'train_accuracy': 0.25144324}
:::MLL 1679232268.928 eval_start: {"value": null, "metadata": {"epoch_num": 3, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 13:24:28.927603 140547769902912 mlp_log.py:80] :::MLL 1679232268.928 eval_start: {"value": null, "metadata": {"epoch_num": 3, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679232301.308 eval_stop: {"value": null, "metadata": {"epoch_num": 3, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 13:25:01.308466 140547769902912 mlp_log.py:80] :::MLL 1679232301.308 eval_stop: {"value": null, "metadata": {"epoch_num": 3, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679232301.317 eval_accuracy: {"value": 0.255840003490448, "metadata": {"epoch_num": 3, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 13:25:01.317326 140547769902912 mlp_log.py:80] :::MLL 1679232301.317 eval_accuracy: {"value": 0.255840003490448, "metadata": {"epoch_num": 3, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679232301.318 block_stop: {"value": null, "metadata": {"first_epoch_num": 1, "epoch_count": 3, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 13:25:01.318364 140547769902912 mlp_log.py:80] :::MLL 1679232301.318 block_stop: {"value": null, "metadata": {"first_epoch_num": 1, "epoch_count": 3, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679232301.319 block_start: {"value": null, "metadata": {"first_epoch_num": 4, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 13:25:01.319331 140547769902912 mlp_log.py:80] :::MLL 1679232301.319 block_start: {"value": null, "metadata": {"first_epoch_num": 4, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 13:25:01.329561 140547769902912 controller.py:220] step: 1539        evaluation metric: {'test_loss': 0.49958566, 'test_accuracy': 0.25584, 'continue_training': True}
I0319 13:25:01.329745 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 1539 of 36423
I0319 13:32:58.584241 140547769902912 keras_utils.py:120] TimeHistory: 2683.07 examples/second between steps 1539 and 2052
I0319 13:32:58.588519 140547769902912 controller.py:220] step: 2052        steps_per_second: 1.00        {'train_loss': 61.880257, 'train_accuracy': 0.3291465}
I0319 13:32:58.588680 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 2052 of 36423
I0319 13:40:56.833560 140547769902912 keras_utils.py:120] TimeHistory: 2677.52 examples/second between steps 2052 and 2565
I0319 13:40:56.837803 140547769902912 controller.py:220] step: 2565        steps_per_second: 1.07        {'train_loss': 58.142868, 'train_accuracy': 0.3752999}
I0319 13:40:56.837963 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 2565 of 36423
I0319 13:48:55.233101 140547769902912 keras_utils.py:120] TimeHistory: 2676.68 examples/second between steps 2565 and 3078
I0319 13:48:55.237374 140547769902912 controller.py:220] step: 3078        steps_per_second: 1.07        {'train_loss': 55.290226, 'train_accuracy': 0.41178867}
I0319 13:48:55.237531 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 3078 of 36423
I0319 13:56:53.574455 140547769902912 keras_utils.py:120] TimeHistory: 2677.00 examples/second between steps 3078 and 3591
I0319 13:56:53.578727 140547769902912 controller.py:220] step: 3591        steps_per_second: 1.07        {'train_loss': 52.677834, 'train_accuracy': 0.4466218}
I0319 13:56:53.578876 140547769902912 controller.py:185] Start evaluation at step: 3591
step: 1539        evaluation metric: {'test_loss': 0.49958566, 'test_accuracy': 0.25584, 'continue_training': True}
step: 2052        steps_per_second: 1.00        {'train_loss': 61.880257, 'train_accuracy': 0.3291465}
step: 2565        steps_per_second: 1.07        {'train_loss': 58.142868, 'train_accuracy': 0.3752999}
step: 3078        steps_per_second: 1.07        {'train_loss': 55.290226, 'train_accuracy': 0.41178867}
step: 3591        steps_per_second: 1.07        {'train_loss': 52.677834, 'train_accuracy': 0.4466218}
:::MLL 1679234214.081 eval_start: {"value": null, "metadata": {"epoch_num": 7, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 13:56:54.080654 140547769902912 mlp_log.py:80] :::MLL 1679234214.081 eval_start: {"value": null, "metadata": {"epoch_num": 7, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679234225.254 eval_stop: {"value": null, "metadata": {"epoch_num": 7, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 13:57:05.254401 140547769902912 mlp_log.py:80] :::MLL 1679234225.254 eval_stop: {"value": null, "metadata": {"epoch_num": 7, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679234225.261 eval_accuracy: {"value": 0.4514999985694885, "metadata": {"epoch_num": 7, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 13:57:05.261220 140547769902912 mlp_log.py:80] :::MLL 1679234225.261 eval_accuracy: {"value": 0.4514999985694885, "metadata": {"epoch_num": 7, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679234225.262 block_stop: {"value": null, "metadata": {"first_epoch_num": 4, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 13:57:05.262227 140547769902912 mlp_log.py:80] :::MLL 1679234225.262 block_stop: {"value": null, "metadata": {"first_epoch_num": 4, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679234225.263 block_start: {"value": null, "metadata": {"first_epoch_num": 8, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 13:57:05.263200 140547769902912 mlp_log.py:80] :::MLL 1679234225.263 block_start: {"value": null, "metadata": {"first_epoch_num": 8, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 13:57:05.272903 140547769902912 controller.py:220] step: 3591        evaluation metric: {'test_loss': 0.38534293, 'test_accuracy': 0.4515, 'continue_training': True}
I0319 13:57:05.273066 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 3591 of 36423
I0319 14:05:03.201216 140547769902912 keras_utils.py:120] TimeHistory: 2679.28 examples/second between steps 3591 and 4104
I0319 14:05:03.205459 140547769902912 controller.py:220] step: 4104        steps_per_second: 1.05        {'train_loss': 50.85758, 'train_accuracy': 0.47165993}
I0319 14:05:03.205613 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 4104 of 36423
I0319 14:13:01.703775 140547769902912 keras_utils.py:120] TimeHistory: 2676.10 examples/second between steps 4104 and 4617
I0319 14:13:01.707995 140547769902912 controller.py:220] step: 4617        steps_per_second: 1.07        {'train_loss': 49.526817, 'train_accuracy': 0.48903587}
I0319 14:13:01.708152 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 4617 of 36423
I0319 14:20:58.757003 140547769902912 keras_utils.py:120] TimeHistory: 2684.23 examples/second between steps 4617 and 5130
I0319 14:20:58.761198 140547769902912 controller.py:220] step: 5130        steps_per_second: 1.08        {'train_loss': 48.474247, 'train_accuracy': 0.5037846}
I0319 14:20:58.761370 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 5130 of 36423
I0319 14:28:56.838135 140547769902912 keras_utils.py:120] TimeHistory: 2678.46 examples/second between steps 5130 and 5643
I0319 14:28:56.842247 140547769902912 controller.py:220] step: 5643        steps_per_second: 1.07        {'train_loss': 47.524445, 'train_accuracy': 0.517012}
I0319 14:28:56.842405 140547769902912 controller.py:185] Start evaluation at step: 5643
step: 3591        evaluation metric: {'test_loss': 0.38534293, 'test_accuracy': 0.4515, 'continue_training': True}
step: 4104        steps_per_second: 1.05        {'train_loss': 50.85758, 'train_accuracy': 0.47165993}
step: 4617        steps_per_second: 1.07        {'train_loss': 49.526817, 'train_accuracy': 0.48903587}
step: 5130        steps_per_second: 1.08        {'train_loss': 48.474247, 'train_accuracy': 0.5037846}
step: 5643        steps_per_second: 1.07        {'train_loss': 47.524445, 'train_accuracy': 0.517012}
:::MLL 1679236137.347 eval_start: {"value": null, "metadata": {"epoch_num": 11, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 14:28:57.346966 140547769902912 mlp_log.py:80] :::MLL 1679236137.347 eval_start: {"value": null, "metadata": {"epoch_num": 11, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679236148.307 eval_stop: {"value": null, "metadata": {"epoch_num": 11, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 14:29:08.307533 140547769902912 mlp_log.py:80] :::MLL 1679236148.307 eval_stop: {"value": null, "metadata": {"epoch_num": 11, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679236148.314 eval_accuracy: {"value": 0.5169399976730347, "metadata": {"epoch_num": 11, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 14:29:08.314471 140547769902912 mlp_log.py:80] :::MLL 1679236148.314 eval_accuracy: {"value": 0.5169399976730347, "metadata": {"epoch_num": 11, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679236148.315 block_stop: {"value": null, "metadata": {"first_epoch_num": 8, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 14:29:08.315475 140547769902912 mlp_log.py:80] :::MLL 1679236148.315 block_stop: {"value": null, "metadata": {"first_epoch_num": 8, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679236148.316 block_start: {"value": null, "metadata": {"first_epoch_num": 12, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 14:29:08.316439 140547769902912 mlp_log.py:80] :::MLL 1679236148.316 block_start: {"value": null, "metadata": {"first_epoch_num": 12, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 14:29:08.326488 140547769902912 controller.py:220] step: 5643        evaluation metric: {'test_loss': 0.34546962, 'test_accuracy': 0.51694, 'continue_training': True}
I0319 14:29:08.326648 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 5643 of 36423
I0319 14:37:05.725753 140547769902912 keras_utils.py:120] TimeHistory: 2682.26 examples/second between steps 5643 and 6156
I0319 14:37:05.729918 140547769902912 controller.py:220] step: 6156        steps_per_second: 1.05        {'train_loss': 46.72335, 'train_accuracy': 0.5281839}
I0319 14:37:05.730074 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 6156 of 36423
I0319 14:45:03.411590 140547769902912 keras_utils.py:120] TimeHistory: 2680.68 examples/second between steps 6156 and 6669
I0319 14:45:03.415779 140547769902912 controller.py:220] step: 6669        steps_per_second: 1.07        {'train_loss': 46.06021, 'train_accuracy': 0.53761417}
I0319 14:45:03.415935 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 6669 of 36423
I0319 14:53:02.156559 140547769902912 keras_utils.py:120] TimeHistory: 2674.74 examples/second between steps 6669 and 7182
I0319 14:53:02.160710 140547769902912 controller.py:220] step: 7182        steps_per_second: 1.07        {'train_loss': 45.366295, 'train_accuracy': 0.54672974}
I0319 14:53:02.160865 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 7182 of 36423
I0319 15:01:00.511001 140547769902912 keras_utils.py:120] TimeHistory: 2676.93 examples/second between steps 7182 and 7695
I0319 15:01:00.515219 140547769902912 controller.py:220] step: 7695        steps_per_second: 1.07        {'train_loss': 44.782856, 'train_accuracy': 0.5550253}
I0319 15:01:00.517019 140547769902912 controller.py:185] Start evaluation at step: 7695
step: 5643        evaluation metric: {'test_loss': 0.34546962, 'test_accuracy': 0.51694, 'continue_training': True}
step: 6156        steps_per_second: 1.05        {'train_loss': 46.72335, 'train_accuracy': 0.5281839}
step: 6669        steps_per_second: 1.07        {'train_loss': 46.06021, 'train_accuracy': 0.53761417}
step: 7182        steps_per_second: 1.07        {'train_loss': 45.366295, 'train_accuracy': 0.54672974}
step: 7695        steps_per_second: 1.07        {'train_loss': 44.782856, 'train_accuracy': 0.5550253}
:::MLL 1679238061.002 eval_start: {"value": null, "metadata": {"epoch_num": 15, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 15:01:01.002238 140547769902912 mlp_log.py:80] :::MLL 1679238061.002 eval_start: {"value": null, "metadata": {"epoch_num": 15, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679238071.832 eval_stop: {"value": null, "metadata": {"epoch_num": 15, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 15:01:11.832513 140547769902912 mlp_log.py:80] :::MLL 1679238071.832 eval_stop: {"value": null, "metadata": {"epoch_num": 15, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679238071.839 eval_accuracy: {"value": 0.5540599822998047, "metadata": {"epoch_num": 15, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 15:01:11.839387 140547769902912 mlp_log.py:80] :::MLL 1679238071.839 eval_accuracy: {"value": 0.5540599822998047, "metadata": {"epoch_num": 15, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679238071.840 block_stop: {"value": null, "metadata": {"first_epoch_num": 12, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 15:01:11.840405 140547769902912 mlp_log.py:80] :::MLL 1679238071.840 block_stop: {"value": null, "metadata": {"first_epoch_num": 12, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679238071.841 block_start: {"value": null, "metadata": {"first_epoch_num": 16, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 15:01:11.841379 140547769902912 mlp_log.py:80] :::MLL 1679238071.841 block_start: {"value": null, "metadata": {"first_epoch_num": 16, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 15:01:11.851153 140547769902912 controller.py:220] step: 7695        evaluation metric: {'test_loss': 0.3284506, 'test_accuracy': 0.55406, 'continue_training': True}
I0319 15:01:11.851322 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 7695 of 36423
I0319 15:09:09.903125 140547769902912 keras_utils.py:120] TimeHistory: 2678.59 examples/second between steps 7695 and 8208
I0319 15:09:09.907292 140547769902912 controller.py:220] step: 8208        steps_per_second: 1.05        {'train_loss': 44.193314, 'train_accuracy': 0.56363946}
I0319 15:09:09.907462 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 8208 of 36423
I0319 15:17:08.328512 140547769902912 keras_utils.py:120] TimeHistory: 2676.53 examples/second between steps 8208 and 8721
I0319 15:17:08.332779 140547769902912 controller.py:220] step: 8721        steps_per_second: 1.07        {'train_loss': 43.65782, 'train_accuracy': 0.5716288}
I0319 15:17:08.332940 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 8721 of 36423
I0319 15:25:06.558547 140547769902912 keras_utils.py:120] TimeHistory: 2677.62 examples/second between steps 8721 and 9234
I0319 15:25:06.562764 140547769902912 controller.py:220] step: 9234        steps_per_second: 1.07        {'train_loss': 43.085396, 'train_accuracy': 0.5789591}
I0319 15:25:06.562925 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 9234 of 36423
I0319 15:33:04.438484 140547769902912 keras_utils.py:120] TimeHistory: 2679.59 examples/second between steps 9234 and 9747
I0319 15:33:04.442654 140547769902912 controller.py:220] step: 9747        steps_per_second: 1.07        {'train_loss': 42.59366, 'train_accuracy': 0.58631825}
I0319 15:33:04.442804 140547769902912 controller.py:185] Start evaluation at step: 9747
step: 7695        evaluation metric: {'test_loss': 0.3284506, 'test_accuracy': 0.55406, 'continue_training': True}
step: 8208        steps_per_second: 1.05        {'train_loss': 44.193314, 'train_accuracy': 0.56363946}
step: 8721        steps_per_second: 1.07        {'train_loss': 43.65782, 'train_accuracy': 0.5716288}
step: 9234        steps_per_second: 1.07        {'train_loss': 43.085396, 'train_accuracy': 0.5789591}
step: 9747        steps_per_second: 1.07        {'train_loss': 42.59366, 'train_accuracy': 0.58631825}
:::MLL 1679239984.931 eval_start: {"value": null, "metadata": {"epoch_num": 19, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 15:33:04.930735 140547769902912 mlp_log.py:80] :::MLL 1679239984.931 eval_start: {"value": null, "metadata": {"epoch_num": 19, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679239996.094 eval_stop: {"value": null, "metadata": {"epoch_num": 19, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 15:33:16.094051 140547769902912 mlp_log.py:80] :::MLL 1679239996.094 eval_stop: {"value": null, "metadata": {"epoch_num": 19, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679239996.101 eval_accuracy: {"value": 0.6158000230789185, "metadata": {"epoch_num": 19, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 15:33:16.100932 140547769902912 mlp_log.py:80] :::MLL 1679239996.101 eval_accuracy: {"value": 0.6158000230789185, "metadata": {"epoch_num": 19, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679239996.102 block_stop: {"value": null, "metadata": {"first_epoch_num": 16, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 15:33:16.101949 140547769902912 mlp_log.py:80] :::MLL 1679239996.102 block_stop: {"value": null, "metadata": {"first_epoch_num": 16, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679239996.103 block_start: {"value": null, "metadata": {"first_epoch_num": 20, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 15:33:16.102918 140547769902912 mlp_log.py:80] :::MLL 1679239996.103 block_start: {"value": null, "metadata": {"first_epoch_num": 20, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 15:33:16.112729 140547769902912 controller.py:220] step: 9747        evaluation metric: {'test_loss': 0.29739872, 'test_accuracy': 0.6158, 'continue_training': True}
I0319 15:33:16.112884 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 9747 of 36423
I0319 15:41:14.392338 140547769902912 keras_utils.py:120] TimeHistory: 2677.32 examples/second between steps 9747 and 10260
I0319 15:41:14.396505 140547769902912 controller.py:220] step: 10260        steps_per_second: 1.05        {'train_loss': 42.066074, 'train_accuracy': 0.59374607}
I0319 15:41:14.396659 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 10260 of 36423
I0319 15:49:11.961558 140547769902912 keras_utils.py:120] TimeHistory: 2681.33 examples/second between steps 10260 and 10773
I0319 15:49:11.965767 140547769902912 controller.py:220] step: 10773        steps_per_second: 1.07        {'train_loss': 41.57502, 'train_accuracy': 0.60048044}
I0319 15:49:11.965925 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 10773 of 36423
I0319 15:57:09.164847 140547769902912 keras_utils.py:120] TimeHistory: 2683.39 examples/second between steps 10773 and 11286
I0319 15:57:09.168977 140547769902912 controller.py:220] step: 11286        steps_per_second: 1.08        {'train_loss': 41.104015, 'train_accuracy': 0.6074765}
I0319 15:57:09.169133 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 11286 of 36423
I0319 16:05:06.888276 140547769902912 keras_utils.py:120] TimeHistory: 2680.46 examples/second between steps 11286 and 11799
I0319 16:05:06.892483 140547769902912 controller.py:220] step: 11799        steps_per_second: 1.07        {'train_loss': 40.675106, 'train_accuracy': 0.6140507}
I0319 16:05:06.892634 140547769902912 controller.py:185] Start evaluation at step: 11799
step: 9747        evaluation metric: {'test_loss': 0.29739872, 'test_accuracy': 0.6158, 'continue_training': True}
step: 10260        steps_per_second: 1.05        {'train_loss': 42.066074, 'train_accuracy': 0.59374607}
step: 10773        steps_per_second: 1.07        {'train_loss': 41.57502, 'train_accuracy': 0.60048044}
step: 11286        steps_per_second: 1.08        {'train_loss': 41.104015, 'train_accuracy': 0.6074765}
step: 11799        steps_per_second: 1.07        {'train_loss': 40.675106, 'train_accuracy': 0.6140507}
:::MLL 1679241907.377 eval_start: {"value": null, "metadata": {"epoch_num": 23, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 16:05:07.376655 140547769902912 mlp_log.py:80] :::MLL 1679241907.377 eval_start: {"value": null, "metadata": {"epoch_num": 23, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679241918.161 eval_stop: {"value": null, "metadata": {"epoch_num": 23, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 16:05:18.161060 140547769902912 mlp_log.py:80] :::MLL 1679241918.161 eval_stop: {"value": null, "metadata": {"epoch_num": 23, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679241918.168 eval_accuracy: {"value": 0.6306399703025818, "metadata": {"epoch_num": 23, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 16:05:18.167979 140547769902912 mlp_log.py:80] :::MLL 1679241918.168 eval_accuracy: {"value": 0.6306399703025818, "metadata": {"epoch_num": 23, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679241918.169 block_stop: {"value": null, "metadata": {"first_epoch_num": 20, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 16:05:18.168991 140547769902912 mlp_log.py:80] :::MLL 1679241918.169 block_stop: {"value": null, "metadata": {"first_epoch_num": 20, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679241918.170 block_start: {"value": null, "metadata": {"first_epoch_num": 24, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 16:05:18.169961 140547769902912 mlp_log.py:80] :::MLL 1679241918.170 block_start: {"value": null, "metadata": {"first_epoch_num": 24, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 16:05:18.179913 140547769902912 controller.py:220] step: 11799        evaluation metric: {'test_loss': 0.29088515, 'test_accuracy': 0.63064, 'continue_training': True}
I0319 16:05:18.180072 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 11799 of 36423
I0319 16:13:15.017472 140547769902912 keras_utils.py:120] TimeHistory: 2685.42 examples/second between steps 11799 and 12312
I0319 16:13:15.021653 140547769902912 controller.py:220] step: 12312        steps_per_second: 1.05        {'train_loss': 40.224228, 'train_accuracy': 0.6205242}
I0319 16:13:15.021814 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 12312 of 36423
I0319 16:21:11.873815 140547769902912 keras_utils.py:120] TimeHistory: 2685.34 examples/second between steps 12312 and 12825
I0319 16:21:11.877966 140547769902912 controller.py:220] step: 12825        steps_per_second: 1.08        {'train_loss': 39.75526, 'train_accuracy': 0.627093}
I0319 16:21:11.878120 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 12825 of 36423
I0319 16:29:08.757629 140547769902912 keras_utils.py:120] TimeHistory: 2685.19 examples/second between steps 12825 and 13338
I0319 16:29:08.761925 140547769902912 controller.py:220] step: 13338        steps_per_second: 1.08        {'train_loss': 39.357185, 'train_accuracy': 0.6333713}
I0319 16:29:08.762086 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 13338 of 36423
I0319 16:37:05.957099 140547769902912 keras_utils.py:120] TimeHistory: 2683.41 examples/second between steps 13338 and 13851
I0319 16:37:05.961228 140547769902912 controller.py:220] step: 13851        steps_per_second: 1.08        {'train_loss': 38.924423, 'train_accuracy': 0.6391099}
I0319 16:37:05.961388 140547769902912 controller.py:185] Start evaluation at step: 13851
step: 11799        evaluation metric: {'test_loss': 0.29088515, 'test_accuracy': 0.63064, 'continue_training': True}
step: 12312        steps_per_second: 1.05        {'train_loss': 40.224228, 'train_accuracy': 0.6205242}
step: 12825        steps_per_second: 1.08        {'train_loss': 39.75526, 'train_accuracy': 0.627093}
step: 13338        steps_per_second: 1.08        {'train_loss': 39.357185, 'train_accuracy': 0.6333713}
step: 13851        steps_per_second: 1.08        {'train_loss': 38.924423, 'train_accuracy': 0.6391099}
:::MLL 1679243826.441 eval_start: {"value": null, "metadata": {"epoch_num": 27, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 16:37:06.441277 140547769902912 mlp_log.py:80] :::MLL 1679243826.441 eval_start: {"value": null, "metadata": {"epoch_num": 27, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679243837.448 eval_stop: {"value": null, "metadata": {"epoch_num": 27, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 16:37:17.448269 140547769902912 mlp_log.py:80] :::MLL 1679243837.448 eval_stop: {"value": null, "metadata": {"epoch_num": 27, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679243837.455 eval_accuracy: {"value": 0.6637200117111206, "metadata": {"epoch_num": 27, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 16:37:17.455250 140547769902912 mlp_log.py:80] :::MLL 1679243837.455 eval_accuracy: {"value": 0.6637200117111206, "metadata": {"epoch_num": 27, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679243837.456 block_stop: {"value": null, "metadata": {"first_epoch_num": 24, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 16:37:17.456276 140547769902912 mlp_log.py:80] :::MLL 1679243837.456 block_stop: {"value": null, "metadata": {"first_epoch_num": 24, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679243837.457 block_start: {"value": null, "metadata": {"first_epoch_num": 28, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 16:37:17.457254 140547769902912 mlp_log.py:80] :::MLL 1679243837.457 block_start: {"value": null, "metadata": {"first_epoch_num": 28, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 16:37:17.467283 140547769902912 controller.py:220] step: 13851        evaluation metric: {'test_loss': 0.27420917, 'test_accuracy': 0.66372, 'continue_training': True}
I0319 16:37:17.467454 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 13851 of 36423
I0319 16:45:14.272286 140547769902912 keras_utils.py:120] TimeHistory: 2685.60 examples/second between steps 13851 and 14364
I0319 16:45:14.276514 140547769902912 controller.py:220] step: 14364        steps_per_second: 1.05        {'train_loss': 38.50588, 'train_accuracy': 0.645977}
I0319 16:45:14.276674 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 14364 of 36423
I0319 16:53:12.242927 140547769902912 keras_utils.py:120] TimeHistory: 2679.08 examples/second between steps 14364 and 14877
I0319 16:53:12.247173 140547769902912 controller.py:220] step: 14877        steps_per_second: 1.07        {'train_loss': 38.042336, 'train_accuracy': 0.65228266}
I0319 16:53:12.247342 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 14877 of 36423
I0319 17:01:08.925324 140547769902912 keras_utils.py:120] TimeHistory: 2686.32 examples/second between steps 14877 and 15390
I0319 17:01:08.929522 140547769902912 controller.py:220] step: 15390        steps_per_second: 1.08        {'train_loss': 37.64315, 'train_accuracy': 0.65857184}
I0319 17:01:08.929681 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 15390 of 36423
I0319 17:09:05.460300 140547769902912 keras_utils.py:120] TimeHistory: 2687.14 examples/second between steps 15390 and 15903
I0319 17:09:05.464558 140547769902912 controller.py:220] step: 15903        steps_per_second: 1.08        {'train_loss': 37.25062, 'train_accuracy': 0.66452134}
I0319 17:09:05.464712 140547769902912 controller.py:185] Start evaluation at step: 15903
step: 13851        evaluation metric: {'test_loss': 0.27420917, 'test_accuracy': 0.66372, 'continue_training': True}
step: 14364        steps_per_second: 1.05        {'train_loss': 38.50588, 'train_accuracy': 0.645977}
step: 14877        steps_per_second: 1.07        {'train_loss': 38.042336, 'train_accuracy': 0.65228266}
step: 15390        steps_per_second: 1.08        {'train_loss': 37.64315, 'train_accuracy': 0.65857184}
step: 15903        steps_per_second: 1.08        {'train_loss': 37.25062, 'train_accuracy': 0.66452134}
:::MLL 1679245745.958 eval_start: {"value": null, "metadata": {"epoch_num": 31, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 17:09:05.958450 140547769902912 mlp_log.py:80] :::MLL 1679245745.958 eval_start: {"value": null, "metadata": {"epoch_num": 31, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679245756.709 eval_stop: {"value": null, "metadata": {"epoch_num": 31, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 17:09:16.709334 140547769902912 mlp_log.py:80] :::MLL 1679245756.709 eval_stop: {"value": null, "metadata": {"epoch_num": 31, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679245756.716 eval_accuracy: {"value": 0.663860023021698, "metadata": {"epoch_num": 31, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 17:09:16.716322 140547769902912 mlp_log.py:80] :::MLL 1679245756.716 eval_accuracy: {"value": 0.663860023021698, "metadata": {"epoch_num": 31, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679245756.717 block_stop: {"value": null, "metadata": {"first_epoch_num": 28, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 17:09:16.717343 140547769902912 mlp_log.py:80] :::MLL 1679245756.717 block_stop: {"value": null, "metadata": {"first_epoch_num": 28, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679245756.718 block_start: {"value": null, "metadata": {"first_epoch_num": 32, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 17:09:16.718302 140547769902912 mlp_log.py:80] :::MLL 1679245756.718 block_start: {"value": null, "metadata": {"first_epoch_num": 32, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 17:09:16.728244 140547769902912 controller.py:220] step: 15903        evaluation metric: {'test_loss': 0.2736155, 'test_accuracy': 0.66386, 'continue_training': True}
I0319 17:09:16.728415 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 15903 of 36423
I0319 17:17:12.753624 140547769902912 keras_utils.py:120] TimeHistory: 2690.00 examples/second between steps 15903 and 16416
I0319 17:17:12.757766 140547769902912 controller.py:220] step: 16416        steps_per_second: 1.05        {'train_loss': 36.87168, 'train_accuracy': 0.6701756}
I0319 17:17:12.757923 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 16416 of 36423
I0319 17:25:08.728839 140547769902912 keras_utils.py:120] TimeHistory: 2690.31 examples/second between steps 16416 and 16929
I0319 17:25:08.733042 140547769902912 controller.py:220] step: 16929        steps_per_second: 1.08        {'train_loss': 36.469055, 'train_accuracy': 0.67674124}
I0319 17:25:08.733199 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 16929 of 36423
I0319 17:33:05.759370 140547769902912 keras_utils.py:120] TimeHistory: 2684.36 examples/second between steps 16929 and 17442
I0319 17:33:05.763500 140547769902912 controller.py:220] step: 17442        steps_per_second: 1.08        {'train_loss': 36.071156, 'train_accuracy': 0.6823971}
I0319 17:33:05.763653 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 17442 of 36423
I0319 17:41:02.449225 140547769902912 keras_utils.py:120] TimeHistory: 2686.27 examples/second between steps 17442 and 17955
I0319 17:41:02.453442 140547769902912 controller.py:220] step: 17955        steps_per_second: 1.08        {'train_loss': 35.67699, 'train_accuracy': 0.68873394}
I0319 17:41:02.453614 140547769902912 controller.py:185] Start evaluation at step: 17955
step: 15903        evaluation metric: {'test_loss': 0.2736155, 'test_accuracy': 0.66386, 'continue_training': True}
step: 16416        steps_per_second: 1.05        {'train_loss': 36.87168, 'train_accuracy': 0.6701756}
step: 16929        steps_per_second: 1.08        {'train_loss': 36.469055, 'train_accuracy': 0.67674124}
step: 17442        steps_per_second: 1.08        {'train_loss': 36.071156, 'train_accuracy': 0.6823971}
step: 17955        steps_per_second: 1.08        {'train_loss': 35.67699, 'train_accuracy': 0.68873394}
:::MLL 1679247662.938 eval_start: {"value": null, "metadata": {"epoch_num": 35, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 17:41:02.938170 140547769902912 mlp_log.py:80] :::MLL 1679247662.938 eval_start: {"value": null, "metadata": {"epoch_num": 35, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679247673.817 eval_stop: {"value": null, "metadata": {"epoch_num": 35, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 17:41:13.817046 140547769902912 mlp_log.py:80] :::MLL 1679247673.817 eval_stop: {"value": null, "metadata": {"epoch_num": 35, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679247673.824 eval_accuracy: {"value": 0.6931399703025818, "metadata": {"epoch_num": 35, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 17:41:13.824302 140547769902912 mlp_log.py:80] :::MLL 1679247673.824 eval_accuracy: {"value": 0.6931399703025818, "metadata": {"epoch_num": 35, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679247673.825 block_stop: {"value": null, "metadata": {"first_epoch_num": 32, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 17:41:13.825345 140547769902912 mlp_log.py:80] :::MLL 1679247673.825 block_stop: {"value": null, "metadata": {"first_epoch_num": 32, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679247673.826 block_start: {"value": null, "metadata": {"first_epoch_num": 36, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 17:41:13.826306 140547769902912 mlp_log.py:80] :::MLL 1679247673.826 block_start: {"value": null, "metadata": {"first_epoch_num": 36, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 17:41:13.836492 140547769902912 controller.py:220] step: 17955        evaluation metric: {'test_loss': 0.25965777, 'test_accuracy': 0.69314, 'continue_training': True}
I0319 17:41:13.836662 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 17955 of 36423
I0319 17:49:10.447629 140547769902912 keras_utils.py:120] TimeHistory: 2686.69 examples/second between steps 17955 and 18468
I0319 17:49:10.451847 140547769902912 controller.py:220] step: 18468        steps_per_second: 1.05        {'train_loss': 35.29787, 'train_accuracy': 0.6943679}
I0319 17:49:10.452003 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 18468 of 36423
I0319 17:57:07.115317 140547769902912 keras_utils.py:120] TimeHistory: 2686.40 examples/second between steps 18468 and 18981
I0319 17:57:07.119469 140547769902912 controller.py:220] step: 18981        steps_per_second: 1.08        {'train_loss': 34.908035, 'train_accuracy': 0.70074695}
I0319 17:57:07.119627 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 18981 of 36423
I0319 18:05:03.520790 140547769902912 keras_utils.py:120] TimeHistory: 2687.88 examples/second between steps 18981 and 19494
I0319 18:05:03.524950 140547769902912 controller.py:220] step: 19494        steps_per_second: 1.08        {'train_loss': 34.57146, 'train_accuracy': 0.7056616}
I0319 18:05:03.525108 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 19494 of 36423
I0319 18:13:00.146009 140547769902912 keras_utils.py:120] TimeHistory: 2686.66 examples/second between steps 19494 and 20007
I0319 18:13:00.150213 140547769902912 controller.py:220] step: 20007        steps_per_second: 1.08        {'train_loss': 34.178963, 'train_accuracy': 0.71188754}
I0319 18:13:00.150395 140547769902912 controller.py:185] Start evaluation at step: 20007
step: 17955        evaluation metric: {'test_loss': 0.25965777, 'test_accuracy': 0.69314, 'continue_training': True}
step: 18468        steps_per_second: 1.05        {'train_loss': 35.29787, 'train_accuracy': 0.6943679}
step: 18981        steps_per_second: 1.08        {'train_loss': 34.908035, 'train_accuracy': 0.70074695}
step: 19494        steps_per_second: 1.08        {'train_loss': 34.57146, 'train_accuracy': 0.7056616}
step: 20007        steps_per_second: 1.08        {'train_loss': 34.178963, 'train_accuracy': 0.71188754}
:::MLL 1679249580.639 eval_start: {"value": null, "metadata": {"epoch_num": 39, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 18:13:00.638623 140547769902912 mlp_log.py:80] :::MLL 1679249580.639 eval_start: {"value": null, "metadata": {"epoch_num": 39, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679249591.474 eval_stop: {"value": null, "metadata": {"epoch_num": 39, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 18:13:11.473854 140547769902912 mlp_log.py:80] :::MLL 1679249591.474 eval_stop: {"value": null, "metadata": {"epoch_num": 39, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679249591.482 eval_accuracy: {"value": 0.7071400284767151, "metadata": {"epoch_num": 39, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 18:13:11.482290 140547769902912 mlp_log.py:80] :::MLL 1679249591.482 eval_accuracy: {"value": 0.7071400284767151, "metadata": {"epoch_num": 39, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679249591.483 block_stop: {"value": null, "metadata": {"first_epoch_num": 36, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 18:13:11.483335 140547769902912 mlp_log.py:80] :::MLL 1679249591.483 block_stop: {"value": null, "metadata": {"first_epoch_num": 36, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679249591.484 block_start: {"value": null, "metadata": {"first_epoch_num": 40, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 18:13:11.484290 140547769902912 mlp_log.py:80] :::MLL 1679249591.484 block_start: {"value": null, "metadata": {"first_epoch_num": 40, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 18:13:11.494453 140547769902912 controller.py:220] step: 20007        evaluation metric: {'test_loss': 0.25293344, 'test_accuracy': 0.70714, 'continue_training': True}
I0319 18:13:11.494655 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 20007 of 36423
I0319 18:21:07.807034 140547769902912 keras_utils.py:120] TimeHistory: 2688.38 examples/second between steps 20007 and 20520
I0319 18:21:07.811231 140547769902912 controller.py:220] step: 20520        steps_per_second: 1.05        {'train_loss': 33.828243, 'train_accuracy': 0.7174528}
I0319 18:21:07.811421 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 20520 of 36423
I0319 18:29:04.059954 140547769902912 keras_utils.py:120] TimeHistory: 2688.74 examples/second between steps 20520 and 21033
I0319 18:29:04.064098 140547769902912 controller.py:220] step: 21033        steps_per_second: 1.08        {'train_loss': 33.491177, 'train_accuracy': 0.7229907}
I0319 18:29:04.064261 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 21033 of 36423
I0319 18:37:00.501375 140547769902912 keras_utils.py:120] TimeHistory: 2687.68 examples/second between steps 21033 and 21546
I0319 18:37:00.505643 140547769902912 controller.py:220] step: 21546        steps_per_second: 1.08        {'train_loss': 33.12628, 'train_accuracy': 0.72929555}
I0319 18:37:00.505825 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 21546 of 36423
I0319 18:44:57.448547 140547769902912 keras_utils.py:120] TimeHistory: 2684.84 examples/second between steps 21546 and 22059
I0319 18:44:57.453764 140547769902912 controller.py:220] step: 22059        steps_per_second: 1.08        {'train_loss': 32.781902, 'train_accuracy': 0.7347241}
I0319 18:44:57.453983 140547769902912 controller.py:185] Start evaluation at step: 22059
step: 20007        evaluation metric: {'test_loss': 0.25293344, 'test_accuracy': 0.70714, 'continue_training': True}
step: 20520        steps_per_second: 1.05        {'train_loss': 33.828243, 'train_accuracy': 0.7174528}
step: 21033        steps_per_second: 1.08        {'train_loss': 33.491177, 'train_accuracy': 0.7229907}
step: 21546        steps_per_second: 1.08        {'train_loss': 33.12628, 'train_accuracy': 0.72929555}
step: 22059        steps_per_second: 1.08        {'train_loss': 32.781902, 'train_accuracy': 0.7347241}
:::MLL 1679251497.962 eval_start: {"value": null, "metadata": {"epoch_num": 43, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 18:44:57.962047 140547769902912 mlp_log.py:80] :::MLL 1679251497.962 eval_start: {"value": null, "metadata": {"epoch_num": 43, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679251508.834 eval_stop: {"value": null, "metadata": {"epoch_num": 43, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 18:45:08.834538 140547769902912 mlp_log.py:80] :::MLL 1679251508.834 eval_stop: {"value": null, "metadata": {"epoch_num": 43, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679251508.842 eval_accuracy: {"value": 0.7310600280761719, "metadata": {"epoch_num": 43, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 18:45:08.841803 140547769902912 mlp_log.py:80] :::MLL 1679251508.842 eval_accuracy: {"value": 0.7310600280761719, "metadata": {"epoch_num": 43, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679251508.843 block_stop: {"value": null, "metadata": {"first_epoch_num": 40, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 18:45:08.842825 140547769902912 mlp_log.py:80] :::MLL 1679251508.843 block_stop: {"value": null, "metadata": {"first_epoch_num": 40, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679251508.844 block_start: {"value": null, "metadata": {"first_epoch_num": 44, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 18:45:08.843794 140547769902912 mlp_log.py:80] :::MLL 1679251508.844 block_start: {"value": null, "metadata": {"first_epoch_num": 44, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 18:45:08.853801 140547769902912 controller.py:220] step: 22059        evaluation metric: {'test_loss': 0.24191059, 'test_accuracy': 0.73106, 'continue_training': True}
I0319 18:45:08.853981 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 22059 of 36423
I0319 18:53:05.613945 140547769902912 keras_utils.py:120] TimeHistory: 2685.85 examples/second between steps 22059 and 22572
I0319 18:53:05.618196 140547769902912 controller.py:220] step: 22572        steps_per_second: 1.05        {'train_loss': 32.501007, 'train_accuracy': 0.7391007}
I0319 18:53:05.618384 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 22572 of 36423
I0319 19:01:03.341272 140547769902912 keras_utils.py:120] TimeHistory: 2680.44 examples/second between steps 22572 and 23085
I0319 19:01:03.345571 140547769902912 controller.py:220] step: 23085        steps_per_second: 1.07        {'train_loss': 32.159527, 'train_accuracy': 0.74508685}
I0319 19:01:03.345741 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 23085 of 36423
I0319 19:09:00.970286 140547769902912 keras_utils.py:120] TimeHistory: 2680.99 examples/second between steps 23085 and 23598
I0319 19:09:00.974560 140547769902912 controller.py:220] step: 23598        steps_per_second: 1.07        {'train_loss': 31.819632, 'train_accuracy': 0.7504209}
I0319 19:09:00.974729 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 23598 of 36423
I0319 19:16:57.904174 140547769902912 keras_utils.py:120] TimeHistory: 2684.90 examples/second between steps 23598 and 24111
I0319 19:16:57.908411 140547769902912 controller.py:220] step: 24111        steps_per_second: 1.08        {'train_loss': 31.538153, 'train_accuracy': 0.75502324}
I0319 19:16:57.908590 140547769902912 controller.py:185] Start evaluation at step: 24111
step: 22059        evaluation metric: {'test_loss': 0.24191059, 'test_accuracy': 0.73106, 'continue_training': True}
step: 22572        steps_per_second: 1.05        {'train_loss': 32.501007, 'train_accuracy': 0.7391007}
step: 23085        steps_per_second: 1.07        {'train_loss': 32.159527, 'train_accuracy': 0.74508685}
step: 23598        steps_per_second: 1.07        {'train_loss': 31.819632, 'train_accuracy': 0.7504209}
step: 24111        steps_per_second: 1.08        {'train_loss': 31.538153, 'train_accuracy': 0.75502324}
:::MLL 1679253418.442 eval_start: {"value": null, "metadata": {"epoch_num": 47, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 19:16:58.442478 140547769902912 mlp_log.py:80] :::MLL 1679253418.442 eval_start: {"value": null, "metadata": {"epoch_num": 47, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679253429.278 eval_stop: {"value": null, "metadata": {"epoch_num": 47, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 19:17:09.277749 140547769902912 mlp_log.py:80] :::MLL 1679253429.278 eval_stop: {"value": null, "metadata": {"epoch_num": 47, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679253429.285 eval_accuracy: {"value": 0.7399600148200989, "metadata": {"epoch_num": 47, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 19:17:09.285157 140547769902912 mlp_log.py:80] :::MLL 1679253429.285 eval_accuracy: {"value": 0.7399600148200989, "metadata": {"epoch_num": 47, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679253429.286 block_stop: {"value": null, "metadata": {"first_epoch_num": 44, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 19:17:09.286182 140547769902912 mlp_log.py:80] :::MLL 1679253429.286 block_stop: {"value": null, "metadata": {"first_epoch_num": 44, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679253429.287 block_start: {"value": null, "metadata": {"first_epoch_num": 48, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 19:17:09.287138 140547769902912 mlp_log.py:80] :::MLL 1679253429.287 block_start: {"value": null, "metadata": {"first_epoch_num": 48, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 19:17:09.297158 140547769902912 controller.py:220] step: 24111        evaluation metric: {'test_loss': 0.23783618, 'test_accuracy': 0.73996, 'continue_training': True}
I0319 19:17:09.297350 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 24111 of 36423
I0319 19:25:05.843054 140547769902912 keras_utils.py:120] TimeHistory: 2687.06 examples/second between steps 24111 and 24624
I0319 19:25:05.847337 140547769902912 controller.py:220] step: 24624        steps_per_second: 1.05        {'train_loss': 31.25745, 'train_accuracy': 0.7601792}
I0319 19:25:05.847517 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 24624 of 36423
I0319 19:33:01.919262 140547769902912 keras_utils.py:120] TimeHistory: 2689.74 examples/second between steps 24624 and 25137
I0319 19:33:01.923496 140547769902912 controller.py:220] step: 25137        steps_per_second: 1.08        {'train_loss': 30.94866, 'train_accuracy': 0.7650783}
I0319 19:33:01.923671 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 25137 of 36423
I0319 19:40:58.928619 140547769902912 keras_utils.py:120] TimeHistory: 2684.48 examples/second between steps 25137 and 25650
I0319 19:40:58.932954 140547769902912 controller.py:220] step: 25650        steps_per_second: 1.08        {'train_loss': 30.675001, 'train_accuracy': 0.76950336}
I0319 19:40:58.933148 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 25650 of 36423
I0319 19:48:55.050780 140547769902912 keras_utils.py:120] TimeHistory: 2689.48 examples/second between steps 25650 and 26163
I0319 19:48:55.054964 140547769902912 controller.py:220] step: 26163        steps_per_second: 1.08        {'train_loss': 30.428179, 'train_accuracy': 0.7739565}
I0319 19:48:55.055132 140547769902912 controller.py:185] Start evaluation at step: 26163
step: 24111        evaluation metric: {'test_loss': 0.23783618, 'test_accuracy': 0.73996, 'continue_training': True}
step: 24624        steps_per_second: 1.05        {'train_loss': 31.25745, 'train_accuracy': 0.7601792}
step: 25137        steps_per_second: 1.08        {'train_loss': 30.94866, 'train_accuracy': 0.7650783}
step: 25650        steps_per_second: 1.08        {'train_loss': 30.675001, 'train_accuracy': 0.76950336}
step: 26163        steps_per_second: 1.08        {'train_loss': 30.428179, 'train_accuracy': 0.7739565}
:::MLL 1679255335.547 eval_start: {"value": null, "metadata": {"epoch_num": 51, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 19:48:55.547407 140547769902912 mlp_log.py:80] :::MLL 1679255335.547 eval_start: {"value": null, "metadata": {"epoch_num": 51, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679255346.553 eval_stop: {"value": null, "metadata": {"epoch_num": 51, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 19:49:06.552730 140547769902912 mlp_log.py:80] :::MLL 1679255346.553 eval_stop: {"value": null, "metadata": {"epoch_num": 51, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679255346.560 eval_accuracy: {"value": 0.7473400235176086, "metadata": {"epoch_num": 51, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 19:49:06.559940 140547769902912 mlp_log.py:80] :::MLL 1679255346.560 eval_accuracy: {"value": 0.7473400235176086, "metadata": {"epoch_num": 51, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679255346.561 block_stop: {"value": null, "metadata": {"first_epoch_num": 48, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 19:49:06.560959 140547769902912 mlp_log.py:80] :::MLL 1679255346.561 block_stop: {"value": null, "metadata": {"first_epoch_num": 48, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679255346.562 block_start: {"value": null, "metadata": {"first_epoch_num": 52, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 19:49:06.561913 140547769902912 mlp_log.py:80] :::MLL 1679255346.562 block_start: {"value": null, "metadata": {"first_epoch_num": 52, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 19:49:06.571880 140547769902912 controller.py:220] step: 26163        evaluation metric: {'test_loss': 0.23394844, 'test_accuracy': 0.74734, 'continue_training': True}
I0319 19:49:06.572060 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 26163 of 36423
I0319 19:57:04.827503 140547769902912 keras_utils.py:120] TimeHistory: 2677.45 examples/second between steps 26163 and 26676
I0319 19:57:04.831851 140547769902912 controller.py:220] step: 26676        steps_per_second: 1.05        {'train_loss': 30.180855, 'train_accuracy': 0.7779777}
I0319 19:57:04.832029 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 26676 of 36423
I0319 20:05:01.335356 140547769902912 keras_utils.py:120] TimeHistory: 2687.31 examples/second between steps 26676 and 27189
I0319 20:05:01.339524 140547769902912 controller.py:220] step: 27189        steps_per_second: 1.08        {'train_loss': 29.918842, 'train_accuracy': 0.7823356}
I0319 20:05:01.339692 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 27189 of 36423
I0319 20:12:58.183544 140547769902912 keras_utils.py:120] TimeHistory: 2685.39 examples/second between steps 27189 and 27702
I0319 20:12:58.187861 140547769902912 controller.py:220] step: 27702        steps_per_second: 1.08        {'train_loss': 29.700476, 'train_accuracy': 0.78678167}
I0319 20:12:58.188049 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 27702 of 36423
I0319 20:20:54.531436 140547769902912 keras_utils.py:120] TimeHistory: 2688.23 examples/second between steps 27702 and 28215
I0319 20:20:54.535721 140547769902912 controller.py:220] step: 28215        steps_per_second: 1.08        {'train_loss': 29.481922, 'train_accuracy': 0.79037726}
I0319 20:20:54.535894 140547769902912 controller.py:185] Start evaluation at step: 28215
step: 26163        evaluation metric: {'test_loss': 0.23394844, 'test_accuracy': 0.74734, 'continue_training': True}
step: 26676        steps_per_second: 1.05        {'train_loss': 30.180855, 'train_accuracy': 0.7779777}
step: 27189        steps_per_second: 1.08        {'train_loss': 29.918842, 'train_accuracy': 0.7823356}
step: 27702        steps_per_second: 1.08        {'train_loss': 29.700476, 'train_accuracy': 0.78678167}
step: 28215        steps_per_second: 1.08        {'train_loss': 29.481922, 'train_accuracy': 0.79037726}
:::MLL 1679257255.034 eval_start: {"value": null, "metadata": {"epoch_num": 55, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 20:20:55.034361 140547769902912 mlp_log.py:80] :::MLL 1679257255.034 eval_start: {"value": null, "metadata": {"epoch_num": 55, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679257265.511 eval_stop: {"value": null, "metadata": {"epoch_num": 55, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 20:21:05.511004 140547769902912 mlp_log.py:80] :::MLL 1679257265.511 eval_stop: {"value": null, "metadata": {"epoch_num": 55, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679257265.518 eval_accuracy: {"value": 0.7565600275993347, "metadata": {"epoch_num": 55, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 20:21:05.518299 140547769902912 mlp_log.py:80] :::MLL 1679257265.518 eval_accuracy: {"value": 0.7565600275993347, "metadata": {"epoch_num": 55, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679257265.519 block_stop: {"value": null, "metadata": {"first_epoch_num": 52, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 20:21:05.519329 140547769902912 mlp_log.py:80] :::MLL 1679257265.519 block_stop: {"value": null, "metadata": {"first_epoch_num": 52, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679257265.520 block_start: {"value": null, "metadata": {"first_epoch_num": 56, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 20:21:05.520274 140547769902912 mlp_log.py:80] :::MLL 1679257265.520 block_start: {"value": null, "metadata": {"first_epoch_num": 56, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 20:21:05.530137 140547769902912 controller.py:220] step: 28215        evaluation metric: {'test_loss': 0.23033953, 'test_accuracy': 0.75656, 'continue_training': True}
I0319 20:21:05.530332 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 28215 of 36423
I0319 20:29:02.549445 140547769902912 keras_utils.py:120] TimeHistory: 2684.41 examples/second between steps 28215 and 28728
I0319 20:29:02.553695 140547769902912 controller.py:220] step: 28728        steps_per_second: 1.05        {'train_loss': 29.281693, 'train_accuracy': 0.7941939}
I0319 20:29:02.553875 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 28728 of 36423
I0319 20:36:59.701035 140547769902912 keras_utils.py:120] TimeHistory: 2683.68 examples/second between steps 28728 and 29241
I0319 20:36:59.705335 140547769902912 controller.py:220] step: 29241        steps_per_second: 1.08        {'train_loss': 29.111845, 'train_accuracy': 0.79688674}
I0319 20:36:59.705515 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 29241 of 36423
I0319 20:44:56.506052 140547769902912 keras_utils.py:120] TimeHistory: 2685.63 examples/second between steps 29241 and 29754
I0319 20:44:56.510352 140547769902912 controller.py:220] step: 29754        steps_per_second: 1.08        {'train_loss': 28.945818, 'train_accuracy': 0.8003339}
I0319 20:44:56.510533 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 29754 of 36423
I0319 20:52:52.982735 140547769902912 keras_utils.py:120] TimeHistory: 2687.48 examples/second between steps 29754 and 30267
I0319 20:52:52.987001 140547769902912 controller.py:220] step: 30267        steps_per_second: 1.08        {'train_loss': 28.79324, 'train_accuracy': 0.80263704}
I0319 20:52:52.987169 140547769902912 controller.py:185] Start evaluation at step: 30267
step: 28215        evaluation metric: {'test_loss': 0.23033953, 'test_accuracy': 0.75656, 'continue_training': True}
step: 28728        steps_per_second: 1.05        {'train_loss': 29.281693, 'train_accuracy': 0.7941939}
step: 29241        steps_per_second: 1.08        {'train_loss': 29.111845, 'train_accuracy': 0.79688674}
step: 29754        steps_per_second: 1.08        {'train_loss': 28.945818, 'train_accuracy': 0.8003339}
step: 30267        steps_per_second: 1.08        {'train_loss': 28.79324, 'train_accuracy': 0.80263704}
:::MLL 1679259173.484 eval_start: {"value": null, "metadata": {"epoch_num": 59, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 20:52:53.483794 140547769902912 mlp_log.py:80] :::MLL 1679259173.484 eval_start: {"value": null, "metadata": {"epoch_num": 59, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679259184.442 eval_stop: {"value": null, "metadata": {"epoch_num": 59, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 20:53:04.441937 140547769902912 mlp_log.py:80] :::MLL 1679259184.442 eval_stop: {"value": null, "metadata": {"epoch_num": 59, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679259184.452 eval_accuracy: {"value": 0.7594199776649475, "metadata": {"epoch_num": 59, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 20:53:04.452280 140547769902912 mlp_log.py:80] :::MLL 1679259184.452 eval_accuracy: {"value": 0.7594199776649475, "metadata": {"epoch_num": 59, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679259184.453 block_stop: {"value": null, "metadata": {"first_epoch_num": 56, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 20:53:04.453428 140547769902912 mlp_log.py:80] :::MLL 1679259184.453 block_stop: {"value": null, "metadata": {"first_epoch_num": 56, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 20:53:04.463417 140547769902912 controller.py:220] step: 30267        evaluation metric: {'test_loss': 0.22827734, 'test_accuracy': 0.75942, 'continue_training': False}
step: 30267        evaluation metric: {'test_loss': 0.22827734, 'test_accuracy': 0.75942, 'continue_training': False}
:::MLL 1679259184.464 run_stop: {"value": null, "metadata": {"status": "success", "lineno": 279, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 20:53:04.464306 140547769902912 mlp_log.py:80] :::MLL 1679259184.464 run_stop: {"value": null, "metadata": {"status": "success", "lineno": 279, "file": "./resnet_ctl_imagenet_main.py"}}
:::MLL 1679259184.465 run_final: {"value": null, "metadata": {"lineno": 281, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 20:53:04.465044 140547769902912 mlp_log.py:80] :::MLL 1679259184.465 run_final: {"value": null, "metadata": {"lineno": 281, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 20:53:04.484413 140547769902912 resnet_ctl_imagenet_main.py:298] Run stats:
{'eval_loss': 0.22827734, 'eval_acc': 0.75942, 'train_loss': 28.79324, 'train_acc': 0.80263704, 'step_timestamp_log': ['BatchTimestamp<batch_index: 0, timestamp: 1679230834.281045>', 'BatchTimestamp<batch_index: 513, timestamp: 1679231312.765514>', 'BatchTimestamp<batch_index: 1026, timestamp: 1679231790.4766295>', 'BatchTimestamp<batch_index: 1539, timestamp: 1679232268.0622654>', 'BatchTimestamp<batch_index: 2052, timestamp: 1679232778.5840638>', 'BatchTimestamp<batch_index: 2565, timestamp: 1679233256.8333852>', 'BatchTimestamp<batch_index: 3078, timestamp: 1679233735.232925>', 'BatchTimestamp<batch_index: 3591, timestamp: 1679234213.5742714>', 'BatchTimestamp<batch_index: 4104, timestamp: 1679234703.2010417>', 'BatchTimestamp<batch_index: 4617, timestamp: 1679235181.7035873>', 'BatchTimestamp<batch_index: 5130, timestamp: 1679235658.7568266>', 'BatchTimestamp<batch_index: 5643, timestamp: 1679236136.837958>', 'BatchTimestamp<batch_index: 6156, timestamp: 1679236625.7255764>', 'BatchTimestamp<batch_index: 6669, timestamp: 1679237103.411414>', 'BatchTimestamp<batch_index: 7182, timestamp: 1679237582.1563838>', 'BatchTimestamp<batch_index: 7695, timestamp: 1679238060.5108216>', 'BatchTimestamp<batch_index: 8208, timestamp: 1679238549.9029462>', 'BatchTimestamp<batch_index: 8721, timestamp: 1679239028.3283317>', 'BatchTimestamp<batch_index: 9234, timestamp: 1679239506.558369>', 'BatchTimestamp<batch_index: 9747, timestamp: 1679239984.4382937>', 'BatchTimestamp<batch_index: 10260, timestamp: 1679240474.3921533>', 'BatchTimestamp<batch_index: 10773, timestamp: 1679240951.96138>', 'BatchTimestamp<batch_index: 11286, timestamp: 1679241429.1646736>', 'BatchTimestamp<batch_index: 11799, timestamp: 1679241906.888098>', 'BatchTimestamp<batch_index: 12312, timestamp: 1679242395.0172863>', 'BatchTimestamp<batch_index: 12825, timestamp: 1679242871.8736327>', 'BatchTimestamp<batch_index: 13338, timestamp: 1679243348.7574499>', 'BatchTimestamp<batch_index: 13851, timestamp: 1679243825.9569237>', 'BatchTimestamp<batch_index: 14364, timestamp: 1679244314.2721043>', 'BatchTimestamp<batch_index: 14877, timestamp: 1679244792.2427475>', 'BatchTimestamp<batch_index: 15390, timestamp: 1679245268.9251325>', 'BatchTimestamp<batch_index: 15903, timestamp: 1679245745.4601164>', 'BatchTimestamp<batch_index: 16416, timestamp: 1679246232.7534444>', 'BatchTimestamp<batch_index: 16929, timestamp: 1679246708.728656>', 'BatchTimestamp<batch_index: 17442, timestamp: 1679247185.7591805>', 'BatchTimestamp<batch_index: 17955, timestamp: 1679247662.4490402>', 'BatchTimestamp<batch_index: 18468, timestamp: 1679248150.4474506>', 'BatchTimestamp<batch_index: 18981, timestamp: 1679248627.1151292>', 'BatchTimestamp<batch_index: 19494, timestamp: 1679249103.5206127>', 'BatchTimestamp<batch_index: 20007, timestamp: 1679249580.1458325>', 'BatchTimestamp<batch_index: 20520, timestamp: 1679250067.8068252>', 'BatchTimestamp<batch_index: 21033, timestamp: 1679250544.0597591>', 'BatchTimestamp<batch_index: 21546, timestamp: 1679251020.501157>', 'BatchTimestamp<batch_index: 22059, timestamp: 1679251497.4479887>', 'BatchTimestamp<batch_index: 22572, timestamp: 1679251985.6137266>', 'BatchTimestamp<batch_index: 23085, timestamp: 1679252463.3410485>', 'BatchTimestamp<batch_index: 23598, timestamp: 1679252940.9701052>', 'BatchTimestamp<batch_index: 24111, timestamp: 1679253417.9039862>', 'BatchTimestamp<batch_index: 24624, timestamp: 1679253905.8428304>', 'BatchTimestamp<batch_index: 25137, timestamp: 1679254381.919039>', 'BatchTimestamp<batch_index: 25650, timestamp: 1679254858.9284008>', 'BatchTimestamp<batch_index: 26163, timestamp: 1679255335.0505683>', 'BatchTimestamp<batch_index: 26676, timestamp: 1679255824.8272724>', 'BatchTimestamp<batch_index: 27189, timestamp: 1679256301.335169>', 'BatchTimestamp<batch_index: 27702, timestamp: 1679256778.1833446>', 'BatchTimestamp<batch_index: 28215, timestamp: 1679257254.531205>', 'BatchTimestamp<batch_index: 28728, timestamp: 1679257742.5492072>', 'BatchTimestamp<batch_index: 29241, timestamp: 1679258219.7008102>', 'BatchTimestamp<batch_index: 29754, timestamp: 1679258696.5058227>', 'BatchTimestamp<batch_index: 30267, timestamp: 1679259172.9825177>'], 'train_finish_time': 1679259184.4644349, 'avg_exp_per_second': 2683.1639506599217}