Commit f28387e7 authored by liangjing's avatar liangjing
Browse files

Delete reference.log

parent b6571f85
nohup: ignoring input
:::MLL 1679230527.145 cache_clear: {"value": true, "metadata": {"lineno": 116, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 12:55:27.145380 140547769902912 mlp_log.py:80] :::MLL 1679230527.145 cache_clear: {"value": true, "metadata": {"lineno": 116, "file": "./resnet_ctl_imagenet_main.py"}}
:::MLL 1679230527.146 init_start: {"value": null, "metadata": {"lineno": 117, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 12:55:27.146378 140547769902912 mlp_log.py:80] :::MLL 1679230527.146 init_start: {"value": null, "metadata": {"lineno": 117, "file": "./resnet_ctl_imagenet_main.py"}}
:::MLL 1679230527.147 submission_benchmark: {"value": "resnet", "metadata": {"lineno": 118, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 12:55:27.147078 140547769902912 mlp_log.py:80] :::MLL 1679230527.147 submission_benchmark: {"value": "resnet", "metadata": {"lineno": 118, "file": "./resnet_ctl_imagenet_main.py"}}
:::MLL 1679230527.148 submission_division: {"value": "closed", "metadata": {"lineno": 119, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 12:55:27.147791 140547769902912 mlp_log.py:80] :::MLL 1679230527.148 submission_division: {"value": "closed", "metadata": {"lineno": 119, "file": "./resnet_ctl_imagenet_main.py"}}
:::MLL 1679230527.148 submission_org: {"value": "google", "metadata": {"lineno": 120, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 12:55:27.148500 140547769902912 mlp_log.py:80] :::MLL 1679230527.148 submission_org: {"value": "google", "metadata": {"lineno": 120, "file": "./resnet_ctl_imagenet_main.py"}}
:::MLL 1679230527.149 submission_platform: {"value": "gpu-v100-8", "metadata": {"lineno": 121, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 12:55:27.149215 140547769902912 mlp_log.py:80] :::MLL 1679230527.149 submission_platform: {"value": "gpu-v100-8", "metadata": {"lineno": 121, "file": "./resnet_ctl_imagenet_main.py"}}
:::MLL 1679230527.150 submission_status: {"value": "cloud", "metadata": {"lineno": 124, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 12:55:27.149919 140547769902912 mlp_log.py:80] :::MLL 1679230527.150 submission_status: {"value": "cloud", "metadata": {"lineno": 124, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 12:55:27.150071 140547769902912 common.py:617] Module ./resnet_ctl_imagenet_main.py:
I0319 12:55:27.150561 140547769902912 common.py:620] flags_obj.use_tf_function = True
I0319 12:55:27.150646 140547769902912 common.py:620] flags_obj.single_l2_loss_op = True
I0319 12:55:27.150727 140547769902912 common.py:620] flags_obj.cache_decoded_image = False
I0319 12:55:27.150808 140547769902912 common.py:620] flags_obj.enable_device_warmup = True
I0319 12:55:27.150889 140547769902912 common.py:620] flags_obj.device_warmup_steps = 1
I0319 12:55:27.150968 140547769902912 common.py:620] flags_obj.num_replicas = 32
I0319 12:55:27.151046 140547769902912 common.py:617] Module absl.app:
I0319 12:55:27.151130 140547769902912 common.py:620] flags_obj.run_with_pdb = False
I0319 12:55:27.151208 140547769902912 common.py:620] flags_obj.pdb_post_mortem = False
I0319 12:55:27.151290 140547769902912 common.py:620] flags_obj.pdb = False
I0319 12:55:27.151383 140547769902912 common.py:620] flags_obj.run_with_profiling = False
I0319 12:55:27.151461 140547769902912 common.py:620] flags_obj.profile_file = None
I0319 12:55:27.151540 140547769902912 common.py:620] flags_obj.use_cprofile_for_profiling = True
I0319 12:55:27.151618 140547769902912 common.py:620] flags_obj.only_check_args = False
I0319 12:55:27.151695 140547769902912 common.py:620] flags_obj.help = False
I0319 12:55:27.151774 140547769902912 common.py:620] flags_obj.helpshort = False
I0319 12:55:27.151850 140547769902912 common.py:620] flags_obj.helpfull = False
I0319 12:55:27.151929 140547769902912 common.py:620] flags_obj.helpxml = False
I0319 12:55:27.152006 140547769902912 common.py:617] Module absl.logging:
I0319 12:55:27.152086 140547769902912 common.py:620] flags_obj.logtostderr = False
I0319 12:55:27.152163 140547769902912 common.py:620] flags_obj.alsologtostderr = False
I0319 12:55:27.152240 140547769902912 common.py:620] flags_obj.log_dir =
I0319 12:55:27.152339 140547769902912 common.py:620] flags_obj.verbosity = 0
I0319 12:55:27.152423 140547769902912 common.py:620] flags_obj.logger_levels = {}
I0319 12:55:27.152507 140547769902912 common.py:620] flags_obj.stderrthreshold = fatal
I0319 12:55:27.152584 140547769902912 common.py:620] flags_obj.showprefixforinfo = True
I0319 12:55:27.152662 140547769902912 common.py:617] Module absl.testing.absltest:
I0319 12:55:27.152743 140547769902912 common.py:620] flags_obj.test_srcdir =
I0319 12:55:27.152820 140547769902912 common.py:620] flags_obj.test_tmpdir = /tmp/absl_testing
I0319 12:55:27.152901 140547769902912 common.py:620] flags_obj.test_random_seed = 301
I0319 12:55:27.152981 140547769902912 common.py:620] flags_obj.test_randomize_ordering_seed = 1
I0319 12:55:27.153058 140547769902912 common.py:620] flags_obj.xml_output_file =
I0319 12:55:27.153135 140547769902912 common.py:617] Module common:
I0319 12:55:27.153217 140547769902912 common.py:620] flags_obj.enable_eager = True
I0319 12:55:27.153294 140547769902912 common.py:620] flags_obj.skip_eval = False
I0319 12:55:27.153382 140547769902912 common.py:620] flags_obj.set_learning_phase_to_train = True
I0319 12:55:27.153460 140547769902912 common.py:620] flags_obj.explicit_gpu_placement = False
I0319 12:55:27.153537 140547769902912 common.py:620] flags_obj.use_trivial_model = False
I0319 12:55:27.153614 140547769902912 common.py:620] flags_obj.report_accuracy_metrics = True
I0319 12:55:27.153692 140547769902912 common.py:620] flags_obj.lr_schedule = polynomial
I0319 12:55:27.153769 140547769902912 common.py:620] flags_obj.enable_tensorboard = False
I0319 12:55:27.153845 140547769902912 common.py:620] flags_obj.train_steps = None
I0319 12:55:27.153923 140547769902912 common.py:620] flags_obj.profile_steps = None
I0319 12:55:27.154000 140547769902912 common.py:620] flags_obj.batchnorm_spatial_persistent = True
I0319 12:55:27.154076 140547769902912 common.py:620] flags_obj.enable_get_next_as_optional = False
I0319 12:55:27.154153 140547769902912 common.py:620] flags_obj.enable_checkpoint_and_export = False
I0319 12:55:27.154229 140547769902912 common.py:620] flags_obj.tpu =
I0319 12:55:27.154305 140547769902912 common.py:620] flags_obj.tpu_zone =
I0319 12:55:27.154394 140547769902912 common.py:620] flags_obj.steps_per_loop = 514
I0319 12:55:27.154473 140547769902912 common.py:620] flags_obj.use_tf_while_loop = True
I0319 12:55:27.154549 140547769902912 common.py:620] flags_obj.use_tf_keras_layers = False
I0319 12:55:27.154627 140547769902912 common.py:620] flags_obj.base_learning_rate = 4.9
I0319 12:55:27.154710 140547769902912 common.py:620] flags_obj.optimizer = LARS
I0319 12:55:27.154787 140547769902912 common.py:620] flags_obj.drop_train_remainder = True
I0319 12:55:27.154863 140547769902912 common.py:620] flags_obj.drop_eval_remainder = False
I0319 12:55:27.154940 140547769902912 common.py:620] flags_obj.label_smoothing = 0.1
I0319 12:55:27.155020 140547769902912 common.py:620] flags_obj.num_classes = 1000
I0319 12:55:27.155099 140547769902912 common.py:620] flags_obj.eval_offset_epochs = 3
I0319 12:55:27.155177 140547769902912 common.py:620] flags_obj.target_accuracy = 0.759
I0319 12:55:27.155256 140547769902912 common.py:617] Module lars_util:
I0319 12:55:27.155346 140547769902912 common.py:620] flags_obj.end_learning_rate = None
I0319 12:55:27.155426 140547769902912 common.py:620] flags_obj.lars_epsilon = 0.0
I0319 12:55:27.155504 140547769902912 common.py:620] flags_obj.warmup_epochs = 5.0
I0319 12:55:27.155582 140547769902912 common.py:620] flags_obj.momentum = 0.9
I0319 12:55:27.155662 140547769902912 common.py:617] Module resnet_model:
I0319 12:55:27.155743 140547769902912 common.py:620] flags_obj.weight_decay = 0.0002
I0319 12:55:27.155822 140547769902912 common.py:620] flags_obj.num_accumulation_steps = 1
I0319 12:55:27.155900 140547769902912 common.py:617] Module resnet_runnable:
I0319 12:55:27.155981 140547769902912 common.py:620] flags_obj.trace_warmup = False
I0319 12:55:27.156070 140547769902912 common.py:617] Module tensorflow.python.ops.parallel_for.pfor:
I0319 12:55:27.156152 140547769902912 common.py:620] flags_obj.op_conversion_fallback_to_while_loop = True
I0319 12:55:27.156228 140547769902912 common.py:617] Module tensorflow.python.tpu.client.client:
I0319 12:55:27.156317 140547769902912 common.py:620] flags_obj.runtime_oom_exit = True
I0319 12:55:27.156397 140547769902912 common.py:620] flags_obj.hbm_oom_exit = True
I0319 12:55:27.156476 140547769902912 common.py:617] Module tf2_common.utils.flags._base:
I0319 12:55:27.156557 140547769902912 common.py:620] flags_obj.data_dir = /data/tf-imagenet/imagenet
I0319 12:55:27.156634 140547769902912 common.py:620] flags_obj.model_dir = /tmp
I0319 12:55:27.156712 140547769902912 common.py:620] flags_obj.clean = False
I0319 12:55:27.156790 140547769902912 common.py:620] flags_obj.train_epochs = 70
I0319 12:55:27.156867 140547769902912 common.py:620] flags_obj.epochs_between_evals = 4
I0319 12:55:27.156945 140547769902912 common.py:620] flags_obj.batch_size = 2496
I0319 12:55:27.157022 140547769902912 common.py:620] flags_obj.num_gpus = 8
I0319 12:55:27.157100 140547769902912 common.py:620] flags_obj.run_eagerly = False
I0319 12:55:27.157177 140547769902912 common.py:620] flags_obj.distribution_strategy = mirrored
I0319 12:55:27.157255 140547769902912 common.py:617] Module tf2_common.utils.flags._benchmark:
I0319 12:55:27.157347 140547769902912 common.py:620] flags_obj.benchmark_logger_type = BaseBenchmarkLogger
I0319 12:55:27.157434 140547769902912 common.py:620] flags_obj.benchmark_test_id = None
I0319 12:55:27.157512 140547769902912 common.py:620] flags_obj.log_steps = 125
I0319 12:55:27.157588 140547769902912 common.py:620] flags_obj.benchmark_log_dir = None
I0319 12:55:27.157666 140547769902912 common.py:620] flags_obj.gcp_project = None
I0319 12:55:27.157744 140547769902912 common.py:620] flags_obj.bigquery_data_set = test_benchmark
I0319 12:55:27.157821 140547769902912 common.py:620] flags_obj.bigquery_run_table = benchmark_run
I0319 12:55:27.157899 140547769902912 common.py:620] flags_obj.bigquery_run_status_table = benchmark_run_status
I0319 12:55:27.157977 140547769902912 common.py:620] flags_obj.bigquery_metric_table = benchmark_metric
I0319 12:55:27.158053 140547769902912 common.py:617] Module tf2_common.utils.flags._distribution:
I0319 12:55:27.158134 140547769902912 common.py:620] flags_obj.worker_hosts = None
I0319 12:55:27.158211 140547769902912 common.py:620] flags_obj.task_index = -1
I0319 12:55:27.158288 140547769902912 common.py:617] Module tf2_common.utils.flags._misc:
I0319 12:55:27.158379 140547769902912 common.py:620] flags_obj.data_format = None
I0319 12:55:27.158457 140547769902912 common.py:617] Module tf2_common.utils.flags._performance:
I0319 12:55:27.158539 140547769902912 common.py:620] flags_obj.use_synthetic_data = False
I0319 12:55:27.158615 140547769902912 common.py:620] flags_obj.dtype = fp16
I0319 12:55:27.158691 140547769902912 common.py:620] flags_obj.loss_scale = None
I0319 12:55:27.158768 140547769902912 common.py:620] flags_obj.fp16_implementation = keras
I0319 12:55:27.158844 140547769902912 common.py:620] flags_obj.all_reduce_alg = nccl
I0319 12:55:27.158921 140547769902912 common.py:620] flags_obj.num_packs = 1
I0319 12:55:27.158999 140547769902912 common.py:620] flags_obj.tf_gpu_thread_mode = gpu_private
I0319 12:55:27.159075 140547769902912 common.py:620] flags_obj.per_gpu_thread_count = 0
I0319 12:55:27.159153 140547769902912 common.py:620] flags_obj.datasets_num_private_threads = 32
I0319 12:55:27.159230 140547769902912 common.py:620] flags_obj.training_dataset_cache = True
I0319 12:55:27.159306 140547769902912 common.py:620] flags_obj.training_prefetch_batchs = 128
I0319 12:55:27.159394 140547769902912 common.py:620] flags_obj.eval_dataset_cache = True
I0319 12:55:27.159471 140547769902912 common.py:620] flags_obj.eval_prefetch_batchs = 192
I0319 12:55:27.159548 140547769902912 common.py:620] flags_obj.tf_data_experimental_slack = False
I0319 12:55:27.159631 140547769902912 common.py:620] flags_obj.enable_xla = False
I0319 12:55:27.159710 140547769902912 common.py:620] flags_obj.force_v2_in_keras_compile = None
WARNING:tensorflow:Mixed precision compatibility check (mixed_float16): WARNING
Your GPUs may run slowly with dtype policy mixed_float16 because they do not have compute capability of at least 7.0. Your GPUs:
Z100L, no compute capability (probably not an Nvidia GPU) (x8)
See https://developer.nvidia.com/cuda-gpus for a list of GPUs and their compute capabilities.
If you will use compatible GPU(s) not attached to this host, e.g. by running a multi-worker model, you can ignore this warning. This message will only be logged once
W0319 12:55:27.160811 140547769902912 device_compatibility_check.py:107] Mixed precision compatibility check (mixed_float16): WARNING
Your GPUs may run slowly with dtype policy mixed_float16 because they do not have compute capability of at least 7.0. Your GPUs:
Z100L, no compute capability (probably not an Nvidia GPU) (x8)
See https://developer.nvidia.com/cuda-gpus for a list of GPUs and their compute capabilities.
If you will use compatible GPU(s) not attached to this host, e.g. by running a multi-worker model, you can ignore this warning. This message will only be logged once
I0319 12:55:27.161139 140547769902912 keras_utils.py:243] Logical CPU cores: 128
I0319 12:55:27.161378 140547769902912 keras_utils.py:249] TF_GPU_THREAD_COUNT: 2
I0319 12:55:27.161468 140547769902912 keras_utils.py:251] TF_GPU_THREAD_MODE: gpu_private
I0319 12:55:27.161551 140547769902912 keras_utils.py:261] Recommended datasets_num_private_threads: 64
2023-03-19 12:55:27.162998: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-19 12:55:27.181835: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
2023-03-19 12:55:27.181964: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 32252 MB memory: -> device: 0, name: Z100L, pci bus id: 0000:07:00.0
2023-03-19 12:55:27.582374: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
2023-03-19 12:55:27.582493: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 32252 MB memory: -> device: 1, name: Z100L, pci bus id: 0000:0a:00.0
2023-03-19 12:55:27.961772: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
2023-03-19 12:55:27.961893: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:2 with 32252 MB memory: -> device: 2, name: Z100L, pci bus id: 0000:15:00.0
2023-03-19 12:55:28.339247: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
2023-03-19 12:55:28.339376: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:3 with 32252 MB memory: -> device: 3, name: Z100L, pci bus id: 0000:0f:00.0
2023-03-19 12:55:28.719486: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
2023-03-19 12:55:28.719627: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:4 with 32252 MB memory: -> device: 4, name: Z100L, pci bus id: 0000:85:00.0
2023-03-19 12:55:29.097492: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
2023-03-19 12:55:29.097606: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:5 with 32252 MB memory: -> device: 5, name: Z100L, pci bus id: 0000:7f:00.0
2023-03-19 12:55:29.475299: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
2023-03-19 12:55:29.475428: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:6 with 32252 MB memory: -> device: 6, name: Z100L, pci bus id: 0000:77:00.0
2023-03-19 12:55:29.855076: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
2023-03-19 12:55:29.855191: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:7 with 32252 MB memory: -> device: 7, name: Z100L, pci bus id: 0000:7a:00.0
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3', '/job:localhost/replica:0/task:0/device:GPU:4', '/job:localhost/replica:0/task:0/device:GPU:5', '/job:localhost/replica:0/task:0/device:GPU:6', '/job:localhost/replica:0/task:0/device:GPU:7')
I0319 12:55:30.261204 140547769902912 mirrored_strategy.py:376] Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3', '/job:localhost/replica:0/task:0/device:GPU:4', '/job:localhost/replica:0/task:0/device:GPU:5', '/job:localhost/replica:0/task:0/device:GPU:6', '/job:localhost/replica:0/task:0/device:GPU:7')
num_index -1
enter the tf.float16 set policy
Compute dtype: float16
Variable dtype: float32
:::MLL 1679230530.264 global_batch_size: {"value": 2496, "metadata": {"lineno": 190, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 12:55:30.263783 140547769902912 mlp_log.py:80] :::MLL 1679230530.264 global_batch_size: {"value": 2496, "metadata": {"lineno": 190, "file": "./resnet_ctl_imagenet_main.py"}}
:::MLL 1679230530.265 train_samples: {"value": 1281167, "metadata": {"lineno": 191, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 12:55:30.264862 140547769902912 mlp_log.py:80] :::MLL 1679230530.265 train_samples: {"value": 1281167, "metadata": {"lineno": 191, "file": "./resnet_ctl_imagenet_main.py"}}
:::MLL 1679230530.266 eval_samples: {"value": 50000, "metadata": {"lineno": 193, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 12:55:30.265909 140547769902912 mlp_log.py:80] :::MLL 1679230530.266 eval_samples: {"value": 50000, "metadata": {"lineno": 193, "file": "./resnet_ctl_imagenet_main.py"}}
:::MLL 1679230530.267 model_bn_span: {"value": 312, "metadata": {"lineno": 195, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 12:55:30.266957 140547769902912 mlp_log.py:80] :::MLL 1679230530.267 model_bn_span: {"value": 312, "metadata": {"lineno": 195, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 12:55:30.267157 140547769902912 resnet_ctl_imagenet_main.py:204] Training 71 epochs, each epoch has 513 steps, total steps: 36423; Eval 21 steps
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0319 12:55:30.377633 140547769902912 cross_device_ops.py:619] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0319 12:55:30.390385 140547769902912 cross_device_ops.py:619] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0319 12:55:30.400095 140547769902912 cross_device_ops.py:619] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0319 12:55:30.402572 140547769902912 cross_device_ops.py:619] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0319 12:55:30.414422 140547769902912 cross_device_ops.py:619] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0319 12:55:30.426609 140547769902912 cross_device_ops.py:619] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0319 12:55:30.486386 140547769902912 cross_device_ops.py:619] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0319 12:55:30.488949 140547769902912 cross_device_ops.py:619] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0319 12:55:30.497610 140547769902912 cross_device_ops.py:619] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0319 12:55:30.500023 140547769902912 cross_device_ops.py:619] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
:::MLL 1679230535.378 opt_name: {"value": "lars", "metadata": {"lineno": 101, "file": "/root/resnet50/lars_util.py"}}
I0319 12:55:35.377869 140547769902912 mlp_log.py:80] :::MLL 1679230535.378 opt_name: {"value": "lars", "metadata": {"lineno": 101, "file": "/root/resnet50/lars_util.py"}}
:::MLL 1679230535.379 lars_epsilon: {"value": 0.0, "metadata": {"lineno": 103, "file": "/root/resnet50/lars_util.py"}}
I0319 12:55:35.378870 140547769902912 mlp_log.py:80] :::MLL 1679230535.379 lars_epsilon: {"value": 0.0, "metadata": {"lineno": 103, "file": "/root/resnet50/lars_util.py"}}
:::MLL 1679230535.380 lars_opt_weight_decay: {"value": 0.0002, "metadata": {"lineno": 104, "file": "/root/resnet50/lars_util.py"}}
I0319 12:55:35.379752 140547769902912 mlp_log.py:80] :::MLL 1679230535.380 lars_opt_weight_decay: {"value": 0.0002, "metadata": {"lineno": 104, "file": "/root/resnet50/lars_util.py"}}
:::MLL 1679230535.381 lars_opt_base_learning_rate: {"value": 4.9, "metadata": {"lineno": 106, "file": "/root/resnet50/lars_util.py"}}
I0319 12:55:35.380624 140547769902912 mlp_log.py:80] :::MLL 1679230535.381 lars_opt_base_learning_rate: {"value": 4.9, "metadata": {"lineno": 106, "file": "/root/resnet50/lars_util.py"}}
:::MLL 1679230535.381 lars_opt_learning_rate_warmup_epochs: {"value": 5.0, "metadata": {"lineno": 108, "file": "/root/resnet50/lars_util.py"}}
I0319 12:55:35.381502 140547769902912 mlp_log.py:80] :::MLL 1679230535.381 lars_opt_learning_rate_warmup_epochs: {"value": 5.0, "metadata": {"lineno": 108, "file": "/root/resnet50/lars_util.py"}}
:::MLL 1679230535.382 lars_opt_end_learning_rate: {"value": 0.0001, "metadata": {"lineno": 110, "file": "/root/resnet50/lars_util.py"}}
I0319 12:55:35.382365 140547769902912 mlp_log.py:80] :::MLL 1679230535.382 lars_opt_end_learning_rate: {"value": 0.0001, "metadata": {"lineno": 110, "file": "/root/resnet50/lars_util.py"}}
:::MLL 1679230535.384 lars_opt_learning_rate_decay_steps: {"value": 33346, "metadata": {"lineno": 115, "file": "/root/resnet50/lars_util.py"}}
I0319 12:55:35.383680 140547769902912 mlp_log.py:80] :::MLL 1679230535.384 lars_opt_learning_rate_decay_steps: {"value": 33346, "metadata": {"lineno": 115, "file": "/root/resnet50/lars_util.py"}}
:::MLL 1679230535.385 lars_opt_learning_rate_decay_poly_power: {"value": 2.0, "metadata": {"lineno": 117, "file": "/root/resnet50/lars_util.py"}}
I0319 12:55:35.384541 140547769902912 mlp_log.py:80] :::MLL 1679230535.385 lars_opt_learning_rate_decay_poly_power: {"value": 2.0, "metadata": {"lineno": 117, "file": "/root/resnet50/lars_util.py"}}
:::MLL 1679230535.385 lars_opt_momentum: {"value": 0.9, "metadata": {"lineno": 119, "file": "/root/resnet50/lars_util.py"}}
I0319 12:55:35.385398 140547769902912 mlp_log.py:80] :::MLL 1679230535.385 lars_opt_momentum: {"value": 0.9, "metadata": {"lineno": 119, "file": "/root/resnet50/lars_util.py"}}
I0319 12:55:35.494630 140547769902912 resnet_ctl_imagenet_main.py:238] Warmup for 1 steps.
I0319 12:55:35.496956 140547769902912 controller.py:340] Warmup at step 0 of 1
I0319 12:55:35.497112 140547769902912 controller.py:345] Entering warmup loop with 1 steps, at step 0 of 1
WARNING:tensorflow:From /root/resnet50/tf2_common/training/utils.py:139: StrategyBase.experimental_distribute_datasets_from_function (from tensorflow.python.distribute.distribute_lib) is deprecated and will be removed in a future version.
Instructions for updating:
rename to distribute_datasets_from_function
W0319 12:55:35.497444 140547769902912 deprecation.py:341] From /root/resnet50/tf2_common/training/utils.py:139: StrategyBase.experimental_distribute_datasets_from_function (from tensorflow.python.distribute.distribute_lib) is deprecated and will be removed in a future version.
Instructions for updating:
rename to distribute_datasets_from_function
I0319 12:55:35.897564 140547769902912 resnet_runnable.py:484] Entering the warmup loop.
WARNING:tensorflow:From /usr/local/lib/python3.8/site-packages/tensorflow/python/autograph/impl/api.py:464: calling function (from tensorflow.python.eager.def_function) with experimental_compile is deprecated and will be removed in a future version.
Instructions for updating:
experimental_compile is deprecated, use jit_compile instead
W0319 12:55:37.124004 140547769902912 deprecation.py:545] From /usr/local/lib/python3.8/site-packages/tensorflow/python/autograph/impl/api.py:464: calling function (from tensorflow.python.eager.def_function) with experimental_compile is deprecated and will be removed in a future version.
Instructions for updating:
experimental_compile is deprecated, use jit_compile instead
INFO:tensorflow:batch_all_reduce: 161 all-reduces with algorithm = nccl, num_packs = 1
I0319 12:55:55.412617 140547769902912 cross_device_ops.py:900] batch_all_reduce: 161 all-reduces with algorithm = nccl, num_packs = 1
INFO:tensorflow:batch_all_reduce: 161 all-reduces with algorithm = nccl, num_packs = 1
I0319 12:56:48.352646 140547769902912 cross_device_ops.py:900] batch_all_reduce: 161 all-reduces with algorithm = nccl, num_packs = 1
I0319 13:00:32.592645 140547769902912 resnet_runnable.py:497] Exiting the warmup loop.
I0319 13:00:32.595108 140547769902912 controller.py:220] step: 1 steps_per_second: 0.00
enter fp16 computing
step: 1 steps_per_second: 0.00
:::MLL 1679230832.596 init_stop: {"value": null, "metadata": {"lineno": 258, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 13:00:32.596201 140547769902912 mlp_log.py:80] :::MLL 1679230832.596 init_stop: {"value": null, "metadata": {"lineno": 258, "file": "./resnet_ctl_imagenet_main.py"}}
:::MLL 1679230832.597 run_start: {"value": null, "metadata": {"lineno": 267, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 13:00:32.596997 140547769902912 mlp_log.py:80] :::MLL 1679230832.597 run_start: {"value": null, "metadata": {"lineno": 267, "file": "./resnet_ctl_imagenet_main.py"}}
:::MLL 1679230832.598 block_start: {"value": null, "metadata": {"first_epoch_num": 1, "epoch_count": 3, "lineno": 268, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 13:00:32.597745 140547769902912 mlp_log.py:80] :::MLL 1679230832.598 block_start: {"value": null, "metadata": {"first_epoch_num": 1, "epoch_count": 3, "lineno": 268, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 13:00:32.599620 140547769902912 controller.py:247] Train at step 0 of 36423
I0319 13:00:32.599745 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 0 of 36423
I0319 13:00:32.612586 140547769902912 imagenet_preprocessing.py:338] Sharding the dataset: input_pipeline_id=0 num_input_pipelines=1
W0319 13:00:32.634842 140547769902912 options.py:503] options.experimental_threading is deprecated. Use options.threading instead.
I0319 13:00:32.636068 140547769902912 imagenet_preprocessing.py:104] datasets_num_private_threads: 32
I0319 13:00:32.637336 140547769902912 imagenet_preprocessing.py:118] Num classes: 1000
I0319 13:00:32.637444 140547769902912 imagenet_preprocessing.py:119] One hot: True
I0319 13:08:32.765698 140547769902912 keras_utils.py:120] TimeHistory: 2676.05 examples/second between steps 0 and 513
I0319 13:08:32.769956 140547769902912 controller.py:220] step: 513 steps_per_second: 1.07 {'train_loss': 101.53466, 'train_accuracy': 0.025109181}
I0319 13:08:32.770123 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 513 of 36423
I0319 13:16:30.476807 140547769902912 keras_utils.py:120] TimeHistory: 2680.53 examples/second between steps 513 and 1026
I0319 13:16:30.481098 140547769902912 controller.py:220] step: 1026 steps_per_second: 1.07 {'train_loss': 80.75745, 'train_accuracy': 0.13648738}
I0319 13:16:30.481256 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 1026 of 36423
I0319 13:24:28.062501 140547769902912 keras_utils.py:120] TimeHistory: 2681.24 examples/second between steps 1026 and 1539
I0319 13:24:28.066748 140547769902912 controller.py:220] step: 1539 steps_per_second: 1.07 {'train_loss': 68.72967, 'train_accuracy': 0.25144324}
I0319 13:24:28.066913 140547769902912 controller.py:185] Start evaluation at step: 1539
I0319 13:24:28.070569 140547769902912 imagenet_preprocessing.py:338] Sharding the dataset: input_pipeline_id=0 num_input_pipelines=1
W0319 13:24:28.088642 140547769902912 options.py:503] options.experimental_threading is deprecated. Use options.threading instead.
I0319 13:24:28.089705 140547769902912 imagenet_preprocessing.py:104] datasets_num_private_threads: 32
I0319 13:24:28.089835 140547769902912 imagenet_preprocessing.py:118] Num classes: 1000
I0319 13:24:28.089923 140547769902912 imagenet_preprocessing.py:119] One hot: True
step: 513 steps_per_second: 1.07 {'train_loss': 101.53466, 'train_accuracy': 0.025109181}
step: 1026 steps_per_second: 1.07 {'train_loss': 80.75745, 'train_accuracy': 0.13648738}
step: 1539 steps_per_second: 1.07 {'train_loss': 68.72967, 'train_accuracy': 0.25144324}
:::MLL 1679232268.928 eval_start: {"value": null, "metadata": {"epoch_num": 3, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 13:24:28.927603 140547769902912 mlp_log.py:80] :::MLL 1679232268.928 eval_start: {"value": null, "metadata": {"epoch_num": 3, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679232301.308 eval_stop: {"value": null, "metadata": {"epoch_num": 3, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 13:25:01.308466 140547769902912 mlp_log.py:80] :::MLL 1679232301.308 eval_stop: {"value": null, "metadata": {"epoch_num": 3, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679232301.317 eval_accuracy: {"value": 0.255840003490448, "metadata": {"epoch_num": 3, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 13:25:01.317326 140547769902912 mlp_log.py:80] :::MLL 1679232301.317 eval_accuracy: {"value": 0.255840003490448, "metadata": {"epoch_num": 3, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679232301.318 block_stop: {"value": null, "metadata": {"first_epoch_num": 1, "epoch_count": 3, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 13:25:01.318364 140547769902912 mlp_log.py:80] :::MLL 1679232301.318 block_stop: {"value": null, "metadata": {"first_epoch_num": 1, "epoch_count": 3, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679232301.319 block_start: {"value": null, "metadata": {"first_epoch_num": 4, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 13:25:01.319331 140547769902912 mlp_log.py:80] :::MLL 1679232301.319 block_start: {"value": null, "metadata": {"first_epoch_num": 4, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 13:25:01.329561 140547769902912 controller.py:220] step: 1539 evaluation metric: {'test_loss': 0.49958566, 'test_accuracy': 0.25584, 'continue_training': True}
I0319 13:25:01.329745 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 1539 of 36423
I0319 13:32:58.584241 140547769902912 keras_utils.py:120] TimeHistory: 2683.07 examples/second between steps 1539 and 2052
I0319 13:32:58.588519 140547769902912 controller.py:220] step: 2052 steps_per_second: 1.00 {'train_loss': 61.880257, 'train_accuracy': 0.3291465}
I0319 13:32:58.588680 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 2052 of 36423
I0319 13:40:56.833560 140547769902912 keras_utils.py:120] TimeHistory: 2677.52 examples/second between steps 2052 and 2565
I0319 13:40:56.837803 140547769902912 controller.py:220] step: 2565 steps_per_second: 1.07 {'train_loss': 58.142868, 'train_accuracy': 0.3752999}
I0319 13:40:56.837963 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 2565 of 36423
I0319 13:48:55.233101 140547769902912 keras_utils.py:120] TimeHistory: 2676.68 examples/second between steps 2565 and 3078
I0319 13:48:55.237374 140547769902912 controller.py:220] step: 3078 steps_per_second: 1.07 {'train_loss': 55.290226, 'train_accuracy': 0.41178867}
I0319 13:48:55.237531 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 3078 of 36423
I0319 13:56:53.574455 140547769902912 keras_utils.py:120] TimeHistory: 2677.00 examples/second between steps 3078 and 3591
I0319 13:56:53.578727 140547769902912 controller.py:220] step: 3591 steps_per_second: 1.07 {'train_loss': 52.677834, 'train_accuracy': 0.4466218}
I0319 13:56:53.578876 140547769902912 controller.py:185] Start evaluation at step: 3591
step: 1539 evaluation metric: {'test_loss': 0.49958566, 'test_accuracy': 0.25584, 'continue_training': True}
step: 2052 steps_per_second: 1.00 {'train_loss': 61.880257, 'train_accuracy': 0.3291465}
step: 2565 steps_per_second: 1.07 {'train_loss': 58.142868, 'train_accuracy': 0.3752999}
step: 3078 steps_per_second: 1.07 {'train_loss': 55.290226, 'train_accuracy': 0.41178867}
step: 3591 steps_per_second: 1.07 {'train_loss': 52.677834, 'train_accuracy': 0.4466218}
:::MLL 1679234214.081 eval_start: {"value": null, "metadata": {"epoch_num": 7, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 13:56:54.080654 140547769902912 mlp_log.py:80] :::MLL 1679234214.081 eval_start: {"value": null, "metadata": {"epoch_num": 7, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679234225.254 eval_stop: {"value": null, "metadata": {"epoch_num": 7, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 13:57:05.254401 140547769902912 mlp_log.py:80] :::MLL 1679234225.254 eval_stop: {"value": null, "metadata": {"epoch_num": 7, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679234225.261 eval_accuracy: {"value": 0.4514999985694885, "metadata": {"epoch_num": 7, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 13:57:05.261220 140547769902912 mlp_log.py:80] :::MLL 1679234225.261 eval_accuracy: {"value": 0.4514999985694885, "metadata": {"epoch_num": 7, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679234225.262 block_stop: {"value": null, "metadata": {"first_epoch_num": 4, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 13:57:05.262227 140547769902912 mlp_log.py:80] :::MLL 1679234225.262 block_stop: {"value": null, "metadata": {"first_epoch_num": 4, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679234225.263 block_start: {"value": null, "metadata": {"first_epoch_num": 8, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 13:57:05.263200 140547769902912 mlp_log.py:80] :::MLL 1679234225.263 block_start: {"value": null, "metadata": {"first_epoch_num": 8, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 13:57:05.272903 140547769902912 controller.py:220] step: 3591 evaluation metric: {'test_loss': 0.38534293, 'test_accuracy': 0.4515, 'continue_training': True}
I0319 13:57:05.273066 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 3591 of 36423
I0319 14:05:03.201216 140547769902912 keras_utils.py:120] TimeHistory: 2679.28 examples/second between steps 3591 and 4104
I0319 14:05:03.205459 140547769902912 controller.py:220] step: 4104 steps_per_second: 1.05 {'train_loss': 50.85758, 'train_accuracy': 0.47165993}
I0319 14:05:03.205613 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 4104 of 36423
I0319 14:13:01.703775 140547769902912 keras_utils.py:120] TimeHistory: 2676.10 examples/second between steps 4104 and 4617
I0319 14:13:01.707995 140547769902912 controller.py:220] step: 4617 steps_per_second: 1.07 {'train_loss': 49.526817, 'train_accuracy': 0.48903587}
I0319 14:13:01.708152 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 4617 of 36423
I0319 14:20:58.757003 140547769902912 keras_utils.py:120] TimeHistory: 2684.23 examples/second between steps 4617 and 5130
I0319 14:20:58.761198 140547769902912 controller.py:220] step: 5130 steps_per_second: 1.08 {'train_loss': 48.474247, 'train_accuracy': 0.5037846}
I0319 14:20:58.761370 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 5130 of 36423
I0319 14:28:56.838135 140547769902912 keras_utils.py:120] TimeHistory: 2678.46 examples/second between steps 5130 and 5643
I0319 14:28:56.842247 140547769902912 controller.py:220] step: 5643 steps_per_second: 1.07 {'train_loss': 47.524445, 'train_accuracy': 0.517012}
I0319 14:28:56.842405 140547769902912 controller.py:185] Start evaluation at step: 5643
step: 3591 evaluation metric: {'test_loss': 0.38534293, 'test_accuracy': 0.4515, 'continue_training': True}
step: 4104 steps_per_second: 1.05 {'train_loss': 50.85758, 'train_accuracy': 0.47165993}
step: 4617 steps_per_second: 1.07 {'train_loss': 49.526817, 'train_accuracy': 0.48903587}
step: 5130 steps_per_second: 1.08 {'train_loss': 48.474247, 'train_accuracy': 0.5037846}
step: 5643 steps_per_second: 1.07 {'train_loss': 47.524445, 'train_accuracy': 0.517012}
:::MLL 1679236137.347 eval_start: {"value": null, "metadata": {"epoch_num": 11, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 14:28:57.346966 140547769902912 mlp_log.py:80] :::MLL 1679236137.347 eval_start: {"value": null, "metadata": {"epoch_num": 11, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679236148.307 eval_stop: {"value": null, "metadata": {"epoch_num": 11, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 14:29:08.307533 140547769902912 mlp_log.py:80] :::MLL 1679236148.307 eval_stop: {"value": null, "metadata": {"epoch_num": 11, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679236148.314 eval_accuracy: {"value": 0.5169399976730347, "metadata": {"epoch_num": 11, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 14:29:08.314471 140547769902912 mlp_log.py:80] :::MLL 1679236148.314 eval_accuracy: {"value": 0.5169399976730347, "metadata": {"epoch_num": 11, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679236148.315 block_stop: {"value": null, "metadata": {"first_epoch_num": 8, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 14:29:08.315475 140547769902912 mlp_log.py:80] :::MLL 1679236148.315 block_stop: {"value": null, "metadata": {"first_epoch_num": 8, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679236148.316 block_start: {"value": null, "metadata": {"first_epoch_num": 12, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 14:29:08.316439 140547769902912 mlp_log.py:80] :::MLL 1679236148.316 block_start: {"value": null, "metadata": {"first_epoch_num": 12, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 14:29:08.326488 140547769902912 controller.py:220] step: 5643 evaluation metric: {'test_loss': 0.34546962, 'test_accuracy': 0.51694, 'continue_training': True}
I0319 14:29:08.326648 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 5643 of 36423
I0319 14:37:05.725753 140547769902912 keras_utils.py:120] TimeHistory: 2682.26 examples/second between steps 5643 and 6156
I0319 14:37:05.729918 140547769902912 controller.py:220] step: 6156 steps_per_second: 1.05 {'train_loss': 46.72335, 'train_accuracy': 0.5281839}
I0319 14:37:05.730074 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 6156 of 36423
I0319 14:45:03.411590 140547769902912 keras_utils.py:120] TimeHistory: 2680.68 examples/second between steps 6156 and 6669
I0319 14:45:03.415779 140547769902912 controller.py:220] step: 6669 steps_per_second: 1.07 {'train_loss': 46.06021, 'train_accuracy': 0.53761417}
I0319 14:45:03.415935 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 6669 of 36423
I0319 14:53:02.156559 140547769902912 keras_utils.py:120] TimeHistory: 2674.74 examples/second between steps 6669 and 7182
I0319 14:53:02.160710 140547769902912 controller.py:220] step: 7182 steps_per_second: 1.07 {'train_loss': 45.366295, 'train_accuracy': 0.54672974}
I0319 14:53:02.160865 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 7182 of 36423
I0319 15:01:00.511001 140547769902912 keras_utils.py:120] TimeHistory: 2676.93 examples/second between steps 7182 and 7695
I0319 15:01:00.515219 140547769902912 controller.py:220] step: 7695 steps_per_second: 1.07 {'train_loss': 44.782856, 'train_accuracy': 0.5550253}
I0319 15:01:00.517019 140547769902912 controller.py:185] Start evaluation at step: 7695
step: 5643 evaluation metric: {'test_loss': 0.34546962, 'test_accuracy': 0.51694, 'continue_training': True}
step: 6156 steps_per_second: 1.05 {'train_loss': 46.72335, 'train_accuracy': 0.5281839}
step: 6669 steps_per_second: 1.07 {'train_loss': 46.06021, 'train_accuracy': 0.53761417}
step: 7182 steps_per_second: 1.07 {'train_loss': 45.366295, 'train_accuracy': 0.54672974}
step: 7695 steps_per_second: 1.07 {'train_loss': 44.782856, 'train_accuracy': 0.5550253}
:::MLL 1679238061.002 eval_start: {"value": null, "metadata": {"epoch_num": 15, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 15:01:01.002238 140547769902912 mlp_log.py:80] :::MLL 1679238061.002 eval_start: {"value": null, "metadata": {"epoch_num": 15, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679238071.832 eval_stop: {"value": null, "metadata": {"epoch_num": 15, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 15:01:11.832513 140547769902912 mlp_log.py:80] :::MLL 1679238071.832 eval_stop: {"value": null, "metadata": {"epoch_num": 15, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679238071.839 eval_accuracy: {"value": 0.5540599822998047, "metadata": {"epoch_num": 15, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 15:01:11.839387 140547769902912 mlp_log.py:80] :::MLL 1679238071.839 eval_accuracy: {"value": 0.5540599822998047, "metadata": {"epoch_num": 15, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679238071.840 block_stop: {"value": null, "metadata": {"first_epoch_num": 12, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 15:01:11.840405 140547769902912 mlp_log.py:80] :::MLL 1679238071.840 block_stop: {"value": null, "metadata": {"first_epoch_num": 12, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679238071.841 block_start: {"value": null, "metadata": {"first_epoch_num": 16, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 15:01:11.841379 140547769902912 mlp_log.py:80] :::MLL 1679238071.841 block_start: {"value": null, "metadata": {"first_epoch_num": 16, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 15:01:11.851153 140547769902912 controller.py:220] step: 7695 evaluation metric: {'test_loss': 0.3284506, 'test_accuracy': 0.55406, 'continue_training': True}
I0319 15:01:11.851322 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 7695 of 36423
I0319 15:09:09.903125 140547769902912 keras_utils.py:120] TimeHistory: 2678.59 examples/second between steps 7695 and 8208
I0319 15:09:09.907292 140547769902912 controller.py:220] step: 8208 steps_per_second: 1.05 {'train_loss': 44.193314, 'train_accuracy': 0.56363946}
I0319 15:09:09.907462 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 8208 of 36423
I0319 15:17:08.328512 140547769902912 keras_utils.py:120] TimeHistory: 2676.53 examples/second between steps 8208 and 8721
I0319 15:17:08.332779 140547769902912 controller.py:220] step: 8721 steps_per_second: 1.07 {'train_loss': 43.65782, 'train_accuracy': 0.5716288}
I0319 15:17:08.332940 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 8721 of 36423
I0319 15:25:06.558547 140547769902912 keras_utils.py:120] TimeHistory: 2677.62 examples/second between steps 8721 and 9234
I0319 15:25:06.562764 140547769902912 controller.py:220] step: 9234 steps_per_second: 1.07 {'train_loss': 43.085396, 'train_accuracy': 0.5789591}
I0319 15:25:06.562925 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 9234 of 36423
I0319 15:33:04.438484 140547769902912 keras_utils.py:120] TimeHistory: 2679.59 examples/second between steps 9234 and 9747
I0319 15:33:04.442654 140547769902912 controller.py:220] step: 9747 steps_per_second: 1.07 {'train_loss': 42.59366, 'train_accuracy': 0.58631825}
I0319 15:33:04.442804 140547769902912 controller.py:185] Start evaluation at step: 9747
step: 7695 evaluation metric: {'test_loss': 0.3284506, 'test_accuracy': 0.55406, 'continue_training': True}
step: 8208 steps_per_second: 1.05 {'train_loss': 44.193314, 'train_accuracy': 0.56363946}
step: 8721 steps_per_second: 1.07 {'train_loss': 43.65782, 'train_accuracy': 0.5716288}
step: 9234 steps_per_second: 1.07 {'train_loss': 43.085396, 'train_accuracy': 0.5789591}
step: 9747 steps_per_second: 1.07 {'train_loss': 42.59366, 'train_accuracy': 0.58631825}
:::MLL 1679239984.931 eval_start: {"value": null, "metadata": {"epoch_num": 19, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 15:33:04.930735 140547769902912 mlp_log.py:80] :::MLL 1679239984.931 eval_start: {"value": null, "metadata": {"epoch_num": 19, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679239996.094 eval_stop: {"value": null, "metadata": {"epoch_num": 19, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 15:33:16.094051 140547769902912 mlp_log.py:80] :::MLL 1679239996.094 eval_stop: {"value": null, "metadata": {"epoch_num": 19, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679239996.101 eval_accuracy: {"value": 0.6158000230789185, "metadata": {"epoch_num": 19, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 15:33:16.100932 140547769902912 mlp_log.py:80] :::MLL 1679239996.101 eval_accuracy: {"value": 0.6158000230789185, "metadata": {"epoch_num": 19, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679239996.102 block_stop: {"value": null, "metadata": {"first_epoch_num": 16, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 15:33:16.101949 140547769902912 mlp_log.py:80] :::MLL 1679239996.102 block_stop: {"value": null, "metadata": {"first_epoch_num": 16, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679239996.103 block_start: {"value": null, "metadata": {"first_epoch_num": 20, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 15:33:16.102918 140547769902912 mlp_log.py:80] :::MLL 1679239996.103 block_start: {"value": null, "metadata": {"first_epoch_num": 20, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 15:33:16.112729 140547769902912 controller.py:220] step: 9747 evaluation metric: {'test_loss': 0.29739872, 'test_accuracy': 0.6158, 'continue_training': True}
I0319 15:33:16.112884 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 9747 of 36423
I0319 15:41:14.392338 140547769902912 keras_utils.py:120] TimeHistory: 2677.32 examples/second between steps 9747 and 10260
I0319 15:41:14.396505 140547769902912 controller.py:220] step: 10260 steps_per_second: 1.05 {'train_loss': 42.066074, 'train_accuracy': 0.59374607}
I0319 15:41:14.396659 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 10260 of 36423
I0319 15:49:11.961558 140547769902912 keras_utils.py:120] TimeHistory: 2681.33 examples/second between steps 10260 and 10773
I0319 15:49:11.965767 140547769902912 controller.py:220] step: 10773 steps_per_second: 1.07 {'train_loss': 41.57502, 'train_accuracy': 0.60048044}
I0319 15:49:11.965925 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 10773 of 36423
I0319 15:57:09.164847 140547769902912 keras_utils.py:120] TimeHistory: 2683.39 examples/second between steps 10773 and 11286
I0319 15:57:09.168977 140547769902912 controller.py:220] step: 11286 steps_per_second: 1.08 {'train_loss': 41.104015, 'train_accuracy': 0.6074765}
I0319 15:57:09.169133 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 11286 of 36423
I0319 16:05:06.888276 140547769902912 keras_utils.py:120] TimeHistory: 2680.46 examples/second between steps 11286 and 11799
I0319 16:05:06.892483 140547769902912 controller.py:220] step: 11799 steps_per_second: 1.07 {'train_loss': 40.675106, 'train_accuracy': 0.6140507}
I0319 16:05:06.892634 140547769902912 controller.py:185] Start evaluation at step: 11799
step: 9747 evaluation metric: {'test_loss': 0.29739872, 'test_accuracy': 0.6158, 'continue_training': True}
step: 10260 steps_per_second: 1.05 {'train_loss': 42.066074, 'train_accuracy': 0.59374607}
step: 10773 steps_per_second: 1.07 {'train_loss': 41.57502, 'train_accuracy': 0.60048044}
step: 11286 steps_per_second: 1.08 {'train_loss': 41.104015, 'train_accuracy': 0.6074765}
step: 11799 steps_per_second: 1.07 {'train_loss': 40.675106, 'train_accuracy': 0.6140507}
:::MLL 1679241907.377 eval_start: {"value": null, "metadata": {"epoch_num": 23, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 16:05:07.376655 140547769902912 mlp_log.py:80] :::MLL 1679241907.377 eval_start: {"value": null, "metadata": {"epoch_num": 23, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679241918.161 eval_stop: {"value": null, "metadata": {"epoch_num": 23, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 16:05:18.161060 140547769902912 mlp_log.py:80] :::MLL 1679241918.161 eval_stop: {"value": null, "metadata": {"epoch_num": 23, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679241918.168 eval_accuracy: {"value": 0.6306399703025818, "metadata": {"epoch_num": 23, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 16:05:18.167979 140547769902912 mlp_log.py:80] :::MLL 1679241918.168 eval_accuracy: {"value": 0.6306399703025818, "metadata": {"epoch_num": 23, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679241918.169 block_stop: {"value": null, "metadata": {"first_epoch_num": 20, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 16:05:18.168991 140547769902912 mlp_log.py:80] :::MLL 1679241918.169 block_stop: {"value": null, "metadata": {"first_epoch_num": 20, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679241918.170 block_start: {"value": null, "metadata": {"first_epoch_num": 24, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 16:05:18.169961 140547769902912 mlp_log.py:80] :::MLL 1679241918.170 block_start: {"value": null, "metadata": {"first_epoch_num": 24, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 16:05:18.179913 140547769902912 controller.py:220] step: 11799 evaluation metric: {'test_loss': 0.29088515, 'test_accuracy': 0.63064, 'continue_training': True}
I0319 16:05:18.180072 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 11799 of 36423
I0319 16:13:15.017472 140547769902912 keras_utils.py:120] TimeHistory: 2685.42 examples/second between steps 11799 and 12312
I0319 16:13:15.021653 140547769902912 controller.py:220] step: 12312 steps_per_second: 1.05 {'train_loss': 40.224228, 'train_accuracy': 0.6205242}
I0319 16:13:15.021814 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 12312 of 36423
I0319 16:21:11.873815 140547769902912 keras_utils.py:120] TimeHistory: 2685.34 examples/second between steps 12312 and 12825
I0319 16:21:11.877966 140547769902912 controller.py:220] step: 12825 steps_per_second: 1.08 {'train_loss': 39.75526, 'train_accuracy': 0.627093}
I0319 16:21:11.878120 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 12825 of 36423
I0319 16:29:08.757629 140547769902912 keras_utils.py:120] TimeHistory: 2685.19 examples/second between steps 12825 and 13338
I0319 16:29:08.761925 140547769902912 controller.py:220] step: 13338 steps_per_second: 1.08 {'train_loss': 39.357185, 'train_accuracy': 0.6333713}
I0319 16:29:08.762086 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 13338 of 36423
I0319 16:37:05.957099 140547769902912 keras_utils.py:120] TimeHistory: 2683.41 examples/second between steps 13338 and 13851
I0319 16:37:05.961228 140547769902912 controller.py:220] step: 13851 steps_per_second: 1.08 {'train_loss': 38.924423, 'train_accuracy': 0.6391099}
I0319 16:37:05.961388 140547769902912 controller.py:185] Start evaluation at step: 13851
step: 11799 evaluation metric: {'test_loss': 0.29088515, 'test_accuracy': 0.63064, 'continue_training': True}
step: 12312 steps_per_second: 1.05 {'train_loss': 40.224228, 'train_accuracy': 0.6205242}
step: 12825 steps_per_second: 1.08 {'train_loss': 39.75526, 'train_accuracy': 0.627093}
step: 13338 steps_per_second: 1.08 {'train_loss': 39.357185, 'train_accuracy': 0.6333713}
step: 13851 steps_per_second: 1.08 {'train_loss': 38.924423, 'train_accuracy': 0.6391099}
:::MLL 1679243826.441 eval_start: {"value": null, "metadata": {"epoch_num": 27, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 16:37:06.441277 140547769902912 mlp_log.py:80] :::MLL 1679243826.441 eval_start: {"value": null, "metadata": {"epoch_num": 27, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679243837.448 eval_stop: {"value": null, "metadata": {"epoch_num": 27, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 16:37:17.448269 140547769902912 mlp_log.py:80] :::MLL 1679243837.448 eval_stop: {"value": null, "metadata": {"epoch_num": 27, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679243837.455 eval_accuracy: {"value": 0.6637200117111206, "metadata": {"epoch_num": 27, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 16:37:17.455250 140547769902912 mlp_log.py:80] :::MLL 1679243837.455 eval_accuracy: {"value": 0.6637200117111206, "metadata": {"epoch_num": 27, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679243837.456 block_stop: {"value": null, "metadata": {"first_epoch_num": 24, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 16:37:17.456276 140547769902912 mlp_log.py:80] :::MLL 1679243837.456 block_stop: {"value": null, "metadata": {"first_epoch_num": 24, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679243837.457 block_start: {"value": null, "metadata": {"first_epoch_num": 28, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 16:37:17.457254 140547769902912 mlp_log.py:80] :::MLL 1679243837.457 block_start: {"value": null, "metadata": {"first_epoch_num": 28, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 16:37:17.467283 140547769902912 controller.py:220] step: 13851 evaluation metric: {'test_loss': 0.27420917, 'test_accuracy': 0.66372, 'continue_training': True}
I0319 16:37:17.467454 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 13851 of 36423
I0319 16:45:14.272286 140547769902912 keras_utils.py:120] TimeHistory: 2685.60 examples/second between steps 13851 and 14364
I0319 16:45:14.276514 140547769902912 controller.py:220] step: 14364 steps_per_second: 1.05 {'train_loss': 38.50588, 'train_accuracy': 0.645977}
I0319 16:45:14.276674 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 14364 of 36423
I0319 16:53:12.242927 140547769902912 keras_utils.py:120] TimeHistory: 2679.08 examples/second between steps 14364 and 14877
I0319 16:53:12.247173 140547769902912 controller.py:220] step: 14877 steps_per_second: 1.07 {'train_loss': 38.042336, 'train_accuracy': 0.65228266}
I0319 16:53:12.247342 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 14877 of 36423
I0319 17:01:08.925324 140547769902912 keras_utils.py:120] TimeHistory: 2686.32 examples/second between steps 14877 and 15390
I0319 17:01:08.929522 140547769902912 controller.py:220] step: 15390 steps_per_second: 1.08 {'train_loss': 37.64315, 'train_accuracy': 0.65857184}
I0319 17:01:08.929681 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 15390 of 36423
I0319 17:09:05.460300 140547769902912 keras_utils.py:120] TimeHistory: 2687.14 examples/second between steps 15390 and 15903
I0319 17:09:05.464558 140547769902912 controller.py:220] step: 15903 steps_per_second: 1.08 {'train_loss': 37.25062, 'train_accuracy': 0.66452134}
I0319 17:09:05.464712 140547769902912 controller.py:185] Start evaluation at step: 15903
step: 13851 evaluation metric: {'test_loss': 0.27420917, 'test_accuracy': 0.66372, 'continue_training': True}
step: 14364 steps_per_second: 1.05 {'train_loss': 38.50588, 'train_accuracy': 0.645977}
step: 14877 steps_per_second: 1.07 {'train_loss': 38.042336, 'train_accuracy': 0.65228266}
step: 15390 steps_per_second: 1.08 {'train_loss': 37.64315, 'train_accuracy': 0.65857184}
step: 15903 steps_per_second: 1.08 {'train_loss': 37.25062, 'train_accuracy': 0.66452134}
:::MLL 1679245745.958 eval_start: {"value": null, "metadata": {"epoch_num": 31, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 17:09:05.958450 140547769902912 mlp_log.py:80] :::MLL 1679245745.958 eval_start: {"value": null, "metadata": {"epoch_num": 31, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679245756.709 eval_stop: {"value": null, "metadata": {"epoch_num": 31, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 17:09:16.709334 140547769902912 mlp_log.py:80] :::MLL 1679245756.709 eval_stop: {"value": null, "metadata": {"epoch_num": 31, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679245756.716 eval_accuracy: {"value": 0.663860023021698, "metadata": {"epoch_num": 31, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 17:09:16.716322 140547769902912 mlp_log.py:80] :::MLL 1679245756.716 eval_accuracy: {"value": 0.663860023021698, "metadata": {"epoch_num": 31, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679245756.717 block_stop: {"value": null, "metadata": {"first_epoch_num": 28, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 17:09:16.717343 140547769902912 mlp_log.py:80] :::MLL 1679245756.717 block_stop: {"value": null, "metadata": {"first_epoch_num": 28, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679245756.718 block_start: {"value": null, "metadata": {"first_epoch_num": 32, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 17:09:16.718302 140547769902912 mlp_log.py:80] :::MLL 1679245756.718 block_start: {"value": null, "metadata": {"first_epoch_num": 32, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 17:09:16.728244 140547769902912 controller.py:220] step: 15903 evaluation metric: {'test_loss': 0.2736155, 'test_accuracy': 0.66386, 'continue_training': True}
I0319 17:09:16.728415 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 15903 of 36423
I0319 17:17:12.753624 140547769902912 keras_utils.py:120] TimeHistory: 2690.00 examples/second between steps 15903 and 16416
I0319 17:17:12.757766 140547769902912 controller.py:220] step: 16416 steps_per_second: 1.05 {'train_loss': 36.87168, 'train_accuracy': 0.6701756}
I0319 17:17:12.757923 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 16416 of 36423
I0319 17:25:08.728839 140547769902912 keras_utils.py:120] TimeHistory: 2690.31 examples/second between steps 16416 and 16929
I0319 17:25:08.733042 140547769902912 controller.py:220] step: 16929 steps_per_second: 1.08 {'train_loss': 36.469055, 'train_accuracy': 0.67674124}
I0319 17:25:08.733199 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 16929 of 36423
I0319 17:33:05.759370 140547769902912 keras_utils.py:120] TimeHistory: 2684.36 examples/second between steps 16929 and 17442
I0319 17:33:05.763500 140547769902912 controller.py:220] step: 17442 steps_per_second: 1.08 {'train_loss': 36.071156, 'train_accuracy': 0.6823971}
I0319 17:33:05.763653 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 17442 of 36423
I0319 17:41:02.449225 140547769902912 keras_utils.py:120] TimeHistory: 2686.27 examples/second between steps 17442 and 17955
I0319 17:41:02.453442 140547769902912 controller.py:220] step: 17955 steps_per_second: 1.08 {'train_loss': 35.67699, 'train_accuracy': 0.68873394}
I0319 17:41:02.453614 140547769902912 controller.py:185] Start evaluation at step: 17955
step: 15903 evaluation metric: {'test_loss': 0.2736155, 'test_accuracy': 0.66386, 'continue_training': True}
step: 16416 steps_per_second: 1.05 {'train_loss': 36.87168, 'train_accuracy': 0.6701756}
step: 16929 steps_per_second: 1.08 {'train_loss': 36.469055, 'train_accuracy': 0.67674124}
step: 17442 steps_per_second: 1.08 {'train_loss': 36.071156, 'train_accuracy': 0.6823971}
step: 17955 steps_per_second: 1.08 {'train_loss': 35.67699, 'train_accuracy': 0.68873394}
:::MLL 1679247662.938 eval_start: {"value": null, "metadata": {"epoch_num": 35, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 17:41:02.938170 140547769902912 mlp_log.py:80] :::MLL 1679247662.938 eval_start: {"value": null, "metadata": {"epoch_num": 35, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679247673.817 eval_stop: {"value": null, "metadata": {"epoch_num": 35, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 17:41:13.817046 140547769902912 mlp_log.py:80] :::MLL 1679247673.817 eval_stop: {"value": null, "metadata": {"epoch_num": 35, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679247673.824 eval_accuracy: {"value": 0.6931399703025818, "metadata": {"epoch_num": 35, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 17:41:13.824302 140547769902912 mlp_log.py:80] :::MLL 1679247673.824 eval_accuracy: {"value": 0.6931399703025818, "metadata": {"epoch_num": 35, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679247673.825 block_stop: {"value": null, "metadata": {"first_epoch_num": 32, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 17:41:13.825345 140547769902912 mlp_log.py:80] :::MLL 1679247673.825 block_stop: {"value": null, "metadata": {"first_epoch_num": 32, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679247673.826 block_start: {"value": null, "metadata": {"first_epoch_num": 36, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 17:41:13.826306 140547769902912 mlp_log.py:80] :::MLL 1679247673.826 block_start: {"value": null, "metadata": {"first_epoch_num": 36, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 17:41:13.836492 140547769902912 controller.py:220] step: 17955 evaluation metric: {'test_loss': 0.25965777, 'test_accuracy': 0.69314, 'continue_training': True}
I0319 17:41:13.836662 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 17955 of 36423
I0319 17:49:10.447629 140547769902912 keras_utils.py:120] TimeHistory: 2686.69 examples/second between steps 17955 and 18468
I0319 17:49:10.451847 140547769902912 controller.py:220] step: 18468 steps_per_second: 1.05 {'train_loss': 35.29787, 'train_accuracy': 0.6943679}
I0319 17:49:10.452003 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 18468 of 36423
I0319 17:57:07.115317 140547769902912 keras_utils.py:120] TimeHistory: 2686.40 examples/second between steps 18468 and 18981
I0319 17:57:07.119469 140547769902912 controller.py:220] step: 18981 steps_per_second: 1.08 {'train_loss': 34.908035, 'train_accuracy': 0.70074695}
I0319 17:57:07.119627 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 18981 of 36423
I0319 18:05:03.520790 140547769902912 keras_utils.py:120] TimeHistory: 2687.88 examples/second between steps 18981 and 19494
I0319 18:05:03.524950 140547769902912 controller.py:220] step: 19494 steps_per_second: 1.08 {'train_loss': 34.57146, 'train_accuracy': 0.7056616}
I0319 18:05:03.525108 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 19494 of 36423
I0319 18:13:00.146009 140547769902912 keras_utils.py:120] TimeHistory: 2686.66 examples/second between steps 19494 and 20007
I0319 18:13:00.150213 140547769902912 controller.py:220] step: 20007 steps_per_second: 1.08 {'train_loss': 34.178963, 'train_accuracy': 0.71188754}
I0319 18:13:00.150395 140547769902912 controller.py:185] Start evaluation at step: 20007
step: 17955 evaluation metric: {'test_loss': 0.25965777, 'test_accuracy': 0.69314, 'continue_training': True}
step: 18468 steps_per_second: 1.05 {'train_loss': 35.29787, 'train_accuracy': 0.6943679}
step: 18981 steps_per_second: 1.08 {'train_loss': 34.908035, 'train_accuracy': 0.70074695}
step: 19494 steps_per_second: 1.08 {'train_loss': 34.57146, 'train_accuracy': 0.7056616}
step: 20007 steps_per_second: 1.08 {'train_loss': 34.178963, 'train_accuracy': 0.71188754}
:::MLL 1679249580.639 eval_start: {"value": null, "metadata": {"epoch_num": 39, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 18:13:00.638623 140547769902912 mlp_log.py:80] :::MLL 1679249580.639 eval_start: {"value": null, "metadata": {"epoch_num": 39, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679249591.474 eval_stop: {"value": null, "metadata": {"epoch_num": 39, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 18:13:11.473854 140547769902912 mlp_log.py:80] :::MLL 1679249591.474 eval_stop: {"value": null, "metadata": {"epoch_num": 39, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679249591.482 eval_accuracy: {"value": 0.7071400284767151, "metadata": {"epoch_num": 39, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 18:13:11.482290 140547769902912 mlp_log.py:80] :::MLL 1679249591.482 eval_accuracy: {"value": 0.7071400284767151, "metadata": {"epoch_num": 39, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679249591.483 block_stop: {"value": null, "metadata": {"first_epoch_num": 36, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 18:13:11.483335 140547769902912 mlp_log.py:80] :::MLL 1679249591.483 block_stop: {"value": null, "metadata": {"first_epoch_num": 36, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679249591.484 block_start: {"value": null, "metadata": {"first_epoch_num": 40, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 18:13:11.484290 140547769902912 mlp_log.py:80] :::MLL 1679249591.484 block_start: {"value": null, "metadata": {"first_epoch_num": 40, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 18:13:11.494453 140547769902912 controller.py:220] step: 20007 evaluation metric: {'test_loss': 0.25293344, 'test_accuracy': 0.70714, 'continue_training': True}
I0319 18:13:11.494655 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 20007 of 36423
I0319 18:21:07.807034 140547769902912 keras_utils.py:120] TimeHistory: 2688.38 examples/second between steps 20007 and 20520
I0319 18:21:07.811231 140547769902912 controller.py:220] step: 20520 steps_per_second: 1.05 {'train_loss': 33.828243, 'train_accuracy': 0.7174528}
I0319 18:21:07.811421 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 20520 of 36423
I0319 18:29:04.059954 140547769902912 keras_utils.py:120] TimeHistory: 2688.74 examples/second between steps 20520 and 21033
I0319 18:29:04.064098 140547769902912 controller.py:220] step: 21033 steps_per_second: 1.08 {'train_loss': 33.491177, 'train_accuracy': 0.7229907}
I0319 18:29:04.064261 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 21033 of 36423
I0319 18:37:00.501375 140547769902912 keras_utils.py:120] TimeHistory: 2687.68 examples/second between steps 21033 and 21546
I0319 18:37:00.505643 140547769902912 controller.py:220] step: 21546 steps_per_second: 1.08 {'train_loss': 33.12628, 'train_accuracy': 0.72929555}
I0319 18:37:00.505825 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 21546 of 36423
I0319 18:44:57.448547 140547769902912 keras_utils.py:120] TimeHistory: 2684.84 examples/second between steps 21546 and 22059
I0319 18:44:57.453764 140547769902912 controller.py:220] step: 22059 steps_per_second: 1.08 {'train_loss': 32.781902, 'train_accuracy': 0.7347241}
I0319 18:44:57.453983 140547769902912 controller.py:185] Start evaluation at step: 22059
step: 20007 evaluation metric: {'test_loss': 0.25293344, 'test_accuracy': 0.70714, 'continue_training': True}
step: 20520 steps_per_second: 1.05 {'train_loss': 33.828243, 'train_accuracy': 0.7174528}
step: 21033 steps_per_second: 1.08 {'train_loss': 33.491177, 'train_accuracy': 0.7229907}
step: 21546 steps_per_second: 1.08 {'train_loss': 33.12628, 'train_accuracy': 0.72929555}
step: 22059 steps_per_second: 1.08 {'train_loss': 32.781902, 'train_accuracy': 0.7347241}
:::MLL 1679251497.962 eval_start: {"value": null, "metadata": {"epoch_num": 43, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 18:44:57.962047 140547769902912 mlp_log.py:80] :::MLL 1679251497.962 eval_start: {"value": null, "metadata": {"epoch_num": 43, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679251508.834 eval_stop: {"value": null, "metadata": {"epoch_num": 43, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 18:45:08.834538 140547769902912 mlp_log.py:80] :::MLL 1679251508.834 eval_stop: {"value": null, "metadata": {"epoch_num": 43, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679251508.842 eval_accuracy: {"value": 0.7310600280761719, "metadata": {"epoch_num": 43, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 18:45:08.841803 140547769902912 mlp_log.py:80] :::MLL 1679251508.842 eval_accuracy: {"value": 0.7310600280761719, "metadata": {"epoch_num": 43, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679251508.843 block_stop: {"value": null, "metadata": {"first_epoch_num": 40, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 18:45:08.842825 140547769902912 mlp_log.py:80] :::MLL 1679251508.843 block_stop: {"value": null, "metadata": {"first_epoch_num": 40, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679251508.844 block_start: {"value": null, "metadata": {"first_epoch_num": 44, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 18:45:08.843794 140547769902912 mlp_log.py:80] :::MLL 1679251508.844 block_start: {"value": null, "metadata": {"first_epoch_num": 44, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 18:45:08.853801 140547769902912 controller.py:220] step: 22059 evaluation metric: {'test_loss': 0.24191059, 'test_accuracy': 0.73106, 'continue_training': True}
I0319 18:45:08.853981 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 22059 of 36423
I0319 18:53:05.613945 140547769902912 keras_utils.py:120] TimeHistory: 2685.85 examples/second between steps 22059 and 22572
I0319 18:53:05.618196 140547769902912 controller.py:220] step: 22572 steps_per_second: 1.05 {'train_loss': 32.501007, 'train_accuracy': 0.7391007}
I0319 18:53:05.618384 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 22572 of 36423
I0319 19:01:03.341272 140547769902912 keras_utils.py:120] TimeHistory: 2680.44 examples/second between steps 22572 and 23085
I0319 19:01:03.345571 140547769902912 controller.py:220] step: 23085 steps_per_second: 1.07 {'train_loss': 32.159527, 'train_accuracy': 0.74508685}
I0319 19:01:03.345741 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 23085 of 36423
I0319 19:09:00.970286 140547769902912 keras_utils.py:120] TimeHistory: 2680.99 examples/second between steps 23085 and 23598
I0319 19:09:00.974560 140547769902912 controller.py:220] step: 23598 steps_per_second: 1.07 {'train_loss': 31.819632, 'train_accuracy': 0.7504209}
I0319 19:09:00.974729 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 23598 of 36423
I0319 19:16:57.904174 140547769902912 keras_utils.py:120] TimeHistory: 2684.90 examples/second between steps 23598 and 24111
I0319 19:16:57.908411 140547769902912 controller.py:220] step: 24111 steps_per_second: 1.08 {'train_loss': 31.538153, 'train_accuracy': 0.75502324}
I0319 19:16:57.908590 140547769902912 controller.py:185] Start evaluation at step: 24111
step: 22059 evaluation metric: {'test_loss': 0.24191059, 'test_accuracy': 0.73106, 'continue_training': True}
step: 22572 steps_per_second: 1.05 {'train_loss': 32.501007, 'train_accuracy': 0.7391007}
step: 23085 steps_per_second: 1.07 {'train_loss': 32.159527, 'train_accuracy': 0.74508685}
step: 23598 steps_per_second: 1.07 {'train_loss': 31.819632, 'train_accuracy': 0.7504209}
step: 24111 steps_per_second: 1.08 {'train_loss': 31.538153, 'train_accuracy': 0.75502324}
:::MLL 1679253418.442 eval_start: {"value": null, "metadata": {"epoch_num": 47, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 19:16:58.442478 140547769902912 mlp_log.py:80] :::MLL 1679253418.442 eval_start: {"value": null, "metadata": {"epoch_num": 47, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679253429.278 eval_stop: {"value": null, "metadata": {"epoch_num": 47, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 19:17:09.277749 140547769902912 mlp_log.py:80] :::MLL 1679253429.278 eval_stop: {"value": null, "metadata": {"epoch_num": 47, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679253429.285 eval_accuracy: {"value": 0.7399600148200989, "metadata": {"epoch_num": 47, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 19:17:09.285157 140547769902912 mlp_log.py:80] :::MLL 1679253429.285 eval_accuracy: {"value": 0.7399600148200989, "metadata": {"epoch_num": 47, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679253429.286 block_stop: {"value": null, "metadata": {"first_epoch_num": 44, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 19:17:09.286182 140547769902912 mlp_log.py:80] :::MLL 1679253429.286 block_stop: {"value": null, "metadata": {"first_epoch_num": 44, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679253429.287 block_start: {"value": null, "metadata": {"first_epoch_num": 48, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 19:17:09.287138 140547769902912 mlp_log.py:80] :::MLL 1679253429.287 block_start: {"value": null, "metadata": {"first_epoch_num": 48, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 19:17:09.297158 140547769902912 controller.py:220] step: 24111 evaluation metric: {'test_loss': 0.23783618, 'test_accuracy': 0.73996, 'continue_training': True}
I0319 19:17:09.297350 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 24111 of 36423
I0319 19:25:05.843054 140547769902912 keras_utils.py:120] TimeHistory: 2687.06 examples/second between steps 24111 and 24624
I0319 19:25:05.847337 140547769902912 controller.py:220] step: 24624 steps_per_second: 1.05 {'train_loss': 31.25745, 'train_accuracy': 0.7601792}
I0319 19:25:05.847517 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 24624 of 36423
I0319 19:33:01.919262 140547769902912 keras_utils.py:120] TimeHistory: 2689.74 examples/second between steps 24624 and 25137
I0319 19:33:01.923496 140547769902912 controller.py:220] step: 25137 steps_per_second: 1.08 {'train_loss': 30.94866, 'train_accuracy': 0.7650783}
I0319 19:33:01.923671 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 25137 of 36423
I0319 19:40:58.928619 140547769902912 keras_utils.py:120] TimeHistory: 2684.48 examples/second between steps 25137 and 25650
I0319 19:40:58.932954 140547769902912 controller.py:220] step: 25650 steps_per_second: 1.08 {'train_loss': 30.675001, 'train_accuracy': 0.76950336}
I0319 19:40:58.933148 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 25650 of 36423
I0319 19:48:55.050780 140547769902912 keras_utils.py:120] TimeHistory: 2689.48 examples/second between steps 25650 and 26163
I0319 19:48:55.054964 140547769902912 controller.py:220] step: 26163 steps_per_second: 1.08 {'train_loss': 30.428179, 'train_accuracy': 0.7739565}
I0319 19:48:55.055132 140547769902912 controller.py:185] Start evaluation at step: 26163
step: 24111 evaluation metric: {'test_loss': 0.23783618, 'test_accuracy': 0.73996, 'continue_training': True}
step: 24624 steps_per_second: 1.05 {'train_loss': 31.25745, 'train_accuracy': 0.7601792}
step: 25137 steps_per_second: 1.08 {'train_loss': 30.94866, 'train_accuracy': 0.7650783}
step: 25650 steps_per_second: 1.08 {'train_loss': 30.675001, 'train_accuracy': 0.76950336}
step: 26163 steps_per_second: 1.08 {'train_loss': 30.428179, 'train_accuracy': 0.7739565}
:::MLL 1679255335.547 eval_start: {"value": null, "metadata": {"epoch_num": 51, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 19:48:55.547407 140547769902912 mlp_log.py:80] :::MLL 1679255335.547 eval_start: {"value": null, "metadata": {"epoch_num": 51, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679255346.553 eval_stop: {"value": null, "metadata": {"epoch_num": 51, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 19:49:06.552730 140547769902912 mlp_log.py:80] :::MLL 1679255346.553 eval_stop: {"value": null, "metadata": {"epoch_num": 51, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679255346.560 eval_accuracy: {"value": 0.7473400235176086, "metadata": {"epoch_num": 51, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 19:49:06.559940 140547769902912 mlp_log.py:80] :::MLL 1679255346.560 eval_accuracy: {"value": 0.7473400235176086, "metadata": {"epoch_num": 51, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679255346.561 block_stop: {"value": null, "metadata": {"first_epoch_num": 48, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 19:49:06.560959 140547769902912 mlp_log.py:80] :::MLL 1679255346.561 block_stop: {"value": null, "metadata": {"first_epoch_num": 48, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679255346.562 block_start: {"value": null, "metadata": {"first_epoch_num": 52, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 19:49:06.561913 140547769902912 mlp_log.py:80] :::MLL 1679255346.562 block_start: {"value": null, "metadata": {"first_epoch_num": 52, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 19:49:06.571880 140547769902912 controller.py:220] step: 26163 evaluation metric: {'test_loss': 0.23394844, 'test_accuracy': 0.74734, 'continue_training': True}
I0319 19:49:06.572060 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 26163 of 36423
I0319 19:57:04.827503 140547769902912 keras_utils.py:120] TimeHistory: 2677.45 examples/second between steps 26163 and 26676
I0319 19:57:04.831851 140547769902912 controller.py:220] step: 26676 steps_per_second: 1.05 {'train_loss': 30.180855, 'train_accuracy': 0.7779777}
I0319 19:57:04.832029 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 26676 of 36423
I0319 20:05:01.335356 140547769902912 keras_utils.py:120] TimeHistory: 2687.31 examples/second between steps 26676 and 27189
I0319 20:05:01.339524 140547769902912 controller.py:220] step: 27189 steps_per_second: 1.08 {'train_loss': 29.918842, 'train_accuracy': 0.7823356}
I0319 20:05:01.339692 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 27189 of 36423
I0319 20:12:58.183544 140547769902912 keras_utils.py:120] TimeHistory: 2685.39 examples/second between steps 27189 and 27702
I0319 20:12:58.187861 140547769902912 controller.py:220] step: 27702 steps_per_second: 1.08 {'train_loss': 29.700476, 'train_accuracy': 0.78678167}
I0319 20:12:58.188049 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 27702 of 36423
I0319 20:20:54.531436 140547769902912 keras_utils.py:120] TimeHistory: 2688.23 examples/second between steps 27702 and 28215
I0319 20:20:54.535721 140547769902912 controller.py:220] step: 28215 steps_per_second: 1.08 {'train_loss': 29.481922, 'train_accuracy': 0.79037726}
I0319 20:20:54.535894 140547769902912 controller.py:185] Start evaluation at step: 28215
step: 26163 evaluation metric: {'test_loss': 0.23394844, 'test_accuracy': 0.74734, 'continue_training': True}
step: 26676 steps_per_second: 1.05 {'train_loss': 30.180855, 'train_accuracy': 0.7779777}
step: 27189 steps_per_second: 1.08 {'train_loss': 29.918842, 'train_accuracy': 0.7823356}
step: 27702 steps_per_second: 1.08 {'train_loss': 29.700476, 'train_accuracy': 0.78678167}
step: 28215 steps_per_second: 1.08 {'train_loss': 29.481922, 'train_accuracy': 0.79037726}
:::MLL 1679257255.034 eval_start: {"value": null, "metadata": {"epoch_num": 55, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 20:20:55.034361 140547769902912 mlp_log.py:80] :::MLL 1679257255.034 eval_start: {"value": null, "metadata": {"epoch_num": 55, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679257265.511 eval_stop: {"value": null, "metadata": {"epoch_num": 55, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 20:21:05.511004 140547769902912 mlp_log.py:80] :::MLL 1679257265.511 eval_stop: {"value": null, "metadata": {"epoch_num": 55, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679257265.518 eval_accuracy: {"value": 0.7565600275993347, "metadata": {"epoch_num": 55, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 20:21:05.518299 140547769902912 mlp_log.py:80] :::MLL 1679257265.518 eval_accuracy: {"value": 0.7565600275993347, "metadata": {"epoch_num": 55, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679257265.519 block_stop: {"value": null, "metadata": {"first_epoch_num": 52, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 20:21:05.519329 140547769902912 mlp_log.py:80] :::MLL 1679257265.519 block_stop: {"value": null, "metadata": {"first_epoch_num": 52, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679257265.520 block_start: {"value": null, "metadata": {"first_epoch_num": 56, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 20:21:05.520274 140547769902912 mlp_log.py:80] :::MLL 1679257265.520 block_start: {"value": null, "metadata": {"first_epoch_num": 56, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 20:21:05.530137 140547769902912 controller.py:220] step: 28215 evaluation metric: {'test_loss': 0.23033953, 'test_accuracy': 0.75656, 'continue_training': True}
I0319 20:21:05.530332 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 28215 of 36423
I0319 20:29:02.549445 140547769902912 keras_utils.py:120] TimeHistory: 2684.41 examples/second between steps 28215 and 28728
I0319 20:29:02.553695 140547769902912 controller.py:220] step: 28728 steps_per_second: 1.05 {'train_loss': 29.281693, 'train_accuracy': 0.7941939}
I0319 20:29:02.553875 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 28728 of 36423
I0319 20:36:59.701035 140547769902912 keras_utils.py:120] TimeHistory: 2683.68 examples/second between steps 28728 and 29241
I0319 20:36:59.705335 140547769902912 controller.py:220] step: 29241 steps_per_second: 1.08 {'train_loss': 29.111845, 'train_accuracy': 0.79688674}
I0319 20:36:59.705515 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 29241 of 36423
I0319 20:44:56.506052 140547769902912 keras_utils.py:120] TimeHistory: 2685.63 examples/second between steps 29241 and 29754
I0319 20:44:56.510352 140547769902912 controller.py:220] step: 29754 steps_per_second: 1.08 {'train_loss': 28.945818, 'train_accuracy': 0.8003339}
I0319 20:44:56.510533 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 29754 of 36423
I0319 20:52:52.982735 140547769902912 keras_utils.py:120] TimeHistory: 2687.48 examples/second between steps 29754 and 30267
I0319 20:52:52.987001 140547769902912 controller.py:220] step: 30267 steps_per_second: 1.08 {'train_loss': 28.79324, 'train_accuracy': 0.80263704}
I0319 20:52:52.987169 140547769902912 controller.py:185] Start evaluation at step: 30267
step: 28215 evaluation metric: {'test_loss': 0.23033953, 'test_accuracy': 0.75656, 'continue_training': True}
step: 28728 steps_per_second: 1.05 {'train_loss': 29.281693, 'train_accuracy': 0.7941939}
step: 29241 steps_per_second: 1.08 {'train_loss': 29.111845, 'train_accuracy': 0.79688674}
step: 29754 steps_per_second: 1.08 {'train_loss': 28.945818, 'train_accuracy': 0.8003339}
step: 30267 steps_per_second: 1.08 {'train_loss': 28.79324, 'train_accuracy': 0.80263704}
:::MLL 1679259173.484 eval_start: {"value": null, "metadata": {"epoch_num": 59, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 20:52:53.483794 140547769902912 mlp_log.py:80] :::MLL 1679259173.484 eval_start: {"value": null, "metadata": {"epoch_num": 59, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679259184.442 eval_stop: {"value": null, "metadata": {"epoch_num": 59, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 20:53:04.441937 140547769902912 mlp_log.py:80] :::MLL 1679259184.442 eval_stop: {"value": null, "metadata": {"epoch_num": 59, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679259184.452 eval_accuracy: {"value": 0.7594199776649475, "metadata": {"epoch_num": 59, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 20:53:04.452280 140547769902912 mlp_log.py:80] :::MLL 1679259184.452 eval_accuracy: {"value": 0.7594199776649475, "metadata": {"epoch_num": 59, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679259184.453 block_stop: {"value": null, "metadata": {"first_epoch_num": 56, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 20:53:04.453428 140547769902912 mlp_log.py:80] :::MLL 1679259184.453 block_stop: {"value": null, "metadata": {"first_epoch_num": 56, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 20:53:04.463417 140547769902912 controller.py:220] step: 30267 evaluation metric: {'test_loss': 0.22827734, 'test_accuracy': 0.75942, 'continue_training': False}
step: 30267 evaluation metric: {'test_loss': 0.22827734, 'test_accuracy': 0.75942, 'continue_training': False}
:::MLL 1679259184.464 run_stop: {"value": null, "metadata": {"status": "success", "lineno": 279, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 20:53:04.464306 140547769902912 mlp_log.py:80] :::MLL 1679259184.464 run_stop: {"value": null, "metadata": {"status": "success", "lineno": 279, "file": "./resnet_ctl_imagenet_main.py"}}
:::MLL 1679259184.465 run_final: {"value": null, "metadata": {"lineno": 281, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 20:53:04.465044 140547769902912 mlp_log.py:80] :::MLL 1679259184.465 run_final: {"value": null, "metadata": {"lineno": 281, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 20:53:04.484413 140547769902912 resnet_ctl_imagenet_main.py:298] Run stats:
{'eval_loss': 0.22827734, 'eval_acc': 0.75942, 'train_loss': 28.79324, 'train_acc': 0.80263704, 'step_timestamp_log': ['BatchTimestamp<batch_index: 0, timestamp: 1679230834.281045>', 'BatchTimestamp<batch_index: 513, timestamp: 1679231312.765514>', 'BatchTimestamp<batch_index: 1026, timestamp: 1679231790.4766295>', 'BatchTimestamp<batch_index: 1539, timestamp: 1679232268.0622654>', 'BatchTimestamp<batch_index: 2052, timestamp: 1679232778.5840638>', 'BatchTimestamp<batch_index: 2565, timestamp: 1679233256.8333852>', 'BatchTimestamp<batch_index: 3078, timestamp: 1679233735.232925>', 'BatchTimestamp<batch_index: 3591, timestamp: 1679234213.5742714>', 'BatchTimestamp<batch_index: 4104, timestamp: 1679234703.2010417>', 'BatchTimestamp<batch_index: 4617, timestamp: 1679235181.7035873>', 'BatchTimestamp<batch_index: 5130, timestamp: 1679235658.7568266>', 'BatchTimestamp<batch_index: 5643, timestamp: 1679236136.837958>', 'BatchTimestamp<batch_index: 6156, timestamp: 1679236625.7255764>', 'BatchTimestamp<batch_index: 6669, timestamp: 1679237103.411414>', 'BatchTimestamp<batch_index: 7182, timestamp: 1679237582.1563838>', 'BatchTimestamp<batch_index: 7695, timestamp: 1679238060.5108216>', 'BatchTimestamp<batch_index: 8208, timestamp: 1679238549.9029462>', 'BatchTimestamp<batch_index: 8721, timestamp: 1679239028.3283317>', 'BatchTimestamp<batch_index: 9234, timestamp: 1679239506.558369>', 'BatchTimestamp<batch_index: 9747, timestamp: 1679239984.4382937>', 'BatchTimestamp<batch_index: 10260, timestamp: 1679240474.3921533>', 'BatchTimestamp<batch_index: 10773, timestamp: 1679240951.96138>', 'BatchTimestamp<batch_index: 11286, timestamp: 1679241429.1646736>', 'BatchTimestamp<batch_index: 11799, timestamp: 1679241906.888098>', 'BatchTimestamp<batch_index: 12312, timestamp: 1679242395.0172863>', 'BatchTimestamp<batch_index: 12825, timestamp: 1679242871.8736327>', 'BatchTimestamp<batch_index: 13338, timestamp: 1679243348.7574499>', 'BatchTimestamp<batch_index: 13851, timestamp: 1679243825.9569237>', 'BatchTimestamp<batch_index: 14364, timestamp: 1679244314.2721043>', 'BatchTimestamp<batch_index: 14877, timestamp: 1679244792.2427475>', 'BatchTimestamp<batch_index: 15390, timestamp: 1679245268.9251325>', 'BatchTimestamp<batch_index: 15903, timestamp: 1679245745.4601164>', 'BatchTimestamp<batch_index: 16416, timestamp: 1679246232.7534444>', 'BatchTimestamp<batch_index: 16929, timestamp: 1679246708.728656>', 'BatchTimestamp<batch_index: 17442, timestamp: 1679247185.7591805>', 'BatchTimestamp<batch_index: 17955, timestamp: 1679247662.4490402>', 'BatchTimestamp<batch_index: 18468, timestamp: 1679248150.4474506>', 'BatchTimestamp<batch_index: 18981, timestamp: 1679248627.1151292>', 'BatchTimestamp<batch_index: 19494, timestamp: 1679249103.5206127>', 'BatchTimestamp<batch_index: 20007, timestamp: 1679249580.1458325>', 'BatchTimestamp<batch_index: 20520, timestamp: 1679250067.8068252>', 'BatchTimestamp<batch_index: 21033, timestamp: 1679250544.0597591>', 'BatchTimestamp<batch_index: 21546, timestamp: 1679251020.501157>', 'BatchTimestamp<batch_index: 22059, timestamp: 1679251497.4479887>', 'BatchTimestamp<batch_index: 22572, timestamp: 1679251985.6137266>', 'BatchTimestamp<batch_index: 23085, timestamp: 1679252463.3410485>', 'BatchTimestamp<batch_index: 23598, timestamp: 1679252940.9701052>', 'BatchTimestamp<batch_index: 24111, timestamp: 1679253417.9039862>', 'BatchTimestamp<batch_index: 24624, timestamp: 1679253905.8428304>', 'BatchTimestamp<batch_index: 25137, timestamp: 1679254381.919039>', 'BatchTimestamp<batch_index: 25650, timestamp: 1679254858.9284008>', 'BatchTimestamp<batch_index: 26163, timestamp: 1679255335.0505683>', 'BatchTimestamp<batch_index: 26676, timestamp: 1679255824.8272724>', 'BatchTimestamp<batch_index: 27189, timestamp: 1679256301.335169>', 'BatchTimestamp<batch_index: 27702, timestamp: 1679256778.1833446>', 'BatchTimestamp<batch_index: 28215, timestamp: 1679257254.531205>', 'BatchTimestamp<batch_index: 28728, timestamp: 1679257742.5492072>', 'BatchTimestamp<batch_index: 29241, timestamp: 1679258219.7008102>', 'BatchTimestamp<batch_index: 29754, timestamp: 1679258696.5058227>', 'BatchTimestamp<batch_index: 30267, timestamp: 1679259172.9825177>'], 'train_finish_time': 1679259184.4644349, 'avg_exp_per_second': 2683.1639506599217}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment