Unverified Commit f788046c authored by Toby Boyd's avatar Toby Boyd Committed by GitHub
Browse files

Add model_dir to all tests to avoid "resource not found error". (#6143)

* fix test benchmark_graph_1_gpu_no_dist_strat failing

- Failure only occurs when all 1_gpu tests are run
together with the error:
tensorflow.python.framework.errors_impl.NotFoundError:
Resource localhost/logdir:/tmp/cifar10_model/
N10tensorflow22SummaryWriterInterfaceE does not exist.
[Op:WriteScalarSummary] name: epoch_loss/

Another fix might be to generate a different model_dir
in the core code, but that has other draw backs such as
restarting from the checkpoint.

* Model_dir for all tests.
parent 25b10ebe
...@@ -51,7 +51,7 @@ class Resnet56KerasAccuracy(keras_benchmark.KerasBenchmark): ...@@ -51,7 +51,7 @@ class Resnet56KerasAccuracy(keras_benchmark.KerasBenchmark):
FLAGS.data_dir = DATA_DIR FLAGS.data_dir = DATA_DIR
FLAGS.batch_size = 128 FLAGS.batch_size = 128
FLAGS.train_epochs = 182 FLAGS.train_epochs = 182
FLAGS.model_dir = self._get_model_dir('keras_resnet56_1_gpu') FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu')
FLAGS.dtype = 'fp32' FLAGS.dtype = 'fp32'
self._run_and_report_benchmark() self._run_and_report_benchmark()
...@@ -62,7 +62,7 @@ class Resnet56KerasAccuracy(keras_benchmark.KerasBenchmark): ...@@ -62,7 +62,7 @@ class Resnet56KerasAccuracy(keras_benchmark.KerasBenchmark):
FLAGS.data_dir = DATA_DIR FLAGS.data_dir = DATA_DIR
FLAGS.batch_size = 128 FLAGS.batch_size = 128
FLAGS.train_epochs = 182 FLAGS.train_epochs = 182
FLAGS.model_dir = self._get_model_dir('keras_resnet56_eager_1_gpu') FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu')
FLAGS.dtype = 'fp32' FLAGS.dtype = 'fp32'
FLAGS.enable_eager = True FLAGS.enable_eager = True
self._run_and_report_benchmark() self._run_and_report_benchmark()
...@@ -74,7 +74,7 @@ class Resnet56KerasAccuracy(keras_benchmark.KerasBenchmark): ...@@ -74,7 +74,7 @@ class Resnet56KerasAccuracy(keras_benchmark.KerasBenchmark):
FLAGS.data_dir = DATA_DIR FLAGS.data_dir = DATA_DIR
FLAGS.batch_size = 128 FLAGS.batch_size = 128
FLAGS.train_epochs = 182 FLAGS.train_epochs = 182
FLAGS.model_dir = self._get_model_dir('keras_resnet56_eager_2_gpu') FLAGS.model_dir = self._get_model_dir('benchmark_2_gpu')
FLAGS.dtype = 'fp32' FLAGS.dtype = 'fp32'
FLAGS.enable_eager = True FLAGS.enable_eager = True
self._run_and_report_benchmark() self._run_and_report_benchmark()
...@@ -86,7 +86,7 @@ class Resnet56KerasAccuracy(keras_benchmark.KerasBenchmark): ...@@ -86,7 +86,7 @@ class Resnet56KerasAccuracy(keras_benchmark.KerasBenchmark):
FLAGS.data_dir = DATA_DIR FLAGS.data_dir = DATA_DIR
FLAGS.batch_size = 128 FLAGS.batch_size = 128
FLAGS.train_epochs = 182 FLAGS.train_epochs = 182
FLAGS.model_dir = self._get_model_dir('keras_resnet56_2_gpu') FLAGS.model_dir = self._get_model_dir('benchmark_graph_2_gpu')
FLAGS.dtype = 'fp32' FLAGS.dtype = 'fp32'
self._run_and_report_benchmark() self._run_and_report_benchmark()
...@@ -98,7 +98,7 @@ class Resnet56KerasAccuracy(keras_benchmark.KerasBenchmark): ...@@ -98,7 +98,7 @@ class Resnet56KerasAccuracy(keras_benchmark.KerasBenchmark):
FLAGS.data_dir = DATA_DIR FLAGS.data_dir = DATA_DIR
FLAGS.batch_size = 128 FLAGS.batch_size = 128
FLAGS.train_epochs = 182 FLAGS.train_epochs = 182
FLAGS.model_dir = self._get_model_dir('keras_resnet56_no_dist_strat_1_gpu') FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu_no_dist_strat')
FLAGS.dtype = 'fp32' FLAGS.dtype = 'fp32'
self._run_and_report_benchmark() self._run_and_report_benchmark()
...@@ -145,6 +145,7 @@ class Resnet56KerasBenchmarkBase(keras_benchmark.KerasBenchmark): ...@@ -145,6 +145,7 @@ class Resnet56KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
FLAGS.num_gpus = 1 FLAGS.num_gpus = 1
FLAGS.enable_eager = True FLAGS.enable_eager = True
FLAGS.turn_off_distribution_strategy = True FLAGS.turn_off_distribution_strategy = True
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_no_dist_strat')
FLAGS.batch_size = 128 FLAGS.batch_size = 128
self._run_and_report_benchmark() self._run_and_report_benchmark()
...@@ -153,6 +154,7 @@ class Resnet56KerasBenchmarkBase(keras_benchmark.KerasBenchmark): ...@@ -153,6 +154,7 @@ class Resnet56KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
FLAGS.num_gpus = 1 FLAGS.num_gpus = 1
FLAGS.enable_eager = False FLAGS.enable_eager = False
FLAGS.turn_off_distribution_strategy = True FLAGS.turn_off_distribution_strategy = True
FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu_no_dist_strat')
FLAGS.batch_size = 128 FLAGS.batch_size = 128
self._run_and_report_benchmark() self._run_and_report_benchmark()
...@@ -161,6 +163,7 @@ class Resnet56KerasBenchmarkBase(keras_benchmark.KerasBenchmark): ...@@ -161,6 +163,7 @@ class Resnet56KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
FLAGS.num_gpus = 1 FLAGS.num_gpus = 1
FLAGS.enable_eager = True FLAGS.enable_eager = True
FLAGS.turn_off_distribution_strategy = False FLAGS.turn_off_distribution_strategy = False
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu')
FLAGS.batch_size = 128 FLAGS.batch_size = 128
self._run_and_report_benchmark() self._run_and_report_benchmark()
...@@ -169,6 +172,7 @@ class Resnet56KerasBenchmarkBase(keras_benchmark.KerasBenchmark): ...@@ -169,6 +172,7 @@ class Resnet56KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
FLAGS.num_gpus = 1 FLAGS.num_gpus = 1
FLAGS.enable_eager = False FLAGS.enable_eager = False
FLAGS.turn_off_distribution_strategy = False FLAGS.turn_off_distribution_strategy = False
FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu')
FLAGS.batch_size = 128 FLAGS.batch_size = 128
self._run_and_report_benchmark() self._run_and_report_benchmark()
...@@ -177,6 +181,7 @@ class Resnet56KerasBenchmarkBase(keras_benchmark.KerasBenchmark): ...@@ -177,6 +181,7 @@ class Resnet56KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
FLAGS.num_gpus = 2 FLAGS.num_gpus = 2
FLAGS.enable_eager = True FLAGS.enable_eager = True
FLAGS.turn_off_distribution_strategy = False FLAGS.turn_off_distribution_strategy = False
FLAGS.model_dir = self._get_model_dir('benchmark_2_gpu')
FLAGS.batch_size = 128 * 2 # 2 GPUs FLAGS.batch_size = 128 * 2 # 2 GPUs
self._run_and_report_benchmark() self._run_and_report_benchmark()
...@@ -185,6 +190,7 @@ class Resnet56KerasBenchmarkBase(keras_benchmark.KerasBenchmark): ...@@ -185,6 +190,7 @@ class Resnet56KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
FLAGS.num_gpus = 2 FLAGS.num_gpus = 2
FLAGS.enable_eager = False FLAGS.enable_eager = False
FLAGS.turn_off_distribution_strategy = False FLAGS.turn_off_distribution_strategy = False
FLAGS.model_dir = self._get_model_dir('benchmark_graph_2_gpu')
FLAGS.batch_size = 128 * 2 # 2 GPUs FLAGS.batch_size = 128 * 2 # 2 GPUs
self._run_and_report_benchmark() self._run_and_report_benchmark()
......
...@@ -50,7 +50,7 @@ class Resnet50KerasAccuracy(keras_benchmark.KerasBenchmark): ...@@ -50,7 +50,7 @@ class Resnet50KerasAccuracy(keras_benchmark.KerasBenchmark):
FLAGS.data_dir = DATA_DIR FLAGS.data_dir = DATA_DIR
FLAGS.batch_size = 128 * 8 FLAGS.batch_size = 128 * 8
FLAGS.train_epochs = 90 FLAGS.train_epochs = 90
FLAGS.model_dir = self._get_model_dir('keras_resnet50_8_gpu') FLAGS.model_dir = self._get_model_dir('benchmark_graph_8_gpu')
FLAGS.dtype = 'fp32' FLAGS.dtype = 'fp32'
self._run_and_report_benchmark() self._run_and_report_benchmark()
...@@ -61,7 +61,7 @@ class Resnet50KerasAccuracy(keras_benchmark.KerasBenchmark): ...@@ -61,7 +61,7 @@ class Resnet50KerasAccuracy(keras_benchmark.KerasBenchmark):
FLAGS.data_dir = DATA_DIR FLAGS.data_dir = DATA_DIR
FLAGS.batch_size = 128 * 8 FLAGS.batch_size = 128 * 8
FLAGS.train_epochs = 90 FLAGS.train_epochs = 90
FLAGS.model_dir = self._get_model_dir('keras_resnet50_eager_8_gpu') FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu')
FLAGS.dtype = 'fp32' FLAGS.dtype = 'fp32'
FLAGS.enable_eager = True FLAGS.enable_eager = True
self._run_and_report_benchmark() self._run_and_report_benchmark()
...@@ -113,6 +113,7 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark): ...@@ -113,6 +113,7 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
FLAGS.num_gpus = 1 FLAGS.num_gpus = 1
FLAGS.enable_eager = True FLAGS.enable_eager = True
FLAGS.turn_off_distribution_strategy = True FLAGS.turn_off_distribution_strategy = True
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_no_dist_strat')
FLAGS.batch_size = 128 FLAGS.batch_size = 128
self._run_and_report_benchmark() self._run_and_report_benchmark()
...@@ -122,6 +123,7 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark): ...@@ -122,6 +123,7 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
FLAGS.num_gpus = 1 FLAGS.num_gpus = 1
FLAGS.enable_eager = False FLAGS.enable_eager = False
FLAGS.turn_off_distribution_strategy = True FLAGS.turn_off_distribution_strategy = True
FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu_no_dist_strat')
FLAGS.batch_size = 128 FLAGS.batch_size = 128
self._run_and_report_benchmark() self._run_and_report_benchmark()
...@@ -131,6 +133,7 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark): ...@@ -131,6 +133,7 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
FLAGS.num_gpus = 1 FLAGS.num_gpus = 1
FLAGS.enable_eager = True FLAGS.enable_eager = True
FLAGS.turn_off_distribution_strategy = False FLAGS.turn_off_distribution_strategy = False
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu')
FLAGS.batch_size = 128 FLAGS.batch_size = 128
self._run_and_report_benchmark() self._run_and_report_benchmark()
...@@ -140,6 +143,7 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark): ...@@ -140,6 +143,7 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
FLAGS.num_gpus = 1 FLAGS.num_gpus = 1
FLAGS.enable_eager = False FLAGS.enable_eager = False
FLAGS.turn_off_distribution_strategy = False FLAGS.turn_off_distribution_strategy = False
FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu')
FLAGS.batch_size = 128 FLAGS.batch_size = 128
self._run_and_report_benchmark() self._run_and_report_benchmark()
...@@ -149,6 +153,7 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark): ...@@ -149,6 +153,7 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
FLAGS.num_gpus = 8 FLAGS.num_gpus = 8
FLAGS.enable_eager = True FLAGS.enable_eager = True
FLAGS.turn_off_distribution_strategy = False FLAGS.turn_off_distribution_strategy = False
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu')
FLAGS.batch_size = 128 * 8 # 8 GPUs FLAGS.batch_size = 128 * 8 # 8 GPUs
self._run_and_report_benchmark() self._run_and_report_benchmark()
...@@ -158,6 +163,7 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark): ...@@ -158,6 +163,7 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
FLAGS.num_gpus = 8 FLAGS.num_gpus = 8
FLAGS.enable_eager = False FLAGS.enable_eager = False
FLAGS.turn_off_distribution_strategy = False FLAGS.turn_off_distribution_strategy = False
FLAGS.model_dir = self._get_model_dir('benchmark_graph_8_gpu')
FLAGS.batch_size = 128 * 8 # 8 GPUs FLAGS.batch_size = 128 * 8 # 8 GPUs
self._run_and_report_benchmark() self._run_and_report_benchmark()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment