Unverified Commit f788046c authored by Toby Boyd's avatar Toby Boyd Committed by GitHub
Browse files

Add model_dir to all tests to avoid "resource not found error". (#6143)

* fix test benchmark_graph_1_gpu_no_dist_strat failing

- Failure only occurs when all 1_gpu tests are run
together with the error:
tensorflow.python.framework.errors_impl.NotFoundError:
Resource localhost/logdir:/tmp/cifar10_model/
N10tensorflow22SummaryWriterInterfaceE does not exist.
[Op:WriteScalarSummary] name: epoch_loss/

Another fix might be to generate a different model_dir
in the core code, but that has other draw backs such as
restarting from the checkpoint.

* Model_dir for all tests.
parent 25b10ebe
......@@ -51,7 +51,7 @@ class Resnet56KerasAccuracy(keras_benchmark.KerasBenchmark):
FLAGS.data_dir = DATA_DIR
FLAGS.batch_size = 128
FLAGS.train_epochs = 182
FLAGS.model_dir = self._get_model_dir('keras_resnet56_1_gpu')
FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu')
FLAGS.dtype = 'fp32'
self._run_and_report_benchmark()
......@@ -62,7 +62,7 @@ class Resnet56KerasAccuracy(keras_benchmark.KerasBenchmark):
FLAGS.data_dir = DATA_DIR
FLAGS.batch_size = 128
FLAGS.train_epochs = 182
FLAGS.model_dir = self._get_model_dir('keras_resnet56_eager_1_gpu')
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu')
FLAGS.dtype = 'fp32'
FLAGS.enable_eager = True
self._run_and_report_benchmark()
......@@ -74,7 +74,7 @@ class Resnet56KerasAccuracy(keras_benchmark.KerasBenchmark):
FLAGS.data_dir = DATA_DIR
FLAGS.batch_size = 128
FLAGS.train_epochs = 182
FLAGS.model_dir = self._get_model_dir('keras_resnet56_eager_2_gpu')
FLAGS.model_dir = self._get_model_dir('benchmark_2_gpu')
FLAGS.dtype = 'fp32'
FLAGS.enable_eager = True
self._run_and_report_benchmark()
......@@ -86,7 +86,7 @@ class Resnet56KerasAccuracy(keras_benchmark.KerasBenchmark):
FLAGS.data_dir = DATA_DIR
FLAGS.batch_size = 128
FLAGS.train_epochs = 182
FLAGS.model_dir = self._get_model_dir('keras_resnet56_2_gpu')
FLAGS.model_dir = self._get_model_dir('benchmark_graph_2_gpu')
FLAGS.dtype = 'fp32'
self._run_and_report_benchmark()
......@@ -98,7 +98,7 @@ class Resnet56KerasAccuracy(keras_benchmark.KerasBenchmark):
FLAGS.data_dir = DATA_DIR
FLAGS.batch_size = 128
FLAGS.train_epochs = 182
FLAGS.model_dir = self._get_model_dir('keras_resnet56_no_dist_strat_1_gpu')
FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu_no_dist_strat')
FLAGS.dtype = 'fp32'
self._run_and_report_benchmark()
......@@ -145,6 +145,7 @@ class Resnet56KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.turn_off_distribution_strategy = True
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_no_dist_strat')
FLAGS.batch_size = 128
self._run_and_report_benchmark()
......@@ -153,6 +154,7 @@ class Resnet56KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
FLAGS.num_gpus = 1
FLAGS.enable_eager = False
FLAGS.turn_off_distribution_strategy = True
FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu_no_dist_strat')
FLAGS.batch_size = 128
self._run_and_report_benchmark()
......@@ -161,6 +163,7 @@ class Resnet56KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.turn_off_distribution_strategy = False
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu')
FLAGS.batch_size = 128
self._run_and_report_benchmark()
......@@ -169,6 +172,7 @@ class Resnet56KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
FLAGS.num_gpus = 1
FLAGS.enable_eager = False
FLAGS.turn_off_distribution_strategy = False
FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu')
FLAGS.batch_size = 128
self._run_and_report_benchmark()
......@@ -177,6 +181,7 @@ class Resnet56KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
FLAGS.num_gpus = 2
FLAGS.enable_eager = True
FLAGS.turn_off_distribution_strategy = False
FLAGS.model_dir = self._get_model_dir('benchmark_2_gpu')
FLAGS.batch_size = 128 * 2 # 2 GPUs
self._run_and_report_benchmark()
......@@ -185,6 +190,7 @@ class Resnet56KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
FLAGS.num_gpus = 2
FLAGS.enable_eager = False
FLAGS.turn_off_distribution_strategy = False
FLAGS.model_dir = self._get_model_dir('benchmark_graph_2_gpu')
FLAGS.batch_size = 128 * 2 # 2 GPUs
self._run_and_report_benchmark()
......
......@@ -50,7 +50,7 @@ class Resnet50KerasAccuracy(keras_benchmark.KerasBenchmark):
FLAGS.data_dir = DATA_DIR
FLAGS.batch_size = 128 * 8
FLAGS.train_epochs = 90
FLAGS.model_dir = self._get_model_dir('keras_resnet50_8_gpu')
FLAGS.model_dir = self._get_model_dir('benchmark_graph_8_gpu')
FLAGS.dtype = 'fp32'
self._run_and_report_benchmark()
......@@ -61,7 +61,7 @@ class Resnet50KerasAccuracy(keras_benchmark.KerasBenchmark):
FLAGS.data_dir = DATA_DIR
FLAGS.batch_size = 128 * 8
FLAGS.train_epochs = 90
FLAGS.model_dir = self._get_model_dir('keras_resnet50_eager_8_gpu')
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu')
FLAGS.dtype = 'fp32'
FLAGS.enable_eager = True
self._run_and_report_benchmark()
......@@ -113,6 +113,7 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.turn_off_distribution_strategy = True
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_no_dist_strat')
FLAGS.batch_size = 128
self._run_and_report_benchmark()
......@@ -122,6 +123,7 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
FLAGS.num_gpus = 1
FLAGS.enable_eager = False
FLAGS.turn_off_distribution_strategy = True
FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu_no_dist_strat')
FLAGS.batch_size = 128
self._run_and_report_benchmark()
......@@ -131,6 +133,7 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.turn_off_distribution_strategy = False
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu')
FLAGS.batch_size = 128
self._run_and_report_benchmark()
......@@ -140,6 +143,7 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
FLAGS.num_gpus = 1
FLAGS.enable_eager = False
FLAGS.turn_off_distribution_strategy = False
FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu')
FLAGS.batch_size = 128
self._run_and_report_benchmark()
......@@ -149,6 +153,7 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
FLAGS.num_gpus = 8
FLAGS.enable_eager = True
FLAGS.turn_off_distribution_strategy = False
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu')
FLAGS.batch_size = 128 * 8 # 8 GPUs
self._run_and_report_benchmark()
......@@ -158,6 +163,7 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
FLAGS.num_gpus = 8
FLAGS.enable_eager = False
FLAGS.turn_off_distribution_strategy = False
FLAGS.model_dir = self._get_model_dir('benchmark_graph_8_gpu')
FLAGS.batch_size = 128 * 8 # 8 GPUs
self._run_and_report_benchmark()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment