"src/git@developer.sourcefind.cn:OpenDAS/tilelang.git" did not exist on "b66a93c5dd10c93a1aa788d018bd999f6e987985"
Commit e286da17 authored by qianyj's avatar qianyj
Browse files

update Tensorflow test method

parent f270c43a
# 简介 # 简介
该测试用例可用于ResNet50/Vgg16等网络的性能测试及精度验证。 该测试用例可用于ResNet50/Vgg16等网络的性能测试及精度验证。
# 单卡测试 (单精度) # 单卡测试 (单精度)
...@@ -19,7 +19,14 @@ ...@@ -19,7 +19,14 @@
## 运行 ## 运行
mpirun -np 4 --hostfile hostfile --bind-to none scripts-run/single_process.sh mpirun -np 4 --hostfile hostfile -mca btl self,tcp --bind-to none scripts-run/single_process.sh
# 多卡测试 (混合精度)
##
修改scripts-run/single_process.sh中的--use_fp16=True
mpirun -np 4 --hostfile hostfile -mca btl self,tcp --bind-to none scripts-run/single_process.sh
# 参考资料 # 参考资料
[https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks) [https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks)
......
...@@ -1555,7 +1555,7 @@ class BenchmarkCNN(object): ...@@ -1555,7 +1555,7 @@ class BenchmarkCNN(object):
n_epochs = self.params.eval_during_training_every_n_epochs n_epochs = self.params.eval_during_training_every_n_epochs
self.eval_during_training_at_specified_steps = { self.eval_during_training_at_specified_steps = {
(int(e * num_train_examples_per_epoch + self.batch_size - 1) // (int(e * num_train_examples_per_epoch + self.batch_size - 1) //
self.batch_size) (self.batch_size * self.num_workers ))
for e in np.arange(n_epochs, self.num_epochs, n_epochs)} for e in np.arange(n_epochs, self.num_epochs, n_epochs)}
if self.params.eval_during_training_at_specified_steps: if self.params.eval_during_training_at_specified_steps:
...@@ -1577,7 +1577,7 @@ class BenchmarkCNN(object): ...@@ -1577,7 +1577,7 @@ class BenchmarkCNN(object):
mlperf.logger.log(key=mlperf.tags.EVAL_EPOCH_OFFSET, value=offset) mlperf.logger.log(key=mlperf.tags.EVAL_EPOCH_OFFSET, value=offset)
self.eval_during_training_at_specified_steps = { self.eval_during_training_at_specified_steps = {
(int(e * num_train_examples_per_epoch + self.batch_size - 1) // (int(e * num_train_examples_per_epoch + self.batch_size - 1) //
self.batch_size) (self.batch_size * self.num_workers ))
for e in n_epochs} for e in n_epochs}
except ValueError: except ValueError:
raise ValueError('Param eval_during_training_at_specified_epochs value ' raise ValueError('Param eval_during_training_at_specified_epochs value '
......
# 简介 # 简介
该测试用例用于TensorFlow分类模型性能测试,使用的数据集是imagenet。 该测试用例用于TensorFlow分类模型性能测试,使用的数据集是imagenet。
* 该脚本支持horovod等分布式通信库方式 * 该脚本支持horovod等分布式通信库方式
# 运行 # 运行
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
## 分布式多卡 ## 分布式多卡
mpirun -np ${num_gpu} --hostfile hostfile --bind-to none scripts-run/single_process.sh mpirun -np ${num_gpu} --hostfile hostfile -mca btl self,tcp --bind-to none scripts-run/single_process.sh
hostfile格式参考: hostfile格式参考:
......
...@@ -1555,7 +1555,7 @@ class BenchmarkCNN(object): ...@@ -1555,7 +1555,7 @@ class BenchmarkCNN(object):
n_epochs = self.params.eval_during_training_every_n_epochs n_epochs = self.params.eval_during_training_every_n_epochs
self.eval_during_training_at_specified_steps = { self.eval_during_training_at_specified_steps = {
(int(e * num_train_examples_per_epoch + self.batch_size - 1) // (int(e * num_train_examples_per_epoch + self.batch_size - 1) //
self.batch_size) (self.batch_size * self.num_workers ))
for e in np.arange(n_epochs, self.num_epochs, n_epochs)} for e in np.arange(n_epochs, self.num_epochs, n_epochs)}
if self.params.eval_during_training_at_specified_steps: if self.params.eval_during_training_at_specified_steps:
...@@ -1577,7 +1577,7 @@ class BenchmarkCNN(object): ...@@ -1577,7 +1577,7 @@ class BenchmarkCNN(object):
mlperf.logger.log(key=mlperf.tags.EVAL_EPOCH_OFFSET, value=offset) mlperf.logger.log(key=mlperf.tags.EVAL_EPOCH_OFFSET, value=offset)
self.eval_during_training_at_specified_steps = { self.eval_during_training_at_specified_steps = {
(int(e * num_train_examples_per_epoch + self.batch_size - 1) // (int(e * num_train_examples_per_epoch + self.batch_size - 1) //
self.batch_size) (self.batch_size * self.num_workers ))
for e in n_epochs} for e in n_epochs}
except ValueError: except ValueError:
raise ValueError('Param eval_during_training_at_specified_epochs value ' raise ValueError('Param eval_during_training_at_specified_epochs value '
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment