Benchmarks: Add Feature - Add option to use fp32 instead of tf32 (#213)

**Description** Add option `force_fp32` to use fp32 instead of tf32, only takes effect on Ampere or newer GPUs.

Benchmarks: Add Feature - Add option to use fp32 instead of tf32 (#213)
**Description** Add option `force_fp32` to use fp32 instead of tf32, only takes effect on Ampere or newer GPUs.
f9442456 · guoshzhao · GitHub · dfbd70b1 · f9442456 · f9442456
Unverified Commit f9442456 authored Sep 28, 2021 by guoshzhao Committed by GitHub Sep 28, 2021
4 changed files
--- a/superbench/benchmarks/model_benchmarks/model_base.py
+++ b/superbench/benchmarks/model_benchmarks/model_base.py
@@ -124,11 +124,27 @@ class ModelBenchmark(Benchmark):
            help='Enable option to pin memory in data loader.',
        )

+        self._parser.add_argument(
+            '--force_fp32',
+            action='store_true',
+            default=False,
+            help='Enable option to use full float32 precision.',
+        )
+
    @abstractmethod
    def _judge_gpu_availability(self):
        """Judge GPUs' availability according to arguments and running environment."""
        pass

+    @abstractmethod
+    def _set_force_fp32(self):
+        """Set the config that controls whether full float32 precision will be used.
+
+        On Ampere or newer GPUs, pytorch and tensorflow will use TF32 instead of FP32 by default.
+        We can disable TF32 execution by setting force_fp32 as True.
+        """
+        pass
+
    @abstractmethod
    def _init_distributed_setting(self):
        """Initialize the distributed library and bind the worker to GPU.
@@ -166,9 +182,10 @@ class ModelBenchmark(Benchmark):
            return False

        self._judge_gpu_availability()
+        self._set_force_fp32()
        logger.info(
-            'Model placement - model: {}, GPU availablility: {}, pin memory: {}.'.format(
-                self._name, self._gpu_available, self._args.pin_memory
+            'Model placement - model: {}, GPU availablility: {}, pin memory: {}, force fp32: {}.'.format(
+                self._name, self._gpu_available, self._args.pin_memory, self._args.force_fp32
            )
        )


--- a/superbench/benchmarks/model_benchmarks/pytorch_base.py
+++ b/superbench/benchmarks/model_benchmarks/pytorch_base.py
@@ -32,6 +32,15 @@ class PytorchBase(ModelBenchmark):
        """Judge GPUs' availability according to arguments and running environment."""
        self._gpu_available = not self._args.no_gpu and torch.cuda.is_available()

+    def _set_force_fp32(self):
+        """Set the config that controls whether full float32 precision will be used.
+
+        On Ampere or newer GPUs, pytorch and tensorflow will use TF32 instead of FP32 by default.
+        We can disable TF32 execution by setting force_fp32 as True.
+        """
+        torch.backends.cuda.matmul.allow_tf32 = self._args.force_fp32
+        torch.backends.cudnn.allow_tf32 = self._args.force_fp32
+
    def _init_distributed_setting(self):
        """Initialize the distributed library and bind the worker to GPU.


--- a/tests/benchmarks/model_benchmarks/test_model_base.py
+++ b/tests/benchmarks/model_benchmarks/test_model_base.py
@@ -42,6 +42,10 @@ class FakeModelBenchmark(ModelBenchmark):
        """Judge GPUs' availability according to arguments and running environment."""
        self._gpu_available = False

+    def _set_force_fp32(self):
+        """Set the config that controls whether full float32 precision will be used."""
+        pass
+
    def _init_distributed_setting(self):
        """Initialize the distributed library and bind the worker to GPU."""
        return True
@@ -161,6 +165,7 @@ def test_arguments_related_interfaces():
                        Distributed backends. E.g. nccl mpi gloo.
  --no_gpu              Disable GPU training.
  --pin_memory          Enable option to pin memory in data loader.
+  --force_fp32          Enable option to use full float32 precision.
  --hidden_size int     Hidden size.
  --seq_len int         Sequence length."""
    )
@@ -194,6 +199,7 @@ def test_preprocess():
                        Distributed backends. E.g. nccl mpi gloo.
  --no_gpu              Disable GPU training.
  --pin_memory          Enable option to pin memory in data loader.
+  --force_fp32          Enable option to use full float32 precision.
  --hidden_size int     Hidden size.
  --seq_len int         Sequence length."""
    )

--- a/tests/benchmarks/model_benchmarks/test_pytorch_base.py
+++ b/tests/benchmarks/model_benchmarks/test_pytorch_base.py
@@ -178,7 +178,7 @@ def test_pytorch_base():
    BenchmarkRegistry.register_benchmark('pytorch-mnist', PytorchMNIST)

    # Launch benchmark with --no_gpu for testing.
-    parameters = '--batch_size 32 --num_warmup 8 --num_steps 64 --model_action train inference --no_gpu'
+    parameters = '--batch_size 32 --num_warmup 8 --num_steps 64 --model_action train inference --no_gpu --force_fp32'
    benchmark = PytorchMNIST('pytorch-mnist', parameters=parameters)
    assert (benchmark)
    assert (benchmark._preprocess())
@@ -202,6 +202,9 @@ def test_pytorch_base():
    # Test _judge_gpu_availability().
    assert (benchmark._gpu_available is False)

+    # Test _set_force_fp32().
+    assert (benchmark._args.force_fp32 is True)
+
    # Test _init_distributed_setting().
    assert (benchmark._args.distributed_impl is None)
    assert (benchmark._args.distributed_backend is None)