Benchmark: Model benchmark - add option to exclude data copy time in model benchmarks (#734)

**Description** add option to exclude data copy time in model benchmarks. **Major Revision** - add an option --no_copy - move start time after data copy finish

Benchmark: Model benchmark - add option to exclude data copy time in model benchmarks (#734)
**Description** add option to exclude data copy time in model benchmarks. **Major Revision** - add an option --no_copy - move start time after data copy finish
76066b6d · Yuting Jiang · GitHub · ad8e0143 · 76066b6d · 76066b6d
Unverified Commit 76066b6d authored Sep 29, 2025 by Yuting Jiang Committed by GitHub Sep 29, 2025
8 changed files
--- a/superbench/benchmarks/model_benchmarks/model_base.py
+++ b/superbench/benchmarks/model_benchmarks/model_base.py
@@ -147,6 +147,13 @@ def add_parser_arguments(self):
            help='Real-time log every n steps.',
        )

+        self._parser.add_argument(
+            '--exclude_copy_time',
+            action='store_true',
+            default=False,
+            help='Exclude GPU data copy time from measured time.',
+        )
+
    @abstractmethod
    def _judge_gpu_availability(self):
        """Judge GPUs' availability according to arguments and running environment."""

--- a/superbench/benchmarks/model_benchmarks/pytorch_bert.py
+++ b/superbench/benchmarks/model_benchmarks/pytorch_bert.py
@@ -174,6 +174,8 @@ def _train_step(self, precision):
                start = self._timer()
                if self._gpu_available:
                    sample = sample.cuda()
+                if self._args.exclude_copy_time:
+                    start = self._timer()
                self._optimizer.zero_grad()
                if self._fp8_recipe is not None:
                    with te.fp8_autocast(enabled=True, fp8_recipe=self._fp8_recipe):
@@ -211,6 +213,8 @@ def _inference_step(self, precision):
                    start = self._timer()
                    if self._gpu_available:
                        sample = sample.cuda()
+                    if self._args.exclude_copy_time:
+                        start = self._timer()
                    if self._fp8_recipe is not None:
                        with te.fp8_autocast(enabled=True, fp8_recipe=self._fp8_recipe):
                            self._model(sample)

--- a/superbench/benchmarks/model_benchmarks/pytorch_cnn.py
+++ b/superbench/benchmarks/model_benchmarks/pytorch_cnn.py
@@ -104,6 +104,8 @@ def _train_step(self, precision):
                start = self._timer()
                if self._gpu_available:
                    sample = sample.cuda()
+                if self._args.exclude_copy_time:
+                    start = self._timer()
                self._optimizer.zero_grad()
                output = self._model(sample)
                loss = self._loss_fn(output, self._target)
@@ -138,6 +140,8 @@ def _inference_step(self, precision):
                    start = self._timer()
                    if self._gpu_available:
                        sample = sample.cuda()
+                    if self._args.exclude_copy_time:
+                        start = self._timer()
                    self._model(sample)
                    end = self._timer()
                    curr_step += 1

--- a/superbench/benchmarks/model_benchmarks/pytorch_gpt2.py
+++ b/superbench/benchmarks/model_benchmarks/pytorch_gpt2.py
@@ -168,6 +168,8 @@ def _train_step(self, precision):
                start = self._timer()
                if self._gpu_available:
                    sample = sample.cuda()
+                if self._args.exclude_copy_time:
+                    start = self._timer()
                self._optimizer.zero_grad()
                if self._fp8_recipe is not None:
                    with te.fp8_autocast(enabled=True, fp8_recipe=self._fp8_recipe):
@@ -205,6 +207,8 @@ def _inference_step(self, precision):
                    start = self._timer()
                    if self._gpu_available:
                        sample = sample.cuda()
+                    if self._args.exclude_copy_time:
+                        start = self._timer()
                    if self._fp8_recipe is not None:
                        with te.fp8_autocast(enabled=True, fp8_recipe=self._fp8_recipe):
                            self._model(sample)

--- a/superbench/benchmarks/model_benchmarks/pytorch_llama.py
+++ b/superbench/benchmarks/model_benchmarks/pytorch_llama.py
@@ -188,6 +188,8 @@ def _train_step(self, precision):
                start = self._timer()
                if self._gpu_available:
                    sample = sample.cuda()
+                if self._args.exclude_copy_time:
+                    start = self._timer()
                self._optimizer.zero_grad()
                if self._fp8_recipe is not None:
                    with te.fp8_autocast(enabled=True, fp8_recipe=self._fp8_recipe):
@@ -225,6 +227,8 @@ def _inference_step(self, precision):
                    start = self._timer()
                    if self._gpu_available:
                        sample = sample.cuda()
+                    if self._args.exclude_copy_time:
+                        start = self._timer()
                    if self._fp8_recipe is not None:
                        with te.fp8_autocast(enabled=True, fp8_recipe=self._fp8_recipe):
                            self._model(sample)

--- a/superbench/benchmarks/model_benchmarks/pytorch_lstm.py
+++ b/superbench/benchmarks/model_benchmarks/pytorch_lstm.py
@@ -144,6 +144,8 @@ def _train_step(self, precision):
                start = self._timer()
                if self._gpu_available:
                    sample = sample.cuda()
+                if self._args.exclude_copy_time:
+                    start = self._timer()
                self._optimizer.zero_grad()
                output = self._model(sample)
                loss = self._loss_fn(output, self._target)
@@ -178,6 +180,8 @@ def _inference_step(self, precision):
                    start = self._timer()
                    if self._gpu_available:
                        sample = sample.cuda()
+                    if self._args.exclude_copy_time:
+                        start = self._timer()
                    self._model(sample)
                    end = self._timer()
                    curr_step += 1

--- a/superbench/benchmarks/model_benchmarks/pytorch_mixtral_impl.py
+++ b/superbench/benchmarks/model_benchmarks/pytorch_mixtral_impl.py
@@ -202,6 +202,8 @@ def _train_step(self, precision):
                start = self._timer()
                if self._gpu_available:
                    sample = sample.cuda()
+                if self._args.exclude_copy_time:
+                    start = self._timer()
                self._optimizer.zero_grad()
                if self._fp8_recipe is not None:
                    with te.fp8_autocast(enabled=True, fp8_recipe=self._fp8_recipe):
@@ -239,6 +241,8 @@ def _inference_step(self, precision):
                    start = self._timer()
                    if self._gpu_available:
                        sample = sample.cuda()
+                    if self._args.exclude_copy_time:
+                        start = self._timer()
                    if self._fp8_recipe is not None:
                        with te.fp8_autocast(enabled=True, fp8_recipe=self._fp8_recipe):
                            self._model(sample)

--- a/tests/benchmarks/model_benchmarks/test_model_base.py
+++ b/tests/benchmarks/model_benchmarks/test_model_base.py
@@ -157,6 +157,7 @@ def test_arguments_related_interfaces():
                        Distributed implementations. E.g. ddp mirrored
                        multiworkermirrored parameterserver horovod.
  --duration int        The elapsed time of benchmark in seconds.
+  --exclude_copy_time   Exclude GPU data copy time from measured time.
  --force_fp32          Enable option to use full float32 precision.
  --hidden_size int     Hidden size.
  --log_flushing        Real-time log flushing.
@@ -197,6 +198,7 @@ def test_preprocess():
                        Distributed implementations. E.g. ddp mirrored
                        multiworkermirrored parameterserver horovod.
  --duration int        The elapsed time of benchmark in seconds.
+  --exclude_copy_time   Exclude GPU data copy time from measured time.
  --force_fp32          Enable option to use full float32 precision.
  --hidden_size int     Hidden size.
  --log_flushing        Real-time log flushing.