Unverified Commit 76066b6d authored by Yuting Jiang's avatar Yuting Jiang Committed by GitHub
Browse files

Benchmark: Model benchmark - add option to exclude data copy time in model benchmarks (#734)

**Description**
add option to exclude data copy time in model benchmarks.

**Major Revision**
- add an option --no_copy
- move start time after data copy finish
parent ad8e0143
......@@ -147,6 +147,13 @@ def add_parser_arguments(self):
help='Real-time log every n steps.',
)
self._parser.add_argument(
'--exclude_copy_time',
action='store_true',
default=False,
help='Exclude GPU data copy time from measured time.',
)
@abstractmethod
def _judge_gpu_availability(self):
"""Judge GPUs' availability according to arguments and running environment."""
......
......@@ -174,6 +174,8 @@ def _train_step(self, precision):
start = self._timer()
if self._gpu_available:
sample = sample.cuda()
if self._args.exclude_copy_time:
start = self._timer()
self._optimizer.zero_grad()
if self._fp8_recipe is not None:
with te.fp8_autocast(enabled=True, fp8_recipe=self._fp8_recipe):
......@@ -211,6 +213,8 @@ def _inference_step(self, precision):
start = self._timer()
if self._gpu_available:
sample = sample.cuda()
if self._args.exclude_copy_time:
start = self._timer()
if self._fp8_recipe is not None:
with te.fp8_autocast(enabled=True, fp8_recipe=self._fp8_recipe):
self._model(sample)
......
......@@ -104,6 +104,8 @@ def _train_step(self, precision):
start = self._timer()
if self._gpu_available:
sample = sample.cuda()
if self._args.exclude_copy_time:
start = self._timer()
self._optimizer.zero_grad()
output = self._model(sample)
loss = self._loss_fn(output, self._target)
......@@ -138,6 +140,8 @@ def _inference_step(self, precision):
start = self._timer()
if self._gpu_available:
sample = sample.cuda()
if self._args.exclude_copy_time:
start = self._timer()
self._model(sample)
end = self._timer()
curr_step += 1
......
......@@ -168,6 +168,8 @@ def _train_step(self, precision):
start = self._timer()
if self._gpu_available:
sample = sample.cuda()
if self._args.exclude_copy_time:
start = self._timer()
self._optimizer.zero_grad()
if self._fp8_recipe is not None:
with te.fp8_autocast(enabled=True, fp8_recipe=self._fp8_recipe):
......@@ -205,6 +207,8 @@ def _inference_step(self, precision):
start = self._timer()
if self._gpu_available:
sample = sample.cuda()
if self._args.exclude_copy_time:
start = self._timer()
if self._fp8_recipe is not None:
with te.fp8_autocast(enabled=True, fp8_recipe=self._fp8_recipe):
self._model(sample)
......
......@@ -188,6 +188,8 @@ def _train_step(self, precision):
start = self._timer()
if self._gpu_available:
sample = sample.cuda()
if self._args.exclude_copy_time:
start = self._timer()
self._optimizer.zero_grad()
if self._fp8_recipe is not None:
with te.fp8_autocast(enabled=True, fp8_recipe=self._fp8_recipe):
......@@ -225,6 +227,8 @@ def _inference_step(self, precision):
start = self._timer()
if self._gpu_available:
sample = sample.cuda()
if self._args.exclude_copy_time:
start = self._timer()
if self._fp8_recipe is not None:
with te.fp8_autocast(enabled=True, fp8_recipe=self._fp8_recipe):
self._model(sample)
......
......@@ -144,6 +144,8 @@ def _train_step(self, precision):
start = self._timer()
if self._gpu_available:
sample = sample.cuda()
if self._args.exclude_copy_time:
start = self._timer()
self._optimizer.zero_grad()
output = self._model(sample)
loss = self._loss_fn(output, self._target)
......@@ -178,6 +180,8 @@ def _inference_step(self, precision):
start = self._timer()
if self._gpu_available:
sample = sample.cuda()
if self._args.exclude_copy_time:
start = self._timer()
self._model(sample)
end = self._timer()
curr_step += 1
......
......@@ -202,6 +202,8 @@ def _train_step(self, precision):
start = self._timer()
if self._gpu_available:
sample = sample.cuda()
if self._args.exclude_copy_time:
start = self._timer()
self._optimizer.zero_grad()
if self._fp8_recipe is not None:
with te.fp8_autocast(enabled=True, fp8_recipe=self._fp8_recipe):
......@@ -239,6 +241,8 @@ def _inference_step(self, precision):
start = self._timer()
if self._gpu_available:
sample = sample.cuda()
if self._args.exclude_copy_time:
start = self._timer()
if self._fp8_recipe is not None:
with te.fp8_autocast(enabled=True, fp8_recipe=self._fp8_recipe):
self._model(sample)
......
......@@ -157,6 +157,7 @@ def test_arguments_related_interfaces():
Distributed implementations. E.g. ddp mirrored
multiworkermirrored parameterserver horovod.
--duration int The elapsed time of benchmark in seconds.
--exclude_copy_time Exclude GPU data copy time from measured time.
--force_fp32 Enable option to use full float32 precision.
--hidden_size int Hidden size.
--log_flushing Real-time log flushing.
......@@ -197,6 +198,7 @@ def test_preprocess():
Distributed implementations. E.g. ddp mirrored
multiworkermirrored parameterserver horovod.
--duration int The elapsed time of benchmark in seconds.
--exclude_copy_time Exclude GPU data copy time from measured time.
--force_fp32 Enable option to use full float32 precision.
--hidden_size int Hidden size.
--log_flushing Real-time log flushing.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment