Benchmarks - Update result parsing in tensorrt inference (#541)

* Update result parsing for newer tensorrt versions * Update arguments when load torchvision models

Benchmarks - Update result parsing in tensorrt inference (#541)
* Update result parsing for newer tensorrt versions * Update arguments when load torchvision models
7184bdd1 · Yifan Xiong · GitHub · f2599137 · 7184bdd1 · 7184bdd1
Unverified Commit 7184bdd1 authored Jun 30, 2023 by Yifan Xiong Committed by GitHub Jun 30, 2023
6 changed files
--- a/setup.py
+++ b/setup.py
@@ -166,6 +166,7 @@ def run(self):
        'numpy>=1.19.2',
        'omegaconf==2.0.6',
        'openpyxl>=3.0.7',
+        'packaging>=21.0',
        'pandas>=1.1.5',
        'pssh @ git+https://github.com/lilydjwg/pssh.git@v2.3.4',
        'pyyaml>=5.3',

--- a/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py
+++ b/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py
@@ -5,6 +5,7 @@

 from pathlib import Path

+from packaging import version
 import torch.hub
 import torch.onnx
 import torchvision.models
@@ -129,7 +130,9 @@ def export_torchvision_model(self, model_name, batch_size=1):
        if not self.check_torchvision_model(model_name):
            return ''
        file_name = str(self._onnx_model_path / (model_name + '.onnx'))
-        model = getattr(torchvision.models, model_name)(pretrained=False).eval().cuda()
+        # the parameter 'pretrained' is deprecated since 0.13 in torchvision
+        args = {'pretrained': False} if version.parse(torchvision.__version__) < version.parse('0.13') else {}
+        model = getattr(torchvision.models, model_name)(**args).eval().cuda()
        dummy_input = torch.randn((batch_size, 3, 224, 224), device='cuda')
        torch.onnx.export(
            model,

--- a/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py
@@ -145,6 +145,17 @@ def _process_raw_result(self, cmd_idx, raw_output):
                        self._result.add_result(f'{model}_host_time_{tag}', float(lats[0]))
                        self._result.add_result(f'{model}_end_to_end_time_{tag}', float(lats[1]))
                    success = True
+                if '[I] Latency:' in line or '[I] GPU Compute Time:' in line:
+                    tm = 'gpu' if '[I] GPU Compute Time:' in line else 'host'
+                    self._result.add_result(
+                        f'{model}_{tm}_time_mean',
+                        float(re.findall(r'mean = (\d+\.\d+) ms', line)[0]),
+                    )
+                    self._result.add_result(
+                        f'{model}_{tm}_time_99',
+                        float(re.findall(r'\(99\%\) = (\d+\.\d+) ms', line)[0]),
+                    )
+                    success = True
        except BaseException as e:
            self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
            logger.error(

--- a/tests/benchmarks/micro_benchmarks/test_tensorrt_inference_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_tensorrt_inference_performance.py
@@ -116,16 +116,17 @@ def test_tensorrt_inference_params(self):
                    len(test_case.get('pytorch_models', benchmark._pytorch_models)), len(benchmark._commands)
                )

-    @decorator.load_data('tests/data/tensorrt_inference.log')
-    def test_tensorrt_inference_result_parsing(self, test_raw_log):
+    @decorator.load_data('tests/data/tensorrt_inference.1.log')
+    @decorator.load_data('tests/data/tensorrt_inference.2.log')
+    def test_tensorrt_inference_result_parsing(self, test_raw_log_1, test_raw_log_2):
        """Test tensorrt-inference benchmark result parsing."""
        (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.CUDA)
        benchmark = benchmark_cls(self.benchmark_name, parameters='')
        benchmark._args = SimpleNamespace(pytorch_models=['model_0', 'model_1'], log_raw_data=False)
        benchmark._result = BenchmarkResult(self.benchmark_name, BenchmarkType.MICRO, ReturnCode.SUCCESS, run_count=1)

-        # Positive case - valid raw output
-        self.assertTrue(benchmark._process_raw_result(0, test_raw_log))
+        # Positive case 1 - valid raw output
+        self.assertTrue(benchmark._process_raw_result(0, test_raw_log_1))
        self.assertEqual(ReturnCode.SUCCESS, benchmark.return_code)

        self.assertEqual(6 + benchmark.default_metric_count, len(benchmark.result))
@@ -134,5 +135,12 @@ def test_tensorrt_inference_result_parsing(self, test_raw_log):
            self.assertEqual(0.6, benchmark.result[f'model_0_host_time_{tag}'][0])
            self.assertEqual(1.0, benchmark.result[f'model_0_end_to_end_time_{tag}'][0])

+        # Positive case 2 - valid raw output
+        self.assertTrue(benchmark._process_raw_result(0, test_raw_log_2))
+        self.assertEqual(ReturnCode.SUCCESS, benchmark.return_code)
+        for tag in ['mean', '99']:
+            self.assertEqual(1.5, benchmark.result[f'model_0_gpu_time_{tag}'][1])
+            self.assertEqual(2.0, benchmark.result[f'model_0_host_time_{tag}'][1])
+
        # Negative case - invalid raw output
        self.assertFalse(benchmark._process_raw_result(1, 'Invalid raw output'))
--- a/tests/data/tensorrt_inference.log
+++ b/tests/data/tensorrt_inference.log
--- a/tests/data/tensorrt_inference.2.log
+++ b/tests/data/tensorrt_inference.2.log
+[06/29/2023-08:24:55] [I] === Model Options ===
+[06/29/2023-08:24:55] [I] Format: ONNX
+[06/29/2023-08:24:55] [I] Model: /root/.cache/torch/hub/onnx/resnet50.onnx
+[06/29/2023-08:24:55] [I] Output:
+[06/29/2023-08:24:55] [I] === Build Options ===
+[06/29/2023-08:24:55] [I] Max batch: explicit batch
+[06/29/2023-08:24:55] [I] Memory Pools: workspace: 8192 MiB, dlaSRAM: default, dlaLocalDRAM: default, dlaGlobalDRAM: default
+[06/29/2023-08:24:55] [I] minTiming: 1
+[06/29/2023-08:24:55] [I] avgTiming: 8
+[06/29/2023-08:24:55] [I] Precision: FP32+FP16
+[06/29/2023-08:24:55] [I] LayerPrecisions:
+[06/29/2023-08:24:55] [I] Calibration:
+[06/29/2023-08:24:55] [I] Refit: Disabled
+[06/29/2023-08:24:55] [I] Sparsity: Disabled
+[06/29/2023-08:24:55] [I] Safe mode: Disabled
+[06/29/2023-08:24:55] [I] DirectIO mode: Disabled
+[06/29/2023-08:24:55] [I] Restricted mode: Disabled
+[06/29/2023-08:24:55] [I] Build only: Disabled
+[06/29/2023-08:24:55] [I] Save engine:
+[06/29/2023-08:24:55] [I] Load engine:
+[06/29/2023-08:24:55] [I] Profiling verbosity: 0
+[06/29/2023-08:24:55] [I] Tactic sources: Using default tactic sources
+[06/29/2023-08:24:55] [I] timingCacheMode: local
+[06/29/2023-08:24:55] [I] timingCacheFile:
+[06/29/2023-08:24:55] [I] Heuristic: Disabled
+[06/29/2023-08:24:55] [I] Preview Features: Use default preview flags.
+[06/29/2023-08:24:55] [I] Input(s)s format: fp32:CHW
+[06/29/2023-08:24:55] [I] Output(s)s format: fp32:CHW
+[06/29/2023-08:24:55] [I] Input build shape: input=32x3x224x224+32x3x224x224+32x3x224x224
+[06/29/2023-08:24:55] [I] Input calibration shapes: model
+[06/29/2023-08:24:55] [I] === System Options ===
+[06/29/2023-08:24:55] [I] Device: 0
+[06/29/2023-08:24:55] [I] DLACore:
+[06/29/2023-08:24:55] [I] Plugins:
+[06/29/2023-08:24:55] [I] === Inference Options ===
+[06/29/2023-08:24:55] [I] Batch: Explicit
+[06/29/2023-08:24:55] [I] Input inference shape: input=32x3x224x224
+[06/29/2023-08:24:55] [I] Iterations: 2048
+[06/29/2023-08:24:55] [I] Duration: 3s (+ 200ms warm up)
+[06/29/2023-08:24:55] [I] Sleep time: 0ms
+[06/29/2023-08:24:55] [I] Idle time: 0ms
+[06/29/2023-08:24:55] [I] Streams: 1
+[06/29/2023-08:24:55] [I] ExposeDMA: Disabled
+[06/29/2023-08:24:55] [I] Data transfers: Enabled
+[06/29/2023-08:24:55] [I] Spin-wait: Disabled
+[06/29/2023-08:24:55] [I] Multithreading: Disabled
+[06/29/2023-08:24:55] [I] CUDA Graph: Disabled
+[06/29/2023-08:24:55] [I] Separate profiling: Disabled
+[06/29/2023-08:24:55] [I] Time Deserialize: Disabled
+[06/29/2023-08:24:55] [I] Time Refit: Disabled
+[06/29/2023-08:24:55] [I] NVTX verbosity: 0
+[06/29/2023-08:24:55] [I] Persistent Cache Ratio: 0
+[06/29/2023-08:24:55] [I] Inputs:
+[06/29/2023-08:24:55] [I] === Reporting Options ===
+[06/29/2023-08:24:55] [I] Verbose: Disabled
+[06/29/2023-08:24:55] [I] Averages: 10 inferences
+[06/29/2023-08:24:55] [I] Percentiles: 99
+[06/29/2023-08:24:55] [I] Dump refittable layers:Disabled
+[06/29/2023-08:24:55] [I] Dump output: Disabled
+[06/29/2023-08:24:55] [I] Profile: Disabled
+[06/29/2023-08:24:55] [I] Export timing to JSON file:
+[06/29/2023-08:24:55] [I] Export output to JSON file:
+[06/29/2023-08:24:55] [I] Export profile to JSON file:
+[06/29/2023-08:25:38] [I]
+[06/29/2023-08:25:38] [I] === Trace details ===
+[06/29/2023-08:25:38] [I] Trace averages of 10 runs:
+[06/29/2023-08:25:38] [I] Average on 10 runs - GPU latency: 1.5 ms - Host latency: 2.0 ms (enqueue 0.4 ms)
+[06/29/2023-08:25:38] [I] Average on 10 runs - GPU latency: 1.5 ms - Host latency: 2.0 ms (enqueue 0.4 ms)
+[06/29/2023-08:25:38] [I]
+[06/29/2023-08:25:38] [I] === Performance summary ===
+[06/29/2023-08:25:38] [I] Throughput: 1000.00 qps
+[06/29/2023-08:25:38] [I] Latency: min = 1.9 ms, max = 2.1 ms, mean = 2.0 ms, median = 2.0 ms, percentile(99%) = 2.0 ms
+[06/29/2023-08:25:38] [I] Enqueue Time: min = 0.3 ms, max = 0.3 ms, mean = 0.3 ms, median = 0.3 ms, percentile(99%) = 0.3 ms
+[06/29/2023-08:25:38] [I] H2D Latency: min = 0.3 ms, max = 0.3 ms, mean = 0.3 ms, median = 0.3 ms, percentile(99%) = 0.3 ms
+[06/29/2023-08:25:38] [I] GPU Compute Time: min = 1.4 ms, max = 1.6 ms, mean = 1.5 ms, median = 1.5 ms, percentile(99%) = 1.5 ms
+[06/29/2023-08:25:38] [I] D2H Latency: min = 0.03 ms, max = 0.03 ms, mean = 0.03 ms, median = 0.03 ms, percentile(99%) = 0.03 ms
+[06/29/2023-08:25:38] [I] Total Host Walltime: 3.0 s
+[06/29/2023-08:25:38] [I] Total GPU Compute Time: 2.9 s
+[06/29/2023-08:25:38] [I] Explanations of the performance metrics are printed in the verbose logs.
+[06/29/2023-08:25:38] [I]