Unverified Commit 4d85630a authored by guoshzhao's avatar guoshzhao Committed by GitHub
Browse files

Benchmarks: Add Benchmark - Add ONNXRuntime inference benchmark based on ORT python API (#245)

**Description**
Add ONNXRuntime inference benchmark based on ORT python API.

**Major Revision**
- Add `ORTInferenceBenchmark` class to export pytorch model to onnx model and do inference
- Add tests and example for `ort-inference` benchmark
- Update the introduction docs.
parent c2f942cb
......@@ -18,7 +18,7 @@ steps:
echo "##vso[task.prependpath]$HOME/.local/bin"
displayName: Export path
- script: |
python3 -m pip install .[test,nvidia,torch]
python3 -m pip install .[test,nvidia,torch,ort]
make postinstall
displayName: Install dependencies
- script: |
......
......@@ -108,5 +108,5 @@ ADD third_party third_party
RUN make -j -C third_party cuda
ADD . .
RUN python3 -m pip install .[nvidia,torch] && \
RUN python3 -m pip install .[nvidia,torch,ort] && \
make cppbuild
......@@ -104,5 +104,5 @@ RUN mv /root/.local/bin/* /opt/conda/bin/ && \
rm -rf /root/.local
ADD . .
RUN python3 -m pip install .[torch] && \
RUN python3 -m pip install .[torch,ort] && \
make cppbuild
......@@ -99,5 +99,5 @@ ADD third_party third_party
RUN ROCM_VERSION=rocm-4.2.0 make -j -C third_party rocm
ADD . .
RUN python3 -m pip install .[torch] && \
RUN python3 -m pip install .[torch,ort] && \
make cppbuild
......@@ -71,6 +71,12 @@ TODO
#### Introduction
Inference PyTorch/ONNX models on NVIDIA GPUs with [TensorRT](https://developer.nvidia.com/tensorrt).
Currently the following models are supported:
> alexnet, densenet121, densenet169, densenet201, densenet161, googlenet, inception_v3, mnasnet0_5,
> mnasnet1_0, mobilenet_v2, resnet18, resnet34, resnet50, resnet101, resnet152, resnext50_32x4d,
> resnext101_32x8d, wide_resnet50_2, wide_resnet101_2, shufflenet_v2_x0_5, shufflenet_v2_x1_0,
> squeezenet1_0, squeezenet1_1, vgg11, vgg11_bn, vgg13, vgg13_bn, vgg16, vgg16_bn, vgg19_bn, vgg19
#### Metrics
......@@ -83,6 +89,23 @@ Inference PyTorch/ONNX models on NVIDIA GPUs with [TensorRT](https://developer.n
| tensorrt-inference/${model}_end_to_end_time_mean | time (ms) | The mean duration from when the H2D of a query is called to when the D2H of the same query is completed. |
| tensorrt-inference/${model}_end_to_end_time_99 | time (ms) | The P99 duration from when the H2D of a query is called to when the D2H of the same query is completed. |
### `ort-inference`
#### Introduction
Inference performance of the torchvision models using ONNXRuntime. Currently the following models are supported:
> alexnet, densenet121, densenet169, densenet201, densenet161, googlenet, inception_v3, mnasnet0_5,
> mnasnet1_0, mobilenet_v2, resnet18, resnet34, resnet50, resnet101, resnet152, resnext50_32x4d,
> resnext101_32x8d, wide_resnet50_2, wide_resnet101_2, shufflenet_v2_x0_5, shufflenet_v2_x1_0,
> squeezenet1_0, squeezenet1_1, vgg11, vgg11_bn, vgg13, vgg13_bn, vgg16, vgg16_bn, vgg19_bn, vgg19
#### Metrics
| Name | Unit | Description |
|-----------------------------------------------|-----------|-----------------------------------------------------------|
| ort-inference/{precision}_{model}_time | time (ms) | The mean latency to execute one batch of inference. |
## Communication Benchmarks
### `mem-bw`
......
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
"""Micro benchmark example for ONNXRuntime inference performance.
Commands to run:
python3 examples/benchmarks/ort_inference_performance.py
"""
from superbench.benchmarks import BenchmarkRegistry, Platform
from superbench.common.utils import logger
if __name__ == '__main__':
context = BenchmarkRegistry.create_benchmark_context(
'ort-inference', platform=Platform.CUDA, parameters='--pytorch_models resnet50 resnet101 --precision float16'
)
benchmark = BenchmarkRegistry.launch_benchmark(context)
if benchmark:
logger.info(
'benchmark: {}, return code: {}, result: {}'.format(
benchmark.name, benchmark.return_code, benchmark.result
)
)
......@@ -167,12 +167,16 @@ def run(self):
'vcrpy>=4.1.1',
'yapf>=0.30.0',
],
'nvidia': ['py3nvml>=0.2.6'],
'ort': [
'onnx>=1.10.2',
'onnxruntime-gpu>=1.9.0',
],
'torch': [
'torch>=1.7.0a0',
'torchvision>=0.8.0a0',
'transformers>=4.3.3',
],
'nvidia': ['py3nvml>=0.2.6']
},
include_package_data=True,
entry_points={
......
......@@ -19,6 +19,7 @@
from superbench.benchmarks.micro_benchmarks.ib_loopback_performance import IBLoopbackBenchmark
from superbench.benchmarks.micro_benchmarks.ib_validation_performance import IBBenchmark
from superbench.benchmarks.micro_benchmarks.kernel_launch_overhead import KernelLaunch
from superbench.benchmarks.micro_benchmarks.ort_inference_performance import ORTInferenceBenchmark
from superbench.benchmarks.micro_benchmarks.rocm_gemm_flops_performance import RocmGemmFlopsBenchmark
from superbench.benchmarks.micro_benchmarks.rocm_memory_bw_performance import RocmMemBwBenchmark
from superbench.benchmarks.micro_benchmarks.sharding_matmul import ShardingMatmul
......@@ -42,6 +43,7 @@
'MemBwBenchmark',
'MicroBenchmark',
'MicroBenchmarkWithInvoke',
'ORTInferenceBenchmark',
'RocmGemmFlopsBenchmark',
'RocmMemBwBenchmark',
'ShardingMatmul',
......
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
"""TensorRT inference micro-benchmark."""
import time
import statistics
from pathlib import Path
import torch
import torchvision.models
import numpy as np
from superbench.common.utils import logger
from superbench.benchmarks import BenchmarkRegistry, Platform, Precision
from superbench.benchmarks.micro_benchmarks import MicroBenchmark
class ORTInferenceBenchmark(MicroBenchmark):
"""ONNXRuntime inference micro-benchmark class."""
def __init__(self, name, parameters=''):
"""Constructor.
Args:
name (str): benchmark name.
parameters (str): benchmark parameters.
"""
super().__init__(name, parameters)
self._pytorch_models = [
'resnet50',
'resnet101',
'resnet152',
'densenet169',
'densenet201',
'vgg11',
'vgg13',
'vgg16',
'vgg19',
]
self.__graph_opt_level = None
self.__model_cache_path = Path(torch.hub.get_dir()) / 'checkpoints'
def add_parser_arguments(self):
"""Add the specified arguments."""
super().add_parser_arguments()
self._parser.add_argument(
'--pytorch_models',
type=str,
nargs='+',
default=self._pytorch_models,
help='ONNX models for TensorRT inference benchmark, e.g., {}.'.format(', '.join(self._pytorch_models)),
)
self._parser.add_argument(
'--precision',
type=Precision,
choices=[Precision.FLOAT32, Precision.FLOAT16, Precision.INT8],
default=Precision.FLOAT16,
required=False,
help='Precision for inference, allow int8, float16, or float32 only.',
)
self._parser.add_argument(
'--graph_opt_level',
type=int,
default=3,
choices=[0, 1, 2, 3],
required=False,
help='ONNXRuntime graph optimization level, 0 for ORT_DISABLE_ALL, 1 for ORT_ENABLE_BASIC, '
'2 for ORT_ENABLE_EXTENDED, 3 for ORT_ENABLE_ALL.',
)
self._parser.add_argument(
'--batch_size',
type=int,
default=32,
required=False,
help='Set batch size for inference.',
)
self._parser.add_argument(
'--num_warmup',
type=int,
default=64,
required=False,
help='The number of warmup step before the benchmarking.',
)
self._parser.add_argument(
'--num_steps',
type=int,
default=256,
required=False,
help='The number of test step for benchmarking.',
)
def _preprocess(self):
"""Preprocess/preparation operations before the benchmarking.
Return:
True if _preprocess() succeed.
"""
if not super()._preprocess():
return False
import onnxruntime as ort
self.__graph_opt_level = {
0: ort.GraphOptimizationLevel.ORT_DISABLE_ALL,
1: ort.GraphOptimizationLevel.ORT_ENABLE_BASIC,
2: ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED,
3: ort.GraphOptimizationLevel.ORT_ENABLE_ALL,
}
for model in self._args.pytorch_models:
if hasattr(torchvision.models, model):
data_type = Precision.FLOAT16.value if self._args.precision == Precision.FLOAT16 \
else Precision.FLOAT32.value
model_path = f'{self.__model_cache_path / (model + "." + data_type + ".onnx")}'
torch.onnx.export(
getattr(torchvision.models, model)(pretrained=True).to(dtype=getattr(torch, data_type)).cuda(),
torch.randn(self._args.batch_size, 3, 224, 224, device='cuda', dtype=getattr(torch, data_type)),
model_path,
input_names=['input'],
)
if self._args.precision == Precision.INT8:
file_name = '{model}.{precision}.onnx'.format(model=model, precision=self._args.precision)
# For quantization of ONNXRuntime, refer
# https://onnxruntime.ai/docs/performance/quantization.html#quantization-overview
from onnxruntime.quantization import quantize_dynamic
quantize_dynamic(model_path, f'{self.__model_cache_path / file_name}')
else:
logger.error('Cannot find PyTorch model %s.', model)
return False
return True
def _benchmark(self):
"""Implementation for benchmarking."""
import onnxruntime as ort
precision_metric = {'float16': 'fp16', 'float32': 'fp32', 'int8': 'int8'}
for model in self._args.pytorch_models:
sess_options = ort.SessionOptions()
sess_options.graph_optimization_level = self.__graph_opt_level[self._args.graph_opt_level]
file_name = '{model}.{precision}.onnx'.format(model=model, precision=self._args.precision)
ort_sess = ort.InferenceSession(
f'{self.__model_cache_path / file_name}', sess_options, providers=['CUDAExecutionProvider']
)
elapse_times = self.__inference(ort_sess)
if self._args.precision.value in precision_metric:
precision = precision_metric[self._args.precision.value]
else:
precision = self._args.precision.value
metric = '{}_{}_time'.format(precision, model)
if not self._process_numeric_result(metric, elapse_times):
return False
logger.info(
'ORT Inference - round: {}, name: {}, model: {}, precision: {}, latency: {} ms'.format(
self._curr_run_index, self._name, model, self._args.precision, statistics.mean(elapse_times)
)
)
return True
def __inference(self, ort_sess):
"""Do inference given the ORT inference session.
Args:
ort_sess (InferenceSession): inference session for ORT.
Return:
elapse_times (List[float]): latency of every iterations.
"""
precision = np.float16 if self._args.precision == Precision.FLOAT16 else np.float32
input_tensor = np.random.randn(self._args.batch_size, 3, 224, 224).astype(dtype=precision)
for i in range(self._args.num_warmup):
ort_sess.run(None, {'input': input_tensor})
elapse_times = list()
for i in range(self._args.num_steps):
start = time.time()
ort_sess.run(None, {'input': input_tensor})
end = time.time()
elapse_times.append((end - start) * 1000)
return elapse_times
BenchmarkRegistry.register_benchmark(
'ort-inference',
ORTInferenceBenchmark,
platform=Platform.CUDA,
)
......@@ -88,6 +88,9 @@ superbench:
copy_type:
- sm
- dma
ort-inference:
<<: *default_local_mode
enable: false
ort-models:
enable: false
modes:
......
......@@ -89,6 +89,9 @@ superbench:
copy_type:
- sm
- dma
ort-inference:
<<: *default_local_mode
enable: false
ort-models:
enable: false
modes:
......
......@@ -98,6 +98,9 @@ superbench:
<<: *default_pytorch_mode
computation-communication-overlap:
<<: *default_pytorch_mode
ort-inference:
<<: *default_local_mode
enable: false
gpt_models:
<<: *default_pytorch_mode
models:
......
......@@ -96,6 +96,9 @@ superbench:
<<: *default_pytorch_mode
computation-communication-overlap:
<<: *default_pytorch_mode
ort-inference:
<<: *default_local_mode
enable: false
gpt_models:
<<: *default_pytorch_mode
models:
......
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""Tests for ort-inference benchmark."""
import shutil
from pathlib import Path
from unittest import mock
import torch
import torchvision.models
from tests.helper import decorator
from superbench.benchmarks import BenchmarkRegistry, Platform, Precision, BenchmarkType, ReturnCode
from superbench.benchmarks.micro_benchmarks.ort_inference_performance import ORTInferenceBenchmark
@decorator.cuda_test
@decorator.pytorch_test
@mock.patch('torch.hub.get_dir')
@mock.patch('onnxruntime.InferenceSession.run')
def test_ort_inference_performance(mock_ort_session_run, mock_get_dir):
"""Test ort-inference benchmark."""
benchmark_name = 'ort-inference'
(benchmark_class,
predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
assert (benchmark_class)
mock_get_dir.return_value = '/tmp/superbench/'
benchmark = benchmark_class(
benchmark_name,
parameters='--pytorch_models resnet50 --graph_opt_level 1 --precision float16'
' --batch_size 16 --num_warmup 128 --num_steps 512'
)
assert (isinstance(benchmark, ORTInferenceBenchmark))
assert (benchmark._preprocess())
# Check basic information.
assert (benchmark.name == 'ort-inference')
assert (benchmark.type == BenchmarkType.MICRO)
assert (benchmark._ORTInferenceBenchmark__model_cache_path == Path(torch.hub.get_dir()) / 'checkpoints')
for model in benchmark._args.pytorch_models:
assert (hasattr(torchvision.models, model))
file_name = '{model}.{precision}.onnx'.format(model=model, precision=benchmark._args.precision)
assert ((benchmark._ORTInferenceBenchmark__model_cache_path / file_name).is_file())
# Check parameters specified in BenchmarkContext.
assert (benchmark._args.pytorch_models == ['resnet50'])
assert (benchmark._args.graph_opt_level == 1)
assert (benchmark._args.precision == Precision.FLOAT16)
assert (benchmark._args.batch_size == 16)
assert (benchmark._args.num_warmup == 128)
assert (benchmark._args.num_steps == 512)
# Check results and metrics.
assert (benchmark._benchmark())
shutil.rmtree(benchmark._ORTInferenceBenchmark__model_cache_path)
assert (benchmark.return_code == ReturnCode.SUCCESS)
precision_metric = {'float16': 'fp16', 'float32': 'fp32', 'int8': 'int8'}
for model in benchmark._args.pytorch_models:
if benchmark._args.precision.value in precision_metric:
precision = precision_metric[benchmark._args.precision.value]
else:
precision = benchmark._args.precision.value
metric = '{}_{}_time'.format(precision, model)
assert (metric in benchmark.result)
assert (metric in benchmark.raw_data)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment