Unverified Commit c77bfe36 authored by one's avatar one Committed by GitHub
Browse files

Benchmark: Update ort-inference for ROCm platform (#18)

* Support rocm in ort-inference

* Add tests

* Update dockerfiles for docker 18

* Install onnx, add params to ort-inference

* Update docs
parent 52450b5f
...@@ -183,7 +183,7 @@ RUN sed -i '/NCCL_/d' /etc/bash.bashrc && \ ...@@ -183,7 +183,7 @@ RUN sed -i '/NCCL_/d' /etc/bash.bashrc && \
RUN --mount=type=bind,from=hyhal,source=/,target=/opt/hyhal \ RUN --mount=type=bind,from=hyhal,source=/,target=/opt/hyhal \
python3 -m venv --system-site-packages ${VIRTUAL_ENV} && \ python3 -m venv --system-site-packages ${VIRTUAL_ENV} && \
python3 -m pip install -i ${SB_PIP_INDEX_URL} --upgrade pip wheel setuptools==65.7 mpi4py && \ python3 -m pip install -i ${SB_PIP_INDEX_URL} --upgrade pip wheel setuptools==65.7 mpi4py onnx==1.19.1 && \
python3 -m pip install -i ${SB_PIP_INDEX_URL} --no-build-isolation .[hgworker] && \ python3 -m pip install -i ${SB_PIP_INDEX_URL} --no-build-isolation .[hgworker] && \
make cppbuild && \ make cppbuild && \
make postinstall make postinstall
ARG BASE_IMAGE=harbor.sourcefind.cn:5443/dcu/admin/base/pytorch:2.7.1-ubuntu22.04-dtk26.04-py3.11
FROM ${BASE_IMAGE}
# Included in the base image:
# - Ubuntu: 22.04
# - Python: 3.11
# - DTK: 26.04
# - AMD SMI: 24.5.3+02cbffb.dirty
# - Torch: 2.7.1+das.opt1.dtk2604
# - Torchvision: 0.22.0+das.opt1.dtk2604.torch271
# - vLLM: 0.11.0+das.opt1.dtk2604.torch271
# - ONNX Runtime: 1.19.2+das.opt1.dtk2604.torch271
# - DeepSpeed: 0.18.2+das.opt1.dtk2604.torch271
# - Apex: 1.7.0+das.opt1.dtk2604.torch271
# - FlashAttention: 2.6.1+das.opt1.dtk2604.torch271
# - Transformer Engine: 2.10.0+das.opt1.dtk2604.torch271
# - Triton: 3.1.0+das.opt1.dtk2604.torch271
# - Megatron Core: 0.15.4+das.opt1.dtk2604.torch271
# - DCU Megatron: 0.15.0+das.opt1.dtk2604.torch271
# - Byte Flux: 1.0.4+das.opt1.dtk2604.torch271
# Added or changed by this Dockerfile:
# - Docker client: 20.10.8
# - UCX: 1.20.0, built with DTK/ROCm support
# - Open MPI: 5.0.9, built with UCX and DTK/ROCm support
# - Intel MLC: v3.12
# - rocblas-bench and hipblaslt-bench command symlinks
# - RCCL topology mapping override for DTK
# - SSH and ulimit configuration
LABEL maintainer="SuperBench"
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && \
apt-get -q install -y --no-install-recommends \
autoconf \
automake \
bc \
build-essential \
curl \
dmidecode \
git \
iproute2 \
jq \
libaio-dev \
libboost-program-options-dev \
libcap2 \
libcurl4-openssl-dev \
libnuma-dev \
libpci-dev \
libssl-dev \
libtinfo5 \
libtool \
lshw \
net-tools \
numactl \
openssh-client \
openssh-server \
pciutils \
python3.11-venv \
rsync \
sudo \
util-linux \
vim \
wget \
&& \
rm -rf /tmp/*
# Install Docker
ENV DOCKER_VERSION=20.10.8
RUN cd /tmp && \
wget -q https://download.docker.com/linux/static/stable/x86_64/docker-${DOCKER_VERSION}.tgz -O docker.tgz && \
tar --extract --file docker.tgz --strip-components 1 --directory /usr/local/bin/ && \
rm docker.tgz
# Update system config
RUN mkdir -p /root/.ssh && \
touch /root/.ssh/authorized_keys && \
mkdir -p /var/run/sshd && \
sed -i "s/[# ]*PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config && \
sed -i "s/[# ]*PermitUserEnvironment no/PermitUserEnvironment yes/" /etc/ssh/sshd_config && \
sed -i "s/[# ]*Port.*/Port 22/" /etc/ssh/sshd_config && \
echo "* soft nofile 1048576\n* hard nofile 1048576" >> /etc/security/limits.conf && \
echo "root soft nofile 1048576\nroot hard nofile 1048576" >> /etc/security/limits.conf
ENV ROCM_PATH=/opt/dtk
# Docker 18.09 legacy builder cannot use BuildKit-only named contexts or
# RUN --mount. Prepare a local ./hyhal directory in the build context before
# running docker build, then copy it into the image.
COPY hyhal /opt/hyhal
# Install UCX
ARG UCX_VERSION=1.20.0
ARG UCX_HOME=/opt/ucx
RUN cd /tmp && \
wget https://github.com/openucx/ucx/releases/download/v${UCX_VERSION}/ucx-${UCX_VERSION}.tar.gz && \
tar xzf ucx-${UCX_VERSION}.tar.gz && \
cd ucx-${UCX_VERSION} && \
./contrib/configure-release --prefix=${UCX_HOME} \
--enable-optimizations --enable-tuning \
--enable-cma --enable-mt \
--with-mlx5 --with-rc --with-ud --with-dc --with-dm --with-ib_hw_tm \
--with-verbs=/usr/include --with-rdmacm=/usr \
--with-rocm=${ROCM_PATH} \
--without-knem --without-cuda --without-java && \
make -j $(nproc) && \
rm -rf ${UCX_HOME} && \
make install && \
rm -rf /tmp/ucx-${UCX_VERSION}*
# Install OpenMPI
ENV MPI_HOME=/opt/mpi
ARG OMPI_VERSION=5.0.9
RUN cd /tmp && \
wget https://download.open-mpi.org/release/open-mpi/v${OMPI_VERSION%.*}/openmpi-${OMPI_VERSION}.tar.gz && \
tar xzf openmpi-${OMPI_VERSION}.tar.gz && \
cd openmpi-${OMPI_VERSION} && \
./configure --prefix=${MPI_HOME} \
--with-ucx=${UCX_HOME} \
--with-rocm=${ROCM_PATH} \
--enable-builtin-atomics \
--enable-wrapper-rpath \
--enable-mca-no-build=btl-uct \
--enable-prte-prefix-by-default && \
make -j $(nproc) && \
rm -rf ${MPI_HOME} && \
make install && \
ldconfig && \
cd / && \
rm -rf /tmp/openmpi-${OMPI_VERSION}*
# Install Intel MLC
RUN cd /tmp && \
wget -q https://downloadmirror.intel.com/866182/mlc_v3.12.tgz -O mlc.tgz && \
tar xzf mlc.tgz Linux/mlc && \
cp ./Linux/mlc /usr/local/bin/ && \
rm -rf ./Linux mlc.tgz
# Add rocblas-bench to path
RUN ln -s ${ROCM_PATH}/lib/rocblas/benchmark_tool/rocblas-bench ${ROCM_PATH}/bin/ && \
chmod +x ${ROCM_PATH}/bin/rocblas-bench && \
ln -s ${ROCM_PATH}/lib/hipblaslt/benchmark_tool/hipblaslt-bench ${ROCM_PATH}/bin/ && \
chmod +x ${ROCM_PATH}/bin/hipblaslt-bench
ENV PATH="${MPI_HOME}/bin:${UCX_HOME}/bin:/opt/superbench/bin:/usr/local/bin/${PATH:+:${PATH}}" \
LD_LIBRARY_PATH="${MPI_HOME}/lib:${UCX_HOME}/lib:/usr/lib/x86_64-linux-gnu/:/usr/local/lib/${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" \
SB_HOME=/opt/superbench \
SB_MICRO_PATH=/opt/superbench \
ANSIBLE_DEPRECATION_WARNINGS=FALSE \
ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections
WORKDIR ${SB_HOME}
COPY third_party third_party
COPY dockerfile/etc/dtk26.04-topo-mapping.xml ${ROCM_PATH}/rccl/lib/topo_mapping_default.xml
RUN make \
RCCL_HOME=${ROCM_PATH}/rccl \
ROCM_PATH=${ROCM_PATH} \
HIP_HOME=${ROCM_PATH}/hip \
MPI_HOME=${MPI_HOME} \
-C third_party \
dtk \
-o cpu_hpl \
-o cpu_stream \
-o megatron_lm \
-o apex_rocm \
-o megatron_deepspeed \
-o rocm_megatron_lm
COPY . .
ARG SB_PIP_INDEX_URL=https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
ENV USE_HIP_DATATYPE=1 \
USE_HIPBLAS_COMPUTETYPE=1 \
VIRTUAL_ENV=/opt/superbench-venv
ENV PATH="${VIRTUAL_ENV}/bin:${MPI_HOME}/bin:${UCX_HOME}/bin:/opt/superbench/bin:/usr/local/bin/${PATH:+:${PATH}}"
RUN sed -i '/NCCL_/d' /etc/bash.bashrc && \
echo PATH="$PATH" > /etc/environment && \
echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \
echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment && \
echo VIRTUAL_ENV="$VIRTUAL_ENV" >> /etc/environment
RUN python3 -m venv --system-site-packages ${VIRTUAL_ENV} && \
python3 -m pip install -i ${SB_PIP_INDEX_URL} --upgrade pip wheel setuptools==65.7 mpi4py onnx==1.19.1 && \
python3 -m pip install -i ${SB_PIP_INDEX_URL} --no-build-isolation .[hgworker] && \
make cppbuild && \
make postinstall
...@@ -180,7 +180,7 @@ RUN sed -i '/NCCL_/d' /etc/bash.bashrc && \ ...@@ -180,7 +180,7 @@ RUN sed -i '/NCCL_/d' /etc/bash.bashrc && \
RUN --mount=type=bind,from=hyhal,source=/,target=/opt/hyhal \ RUN --mount=type=bind,from=hyhal,source=/,target=/opt/hyhal \
python3 -m venv --system-site-packages ${VIRTUAL_ENV} && \ python3 -m venv --system-site-packages ${VIRTUAL_ENV} && \
python3 -m pip install -i ${SB_PIP_INDEX_URL} --upgrade pip wheel setuptools==65.7 mpi4py && \ python3 -m pip install -i ${SB_PIP_INDEX_URL} --upgrade pip wheel setuptools==65.7 mpi4py onnx==1.19.1 && \
python3 -m pip install -i ${SB_PIP_INDEX_URL} --no-build-isolation .[hgworker] && \ python3 -m pip install -i ${SB_PIP_INDEX_URL} --no-build-isolation .[hgworker] && \
make cppbuild && \ make cppbuild && \
make postinstall make postinstall
...@@ -181,7 +181,7 @@ RUN sed -i '/NCCL_/d' /etc/bash.bashrc && \ ...@@ -181,7 +181,7 @@ RUN sed -i '/NCCL_/d' /etc/bash.bashrc && \
echo VIRTUAL_ENV="$VIRTUAL_ENV" >> /etc/environment echo VIRTUAL_ENV="$VIRTUAL_ENV" >> /etc/environment
RUN python3 -m venv --system-site-packages ${VIRTUAL_ENV} && \ RUN python3 -m venv --system-site-packages ${VIRTUAL_ENV} && \
python3 -m pip install -i ${SB_PIP_INDEX_URL} --upgrade pip wheel setuptools==65.7 mpi4py && \ python3 -m pip install -i ${SB_PIP_INDEX_URL} --upgrade pip wheel setuptools==65.7 mpi4py onnx==1.19.1 && \
python3 -m pip install -i ${SB_PIP_INDEX_URL} --no-build-isolation .[hgworker] && \ python3 -m pip install -i ${SB_PIP_INDEX_URL} --no-build-isolation .[hgworker] && \
make cppbuild && \ make cppbuild && \
make postinstall make postinstall
...@@ -152,6 +152,19 @@ Inference performance of the torchvision models using ONNXRuntime. Currently the ...@@ -152,6 +152,19 @@ Inference performance of the torchvision models using ONNXRuntime. Currently the
The supported percentiles are 50, 90, 95, 99, and 99.9. The supported percentiles are 50, 90, 95, 99, and 99.9.
#### Parameters
| Parameter | Default | Description |
|------------------------|---------|-----------------------------------------------------------------------------|
| `--pytorch_models` | See above | Torchvision models to export to ONNX and run with ONNX Runtime. |
| `--precision` | `float16` | Inference precision: `float32`, `float16`, or `int8`. |
| `--graph_opt_level` | `3` | ONNX Runtime graph optimization level: `0`, `1`, `2`, or `3`. |
| `--batch_size` | `32` | Batch size of the generated input tensor. |
| `--num_warmup` | `64` | Number of warmup inference iterations excluded from metrics. |
| `--num_steps` | `256` | Number of measured inference iterations. |
| `--execution_provider` | `auto` | ONNX Runtime execution provider: `auto`, `cuda`, `rocm`, `migraphx`, `cpu`, or a full provider name. |
| `--pretrained` | `false` | Use pretrained torchvision weights when exporting ONNX models. |
#### Metrics #### Metrics
| Name | Unit | Description | | Name | Unit | Description |
...@@ -684,6 +697,28 @@ Test the performance of large scale matmul operation with multiple GPUs: ...@@ -684,6 +697,28 @@ Test the performance of large scale matmul operation with multiple GPUs:
Test the performance of distributed model inference. Support both PyTorch implementation and cpp implementation. Test the performance of distributed model inference. Support both PyTorch implementation and cpp implementation.
#### Parameters
| Parameter | Default | Description |
|--------------------------|-------------|-----------------------------------------------------------------------------|
| `--use_pytorch` | `false` | Use the PyTorch implementation. If omitted, the C++ implementation is used. |
| `--batch_size` | `64` | Batch size of the generated input tensor. |
| `--input_size` | `1024` | Input dimension of the synthetic model. |
| `--hidden_size` | `1024` | Hidden dimension of the synthetic model. |
| `--alpha` | `1.0` | Alpha coefficient for `D = alpha * (A * B) + beta * C`. |
| `--beta` | `1.0` | Beta coefficient for `D = alpha * (A * B) + beta * C`. |
| `--num_layers` | `1` | Number of repeated compute-communicate-activate layers. |
| `--computation_kernel` | `matmul` | Computation kernel: `addmm`, `matmul`, or `mul`. |
| `--communication_kernel` | `allreduce` | Communication kernel: `allgather`, `allreduce`, or `alltoall`. |
| `--activation_kernel` | `relu` | Activation kernel: `relu`, `sigmoid`, or `tanh`. |
| `--precision` | `float32` | Model precision, such as `float32` or `float16`. |
| `--num_warmup` | `50` | Number of warmup steps excluded from metrics. |
| `--num_steps` | `10000` | Number of measured inference steps. |
| `--distributed_impl` | `ddp` | Distributed implementation for the PyTorch path. |
| `--distributed_backend` | `nccl` | Distributed backend for the PyTorch path. |
| `--use_cuda_graph` | `false` | Launch kernels in CUDA graph mode when supported. |
| `--tune_gemm` | `false` | Tune GEMM performance before measurement in the C++ implementation. |
#### Metrics #### Metrics
| Name | Unit | Description | | Name | Unit | Description |
......
# Copyright (c) Microsoft Corporation. # Copyright (c) Microsoft Corporation.
# Licensed under the MIT license. # Licensed under the MIT license.
"""TensorRT inference micro-benchmark.""" """ONNX Runtime inference micro-benchmark."""
import time import time
import statistics import statistics
from pathlib import Path from pathlib import Path
from packaging import version
import torch import torch
import torchvision.models import torchvision.models
import numpy as np import numpy as np
from superbench.common.utils import logger from superbench.common.utils import logger
from superbench.benchmarks import BenchmarkRegistry, Platform, Precision from superbench.benchmarks import BenchmarkRegistry, Platform, Precision, ReturnCode
from superbench.benchmarks.micro_benchmarks import MicroBenchmark from superbench.benchmarks.micro_benchmarks import MicroBenchmark
class ORTInferenceBenchmark(MicroBenchmark): class ORTInferenceBenchmark(MicroBenchmark):
"""ONNXRuntime inference micro-benchmark class.""" """ONNXRuntime inference micro-benchmark class."""
_execution_provider_map = {
'cuda': 'CUDAExecutionProvider',
'rocm': 'ROCMExecutionProvider',
'migraphx': 'MIGraphXExecutionProvider',
'cpu': 'CPUExecutionProvider',
}
_execution_provider_preference = [
'CUDAExecutionProvider',
'ROCMExecutionProvider',
'MIGraphXExecutionProvider',
'CPUExecutionProvider',
]
def __init__(self, name, parameters=''): def __init__(self, name, parameters=''):
"""Constructor. """Constructor.
...@@ -40,6 +54,7 @@ def __init__(self, name, parameters=''): ...@@ -40,6 +54,7 @@ def __init__(self, name, parameters=''):
] ]
self.__graph_opt_level = None self.__graph_opt_level = None
self.__model_cache_path = Path(torch.hub.get_dir()) / 'checkpoints' self.__model_cache_path = Path(torch.hub.get_dir()) / 'checkpoints'
self.__execution_provider = None
def add_parser_arguments(self): def add_parser_arguments(self):
"""Add the specified arguments.""" """Add the specified arguments."""
...@@ -50,7 +65,7 @@ def add_parser_arguments(self): ...@@ -50,7 +65,7 @@ def add_parser_arguments(self):
type=str, type=str,
nargs='+', nargs='+',
default=self._pytorch_models, default=self._pytorch_models,
help='ONNX models for TensorRT inference benchmark, e.g., {}.'.format(', '.join(self._pytorch_models)), help='ONNX models for ONNX Runtime inference benchmark, e.g., {}.'.format(', '.join(self._pytorch_models)),
) )
self._parser.add_argument( self._parser.add_argument(
...@@ -96,6 +111,42 @@ def add_parser_arguments(self): ...@@ -96,6 +111,42 @@ def add_parser_arguments(self):
help='The number of test step for benchmarking.', help='The number of test step for benchmarking.',
) )
self._parser.add_argument(
'--execution_provider',
type=str,
choices=['auto'] + list(self._execution_provider_map.keys()) + list(self._execution_provider_map.values()),
default='auto',
required=False,
help='ONNX Runtime execution provider. Use auto, cuda, rocm, migraphx, cpu, or the full provider name.',
)
self._parser.add_argument(
'--pretrained',
action='store_true',
default=False,
required=False,
help='Whether to use pretrained torchvision model weights when exporting ONNX models.',
)
def __select_execution_provider(self, available_providers):
"""Select ONNX Runtime execution provider.
Args:
available_providers (List[str]): available ONNX Runtime execution providers.
Return:
str: selected execution provider.
"""
provider = self._args.execution_provider
if provider != 'auto':
return self._execution_provider_map.get(provider, provider)
for preferred_provider in self._execution_provider_preference:
if preferred_provider in available_providers:
return preferred_provider
return ''
def _preprocess(self): def _preprocess(self):
"""Preprocess/preparation operations before the benchmarking. """Preprocess/preparation operations before the benchmarking.
...@@ -113,13 +164,28 @@ def _preprocess(self): ...@@ -113,13 +164,28 @@ def _preprocess(self):
3: ort.GraphOptimizationLevel.ORT_ENABLE_ALL, 3: ort.GraphOptimizationLevel.ORT_ENABLE_ALL,
} }
available_providers = ort.get_available_providers()
self.__execution_provider = self.__select_execution_provider(available_providers)
if self.__execution_provider not in available_providers:
self._result.set_return_code(ReturnCode.MICROBENCHMARK_UNSUPPORTED_ARCHITECTURE)
logger.error(
'Unsupported ONNX Runtime execution provider - benchmark: {}, provider: {}, available providers: {}.'.
format(self._name, self._args.execution_provider, available_providers)
)
return False
logger.info('Using ONNX Runtime execution provider: %s.', self.__execution_provider)
for model in self._args.pytorch_models: for model in self._args.pytorch_models:
if hasattr(torchvision.models, model): if hasattr(torchvision.models, model):
data_type = Precision.FLOAT16.value if self._args.precision == Precision.FLOAT16 \ data_type = Precision.FLOAT16.value if self._args.precision == Precision.FLOAT16 \
else Precision.FLOAT32.value else Precision.FLOAT32.value
model_path = f'{self.__model_cache_path / (model + "." + data_type + ".onnx")}' model_path = f'{self.__model_cache_path / (model + "." + data_type + ".onnx")}'
if version.parse(torchvision.__version__) < version.parse('0.13'):
model_args = {'pretrained': self._args.pretrained}
else:
model_args = {'pretrained': True} if self._args.pretrained else {'weights': None}
torch.onnx.export( torch.onnx.export(
getattr(torchvision.models, model)(pretrained=True).to(dtype=getattr(torch, data_type)).cuda(), getattr(torchvision.models, model)(**model_args).to(dtype=getattr(torch, data_type)).cuda(),
torch.randn(self._args.batch_size, 3, 224, 224, device='cuda', dtype=getattr(torch, data_type)), torch.randn(self._args.batch_size, 3, 224, 224, device='cuda', dtype=getattr(torch, data_type)),
model_path, model_path,
input_names=['input'], input_names=['input'],
...@@ -146,7 +212,7 @@ def _benchmark(self): ...@@ -146,7 +212,7 @@ def _benchmark(self):
sess_options.graph_optimization_level = self.__graph_opt_level[self._args.graph_opt_level] sess_options.graph_optimization_level = self.__graph_opt_level[self._args.graph_opt_level]
file_name = '{model}.{precision}.onnx'.format(model=model, precision=self._args.precision) file_name = '{model}.{precision}.onnx'.format(model=model, precision=self._args.precision)
ort_sess = ort.InferenceSession( ort_sess = ort.InferenceSession(
f'{self.__model_cache_path / file_name}', sess_options, providers=['CUDAExecutionProvider'] f'{self.__model_cache_path / file_name}', sess_options, providers=[self.__execution_provider]
) )
elapse_times = self.__inference(ort_sess) elapse_times = self.__inference(ort_sess)
...@@ -197,3 +263,13 @@ def __inference(self, ort_sess): ...@@ -197,3 +263,13 @@ def __inference(self, ort_sess):
ORTInferenceBenchmark, ORTInferenceBenchmark,
platform=Platform.CUDA, platform=Platform.CUDA,
) )
BenchmarkRegistry.register_benchmark(
'ort-inference',
ORTInferenceBenchmark,
platform=Platform.ROCM,
)
BenchmarkRegistry.register_benchmark(
'ort-inference',
ORTInferenceBenchmark,
platform=Platform.DTK,
)
...@@ -523,3 +523,15 @@ superbench: ...@@ -523,3 +523,15 @@ superbench:
hidden_size: 4096 hidden_size: 4096
input_size: 4096 input_size: 4096
batch_size: 1024 batch_size: 1024
ort-inference:
<<: *default_local_mode
parameters:
execution_provider: rocm
pytorch_models:
- resnet50
- resnet152
- resnext50_32x4d
- wide_resnet50_2
- mobilenet_v2
precision: float16
batch_size: 1
...@@ -412,3 +412,15 @@ superbench: ...@@ -412,3 +412,15 @@ superbench:
hidden_size: 4096 hidden_size: 4096
input_size: 4096 input_size: 4096
batch_size: 1024 batch_size: 1024
ort-inference:
<<: *default_local_mode
parameters:
execution_provider: rocm
pytorch_models:
- resnet50
- resnet152
- resnext50_32x4d
- wide_resnet50_2
- mobilenet_v2
precision: float16
batch_size: 1
...@@ -15,6 +15,14 @@ ...@@ -15,6 +15,14 @@
from superbench.benchmarks.micro_benchmarks.ort_inference_performance import ORTInferenceBenchmark from superbench.benchmarks.micro_benchmarks.ort_inference_performance import ORTInferenceBenchmark
def test_ort_inference_registered_platforms():
"""Test ort-inference benchmark registration on supported platforms."""
for platform in [Platform.CUDA, Platform.ROCM, Platform.DTK]:
(benchmark_class,
predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark('ort-inference', platform)
assert (benchmark_class == ORTInferenceBenchmark)
@decorator.cuda_test @decorator.cuda_test
@decorator.pytorch_test @decorator.pytorch_test
@mock.patch('torch.hub.get_dir') @mock.patch('torch.hub.get_dir')
...@@ -49,6 +57,7 @@ def test_ort_inference_performance(mock_ort_session_run, mock_get_dir): ...@@ -49,6 +57,7 @@ def test_ort_inference_performance(mock_ort_session_run, mock_get_dir):
assert (benchmark._args.pytorch_models == ['resnet50']) assert (benchmark._args.pytorch_models == ['resnet50'])
assert (benchmark._args.graph_opt_level == 1) assert (benchmark._args.graph_opt_level == 1)
assert (benchmark._args.precision == Precision.FLOAT16) assert (benchmark._args.precision == Precision.FLOAT16)
assert (benchmark._args.pretrained is False)
assert (benchmark._args.batch_size == 16) assert (benchmark._args.batch_size == 16)
assert (benchmark._args.num_warmup == 128) assert (benchmark._args.num_warmup == 128)
assert (benchmark._args.num_steps == 512) assert (benchmark._args.num_steps == 512)
...@@ -66,3 +75,24 @@ def test_ort_inference_performance(mock_ort_session_run, mock_get_dir): ...@@ -66,3 +75,24 @@ def test_ort_inference_performance(mock_ort_session_run, mock_get_dir):
metric = '{}_{}_time'.format(precision, model) metric = '{}_{}_time'.format(precision, model)
assert (metric in benchmark.result) assert (metric in benchmark.result)
assert (metric in benchmark.raw_data) assert (metric in benchmark.raw_data)
@decorator.cuda_test
@decorator.pytorch_test
@mock.patch('torch.hub.get_dir')
@mock.patch('onnxruntime.get_available_providers')
@mock.patch('onnxruntime.InferenceSession')
def test_ort_inference_execution_provider_rocm(mock_ort_session, mock_get_available_providers, mock_get_dir):
"""Test ort-inference execution provider mapping."""
mock_get_dir.return_value = '/tmp/superbench/'
mock_get_available_providers.return_value = ['ROCMExecutionProvider', 'CPUExecutionProvider']
benchmark = ORTInferenceBenchmark(
'ort-inference',
parameters='--pytorch_models resnet50 --precision float16 --execution_provider rocm'
' --batch_size 16 --num_warmup 1 --num_steps 1'
)
assert (benchmark._preprocess())
assert (benchmark._benchmark())
shutil.rmtree(benchmark._ORTInferenceBenchmark__model_cache_path)
assert (mock_ort_session.call_args.kwargs['providers'] == ['ROCMExecutionProvider'])
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment