Unverified Commit 7458f83a authored by Yifan Xiong's avatar Yifan Xiong Committed by GitHub
Browse files

Runner & Executor - Support AMD GPU (#119)

Support both NVIDIA and AMD GPU and check GPU vendor during deployment and execution.

* Add GPU environment check in sb deploy.
* Check GPU vendor in executor.
parent 43620c3f
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""SuperBench devices module."""
from superbench.common.devices.gpu import GPU
__all__ = [
'GPU',
]
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""GPU device module."""
from pathlib import Path
from superbench.common.utils import logger
class GPU():
"""GPU device helper class."""
def __init__(self):
"""Initilize."""
self._vendor = self.get_vendor()
# TODO: check CUDA or ROCm availability accordingly
def get_vendor(self):
"""Get GPU vendor.
Returns:
str: GPU vendor, nvidia or amd.
"""
if Path('/dev/nvidiactl').is_char_device() and Path('/dev/nvidia-uvm').is_char_device():
if not list(Path('/dev').glob('nvidia[0-9]*')):
logger.warning('Cannot find NVIDIA GPU device.')
return 'nvidia'
if Path('/dev/kfd').is_char_device() and Path('/dev/dri').is_dir():
if not list(Path('/dev/dri').glob('card*')):
logger.warning('Cannot find AMD GPU device.')
return 'amd'
return None
@property
def vendor(self):
"""Get the GPU vendor."""
return self._vendor
......@@ -11,6 +11,7 @@
from superbench.benchmarks import Platform, Framework, BenchmarkRegistry
from superbench.common.utils import SuperBenchLogger, logger, rotate_dir
from superbench.common.devices import GPU
class SuperBenchExecutor():
......@@ -67,8 +68,15 @@ def __get_enabled_benchmarks(self):
def __get_platform(self):
"""Detect runninng platform by environment."""
# TODO: check devices and env vars
return Platform.CUDA
try:
gpu = GPU()
if gpu.vendor == 'nvidia':
return Platform.CUDA
elif gpu.vendor == 'amd':
return Platform.ROCM
except Exception as e:
logger.error(e)
return Platform.CPU
def __get_arguments(self, parameters):
"""Get command line arguments for argparse.
......
......@@ -19,6 +19,38 @@
comment: superbench
force: no
- name: Check GPU Environment
hosts: all
gather_facts: false
tasks:
- name: Checking NVIDIA GPU Environment
stat:
path: '{{ item }}'
with_items:
- /dev/nvidiactl
- /dev/nvidia-uvm
register: nvidia_dev
- name: Checking AMD GPU Environment
stat:
path: '{{ item }}'
with_items:
- /dev/kfd
- /dev/dri
register: amd_dev
- name: Set GPU Facts
set_fact:
nvidia_gpu_exist: >-
{{ nvidia_dev.results[0].stat.ischr is defined and nvidia_dev.results[0].stat.ischr and
nvidia_dev.results[1].stat.ischr is defined and nvidia_dev.results[1].stat.ischr }}
amd_gpu_exist: >-
{{ amd_dev.results[0].stat.ischr is defined and amd_dev.results[0].stat.ischr and
amd_dev.results[1].stat.isdir is defined and amd_dev.results[1].stat.isdir }}
- name: Print GPU Checking Result
debug:
msg:
- "NVIDIA GPU {{ 'detected' if nvidia_gpu_exist else 'not detected' }}"
- "AMD GPU {{ 'detected' if amd_gpu_exist else 'not detected' }}"
- name: Remote Deployment
hosts: all
gather_facts: false
......@@ -65,8 +97,8 @@
docker rm --force {{ container }} ||: && \
docker run -itd --name={{ container }} \
--privileged --net=host --ipc=host \
{{ '--gpus=all' if gpu_vendor == 'nvidia' else '' }} \
{{ '--security-opt seccomp=unconfined --group-add video' if gpu_vendor == 'amd' else '' }} \
{{ '--gpus=all' if nvidia_gpu_exist else '' }} \
{{ '--security-opt seccomp=unconfined --group-add video' if amd_gpu_exist else '' }} \
-w /root -v {{ workspace }}:/root -v /mnt:/mnt \
{{ docker_image }} bash && \
docker exec {{ container }} bash -c \
......
......@@ -131,7 +131,6 @@ def deploy(self): # pragma: no cover
'ssh_port': random.randint(1 << 14, (1 << 15) - 1),
'output_dir': str(self._output_path),
'docker_image': self._docker_config.image,
'gpu_vendor': 'nvidia',
}
if bool(self._docker_config.username) and bool(self._docker_config.password):
extravars.update(
......
......@@ -15,4 +15,3 @@
ssh_port: 12345
output_dir: /tmp/test_ansible
docker_image: superbench/superbench
gpu_vendor: none
......@@ -57,8 +57,10 @@ def test_get_enabled_benchmarks_enable_list(self):
expected_enabled_benchmarks = ['benchmark_alpha', 'benchmark_beta']
self.assertListEqual(self.executor._SuperBenchExecutor__get_enabled_benchmarks(), expected_enabled_benchmarks)
def test_get_platform(self):
@mock.patch('pathlib.Path.is_char_device')
def test_get_platform(self, mock_is_char_device):
"""Test get platform."""
mock_is_char_device.return_value = True
self.assertEqual(self.executor._SuperBenchExecutor__get_platform().value, 'CUDA')
def test_get_arguments(self):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment