Unverified Commit c65ae567 authored by Yuting Jiang's avatar Yuting Jiang Committed by GitHub
Browse files

Benchmarks: micro benchmarks - add --set_ib_devices option to auto-select IB...

Benchmarks: micro benchmarks - add --set_ib_devices option to auto-select IB device by MPI local rank in ib validation (#733)

**Description**
add --set_ib_devices option to auto-select IB device by MPI local rank 


**Major Revision**
- Add a new CLI flag --set_ib_devices to automatically select irregular
IB devices based on the MPI local rank.
- When enabled, the benchmark queries available IB devices via
network.get_ib_devices() and selects the device corresponding to
OMPI_COMM_WORLD_LOCAL_RANK.
- Fall back to existing --ib_dev behavior when the flag is not provided.

**Minor Revision**
- Add an env in network.get_ib_devices() to allow user to set the device
name
parent 25db1115
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
from superbench.benchmarks import BenchmarkRegistry, ReturnCode from superbench.benchmarks import BenchmarkRegistry, ReturnCode
from superbench.common.devices import GPU from superbench.common.devices import GPU
from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
from superbench.common.utils import network
class IBBenchmark(MicroBenchmarkWithInvoke): class IBBenchmark(MicroBenchmarkWithInvoke):
...@@ -43,6 +44,13 @@ def add_parser_arguments(self): ...@@ -43,6 +44,13 @@ def add_parser_arguments(self):
required=False, required=False,
help='The IB device, e.g., mlx5_0, mlx5_$LOCAL_RANK, mlx5_$((LOCAL_RANK/2)), etc.', help='The IB device, e.g., mlx5_0, mlx5_$LOCAL_RANK, mlx5_$((LOCAL_RANK/2)), etc.',
) )
self._parser.add_argument(
'--set_ib_devices',
action='store_true',
default=False,
help='Set irregular IB devices automatically according to the local rank. \
If IB devices are not able to be probed, use env IB_DEVICES to set them manually.',
)
self._parser.add_argument( self._parser.add_argument(
'--gpu_dev', '--gpu_dev',
type=str, type=str,
...@@ -282,6 +290,16 @@ def __prepare_general_ib_command_params(self, msg_size, device='cpu'): ...@@ -282,6 +290,16 @@ def __prepare_general_ib_command_params(self, msg_size, device='cpu'):
return False return False
# Generate ib command params # Generate ib command params
command_params = f'-F -n {self._args.iters} -d {self._args.ib_dev} {msg_size} {gpu_dev}' command_params = f'-F -n {self._args.iters} -d {self._args.ib_dev} {msg_size} {gpu_dev}'
if self._args.set_ib_devices:
ib_devices = network.get_ib_devices()
local_rank = int(os.getenv('OMPI_COMM_WORLD_LOCAL_RANK', 0))
if local_rank >= len(ib_devices):
self._result.set_return_code(ReturnCode.INVALID_ARGUMENT)
logger.error(
f'Local rank {local_rank} exceeds IB devices ({len(ib_devices)}) - benchmark: {self._name}'
)
return False
command_params = f'-F -n {self._args.iters} -d {ib_devices[local_rank].split(":")[0]} {msg_size} {gpu_dev}'
command_params = f'{command_params.strip()} --report_gbits' command_params = f'{command_params.strip()} --report_gbits'
return command_params return command_params
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
import socket import socket
import re import re
import os
from pathlib import Path from pathlib import Path
...@@ -31,6 +32,8 @@ def get_ib_devices(): ...@@ -31,6 +32,8 @@ def get_ib_devices():
Return: Return:
ib_devices_port (list): IB devices with available ports in current system. ib_devices_port (list): IB devices with available ports in current system.
""" """
if os.getenv('IB_DEVICES', None):
return os.getenv('IB_DEVICES').split(',')
devices = list(p.name for p in Path('/sys/class/infiniband').glob('*')) devices = list(p.name for p in Path('/sys/class/infiniband').glob('*'))
ib_devices_port_dict = {} ib_devices_port_dict = {}
for device in devices: for device in devices:
......
...@@ -177,6 +177,20 @@ def test_ib_traffic_performance(self, mock_gpu): ...@@ -177,6 +177,20 @@ def test_ib_traffic_performance(self, mock_gpu):
ret = benchmark._preprocess() ret = benchmark._preprocess()
assert (ret is True) assert (ret is True)
os.environ['IB_DEVICES'] = 'mlx5_ibx0,mlx5_ibx1,mlx5_ibx2'
parameters = '--set_ib_devices --iters 2000 --pattern one-to-one --hostfile hostfile'
benchmark = benchmark_class(benchmark_name, parameters=parameters)
ret = benchmark._preprocess()
assert (ret is True)
expect_command = "ib_validation --send_cmd_prefix '" + benchmark._args.bin_dir + \
"/ib_write_bw -F -n 2000 -d mlx5_ibx0 -s 8388608 --report_gbits'" + \
f" --recv_cmd_prefix '{benchmark._args.bin_dir}/ib_write_bw -F -n 2000" + \
" -d mlx5_ibx0 -s 8388608 --report_gbits' " + \
f'--timeout 120 --hostfile hostfile --input_config {os.getcwd()}/config.txt'
command = benchmark._bin_name + benchmark._commands[0].split(benchmark._bin_name)[1]
assert (command == expect_command)
os.environ.pop('IB_DEVICES')
# Generate config # Generate config
parameters = '--ib_dev "$(echo mlx5_0)" --iters 2000 --msg_size 33554432 --hostfile hostfile' parameters = '--ib_dev "$(echo mlx5_0)" --iters 2000 --msg_size 33554432 --hostfile hostfile'
benchmark = benchmark_class(benchmark_name, parameters=parameters) benchmark = benchmark_class(benchmark_name, parameters=parameters)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment