Unverified Commit 2c88db90 authored by Yifan Xiong's avatar Yifan Xiong Committed by GitHub
Browse files

Release - SuperBench v0.10.0 (#607)



**Description**

Cherry-pick bug fixes from v0.10.0 to main.

**Major Revisions**

* Benchmarks: Microbenchmark - Support different hipblasLt data types in dist_inference #590
* Benchmarks: Microbenchmark - Support in-place for NCCL/RCCL benchmark #591
* Bug Fix - Fix NUMA Domains Swap Issue in NDv4 Topology File #592
* Benchmarks: Microbenchmark - Add data type option for NCCL and RCCL tests #595
* Benchmarks: Bug Fix - Make metrics of dist-inference-cpp aligned with PyTorch version #596
* CI/CD - Add ndv5 topo file #597
* Benchmarks: Microbenchmark - Improve AMD GPU P2P performance with fine-grained GPU memory #593
* Benchmarks: Build Pipeline - fix nccl and nccl test version to 2.18.3 to resolve hang issue in cuda12.2 docker #599
* Dockerfile - Bug fix for rocm docker build and deploy #598
* Benchmarks: Microbenchmark - Adapt to hipblasLt data type changes #603
* Benchmarks: Micro benchmarks - Update hipblaslt metric unit to tflops #604
* Monitor - Upgrade pyrsmi to amdsmi python library. #601
* Benchmarks: Micro benchmarks - add fp8 and initialization for hipblaslt benchmark #605
* Dockerfile - Add rocm6.0 dockerfile #602
* Bug Fix - Bug fix for latest megatron-lm benchmark #600
* Docs - Upgrade version and release note #606
Co-authored-by: default avatarZiyue Yang <ziyyang@microsoft.com>
Co-authored-by: default avatarYang Wang <yangwang1@microsoft.com>
Co-authored-by: default avatarYuting Jiang <yutingjiang@microsoft.com>
Co-authored-by: default avatarguoshzhao <guzhao@microsoft.com>
parent 2c2096ed
...@@ -31,6 +31,14 @@ else() ...@@ -31,6 +31,14 @@ else()
# link hip device lib # link hip device lib
add_executable(dist_inference dist_inference.cpp) add_executable(dist_inference dist_inference.cpp)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -DROCM_USE_FLOAT16=1") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -DROCM_USE_FLOAT16=1")
if(DEFINED ENV{USE_HIPBLASLT_DATATYPE})
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_HIPBLASLT_DATATYPE=1")
elseif(DEFINED ENV{USE_HIP_DATATYPE})
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_HIP_DATATYPE=1")
endif()
if(DEFINED ENV{USE_HIPBLAS_COMPUTETYPE})
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_HIPBLAS_COMPUTETYPE=1")
endif()
target_link_libraries(dist_inference MPI::MPI_CXX rccl hipblaslt hip::device) target_link_libraries(dist_inference MPI::MPI_CXX rccl hipblaslt hip::device)
else() else()
message(FATAL_ERROR "No CUDA or ROCm environment found.") message(FATAL_ERROR "No CUDA or ROCm environment found.")
......
...@@ -45,6 +45,21 @@ ...@@ -45,6 +45,21 @@
#include <hipblaslt/hipblaslt.h> #include <hipblaslt/hipblaslt.h>
#include <rccl/rccl.h> #include <rccl/rccl.h>
using cublasLtHalf = hipblasLtHalf; using cublasLtHalf = hipblasLtHalf;
#if defined(USE_HIPBLASLT_DATATYPE)
#define DIST_INF_HIP_DATATYPE_R_16F HIPBLASLT_R_16F
#define DIST_INF_HIP_DATATYPE_R_32F HIPBLASLT_R_32F
#elif defined(USE_HIP_DATATYPE)
#define DIST_INF_HIP_DATATYPE_R_16F HIP_R_16F
#define DIST_INF_HIP_DATATYPE_R_32F HIP_R_32F
#else
#define DIST_INF_HIP_DATATYPE_R_16F HIPBLAS_R_16F
#define DIST_INF_HIP_DATATYPE_R_32F HIPBLAS_R_32F
#endif
#if defined(USE_HIPBLAS_COMPUTETYPE)
#define DIST_INF_HIP_COMPUTETYPE_F32 HIPBLAS_COMPUTE_32F
#else
#define DIST_INF_HIP_COMPUTETYPE_F32 HIPBLASLT_COMPUTE_F32
#endif
#else #else
#include <cublasLt.h> #include <cublasLt.h>
#include <cuda_fp16.h> #include <cuda_fp16.h>
...@@ -229,16 +244,18 @@ void TestModel(int64_t m, int64_t n, int64_t k, float alpha, float beta, int32_t ...@@ -229,16 +244,18 @@ void TestModel(int64_t m, int64_t n, int64_t k, float alpha, float beta, int32_t
CHECK_CUBLASLT_ERROR(hipblasLtCreate(&handle)); CHECK_CUBLASLT_ERROR(hipblasLtCreate(&handle));
CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matA, HIPBLAS_R_16F, k, n, k)); CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matA, DIST_INF_HIP_DATATYPE_R_16F, k, n, k));
CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matB, HIPBLAS_R_16F, m, k, m)); CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matB, DIST_INF_HIP_DATATYPE_R_16F, m, k, m));
CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matC, HIPBLAS_R_16F, m, n, m)); CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matC, DIST_INF_HIP_DATATYPE_R_16F, m, n, m));
CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matD, HIPBLAS_R_16F, m, n, m)); CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matD, DIST_INF_HIP_DATATYPE_R_16F, m, n, m));
CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matE, HIPBLAS_R_16F, k, m, k)); CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matE, DIST_INF_HIP_DATATYPE_R_16F, k, m, k));
CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matF, HIPBLAS_R_16F, k, n, k)); CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matF, DIST_INF_HIP_DATATYPE_R_16F, k, n, k));
CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matG, HIPBLAS_R_16F, k, n, k)); CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matG, DIST_INF_HIP_DATATYPE_R_16F, k, n, k));
CHECK_CUBLASLT_ERROR(hipblasLtMatmulDescCreate(&matmul1, HIPBLASLT_COMPUTE_F32, HIPBLAS_R_32F)); CHECK_CUBLASLT_ERROR(
CHECK_CUBLASLT_ERROR(hipblasLtMatmulDescCreate(&matmul2, HIPBLASLT_COMPUTE_F32, HIPBLAS_R_32F)); hipblasLtMatmulDescCreate(&matmul1, DIST_INF_HIP_COMPUTETYPE_F32, DIST_INF_HIP_DATATYPE_R_32F));
CHECK_CUBLASLT_ERROR(
hipblasLtMatmulDescCreate(&matmul2, DIST_INF_HIP_COMPUTETYPE_F32, DIST_INF_HIP_DATATYPE_R_32F));
hipblasOperation_t trans = HIPBLAS_OP_N; hipblasOperation_t trans = HIPBLAS_OP_N;
CHECK_CUBLASLT_ERROR( CHECK_CUBLASLT_ERROR(
...@@ -336,8 +353,9 @@ void TestModel(int64_t m, int64_t n, int64_t k, float alpha, float beta, int32_t ...@@ -336,8 +353,9 @@ void TestModel(int64_t m, int64_t n, int64_t k, float alpha, float beta, int32_t
#endif #endif
std::chrono::steady_clock::time_point start_time, stop_time; std::chrono::steady_clock::time_point start_time, stop_time;
std::vector<double> step_times(num_iters, 0.);
for (int i = 0; i < num_warmups + num_iters; ++i) { for (int i = 0; i < num_warmups + num_iters; ++i) {
if (i == num_warmups) { if (i >= num_warmups) {
start_time = std::chrono::steady_clock::now(); start_time = std::chrono::steady_clock::now();
} }
#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && (CUDART_VERSION >= 11030 || HIP_VERSION >= 50221310) #if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && (CUDART_VERSION >= 11030 || HIP_VERSION >= 50221310)
...@@ -350,11 +368,15 @@ void TestModel(int64_t m, int64_t n, int64_t k, float alpha, float beta, int32_t ...@@ -350,11 +368,15 @@ void TestModel(int64_t m, int64_t n, int64_t k, float alpha, float beta, int32_t
model_forward(); model_forward();
#endif #endif
CHECK_CUDA_ERROR(cudaStreamSynchronize(stream)); CHECK_CUDA_ERROR(cudaStreamSynchronize(stream));
if (i >= num_warmups) {
stop_time = std::chrono::steady_clock::now();
double step_time = std::chrono::duration_cast<std::chrono::nanoseconds>(stop_time - start_time).count();
step_times[i - num_warmups] = step_time;
}
}
for (int i = 0; i < num_iters; i++) {
fprintf(stdout, "Latency of step %d: %g ms\n", i, step_times[i] / 1e6);
} }
stop_time = std::chrono::steady_clock::now();
double duration = std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time).count();
fprintf(stdout, "Time: %g ms in total, %g ms per iteration, %g ms per layer\n", duration, duration / num_iters,
duration / num_iters / num_layers);
#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && (CUDART_VERSION >= 11030 || HIP_VERSION >= 50221310) #if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && (CUDART_VERSION >= 11030 || HIP_VERSION >= 50221310)
// Destroy graph // Destroy graph
......
...@@ -27,6 +27,13 @@ else() ...@@ -27,6 +27,13 @@ else()
# link hip device lib # link hip device lib
add_executable(gpu_copy gpu_copy.cpp) add_executable(gpu_copy gpu_copy.cpp)
include(CheckSymbolExists)
check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY)
if(${HIP_UNCACHED_MEMORY})
target_compile_definitions(gpu_copy PRIVATE HIP_UNCACHED_MEMORY)
endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
target_link_libraries(gpu_copy numa hip::device) target_link_libraries(gpu_copy numa hip::device)
else() else()
......
...@@ -313,6 +313,25 @@ int SetGpu(int gpu_id) { ...@@ -313,6 +313,25 @@ int SetGpu(int gpu_id) {
return 0; return 0;
} }
#if defined(__HIP_PLATFORM_AMD__)
bool UseFineGrained(const SubBenchArgs &args) {
return args.is_src_dev_gpu && args.is_dst_dev_gpu && args.src_gpu_id != args.dst_gpu_id;
}
cudaError_t GpuMallocDataBuf(uint8_t **ptr, uint64_t size, bool use_fine_grained) {
if (use_fine_grained) {
#if defined(HIP_UNCACHED_MEMORY)
return hipExtMallocWithFlags((void **)ptr, size, hipDeviceMallocUncached);
#else
return hipExtMallocWithFlags((void **)ptr, size, hipDeviceMallocFinegrained);
#endif
} else {
return cudaMalloc(ptr, size);
}
}
#else
cudaError_t GpuMallocDataBuf(uint8_t **ptr, uint64_t size) { return cudaMalloc(ptr, size); }
#endif
// Prepare data buffers and streams to be used. // Prepare data buffers and streams to be used.
int PrepareBufAndStream(BenchArgs *args) { int PrepareBufAndStream(BenchArgs *args) {
cudaError_t cuda_err = cudaSuccess; cudaError_t cuda_err = cudaSuccess;
...@@ -346,7 +365,11 @@ int PrepareBufAndStream(BenchArgs *args) { ...@@ -346,7 +365,11 @@ int PrepareBufAndStream(BenchArgs *args) {
return -1; return -1;
} }
*(host_buf_ptrs[j]) = nullptr; *(host_buf_ptrs[j]) = nullptr;
cuda_err = cudaMalloc(gpu_buf_ptrs[j], args->size); #if defined(__HIP_PLATFORM_AMD__)
cuda_err = GpuMallocDataBuf(gpu_buf_ptrs[j], args->size, UseFineGrained(sub));
#else
cuda_err = GpuMallocDataBuf(gpu_buf_ptrs[j], args->size);
#endif
if (cuda_err != cudaSuccess) { if (cuda_err != cudaSuccess) {
fprintf(stderr, "PrepareBufAndStream::cudaMalloc error: %d\n", cuda_err); fprintf(stderr, "PrepareBufAndStream::cudaMalloc error: %d\n", cuda_err);
return -1; return -1;
...@@ -876,7 +899,11 @@ int RunAllToAllBench(const Opts &opts, int gpu_count, int src_rank, int dst_rank ...@@ -876,7 +899,11 @@ int RunAllToAllBench(const Opts &opts, int gpu_count, int src_rank, int dst_rank
} }
// Prepare source buffers // Prepare source buffers
cuda_err = cudaMalloc(&(src_buffers_gpu[rank]), opts.size); #if defined(__HIP_PLATFORM_AMD__)
cuda_err = GpuMallocDataBuf(&(src_buffers_gpu[rank]), opts.size, true);
#else
cuda_err = GpuMallocDataBuf(&(src_buffers_gpu[rank]), opts.size);
#endif
if (cuda_err != cudaSuccess) { if (cuda_err != cudaSuccess) {
fprintf(stderr, "RunAllToAllBench::cudaMalloc for src_buffers_gpu[%d] error: %d\n", cuda_err, rank); fprintf(stderr, "RunAllToAllBench::cudaMalloc for src_buffers_gpu[%d] error: %d\n", cuda_err, rank);
return -1; return -1;
...@@ -893,7 +920,11 @@ int RunAllToAllBench(const Opts &opts, int gpu_count, int src_rank, int dst_rank ...@@ -893,7 +920,11 @@ int RunAllToAllBench(const Opts &opts, int gpu_count, int src_rank, int dst_rank
} }
// Prepare destination buffers // Prepare destination buffers
cuda_err = cudaMalloc(&(dst_buffers_gpu[rank]), opts.size); #if defined(__HIP_PLATFORM_AMD__)
cuda_err = GpuMallocDataBuf(&(dst_buffers_gpu[rank]), opts.size, true);
#else
cuda_err = GpuMallocDataBuf(&(dst_buffers_gpu[rank]), opts.size);
#endif
if (cuda_err != cudaSuccess) { if (cuda_err != cudaSuccess) {
fprintf(stderr, "RunAllToAllBench::cudaMalloc for dst_buffers_gpu[%d] error: %d\n", cuda_err, rank); fprintf(stderr, "RunAllToAllBench::cudaMalloc for dst_buffers_gpu[%d] error: %d\n", cuda_err, rank);
return -1; return -1;
......
...@@ -4,7 +4,6 @@ ...@@ -4,7 +4,6 @@
"""Module of the hipBlasLt GEMM benchmark.""" """Module of the hipBlasLt GEMM benchmark."""
import os import os
import re
from superbench.common.utils import logger from superbench.common.utils import logger
from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode
...@@ -23,11 +22,12 @@ class HipBlasLtBenchmark(BlasLtBaseBenchmark): ...@@ -23,11 +22,12 @@ class HipBlasLtBenchmark(BlasLtBaseBenchmark):
super().__init__(name, parameters) super().__init__(name, parameters)
self._bin_name = 'hipblaslt-bench' self._bin_name = 'hipblaslt-bench'
self._in_types = ['fp32', 'fp16', 'bf16'] self._in_types = ['fp32', 'fp16', 'bf16', 'fp8']
self._in_type_map = { self._in_type_map = {
'fp16': '--a_type f16_r --b_type f16_r --c_type f16_r --d_type f16_r --compute_type f32_r', 'fp16': '--a_type f16_r --b_type f16_r --c_type f16_r --d_type f16_r --compute_type f32_r',
'fp32': '--a_type f32_r --b_type f32_r --c_type f32_r --d_type f32_r --compute_type f32_r', 'fp32': '--a_type f32_r --b_type f32_r --c_type f32_r --d_type f32_r --compute_type f32_r',
'bf16': '--a_type bf16_r --b_type bf16_r --c_type bf16_r --d_type bf16_r --compute_type f32_r', 'bf16': '--a_type bf16_r --b_type bf16_r --c_type bf16_r --d_type bf16_r --compute_type f32_r',
'fp8': '--a_type f8_r --b_type f8_r --c_type f8_r --d_type f8_r --compute_type f32_r',
} }
def add_parser_arguments(self): def add_parser_arguments(self):
...@@ -42,6 +42,30 @@ class HipBlasLtBenchmark(BlasLtBaseBenchmark): ...@@ -42,6 +42,30 @@ class HipBlasLtBenchmark(BlasLtBaseBenchmark):
required=False, required=False,
help='List of input data types, support {}.'.format(' '.join(self._in_types)), help='List of input data types, support {}.'.format(' '.join(self._in_types)),
) )
self._parser.add_argument(
'--initialization',
type=str,
default='rand_int',
choices=['trig_float', 'rand_int', 'hpl'],
required=False,
help='Initialize matrix data.',
)
self._parser.add_argument(
'--transA',
type=str,
default='N',
choices=['N', 'T', 'C'],
required=False,
help='Transpose matrix A.',
)
self._parser.add_argument(
'--transB',
type=str,
default='N',
choices=['N', 'T', 'C'],
required=False,
help='Transpose matrix B.',
)
def _preprocess(self): def _preprocess(self):
"""Preprocess/preparation operations before the benchmarking. """Preprocess/preparation operations before the benchmarking.
...@@ -58,7 +82,9 @@ class HipBlasLtBenchmark(BlasLtBaseBenchmark): ...@@ -58,7 +82,9 @@ class HipBlasLtBenchmark(BlasLtBaseBenchmark):
self._precision_in_commands = [] self._precision_in_commands = []
for (_m, _n, _k, _b, _in_type) in self._shapes_to_run: for (_m, _n, _k, _b, _in_type) in self._shapes_to_run:
command = f'{self.__bin_path} -m {_m} -n {_n} -k {_k} -j {self._args.num_warmup}' + \ command = f'{self.__bin_path} -m {_m} -n {_n} -k {_k} -j {self._args.num_warmup}' + \
f' -i {self._args.num_steps} {self._in_type_map[_in_type]}' f' -i {self._args.num_steps} {self._in_type_map[_in_type]}' + \
f' --transA {self._args.transA} --transB {self._args.transB}' + \
f' --initialization {self._args.initialization}'
command = command + f' -b {str(_b)}' if _b > 0 else command command = command + f' -b {str(_b)}' if _b > 0 else command
logger.info(command) logger.info(command)
self._commands.append(command) self._commands.append(command)
...@@ -97,13 +123,12 @@ class HipBlasLtBenchmark(BlasLtBaseBenchmark): ...@@ -97,13 +123,12 @@ class HipBlasLtBenchmark(BlasLtBaseBenchmark):
fields = lines[index + 1].strip().split(',') fields = lines[index + 1].strip().split(',')
# Check the number of fields and the format of the first two fields # Check the number of fields and the format of the first two fields
if len(fields) != 23 or not all( if len(fields) != 23:
re.match(r'\d*\.\d*$', item.strip()) or item.strip().isdigit() for item in fields[-2:]
):
raise ValueError('Invalid result') raise ValueError('Invalid result')
self._result.add_result( self._result.add_result(
f'{self._precision_in_commands[cmd_idx]}_{fields[3]}_{"_".join(fields[4:7])}_flops', float(fields[-2]) f'{self._precision_in_commands[cmd_idx]}_{fields[3]}_{"_".join(fields[4:7])}_flops',
float(fields[-2]) / 1000
) )
except BaseException as e: except BaseException as e:
self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE) self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
......
...@@ -45,8 +45,7 @@ message(STATUS "CMAKE HIP ARCHITECTURES: ${CMAKE_HIP_ARCHITECTURES}") ...@@ -45,8 +45,7 @@ message(STATUS "CMAKE HIP ARCHITECTURES: ${CMAKE_HIP_ARCHITECTURES}")
if(EXISTS ${HIP_PATH}) if(EXISTS ${HIP_PATH})
# Search for hip in common locations # Search for hip in common locations
list(APPEND CMAKE_PREFIX_PATH ${HIP_PATH} ${ROCM_PATH}) list(APPEND CMAKE_PREFIX_PATH ${HIP_PATH} ${ROCM_PATH} ${ROCM_PATH}/hsa ${ROCM_PATH}/hip ${ROCM_PATH}/share/rocm/cmake/)
set(CMAKE_PREFIX_PATH /opt/rocm ROCM_PATH)
set(CMAKE_CXX_COMPILER "${HIP_PATH}/bin/hipcc") set(CMAKE_CXX_COMPILER "${HIP_PATH}/bin/hipcc")
set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH}) set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
set(CMAKE_MODULE_PATH "${HIP_PATH}/lib/cmake/hip" ${CMAKE_MODULE_PATH}) set(CMAKE_MODULE_PATH "${HIP_PATH}/lib/cmake/hip" ${CMAKE_MODULE_PATH})
......
...@@ -116,6 +116,9 @@ class MegatronGPT(ModelBenchmark): ...@@ -116,6 +116,9 @@ class MegatronGPT(ModelBenchmark):
self._parser.add_argument('--data_home', type=str, default='/tmp', help='Data home.') self._parser.add_argument('--data_home', type=str, default='/tmp', help='Data home.')
self._parser.add_argument('--vocab_path', type=str, default='/tmp/gpt2-vocab.json', help='Vocab path.') self._parser.add_argument('--vocab_path', type=str, default='/tmp/gpt2-vocab.json', help='Vocab path.')
self._parser.add_argument('--merge_path', type=str, default='/tmp/gpt2-merges.txt', help='Merge path.') self._parser.add_argument('--merge_path', type=str, default='/tmp/gpt2-merges.txt', help='Merge path.')
self._parser.add_argument(
'--split', type=str, default='949,50,1', help='Split dataset ratio for train/val/test.'
)
self._parser.add_argument('--prescale_grad', action='store_true', help='Prescale grad.') self._parser.add_argument('--prescale_grad', action='store_true', help='Prescale grad.')
self._parser.add_argument( self._parser.add_argument(
'--hostfile', type=str, default=None, help='Hostfile to run the mutli-node benchmark.' '--hostfile', type=str, default=None, help='Hostfile to run the mutli-node benchmark.'
...@@ -128,6 +131,13 @@ class MegatronGPT(ModelBenchmark): ...@@ -128,6 +131,13 @@ class MegatronGPT(ModelBenchmark):
def _preprocess(self): def _preprocess(self):
if not super()._preprocess(): if not super()._preprocess():
return False return False
if not self._args.code_base:
if self._args.deepspeed:
self._args.code_base = os.path.join(
os.getenv('SB_MICRO_PATH'), 'third_party/Megatron/Megatron-DeepSpeed/'
)
else:
self._args.code_base = os.path.join(os.getenv('SB_MICRO_PATH'), 'third_party/Megatron/Megatron-LM')
if not os.path.exists(self._args.code_base) or \ if not os.path.exists(self._args.code_base) or \
not os.path.exists(os.path.join(self._args.code_base, 'pretrain_gpt.py')): not os.path.exists(os.path.join(self._args.code_base, 'pretrain_gpt.py')):
...@@ -156,35 +166,35 @@ class MegatronGPT(ModelBenchmark): ...@@ -156,35 +166,35 @@ class MegatronGPT(ModelBenchmark):
def _parse_log(self, output): def _parse_log(self, output):
"""Parse log output and get the performance.""" """Parse log output and get the performance."""
tflops_pattern = re.compile(r'TFLOPs: (\d+\.\d+)') tflops_pattern = re.compile(r'(TFLOPs|TFLOP/s/GPU\)): (\d+\.\d+)')
elapsed_time_pattern = re.compile(r'elapsed time per iteration \(ms\): (\d+\.\d+)') elapsed_time_pattern = re.compile(r'elapsed time per iteration \(ms\): (\d+\.\d+)')
mem_allocated_pattern = re.compile(r'MemAllocated=([\d.]+)[KMGTPEZY]?B') mem_allocated_pattern = re.compile(r'allocated: (\d+\.\d+)')
max_mem_allocated_pattern = re.compile(r'MaxMemAllocated=([\d.]+)[KMGTPEZY]?B') max_mem_allocated_pattern = re.compile(r'max allocated: (\d+\.\d+)')
lines = output.splitlines() lines = output.splitlines()
tflops = [] tflops = []
mem_allocated = [] mem_allocated = []
max_mem_allocated = [] max_mem_allocated = []
iteration_times = [] iteration_times = []
for line in lines: for line in lines:
if 'TFLOPs' in line: if 'elapsed time per iteration' in line:
tflops_matches = tflops_pattern.search(line) tflops_matches = tflops_pattern.search(line)
elapsed_time_match = elapsed_time_pattern.search(line) elapsed_time_match = elapsed_time_pattern.search(line)
if tflops_matches: if tflops_matches:
tflops_values = float(tflops_matches.group(1)) tflops_values = float(tflops_matches.group(2))
tflops.append(tflops_values) tflops.append(tflops_values)
if elapsed_time_match: if elapsed_time_match:
elapsed_time_value = float(elapsed_time_match.group(1)) elapsed_time_value = float(elapsed_time_match.group(1))
iteration_times.append(elapsed_time_value) iteration_times.append(elapsed_time_value)
if 'MaxMemAllocated' in line: if 'max allocated' in line:
mem_allocated_match = mem_allocated_pattern.search(line) mem_allocated_match = mem_allocated_pattern.search(line)
max_mem_allocated_match = max_mem_allocated_pattern.search(line) max_mem_allocated_match = max_mem_allocated_pattern.search(line)
if mem_allocated_match: if mem_allocated_match:
mem_allocated_value = float(mem_allocated_match.group(1)) mem_allocated_value = float(mem_allocated_match.group(1)) / 1024
mem_allocated.append(mem_allocated_value) mem_allocated.append(mem_allocated_value)
if max_mem_allocated_match: if max_mem_allocated_match:
max_mem_allocated_value = float(max_mem_allocated_match.group(1)) max_mem_allocated_value = float(max_mem_allocated_match.group(1)) / 1024
max_mem_allocated.append(max_mem_allocated_value) max_mem_allocated.append(max_mem_allocated_value)
return iteration_times, tflops, mem_allocated, max_mem_allocated return iteration_times, tflops, mem_allocated, max_mem_allocated
...@@ -224,7 +234,9 @@ class MegatronGPT(ModelBenchmark): ...@@ -224,7 +234,9 @@ class MegatronGPT(ModelBenchmark):
--deepspeed \ --deepspeed \
--deepspeed_config {self._config_json_path} \ --deepspeed_config {self._config_json_path} \
--zero-stage {self._args.zero_stage} \ --zero-stage {self._args.zero_stage} \
--pipeline-model-parallel-size {self._args.pipeline_model_parallel_size}' --pipeline-model-parallel-size {self._args.pipeline_model_parallel_size} \
--train-tokens {self._args.train_tokens} \
--data-impl {self._args.data_impl}'
if self._args.pipeline_model_parallel_size <= 1: if self._args.pipeline_model_parallel_size <= 1:
deepspeed_options = f'{deepspeed_options} --no-pipeline-parallel' deepspeed_options = f'{deepspeed_options} --no-pipeline-parallel'
...@@ -255,11 +267,10 @@ class MegatronGPT(ModelBenchmark): ...@@ -255,11 +267,10 @@ class MegatronGPT(ModelBenchmark):
--num-attention-heads {self._args.num_attn_heads} \ --num-attention-heads {self._args.num_attn_heads} \
--seq-length {self._args.seq_len} \ --seq-length {self._args.seq_len} \
--max-position-embeddings {self._args.seq_len} \ --max-position-embeddings {self._args.seq_len} \
--train-tokens {self._args.train_tokens} \
--train-samples {self._args.num_steps * self._args.batch_size} \ --train-samples {self._args.num_steps * self._args.batch_size} \
--lr {self._args.lr} \ --lr {self._args.lr} \
--min-lr {self._args.min_lr} \ --min-lr {self._args.min_lr} \
--split 949,50,1 \ --split {self._args.split} \
--log-interval {self._args.log_interval} \ --log-interval {self._args.log_interval} \
--eval-interval {self._args.eval_interval} \ --eval-interval {self._args.eval_interval} \
--eval-iters {self._args.eval_iters} \ --eval-iters {self._args.eval_iters} \
...@@ -273,7 +284,8 @@ class MegatronGPT(ModelBenchmark): ...@@ -273,7 +284,8 @@ class MegatronGPT(ModelBenchmark):
--optimizer adam \ --optimizer adam \
--use-distributed-optimizer \ --use-distributed-optimizer \
{precision_megatron} \ {precision_megatron} \
--seed {self._args.seed}' --seed {self._args.seed} \
--log-throughput'
if self._args.sequence_parallel: if self._args.sequence_parallel:
megatron_options = f'{megatron_options} --sequence-parallel' megatron_options = f'{megatron_options} --sequence-parallel'
...@@ -298,6 +310,8 @@ class MegatronGPT(ModelBenchmark): ...@@ -298,6 +310,8 @@ class MegatronGPT(ModelBenchmark):
script_path = os.path.join(self._args.code_base, 'pretrain_gpt.py') script_path = os.path.join(self._args.code_base, 'pretrain_gpt.py')
if self._args.deepspeed: if self._args.deepspeed:
deepspeed_option = self.__prepare_deespeed_config(precision_megatron.lstrip('--')) deepspeed_option = self.__prepare_deespeed_config(precision_megatron.lstrip('--'))
# No --log-throughput in Megatron-DeepSpeed by 20231219
megatron_options = megatron_options.replace('--log-throughput', '').strip()
if self._num_nodes > 1: if self._num_nodes > 1:
command = f'torchrun {self._distributed_args} ' + \ command = f'torchrun {self._distributed_args} ' + \
f'{script_path} {megatron_options} {self._data_options} {deepspeed_option}' f'{script_path} {megatron_options} {self._data_options} {deepspeed_option}'
...@@ -379,6 +393,7 @@ class MegatronGPT(ModelBenchmark): ...@@ -379,6 +393,7 @@ class MegatronGPT(ModelBenchmark):
return False return False
self._num_nodes = int(os.getenv('OMPI_COMM_WORLD_SIZE')) // int(os.getenv('OMPI_COMM_WORLD_LOCAL_SIZE')) self._num_nodes = int(os.getenv('OMPI_COMM_WORLD_SIZE')) // int(os.getenv('OMPI_COMM_WORLD_LOCAL_SIZE'))
master_addr = 'localhost'
if self._num_nodes > 1: if self._num_nodes > 1:
if not self._args.hostfile: if not self._args.hostfile:
sb_hostfile = os.path.join(os.environ.get('SB_WORKSPACE', '.'), 'hostfile') sb_hostfile = os.path.join(os.environ.get('SB_WORKSPACE', '.'), 'hostfile')
...@@ -395,12 +410,13 @@ class MegatronGPT(ModelBenchmark): ...@@ -395,12 +410,13 @@ class MegatronGPT(ModelBenchmark):
if self._num_nodes != len(hosts): if self._num_nodes != len(hosts):
logger.error('MPI init failed since hostfile not match the MPI setting.') logger.error('MPI init failed since hostfile not match the MPI setting.')
return False return False
master_addr = hosts[0].split()[0]
addr = os.getenv('MASTER_ADDR', hosts[0].split()[0]) addr = os.getenv('MASTER_ADDR', master_addr)
port = os.getenv('MASTER_PORT', '29500') port = os.getenv('MASTER_PORT', '29500')
node_rank = int(os.environ['OMPI_COMM_WORLD_RANK']) // int(os.environ['OMPI_COMM_WORLD_LOCAL_SIZE']) node_rank = int(os.environ['OMPI_COMM_WORLD_RANK']) // int(os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'])
self._distributed_args = f'--nproc_per_node {self._args.num_gpus} --nnodes {self._num_nodes} ' + \ self._distributed_args = f'--nproc_per_node {self._args.num_gpus} --nnodes {self._num_nodes} ' + \
f'--node_rank {node_rank} --master_addr {addr} --master_port {port}' f'--node_rank {node_rank} --master_addr {addr} --master_port {port}'
return True return True
def _generate_dataset(self): def _generate_dataset(self):
...@@ -448,8 +464,7 @@ class MegatronGPT(ModelBenchmark): ...@@ -448,8 +464,7 @@ class MegatronGPT(ModelBenchmark):
self._data_options = f'\ self._data_options = f'\
--vocab-file {self._vocab_path} \ --vocab-file {self._vocab_path} \
--merge-file {self._merges_path} \ --merge-file {self._merges_path} \
--data-path {self._data_path} \ --data-path {self._data_path}'
--data-impl {self._args.data_impl}'
logger.info('Dataset preparation successfully.') logger.info('Dataset preparation successfully.')
return True return True
......
...@@ -265,8 +265,8 @@ class ModelBenchmark(Benchmark): ...@@ -265,8 +265,8 @@ class ModelBenchmark(Benchmark):
# The unit of step time should be millisecond. # The unit of step time should be millisecond.
step_times = self._train_step(precision) step_times = self._train_step(precision)
if isinstance(step_times, tuple): if isinstance(step_times, tuple):
step_times = step_times[0]
info = step_times[1] info = step_times[1]
step_times = step_times[0]
self._process_info(ModelAction.TRAIN, precision, info) self._process_info(ModelAction.TRAIN, precision, info)
step_times = self.__process_model_result(ModelAction.TRAIN, precision, step_times) step_times = self.__process_model_result(ModelAction.TRAIN, precision, step_times)
if not step_times: if not step_times:
......
...@@ -13,7 +13,7 @@ gpu = GPU() ...@@ -13,7 +13,7 @@ gpu = GPU()
if gpu.vendor == 'nvidia' or gpu.vendor == 'nvidia-graphics': if gpu.vendor == 'nvidia' or gpu.vendor == 'nvidia-graphics':
import py3nvml.py3nvml as nvml import py3nvml.py3nvml as nvml
elif gpu.vendor == 'amd' or gpu.vendor == 'amd-graphics': elif gpu.vendor == 'amd' or gpu.vendor == 'amd-graphics':
from pyrsmi import rocml import amdsmi as rocml
class DeviceManager: class DeviceManager:
...@@ -150,7 +150,7 @@ class NvidiaDeviceManager(DeviceManager): ...@@ -150,7 +150,7 @@ class NvidiaDeviceManager(DeviceManager):
try: try:
cap = nvml.nvmlDeviceGetCudaComputeCapability(self._device_handlers[0]) cap = nvml.nvmlDeviceGetCudaComputeCapability(self._device_handlers[0])
except Exception as err: except Exception as err:
logger.error('Get device compute capability failed: {}'.format(str(err))) logger.warning('Get device compute capability failed: {}'.format(str(err)))
return None return None
return cap return cap
...@@ -166,7 +166,7 @@ class NvidiaDeviceManager(DeviceManager): ...@@ -166,7 +166,7 @@ class NvidiaDeviceManager(DeviceManager):
try: try:
util = nvml.nvmlDeviceGetUtilizationRates(self._device_handlers[idx]) util = nvml.nvmlDeviceGetUtilizationRates(self._device_handlers[idx])
except Exception as err: except Exception as err:
logger.error('Get device utilization failed: {}'.format(str(err))) logger.warning('Get device utilization failed: {}'.format(str(err)))
return None return None
return util.gpu return util.gpu
...@@ -182,7 +182,7 @@ class NvidiaDeviceManager(DeviceManager): ...@@ -182,7 +182,7 @@ class NvidiaDeviceManager(DeviceManager):
try: try:
temp = nvml.nvmlDeviceGetTemperature(self._device_handlers[idx], nvml.NVML_TEMPERATURE_GPU) temp = nvml.nvmlDeviceGetTemperature(self._device_handlers[idx], nvml.NVML_TEMPERATURE_GPU)
except Exception as err: except Exception as err:
logger.error('Get device temperature failed: {}'.format(str(err))) logger.warning('Get device temperature failed: {}'.format(str(err)))
temp = None temp = None
return temp return temp
...@@ -198,7 +198,7 @@ class NvidiaDeviceManager(DeviceManager): ...@@ -198,7 +198,7 @@ class NvidiaDeviceManager(DeviceManager):
try: try:
power = nvml.nvmlDeviceGetPowerUsage(self._device_handlers[idx]) power = nvml.nvmlDeviceGetPowerUsage(self._device_handlers[idx])
except Exception as err: except Exception as err:
logger.error('Get device power failed: {}'.format(str(err))) logger.warning('Get device power failed: {}'.format(str(err)))
return None return None
return int(int(power) / 1000) return int(int(power) / 1000)
...@@ -214,7 +214,7 @@ class NvidiaDeviceManager(DeviceManager): ...@@ -214,7 +214,7 @@ class NvidiaDeviceManager(DeviceManager):
try: try:
powerlimit = nvml.nvmlDeviceGetPowerManagementLimit(self._device_handlers[idx]) powerlimit = nvml.nvmlDeviceGetPowerManagementLimit(self._device_handlers[idx])
except Exception as err: except Exception as err:
logger.error('Get device power limitation failed: {}'.format(str(err))) logger.warning('Get device power limitation failed: {}'.format(str(err)))
return None return None
return int(int(powerlimit) / 1000) return int(int(powerlimit) / 1000)
...@@ -231,7 +231,7 @@ class NvidiaDeviceManager(DeviceManager): ...@@ -231,7 +231,7 @@ class NvidiaDeviceManager(DeviceManager):
try: try:
mem = nvml.nvmlDeviceGetMemoryInfo(self._device_handlers[idx]) mem = nvml.nvmlDeviceGetMemoryInfo(self._device_handlers[idx])
except Exception as err: except Exception as err:
logger.error('Get device memory failed: {}'.format(str(err))) logger.warning('Get device memory failed: {}'.format(str(err)))
return None, None return None, None
return mem.used, mem.total return mem.used, mem.total
...@@ -304,7 +304,7 @@ class NvidiaDeviceManager(DeviceManager): ...@@ -304,7 +304,7 @@ class NvidiaDeviceManager(DeviceManager):
except nvml.NVMLError: except nvml.NVMLError:
pass pass
except Exception as err: except Exception as err:
logger.error('Get device ECC information failed: {}'.format(str(err))) logger.warning('Get device ECC information failed: {}'.format(str(err)))
return None, None return None, None
try: try:
...@@ -316,7 +316,7 @@ class NvidiaDeviceManager(DeviceManager): ...@@ -316,7 +316,7 @@ class NvidiaDeviceManager(DeviceManager):
except nvml.NVMLError: except nvml.NVMLError:
pass pass
except Exception as err: except Exception as err:
logger.error('Get device ECC information failed: {}'.format(str(err))) logger.warning('Get device ECC information failed: {}'.format(str(err)))
return None, None return None, None
return corrected_ecc, uncorrected_ecc return corrected_ecc, uncorrected_ecc
...@@ -326,12 +326,13 @@ class AmdDeviceManager(DeviceManager): ...@@ -326,12 +326,13 @@ class AmdDeviceManager(DeviceManager):
"""Device management module for AMD.""" """Device management module for AMD."""
def __init__(self): def __init__(self):
"""Constructor.""" """Constructor."""
rocml.smi_initialize() rocml.amdsmi_init()
self._device_handlers = rocml.amdsmi_get_processor_handles()
super().__init__() super().__init__()
def __del__(self): def __del__(self):
"""Destructor.""" """Destructor."""
rocml.smi_shutdown() rocml.amdsmi_shut_down()
def get_device_count(self): def get_device_count(self):
"""Get the number of device. """Get the number of device.
...@@ -339,7 +340,7 @@ class AmdDeviceManager(DeviceManager): ...@@ -339,7 +340,7 @@ class AmdDeviceManager(DeviceManager):
Return: Return:
count (int): count of device. count (int): count of device.
""" """
return rocml.smi_get_device_count() return len(self._device_handlers)
def get_device_utilization(self, idx): def get_device_utilization(self, idx):
"""Get the utilization of device. """Get the utilization of device.
...@@ -351,11 +352,11 @@ class AmdDeviceManager(DeviceManager): ...@@ -351,11 +352,11 @@ class AmdDeviceManager(DeviceManager):
util (int): the utilization of device, None means failed to get the data. util (int): the utilization of device, None means failed to get the data.
""" """
try: try:
util = rocml.smi_get_device_utilization(idx) engine_usage = rocml.amdsmi_get_gpu_activity(self._device_handlers[idx])
except Exception as err: except Exception as err:
logger.error('Get device utilization failed: {}'.format(str(err))) logger.warning('Get device utilization failed: {}'.format(str(err)))
return None return None
return util return engine_usage['gfx_activity']
def get_device_temperature(self, idx): def get_device_temperature(self, idx):
"""Get the temperature of device, unit: celsius. """Get the temperature of device, unit: celsius.
...@@ -366,8 +367,16 @@ class AmdDeviceManager(DeviceManager): ...@@ -366,8 +367,16 @@ class AmdDeviceManager(DeviceManager):
Return: Return:
temp (int): the temperature of device, None means failed to get the data. temp (int): the temperature of device, None means failed to get the data.
""" """
# Currently no API provided in rocml. try:
return None temp = rocml.amdsmi_get_temp_metric(
self._device_handlers[idx], rocml.AmdSmiTemperatureType.EDGE, rocml.AmdSmiTemperatureMetric.CURRENT
)
except (rocml.AmdSmiLibraryException, rocml.AmdSmiParameterException):
pass
except Exception as err:
logger.warning('Get device temperature failed: {}'.format(str(err)))
temp = None
return temp
def get_device_power(self, idx): def get_device_power(self, idx):
"""Get the realtime power of device, unit: watt. """Get the realtime power of device, unit: watt.
...@@ -379,11 +388,11 @@ class AmdDeviceManager(DeviceManager): ...@@ -379,11 +388,11 @@ class AmdDeviceManager(DeviceManager):
temp (int): the realtime power of device, None means failed to get the data. temp (int): the realtime power of device, None means failed to get the data.
""" """
try: try:
power = rocml.smi_get_device_average_power(idx) power_measure = rocml.amdsmi_get_power_info(self._device_handlers[idx])
except Exception as err: except Exception as err:
logger.error('Get device power failed: {}'.format(str(err))) logger.warning('Get device power failed: {}'.format(str(err)))
return None return None
return int(int(power) / 1000) return int(power_measure['average_socket_power'])
def get_device_power_limit(self, idx): def get_device_power_limit(self, idx):
"""Get the power management limit of device, unit: watt. """Get the power management limit of device, unit: watt.
...@@ -394,8 +403,12 @@ class AmdDeviceManager(DeviceManager): ...@@ -394,8 +403,12 @@ class AmdDeviceManager(DeviceManager):
Return: Return:
temp (int): the power management limit of device, None means failed to get the data. temp (int): the power management limit of device, None means failed to get the data.
""" """
# Currently no API provided in rocml. try:
return None power_measure = rocml.amdsmi_get_power_info(self._device_handlers[idx])
except Exception as err:
logger.warning('Get device power limit failed: {}'.format(str(err)))
return None
return int(power_measure['power_limit'])
def get_device_memory(self, idx): def get_device_memory(self, idx):
"""Get the memory information of device, unit: byte. """Get the memory information of device, unit: byte.
...@@ -408,10 +421,10 @@ class AmdDeviceManager(DeviceManager): ...@@ -408,10 +421,10 @@ class AmdDeviceManager(DeviceManager):
total (int): the total device memory in bytes, None means failed to get the data. total (int): the total device memory in bytes, None means failed to get the data.
""" """
try: try:
mem_used = rocml.smi_get_device_memory_used(idx) mem_used = rocml.amdsmi_get_gpu_memory_usage(self._device_handlers[idx], rocml.AmdSmiMemoryType.VRAM)
mem_total = rocml.smi_get_device_memory_total(idx) mem_total = rocml.amdsmi_get_gpu_memory_total(self._device_handlers[idx], rocml.AmdSmiMemoryType.VRAM)
except Exception as err: except Exception as err:
logger.error('Get device memory failed: {}'.format(str(err))) logger.warning('Get device memory failed: {}'.format(str(err)))
return None, None return None, None
return mem_used, mem_total return mem_used, mem_total
...@@ -425,8 +438,19 @@ class AmdDeviceManager(DeviceManager): ...@@ -425,8 +438,19 @@ class AmdDeviceManager(DeviceManager):
corrected_ecc (int) : the count of single bit ecc error. corrected_ecc (int) : the count of single bit ecc error.
uncorrected_ecc (int): the count of double bit ecc error. uncorrected_ecc (int): the count of double bit ecc error.
""" """
# Currently no API provided in rocml. corrected_ecc = 0
return None, None uncorrected_ecc = 0
for block in rocml.AmdSmiGpuBlock:
try:
ecc_count = rocml.amdsmi_get_gpu_ecc_count(self._device_handlers[idx], block)
corrected_ecc += ecc_count['correctable_count']
uncorrected_ecc += ecc_count['uncorrectable_count']
except (rocml.AmdSmiLibraryException, rocml.AmdSmiParameterException):
pass
except Exception as err:
logger.info('Get device ECC information failed: {}'.format(str(err)))
return corrected_ecc, uncorrected_ecc
device_manager: Optional[DeviceManager] = DeviceManager() device_manager: Optional[DeviceManager] = DeviceManager()
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
# Server: # Server:
# - Product: HPE Apollo 6500 # - Product: HPE Apollo 6500
version: v0.9 version: v0.10
superbench: superbench:
enable: null enable: null
var: var:
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
# - Product: G482-Z53 # - Product: G482-Z53
# - Link: https://www.gigabyte.cn/FileUpload/Global/MicroSite/553/G482-Z53.html # - Link: https://www.gigabyte.cn/FileUpload/Global/MicroSite/553/G482-Z53.html
version: v0.9 version: v0.10
superbench: superbench:
enable: null enable: null
var: var:
......
version: v0.9 version: v0.10
superbench: superbench:
enable: null enable: null
monitor: monitor:
......
version: v0.9 version: v0.10
superbench: superbench:
enable: null enable: null
monitor: monitor:
......
version: v0.9 version: v0.10
superbench: superbench:
enable: null enable: null
monitor: monitor:
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
# Azure NDm A100 v4 # Azure NDm A100 v4
# reference: https://docs.microsoft.com/en-us/azure/virtual-machines/ndm-a100-v4-series # reference: https://docs.microsoft.com/en-us/azure/virtual-machines/ndm-a100-v4-series
version: v0.9 version: v0.10
superbench: superbench:
enable: null enable: null
monitor: monitor:
......
# SuperBench Config # SuperBench Config
version: v0.9 version: v0.10
superbench: superbench:
enable: null enable: null
monitor: monitor:
......
# SuperBench Config # SuperBench Config
version: v0.9 version: v0.10
superbench: superbench:
enable: null enable: null
monitor: monitor:
......
...@@ -100,7 +100,7 @@ ...@@ -100,7 +100,7 @@
docker run -itd --name={{ container }} \ docker run -itd --name={{ container }} \
--privileged --net=host --ipc=host \ --privileged --net=host --ipc=host \
{{ '--gpus=all' if nvidia_gpu_exist else '' }} \ {{ '--gpus=all' if nvidia_gpu_exist else '' }} \
{{ '--security-opt seccomp=unconfined --group-add video' if amd_gpu_exist else '' }} \ {{ '--security-opt seccomp=unconfined --group-add video --device=/dev/kfd --device=/dev/dri --cap-add=SYS_PTRACE --shm-size=16G' if amd_gpu_exist else '' }} \
-w /root -v {{ workspace }}:/root -v /mnt:/mnt \ -w /root -v {{ workspace }}:/root -v /mnt:/mnt \
-v /var/run/docker.sock:/var/run/docker.sock \ -v /var/run/docker.sock:/var/run/docker.sock \
--entrypoint /bin/bash {{ docker_image }} && \ --entrypoint /bin/bash {{ docker_image }} && \
......
...@@ -66,6 +66,8 @@ class CudaNcclBwBenchmarkTest(BenchmarkTestCase, unittest.TestCase): ...@@ -66,6 +66,8 @@ class CudaNcclBwBenchmarkTest(BenchmarkTestCase, unittest.TestCase):
assert (benchmark._args.iters == 20) assert (benchmark._args.iters == 20)
assert (benchmark._args.warmup_iters == 5) assert (benchmark._args.warmup_iters == 5)
assert (benchmark._args.graph_iters == 0) assert (benchmark._args.graph_iters == 0)
assert (benchmark._args.in_place is False)
assert (benchmark._args.data_type == 'float')
# Check command list # Check command list
bin_names = [ bin_names = [
...@@ -74,7 +76,7 @@ class CudaNcclBwBenchmarkTest(BenchmarkTestCase, unittest.TestCase): ...@@ -74,7 +76,7 @@ class CudaNcclBwBenchmarkTest(BenchmarkTestCase, unittest.TestCase):
] ]
command = bin_names[0] + benchmark._commands[0].split(bin_names[0])[1] command = bin_names[0] + benchmark._commands[0].split(bin_names[0])[1]
expected_command = '{} -b 8 -e 8G -f 2 -g 8 -c 0 -n 20 -w 5 -G 0'.format(bin_names[0]) expected_command = '{} -b 8 -e 8G -f 2 -g 8 -c 0 -n 20 -w 5 -G 0 -d float'.format(bin_names[0])
assert (command == expected_command) assert (command == expected_command)
# Check results and metrics. # Check results and metrics.
...@@ -91,6 +93,11 @@ class CudaNcclBwBenchmarkTest(BenchmarkTestCase, unittest.TestCase): ...@@ -91,6 +93,11 @@ class CudaNcclBwBenchmarkTest(BenchmarkTestCase, unittest.TestCase):
'alltoall': alltoall, 'alltoall': alltoall,
} }
if 'SB_MODE_SERIAL_INDEX' in os.environ:
os.environ.pop('SB_MODE_SERIAL_INDEX')
if 'SB_MODE_PARALLEL_INDEX' in os.environ:
os.environ.pop('SB_MODE_PARALLEL_INDEX')
for op in raw_output.keys(): for op in raw_output.keys():
benchmark._args.operation = op benchmark._args.operation = op
assert (benchmark._process_raw_result(0, raw_output[op])) assert (benchmark._process_raw_result(0, raw_output[op]))
...@@ -131,3 +138,48 @@ class CudaNcclBwBenchmarkTest(BenchmarkTestCase, unittest.TestCase): ...@@ -131,3 +138,48 @@ class CudaNcclBwBenchmarkTest(BenchmarkTestCase, unittest.TestCase):
assert (benchmark.result['alltoall_0_0:8589934592_time'][0] == 33508.0) assert (benchmark.result['alltoall_0_0:8589934592_time'][0] == 33508.0)
assert (benchmark.result['alltoall_0_0:8589934592_algbw'][0] == 256.36) assert (benchmark.result['alltoall_0_0:8589934592_algbw'][0] == 256.36)
assert (benchmark.result['alltoall_0_0:8589934592_busbw'][0] == 224.31) assert (benchmark.result['alltoall_0_0:8589934592_busbw'][0] == 224.31)
@decorator.load_data('tests/data/nccl_allreduce.log')
@decorator.load_data('tests/data/nccl_alltoall.log')
def test_nccl_bw_performance_in_place_parsing(self, allreduce, alltoall):
"""Test nccl-bw benchmark in-place parsing."""
benchmark_name = 'nccl-bw'
(benchmark_class,
predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
assert (benchmark_class)
benchmark = benchmark_class(benchmark_name, parameters='--ngpus 8 --in_place')
ret = benchmark._preprocess()
assert (ret is True)
assert (benchmark.return_code == ReturnCode.SUCCESS)
assert (benchmark._args.in_place is True)
# Case with valid raw_output
raw_output = {
'allreduce': allreduce,
'alltoall': alltoall,
}
if 'SB_MODE_SERIAL_INDEX' in os.environ:
os.environ.pop('SB_MODE_SERIAL_INDEX')
if 'SB_MODE_PARALLEL_INDEX' in os.environ:
os.environ.pop('SB_MODE_PARALLEL_INDEX')
for op in raw_output.keys():
benchmark._args.operation = op
assert (benchmark._process_raw_result(0, raw_output[op]))
for name in ['time', 'algbw', 'busbw']:
for size in ['8589934592', '4294967296', '2147483648', '1073741824', '536870912', '32']:
metric = op + '_' + size + '_' + name
assert (metric in benchmark.result)
assert (len(benchmark.result[metric]) == 1)
assert (isinstance(benchmark.result[metric][0], numbers.Number))
assert (benchmark.result['allreduce_8589934592_time'][0] == 63959.0)
assert (benchmark.result['allreduce_8589934592_algbw'][0] == 134.30)
assert (benchmark.result['allreduce_8589934592_busbw'][0] == 235.03)
assert (benchmark.result['alltoall_8589934592_time'][0] == 33234.0)
assert (benchmark.result['alltoall_8589934592_algbw'][0] == 258.47)
assert (benchmark.result['alltoall_8589934592_busbw'][0] == 226.16)
...@@ -3,7 +3,6 @@ ...@@ -3,7 +3,6 @@
"""Tests for distributed inference benchmark.""" """Tests for distributed inference benchmark."""
import numbers
import unittest import unittest
from tests.helper import decorator from tests.helper import decorator
...@@ -209,19 +208,17 @@ class DistInferenceCppImplTest(BenchmarkTestCase, unittest.TestCase): ...@@ -209,19 +208,17 @@ class DistInferenceCppImplTest(BenchmarkTestCase, unittest.TestCase):
# step_times # step_times
assert (len(benchmark.raw_data) == 2) assert (len(benchmark.raw_data) == 2)
# return code + (avg, 50th, 90th, 95th, 99th, 99.9th) # return code + (avg, 50th, 90th, 95th, 99th, 99.9th)
test_latency = float(test_raw_output.splitlines()[-1].split(' ms per iteration')[0].split()[-1])
assert (7 == len(benchmark.result)) assert (7 == len(benchmark.result))
for output_key in benchmark.result: assert (benchmark.result['return_code'] == [0])
if output_key == 'return_code': assert (benchmark.result['step_times'] == [1.9052048])
assert (benchmark.result[output_key] == [0]) assert (benchmark.result['step_times_50'] == [1.851])
else: assert (benchmark.result['step_times_90'] == [1.89637])
assert (output_key.startswith('step_times')) assert (benchmark.result['step_times_95'] == [2.12037])
assert (len(benchmark.result[output_key]) == 1) assert (benchmark.result['step_times_99'] == [2.67155])
assert (isinstance(benchmark.result[output_key][0], numbers.Number)) assert (benchmark.result['step_times_99.9'] == [4.4198])
assert (test_latency == benchmark.result[output_key][0])
# Negative case - invalid raw output. # Negative case - invalid raw output.
assert (benchmark._process_raw_result(1, 'Invalid raw output') is False) assert (benchmark._process_raw_result(1, 'Latency of step: xxx ms') is False)
assert (benchmark.return_code == ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE) assert (benchmark.return_code == ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
@decorator.cuda_test @decorator.cuda_test
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment