Unverified Commit 2c88db90 authored by Yifan Xiong's avatar Yifan Xiong Committed by GitHub
Browse files

Release - SuperBench v0.10.0 (#607)



**Description**

Cherry-pick bug fixes from v0.10.0 to main.

**Major Revisions**

* Benchmarks: Microbenchmark - Support different hipblasLt data types in dist_inference #590
* Benchmarks: Microbenchmark - Support in-place for NCCL/RCCL benchmark #591
* Bug Fix - Fix NUMA Domains Swap Issue in NDv4 Topology File #592
* Benchmarks: Microbenchmark - Add data type option for NCCL and RCCL tests #595
* Benchmarks: Bug Fix - Make metrics of dist-inference-cpp aligned with PyTorch version #596
* CI/CD - Add ndv5 topo file #597
* Benchmarks: Microbenchmark - Improve AMD GPU P2P performance with fine-grained GPU memory #593
* Benchmarks: Build Pipeline - fix nccl and nccl test version to 2.18.3 to resolve hang issue in cuda12.2 docker #599
* Dockerfile - Bug fix for rocm docker build and deploy #598
* Benchmarks: Microbenchmark - Adapt to hipblasLt data type changes #603
* Benchmarks: Micro benchmarks - Update hipblaslt metric unit to tflops #604
* Monitor - Upgrade pyrsmi to amdsmi python library. #601
* Benchmarks: Micro benchmarks - add fp8 and initialization for hipblaslt benchmark #605
* Dockerfile - Add rocm6.0 dockerfile #602
* Bug Fix - Bug fix for latest megatron-lm benchmark #600
* Docs - Upgrade version and release note #606
Co-authored-by: default avatarZiyue Yang <ziyyang@microsoft.com>
Co-authored-by: default avatarYang Wang <yangwang1@microsoft.com>
Co-authored-by: default avatarYuting Jiang <yutingjiang@microsoft.com>
Co-authored-by: default avatarguoshzhao <guzhao@microsoft.com>
parent 2c2096ed
...@@ -31,6 +31,14 @@ else() ...@@ -31,6 +31,14 @@ else()
# link hip device lib # link hip device lib
add_executable(dist_inference dist_inference.cpp) add_executable(dist_inference dist_inference.cpp)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -DROCM_USE_FLOAT16=1") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -DROCM_USE_FLOAT16=1")
if(DEFINED ENV{USE_HIPBLASLT_DATATYPE})
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_HIPBLASLT_DATATYPE=1")
elseif(DEFINED ENV{USE_HIP_DATATYPE})
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_HIP_DATATYPE=1")
endif()
if(DEFINED ENV{USE_HIPBLAS_COMPUTETYPE})
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_HIPBLAS_COMPUTETYPE=1")
endif()
target_link_libraries(dist_inference MPI::MPI_CXX rccl hipblaslt hip::device) target_link_libraries(dist_inference MPI::MPI_CXX rccl hipblaslt hip::device)
else() else()
message(FATAL_ERROR "No CUDA or ROCm environment found.") message(FATAL_ERROR "No CUDA or ROCm environment found.")
......
...@@ -45,6 +45,21 @@ ...@@ -45,6 +45,21 @@
#include <hipblaslt/hipblaslt.h> #include <hipblaslt/hipblaslt.h>
#include <rccl/rccl.h> #include <rccl/rccl.h>
using cublasLtHalf = hipblasLtHalf; using cublasLtHalf = hipblasLtHalf;
#if defined(USE_HIPBLASLT_DATATYPE)
#define DIST_INF_HIP_DATATYPE_R_16F HIPBLASLT_R_16F
#define DIST_INF_HIP_DATATYPE_R_32F HIPBLASLT_R_32F
#elif defined(USE_HIP_DATATYPE)
#define DIST_INF_HIP_DATATYPE_R_16F HIP_R_16F
#define DIST_INF_HIP_DATATYPE_R_32F HIP_R_32F
#else
#define DIST_INF_HIP_DATATYPE_R_16F HIPBLAS_R_16F
#define DIST_INF_HIP_DATATYPE_R_32F HIPBLAS_R_32F
#endif
#if defined(USE_HIPBLAS_COMPUTETYPE)
#define DIST_INF_HIP_COMPUTETYPE_F32 HIPBLAS_COMPUTE_32F
#else
#define DIST_INF_HIP_COMPUTETYPE_F32 HIPBLASLT_COMPUTE_F32
#endif
#else #else
#include <cublasLt.h> #include <cublasLt.h>
#include <cuda_fp16.h> #include <cuda_fp16.h>
...@@ -229,16 +244,18 @@ void TestModel(int64_t m, int64_t n, int64_t k, float alpha, float beta, int32_t ...@@ -229,16 +244,18 @@ void TestModel(int64_t m, int64_t n, int64_t k, float alpha, float beta, int32_t
CHECK_CUBLASLT_ERROR(hipblasLtCreate(&handle)); CHECK_CUBLASLT_ERROR(hipblasLtCreate(&handle));
CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matA, HIPBLAS_R_16F, k, n, k)); CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matA, DIST_INF_HIP_DATATYPE_R_16F, k, n, k));
CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matB, HIPBLAS_R_16F, m, k, m)); CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matB, DIST_INF_HIP_DATATYPE_R_16F, m, k, m));
CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matC, HIPBLAS_R_16F, m, n, m)); CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matC, DIST_INF_HIP_DATATYPE_R_16F, m, n, m));
CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matD, HIPBLAS_R_16F, m, n, m)); CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matD, DIST_INF_HIP_DATATYPE_R_16F, m, n, m));
CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matE, HIPBLAS_R_16F, k, m, k)); CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matE, DIST_INF_HIP_DATATYPE_R_16F, k, m, k));
CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matF, HIPBLAS_R_16F, k, n, k)); CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matF, DIST_INF_HIP_DATATYPE_R_16F, k, n, k));
CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matG, HIPBLAS_R_16F, k, n, k)); CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matG, DIST_INF_HIP_DATATYPE_R_16F, k, n, k));
CHECK_CUBLASLT_ERROR(hipblasLtMatmulDescCreate(&matmul1, HIPBLASLT_COMPUTE_F32, HIPBLAS_R_32F)); CHECK_CUBLASLT_ERROR(
CHECK_CUBLASLT_ERROR(hipblasLtMatmulDescCreate(&matmul2, HIPBLASLT_COMPUTE_F32, HIPBLAS_R_32F)); hipblasLtMatmulDescCreate(&matmul1, DIST_INF_HIP_COMPUTETYPE_F32, DIST_INF_HIP_DATATYPE_R_32F));
CHECK_CUBLASLT_ERROR(
hipblasLtMatmulDescCreate(&matmul2, DIST_INF_HIP_COMPUTETYPE_F32, DIST_INF_HIP_DATATYPE_R_32F));
hipblasOperation_t trans = HIPBLAS_OP_N; hipblasOperation_t trans = HIPBLAS_OP_N;
CHECK_CUBLASLT_ERROR( CHECK_CUBLASLT_ERROR(
...@@ -336,8 +353,9 @@ void TestModel(int64_t m, int64_t n, int64_t k, float alpha, float beta, int32_t ...@@ -336,8 +353,9 @@ void TestModel(int64_t m, int64_t n, int64_t k, float alpha, float beta, int32_t
#endif #endif
std::chrono::steady_clock::time_point start_time, stop_time; std::chrono::steady_clock::time_point start_time, stop_time;
std::vector<double> step_times(num_iters, 0.);
for (int i = 0; i < num_warmups + num_iters; ++i) { for (int i = 0; i < num_warmups + num_iters; ++i) {
if (i == num_warmups) { if (i >= num_warmups) {
start_time = std::chrono::steady_clock::now(); start_time = std::chrono::steady_clock::now();
} }
#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && (CUDART_VERSION >= 11030 || HIP_VERSION >= 50221310) #if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && (CUDART_VERSION >= 11030 || HIP_VERSION >= 50221310)
...@@ -350,11 +368,15 @@ void TestModel(int64_t m, int64_t n, int64_t k, float alpha, float beta, int32_t ...@@ -350,11 +368,15 @@ void TestModel(int64_t m, int64_t n, int64_t k, float alpha, float beta, int32_t
model_forward(); model_forward();
#endif #endif
CHECK_CUDA_ERROR(cudaStreamSynchronize(stream)); CHECK_CUDA_ERROR(cudaStreamSynchronize(stream));
if (i >= num_warmups) {
stop_time = std::chrono::steady_clock::now();
double step_time = std::chrono::duration_cast<std::chrono::nanoseconds>(stop_time - start_time).count();
step_times[i - num_warmups] = step_time;
}
}
for (int i = 0; i < num_iters; i++) {
fprintf(stdout, "Latency of step %d: %g ms\n", i, step_times[i] / 1e6);
} }
stop_time = std::chrono::steady_clock::now();
double duration = std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time).count();
fprintf(stdout, "Time: %g ms in total, %g ms per iteration, %g ms per layer\n", duration, duration / num_iters,
duration / num_iters / num_layers);
#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && (CUDART_VERSION >= 11030 || HIP_VERSION >= 50221310) #if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && (CUDART_VERSION >= 11030 || HIP_VERSION >= 50221310)
// Destroy graph // Destroy graph
......
...@@ -27,6 +27,13 @@ else() ...@@ -27,6 +27,13 @@ else()
# link hip device lib # link hip device lib
add_executable(gpu_copy gpu_copy.cpp) add_executable(gpu_copy gpu_copy.cpp)
include(CheckSymbolExists)
check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY)
if(${HIP_UNCACHED_MEMORY})
target_compile_definitions(gpu_copy PRIVATE HIP_UNCACHED_MEMORY)
endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
target_link_libraries(gpu_copy numa hip::device) target_link_libraries(gpu_copy numa hip::device)
else() else()
......
...@@ -313,6 +313,25 @@ int SetGpu(int gpu_id) { ...@@ -313,6 +313,25 @@ int SetGpu(int gpu_id) {
return 0; return 0;
} }
#if defined(__HIP_PLATFORM_AMD__)
bool UseFineGrained(const SubBenchArgs &args) {
return args.is_src_dev_gpu && args.is_dst_dev_gpu && args.src_gpu_id != args.dst_gpu_id;
}
cudaError_t GpuMallocDataBuf(uint8_t **ptr, uint64_t size, bool use_fine_grained) {
if (use_fine_grained) {
#if defined(HIP_UNCACHED_MEMORY)
return hipExtMallocWithFlags((void **)ptr, size, hipDeviceMallocUncached);
#else
return hipExtMallocWithFlags((void **)ptr, size, hipDeviceMallocFinegrained);
#endif
} else {
return cudaMalloc(ptr, size);
}
}
#else
cudaError_t GpuMallocDataBuf(uint8_t **ptr, uint64_t size) { return cudaMalloc(ptr, size); }
#endif
// Prepare data buffers and streams to be used. // Prepare data buffers and streams to be used.
int PrepareBufAndStream(BenchArgs *args) { int PrepareBufAndStream(BenchArgs *args) {
cudaError_t cuda_err = cudaSuccess; cudaError_t cuda_err = cudaSuccess;
...@@ -346,7 +365,11 @@ int PrepareBufAndStream(BenchArgs *args) { ...@@ -346,7 +365,11 @@ int PrepareBufAndStream(BenchArgs *args) {
return -1; return -1;
} }
*(host_buf_ptrs[j]) = nullptr; *(host_buf_ptrs[j]) = nullptr;
cuda_err = cudaMalloc(gpu_buf_ptrs[j], args->size); #if defined(__HIP_PLATFORM_AMD__)
cuda_err = GpuMallocDataBuf(gpu_buf_ptrs[j], args->size, UseFineGrained(sub));
#else
cuda_err = GpuMallocDataBuf(gpu_buf_ptrs[j], args->size);
#endif
if (cuda_err != cudaSuccess) { if (cuda_err != cudaSuccess) {
fprintf(stderr, "PrepareBufAndStream::cudaMalloc error: %d\n", cuda_err); fprintf(stderr, "PrepareBufAndStream::cudaMalloc error: %d\n", cuda_err);
return -1; return -1;
...@@ -876,7 +899,11 @@ int RunAllToAllBench(const Opts &opts, int gpu_count, int src_rank, int dst_rank ...@@ -876,7 +899,11 @@ int RunAllToAllBench(const Opts &opts, int gpu_count, int src_rank, int dst_rank
} }
// Prepare source buffers // Prepare source buffers
cuda_err = cudaMalloc(&(src_buffers_gpu[rank]), opts.size); #if defined(__HIP_PLATFORM_AMD__)
cuda_err = GpuMallocDataBuf(&(src_buffers_gpu[rank]), opts.size, true);
#else
cuda_err = GpuMallocDataBuf(&(src_buffers_gpu[rank]), opts.size);
#endif
if (cuda_err != cudaSuccess) { if (cuda_err != cudaSuccess) {
fprintf(stderr, "RunAllToAllBench::cudaMalloc for src_buffers_gpu[%d] error: %d\n", cuda_err, rank); fprintf(stderr, "RunAllToAllBench::cudaMalloc for src_buffers_gpu[%d] error: %d\n", cuda_err, rank);
return -1; return -1;
...@@ -893,7 +920,11 @@ int RunAllToAllBench(const Opts &opts, int gpu_count, int src_rank, int dst_rank ...@@ -893,7 +920,11 @@ int RunAllToAllBench(const Opts &opts, int gpu_count, int src_rank, int dst_rank
} }
// Prepare destination buffers // Prepare destination buffers
cuda_err = cudaMalloc(&(dst_buffers_gpu[rank]), opts.size); #if defined(__HIP_PLATFORM_AMD__)
cuda_err = GpuMallocDataBuf(&(dst_buffers_gpu[rank]), opts.size, true);
#else
cuda_err = GpuMallocDataBuf(&(dst_buffers_gpu[rank]), opts.size);
#endif
if (cuda_err != cudaSuccess) { if (cuda_err != cudaSuccess) {
fprintf(stderr, "RunAllToAllBench::cudaMalloc for dst_buffers_gpu[%d] error: %d\n", cuda_err, rank); fprintf(stderr, "RunAllToAllBench::cudaMalloc for dst_buffers_gpu[%d] error: %d\n", cuda_err, rank);
return -1; return -1;
......
...@@ -4,7 +4,6 @@ ...@@ -4,7 +4,6 @@
"""Module of the hipBlasLt GEMM benchmark.""" """Module of the hipBlasLt GEMM benchmark."""
import os import os
import re
from superbench.common.utils import logger from superbench.common.utils import logger
from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode
...@@ -23,11 +22,12 @@ def __init__(self, name, parameters=''): ...@@ -23,11 +22,12 @@ def __init__(self, name, parameters=''):
super().__init__(name, parameters) super().__init__(name, parameters)
self._bin_name = 'hipblaslt-bench' self._bin_name = 'hipblaslt-bench'
self._in_types = ['fp32', 'fp16', 'bf16'] self._in_types = ['fp32', 'fp16', 'bf16', 'fp8']
self._in_type_map = { self._in_type_map = {
'fp16': '--a_type f16_r --b_type f16_r --c_type f16_r --d_type f16_r --compute_type f32_r', 'fp16': '--a_type f16_r --b_type f16_r --c_type f16_r --d_type f16_r --compute_type f32_r',
'fp32': '--a_type f32_r --b_type f32_r --c_type f32_r --d_type f32_r --compute_type f32_r', 'fp32': '--a_type f32_r --b_type f32_r --c_type f32_r --d_type f32_r --compute_type f32_r',
'bf16': '--a_type bf16_r --b_type bf16_r --c_type bf16_r --d_type bf16_r --compute_type f32_r', 'bf16': '--a_type bf16_r --b_type bf16_r --c_type bf16_r --d_type bf16_r --compute_type f32_r',
'fp8': '--a_type f8_r --b_type f8_r --c_type f8_r --d_type f8_r --compute_type f32_r',
} }
def add_parser_arguments(self): def add_parser_arguments(self):
...@@ -42,6 +42,30 @@ def add_parser_arguments(self): ...@@ -42,6 +42,30 @@ def add_parser_arguments(self):
required=False, required=False,
help='List of input data types, support {}.'.format(' '.join(self._in_types)), help='List of input data types, support {}.'.format(' '.join(self._in_types)),
) )
self._parser.add_argument(
'--initialization',
type=str,
default='rand_int',
choices=['trig_float', 'rand_int', 'hpl'],
required=False,
help='Initialize matrix data.',
)
self._parser.add_argument(
'--transA',
type=str,
default='N',
choices=['N', 'T', 'C'],
required=False,
help='Transpose matrix A.',
)
self._parser.add_argument(
'--transB',
type=str,
default='N',
choices=['N', 'T', 'C'],
required=False,
help='Transpose matrix B.',
)
def _preprocess(self): def _preprocess(self):
"""Preprocess/preparation operations before the benchmarking. """Preprocess/preparation operations before the benchmarking.
...@@ -58,7 +82,9 @@ def _preprocess(self): ...@@ -58,7 +82,9 @@ def _preprocess(self):
self._precision_in_commands = [] self._precision_in_commands = []
for (_m, _n, _k, _b, _in_type) in self._shapes_to_run: for (_m, _n, _k, _b, _in_type) in self._shapes_to_run:
command = f'{self.__bin_path} -m {_m} -n {_n} -k {_k} -j {self._args.num_warmup}' + \ command = f'{self.__bin_path} -m {_m} -n {_n} -k {_k} -j {self._args.num_warmup}' + \
f' -i {self._args.num_steps} {self._in_type_map[_in_type]}' f' -i {self._args.num_steps} {self._in_type_map[_in_type]}' + \
f' --transA {self._args.transA} --transB {self._args.transB}' + \
f' --initialization {self._args.initialization}'
command = command + f' -b {str(_b)}' if _b > 0 else command command = command + f' -b {str(_b)}' if _b > 0 else command
logger.info(command) logger.info(command)
self._commands.append(command) self._commands.append(command)
...@@ -97,13 +123,12 @@ def _process_raw_result(self, cmd_idx, raw_output): ...@@ -97,13 +123,12 @@ def _process_raw_result(self, cmd_idx, raw_output):
fields = lines[index + 1].strip().split(',') fields = lines[index + 1].strip().split(',')
# Check the number of fields and the format of the first two fields # Check the number of fields and the format of the first two fields
if len(fields) != 23 or not all( if len(fields) != 23:
re.match(r'\d*\.\d*$', item.strip()) or item.strip().isdigit() for item in fields[-2:]
):
raise ValueError('Invalid result') raise ValueError('Invalid result')
self._result.add_result( self._result.add_result(
f'{self._precision_in_commands[cmd_idx]}_{fields[3]}_{"_".join(fields[4:7])}_flops', float(fields[-2]) f'{self._precision_in_commands[cmd_idx]}_{fields[3]}_{"_".join(fields[4:7])}_flops',
float(fields[-2]) / 1000
) )
except BaseException as e: except BaseException as e:
self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE) self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
......
...@@ -45,8 +45,7 @@ message(STATUS "CMAKE HIP ARCHITECTURES: ${CMAKE_HIP_ARCHITECTURES}") ...@@ -45,8 +45,7 @@ message(STATUS "CMAKE HIP ARCHITECTURES: ${CMAKE_HIP_ARCHITECTURES}")
if(EXISTS ${HIP_PATH}) if(EXISTS ${HIP_PATH})
# Search for hip in common locations # Search for hip in common locations
list(APPEND CMAKE_PREFIX_PATH ${HIP_PATH} ${ROCM_PATH}) list(APPEND CMAKE_PREFIX_PATH ${HIP_PATH} ${ROCM_PATH} ${ROCM_PATH}/hsa ${ROCM_PATH}/hip ${ROCM_PATH}/share/rocm/cmake/)
set(CMAKE_PREFIX_PATH /opt/rocm ROCM_PATH)
set(CMAKE_CXX_COMPILER "${HIP_PATH}/bin/hipcc") set(CMAKE_CXX_COMPILER "${HIP_PATH}/bin/hipcc")
set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH}) set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
set(CMAKE_MODULE_PATH "${HIP_PATH}/lib/cmake/hip" ${CMAKE_MODULE_PATH}) set(CMAKE_MODULE_PATH "${HIP_PATH}/lib/cmake/hip" ${CMAKE_MODULE_PATH})
......
...@@ -116,6 +116,9 @@ def add_parser_arguments(self): ...@@ -116,6 +116,9 @@ def add_parser_arguments(self):
self._parser.add_argument('--data_home', type=str, default='/tmp', help='Data home.') self._parser.add_argument('--data_home', type=str, default='/tmp', help='Data home.')
self._parser.add_argument('--vocab_path', type=str, default='/tmp/gpt2-vocab.json', help='Vocab path.') self._parser.add_argument('--vocab_path', type=str, default='/tmp/gpt2-vocab.json', help='Vocab path.')
self._parser.add_argument('--merge_path', type=str, default='/tmp/gpt2-merges.txt', help='Merge path.') self._parser.add_argument('--merge_path', type=str, default='/tmp/gpt2-merges.txt', help='Merge path.')
self._parser.add_argument(
'--split', type=str, default='949,50,1', help='Split dataset ratio for train/val/test.'
)
self._parser.add_argument('--prescale_grad', action='store_true', help='Prescale grad.') self._parser.add_argument('--prescale_grad', action='store_true', help='Prescale grad.')
self._parser.add_argument( self._parser.add_argument(
'--hostfile', type=str, default=None, help='Hostfile to run the mutli-node benchmark.' '--hostfile', type=str, default=None, help='Hostfile to run the mutli-node benchmark.'
...@@ -128,6 +131,13 @@ def add_parser_arguments(self): ...@@ -128,6 +131,13 @@ def add_parser_arguments(self):
def _preprocess(self): def _preprocess(self):
if not super()._preprocess(): if not super()._preprocess():
return False return False
if not self._args.code_base:
if self._args.deepspeed:
self._args.code_base = os.path.join(
os.getenv('SB_MICRO_PATH'), 'third_party/Megatron/Megatron-DeepSpeed/'
)
else:
self._args.code_base = os.path.join(os.getenv('SB_MICRO_PATH'), 'third_party/Megatron/Megatron-LM')
if not os.path.exists(self._args.code_base) or \ if not os.path.exists(self._args.code_base) or \
not os.path.exists(os.path.join(self._args.code_base, 'pretrain_gpt.py')): not os.path.exists(os.path.join(self._args.code_base, 'pretrain_gpt.py')):
...@@ -156,35 +166,35 @@ def _preprocess(self): ...@@ -156,35 +166,35 @@ def _preprocess(self):
def _parse_log(self, output): def _parse_log(self, output):
"""Parse log output and get the performance.""" """Parse log output and get the performance."""
tflops_pattern = re.compile(r'TFLOPs: (\d+\.\d+)') tflops_pattern = re.compile(r'(TFLOPs|TFLOP/s/GPU\)): (\d+\.\d+)')
elapsed_time_pattern = re.compile(r'elapsed time per iteration \(ms\): (\d+\.\d+)') elapsed_time_pattern = re.compile(r'elapsed time per iteration \(ms\): (\d+\.\d+)')
mem_allocated_pattern = re.compile(r'MemAllocated=([\d.]+)[KMGTPEZY]?B') mem_allocated_pattern = re.compile(r'allocated: (\d+\.\d+)')
max_mem_allocated_pattern = re.compile(r'MaxMemAllocated=([\d.]+)[KMGTPEZY]?B') max_mem_allocated_pattern = re.compile(r'max allocated: (\d+\.\d+)')
lines = output.splitlines() lines = output.splitlines()
tflops = [] tflops = []
mem_allocated = [] mem_allocated = []
max_mem_allocated = [] max_mem_allocated = []
iteration_times = [] iteration_times = []
for line in lines: for line in lines:
if 'TFLOPs' in line: if 'elapsed time per iteration' in line:
tflops_matches = tflops_pattern.search(line) tflops_matches = tflops_pattern.search(line)
elapsed_time_match = elapsed_time_pattern.search(line) elapsed_time_match = elapsed_time_pattern.search(line)
if tflops_matches: if tflops_matches:
tflops_values = float(tflops_matches.group(1)) tflops_values = float(tflops_matches.group(2))
tflops.append(tflops_values) tflops.append(tflops_values)
if elapsed_time_match: if elapsed_time_match:
elapsed_time_value = float(elapsed_time_match.group(1)) elapsed_time_value = float(elapsed_time_match.group(1))
iteration_times.append(elapsed_time_value) iteration_times.append(elapsed_time_value)
if 'MaxMemAllocated' in line: if 'max allocated' in line:
mem_allocated_match = mem_allocated_pattern.search(line) mem_allocated_match = mem_allocated_pattern.search(line)
max_mem_allocated_match = max_mem_allocated_pattern.search(line) max_mem_allocated_match = max_mem_allocated_pattern.search(line)
if mem_allocated_match: if mem_allocated_match:
mem_allocated_value = float(mem_allocated_match.group(1)) mem_allocated_value = float(mem_allocated_match.group(1)) / 1024
mem_allocated.append(mem_allocated_value) mem_allocated.append(mem_allocated_value)
if max_mem_allocated_match: if max_mem_allocated_match:
max_mem_allocated_value = float(max_mem_allocated_match.group(1)) max_mem_allocated_value = float(max_mem_allocated_match.group(1)) / 1024
max_mem_allocated.append(max_mem_allocated_value) max_mem_allocated.append(max_mem_allocated_value)
return iteration_times, tflops, mem_allocated, max_mem_allocated return iteration_times, tflops, mem_allocated, max_mem_allocated
...@@ -224,7 +234,9 @@ def __prepare_deespeed_config(self, precision_megatron): ...@@ -224,7 +234,9 @@ def __prepare_deespeed_config(self, precision_megatron):
--deepspeed \ --deepspeed \
--deepspeed_config {self._config_json_path} \ --deepspeed_config {self._config_json_path} \
--zero-stage {self._args.zero_stage} \ --zero-stage {self._args.zero_stage} \
--pipeline-model-parallel-size {self._args.pipeline_model_parallel_size}' --pipeline-model-parallel-size {self._args.pipeline_model_parallel_size} \
--train-tokens {self._args.train_tokens} \
--data-impl {self._args.data_impl}'
if self._args.pipeline_model_parallel_size <= 1: if self._args.pipeline_model_parallel_size <= 1:
deepspeed_options = f'{deepspeed_options} --no-pipeline-parallel' deepspeed_options = f'{deepspeed_options} --no-pipeline-parallel'
...@@ -255,11 +267,10 @@ def _megatron_command(self, precision): # noqa: C901 ...@@ -255,11 +267,10 @@ def _megatron_command(self, precision): # noqa: C901
--num-attention-heads {self._args.num_attn_heads} \ --num-attention-heads {self._args.num_attn_heads} \
--seq-length {self._args.seq_len} \ --seq-length {self._args.seq_len} \
--max-position-embeddings {self._args.seq_len} \ --max-position-embeddings {self._args.seq_len} \
--train-tokens {self._args.train_tokens} \
--train-samples {self._args.num_steps * self._args.batch_size} \ --train-samples {self._args.num_steps * self._args.batch_size} \
--lr {self._args.lr} \ --lr {self._args.lr} \
--min-lr {self._args.min_lr} \ --min-lr {self._args.min_lr} \
--split 949,50,1 \ --split {self._args.split} \
--log-interval {self._args.log_interval} \ --log-interval {self._args.log_interval} \
--eval-interval {self._args.eval_interval} \ --eval-interval {self._args.eval_interval} \
--eval-iters {self._args.eval_iters} \ --eval-iters {self._args.eval_iters} \
...@@ -273,7 +284,8 @@ def _megatron_command(self, precision): # noqa: C901 ...@@ -273,7 +284,8 @@ def _megatron_command(self, precision): # noqa: C901
--optimizer adam \ --optimizer adam \
--use-distributed-optimizer \ --use-distributed-optimizer \
{precision_megatron} \ {precision_megatron} \
--seed {self._args.seed}' --seed {self._args.seed} \
--log-throughput'
if self._args.sequence_parallel: if self._args.sequence_parallel:
megatron_options = f'{megatron_options} --sequence-parallel' megatron_options = f'{megatron_options} --sequence-parallel'
...@@ -298,6 +310,8 @@ def _megatron_command(self, precision): # noqa: C901 ...@@ -298,6 +310,8 @@ def _megatron_command(self, precision): # noqa: C901
script_path = os.path.join(self._args.code_base, 'pretrain_gpt.py') script_path = os.path.join(self._args.code_base, 'pretrain_gpt.py')
if self._args.deepspeed: if self._args.deepspeed:
deepspeed_option = self.__prepare_deespeed_config(precision_megatron.lstrip('--')) deepspeed_option = self.__prepare_deespeed_config(precision_megatron.lstrip('--'))
# No --log-throughput in Megatron-DeepSpeed by 20231219
megatron_options = megatron_options.replace('--log-throughput', '').strip()
if self._num_nodes > 1: if self._num_nodes > 1:
command = f'torchrun {self._distributed_args} ' + \ command = f'torchrun {self._distributed_args} ' + \
f'{script_path} {megatron_options} {self._data_options} {deepspeed_option}' f'{script_path} {megatron_options} {self._data_options} {deepspeed_option}'
...@@ -379,6 +393,7 @@ def _init_distributed_setting(self): ...@@ -379,6 +393,7 @@ def _init_distributed_setting(self):
return False return False
self._num_nodes = int(os.getenv('OMPI_COMM_WORLD_SIZE')) // int(os.getenv('OMPI_COMM_WORLD_LOCAL_SIZE')) self._num_nodes = int(os.getenv('OMPI_COMM_WORLD_SIZE')) // int(os.getenv('OMPI_COMM_WORLD_LOCAL_SIZE'))
master_addr = 'localhost'
if self._num_nodes > 1: if self._num_nodes > 1:
if not self._args.hostfile: if not self._args.hostfile:
sb_hostfile = os.path.join(os.environ.get('SB_WORKSPACE', '.'), 'hostfile') sb_hostfile = os.path.join(os.environ.get('SB_WORKSPACE', '.'), 'hostfile')
...@@ -395,12 +410,13 @@ def _init_distributed_setting(self): ...@@ -395,12 +410,13 @@ def _init_distributed_setting(self):
if self._num_nodes != len(hosts): if self._num_nodes != len(hosts):
logger.error('MPI init failed since hostfile not match the MPI setting.') logger.error('MPI init failed since hostfile not match the MPI setting.')
return False return False
master_addr = hosts[0].split()[0]
addr = os.getenv('MASTER_ADDR', hosts[0].split()[0]) addr = os.getenv('MASTER_ADDR', master_addr)
port = os.getenv('MASTER_PORT', '29500') port = os.getenv('MASTER_PORT', '29500')
node_rank = int(os.environ['OMPI_COMM_WORLD_RANK']) // int(os.environ['OMPI_COMM_WORLD_LOCAL_SIZE']) node_rank = int(os.environ['OMPI_COMM_WORLD_RANK']) // int(os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'])
self._distributed_args = f'--nproc_per_node {self._args.num_gpus} --nnodes {self._num_nodes} ' + \ self._distributed_args = f'--nproc_per_node {self._args.num_gpus} --nnodes {self._num_nodes} ' + \
f'--node_rank {node_rank} --master_addr {addr} --master_port {port}' f'--node_rank {node_rank} --master_addr {addr} --master_port {port}'
return True return True
def _generate_dataset(self): def _generate_dataset(self):
...@@ -448,8 +464,7 @@ def _generate_dataset(self): ...@@ -448,8 +464,7 @@ def _generate_dataset(self):
self._data_options = f'\ self._data_options = f'\
--vocab-file {self._vocab_path} \ --vocab-file {self._vocab_path} \
--merge-file {self._merges_path} \ --merge-file {self._merges_path} \
--data-path {self._data_path} \ --data-path {self._data_path}'
--data-impl {self._args.data_impl}'
logger.info('Dataset preparation successfully.') logger.info('Dataset preparation successfully.')
return True return True
......
...@@ -265,8 +265,8 @@ def __train(self, precision): ...@@ -265,8 +265,8 @@ def __train(self, precision):
# The unit of step time should be millisecond. # The unit of step time should be millisecond.
step_times = self._train_step(precision) step_times = self._train_step(precision)
if isinstance(step_times, tuple): if isinstance(step_times, tuple):
step_times = step_times[0]
info = step_times[1] info = step_times[1]
step_times = step_times[0]
self._process_info(ModelAction.TRAIN, precision, info) self._process_info(ModelAction.TRAIN, precision, info)
step_times = self.__process_model_result(ModelAction.TRAIN, precision, step_times) step_times = self.__process_model_result(ModelAction.TRAIN, precision, step_times)
if not step_times: if not step_times:
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
if gpu.vendor == 'nvidia' or gpu.vendor == 'nvidia-graphics': if gpu.vendor == 'nvidia' or gpu.vendor == 'nvidia-graphics':
import py3nvml.py3nvml as nvml import py3nvml.py3nvml as nvml
elif gpu.vendor == 'amd' or gpu.vendor == 'amd-graphics': elif gpu.vendor == 'amd' or gpu.vendor == 'amd-graphics':
from pyrsmi import rocml import amdsmi as rocml
class DeviceManager: class DeviceManager:
...@@ -150,7 +150,7 @@ def get_device_compute_capability(self): ...@@ -150,7 +150,7 @@ def get_device_compute_capability(self):
try: try:
cap = nvml.nvmlDeviceGetCudaComputeCapability(self._device_handlers[0]) cap = nvml.nvmlDeviceGetCudaComputeCapability(self._device_handlers[0])
except Exception as err: except Exception as err:
logger.error('Get device compute capability failed: {}'.format(str(err))) logger.warning('Get device compute capability failed: {}'.format(str(err)))
return None return None
return cap return cap
...@@ -166,7 +166,7 @@ def get_device_utilization(self, idx): ...@@ -166,7 +166,7 @@ def get_device_utilization(self, idx):
try: try:
util = nvml.nvmlDeviceGetUtilizationRates(self._device_handlers[idx]) util = nvml.nvmlDeviceGetUtilizationRates(self._device_handlers[idx])
except Exception as err: except Exception as err:
logger.error('Get device utilization failed: {}'.format(str(err))) logger.warning('Get device utilization failed: {}'.format(str(err)))
return None return None
return util.gpu return util.gpu
...@@ -182,7 +182,7 @@ def get_device_temperature(self, idx): ...@@ -182,7 +182,7 @@ def get_device_temperature(self, idx):
try: try:
temp = nvml.nvmlDeviceGetTemperature(self._device_handlers[idx], nvml.NVML_TEMPERATURE_GPU) temp = nvml.nvmlDeviceGetTemperature(self._device_handlers[idx], nvml.NVML_TEMPERATURE_GPU)
except Exception as err: except Exception as err:
logger.error('Get device temperature failed: {}'.format(str(err))) logger.warning('Get device temperature failed: {}'.format(str(err)))
temp = None temp = None
return temp return temp
...@@ -198,7 +198,7 @@ def get_device_power(self, idx): ...@@ -198,7 +198,7 @@ def get_device_power(self, idx):
try: try:
power = nvml.nvmlDeviceGetPowerUsage(self._device_handlers[idx]) power = nvml.nvmlDeviceGetPowerUsage(self._device_handlers[idx])
except Exception as err: except Exception as err:
logger.error('Get device power failed: {}'.format(str(err))) logger.warning('Get device power failed: {}'.format(str(err)))
return None return None
return int(int(power) / 1000) return int(int(power) / 1000)
...@@ -214,7 +214,7 @@ def get_device_power_limit(self, idx): ...@@ -214,7 +214,7 @@ def get_device_power_limit(self, idx):
try: try:
powerlimit = nvml.nvmlDeviceGetPowerManagementLimit(self._device_handlers[idx]) powerlimit = nvml.nvmlDeviceGetPowerManagementLimit(self._device_handlers[idx])
except Exception as err: except Exception as err:
logger.error('Get device power limitation failed: {}'.format(str(err))) logger.warning('Get device power limitation failed: {}'.format(str(err)))
return None return None
return int(int(powerlimit) / 1000) return int(int(powerlimit) / 1000)
...@@ -231,7 +231,7 @@ def get_device_memory(self, idx): ...@@ -231,7 +231,7 @@ def get_device_memory(self, idx):
try: try:
mem = nvml.nvmlDeviceGetMemoryInfo(self._device_handlers[idx]) mem = nvml.nvmlDeviceGetMemoryInfo(self._device_handlers[idx])
except Exception as err: except Exception as err:
logger.error('Get device memory failed: {}'.format(str(err))) logger.warning('Get device memory failed: {}'.format(str(err)))
return None, None return None, None
return mem.used, mem.total return mem.used, mem.total
...@@ -304,7 +304,7 @@ def get_device_ecc_error(self, idx): ...@@ -304,7 +304,7 @@ def get_device_ecc_error(self, idx):
except nvml.NVMLError: except nvml.NVMLError:
pass pass
except Exception as err: except Exception as err:
logger.error('Get device ECC information failed: {}'.format(str(err))) logger.warning('Get device ECC information failed: {}'.format(str(err)))
return None, None return None, None
try: try:
...@@ -316,7 +316,7 @@ def get_device_ecc_error(self, idx): ...@@ -316,7 +316,7 @@ def get_device_ecc_error(self, idx):
except nvml.NVMLError: except nvml.NVMLError:
pass pass
except Exception as err: except Exception as err:
logger.error('Get device ECC information failed: {}'.format(str(err))) logger.warning('Get device ECC information failed: {}'.format(str(err)))
return None, None return None, None
return corrected_ecc, uncorrected_ecc return corrected_ecc, uncorrected_ecc
...@@ -326,12 +326,13 @@ class AmdDeviceManager(DeviceManager): ...@@ -326,12 +326,13 @@ class AmdDeviceManager(DeviceManager):
"""Device management module for AMD.""" """Device management module for AMD."""
def __init__(self): def __init__(self):
"""Constructor.""" """Constructor."""
rocml.smi_initialize() rocml.amdsmi_init()
self._device_handlers = rocml.amdsmi_get_processor_handles()
super().__init__() super().__init__()
def __del__(self): def __del__(self):
"""Destructor.""" """Destructor."""
rocml.smi_shutdown() rocml.amdsmi_shut_down()
def get_device_count(self): def get_device_count(self):
"""Get the number of device. """Get the number of device.
...@@ -339,7 +340,7 @@ def get_device_count(self): ...@@ -339,7 +340,7 @@ def get_device_count(self):
Return: Return:
count (int): count of device. count (int): count of device.
""" """
return rocml.smi_get_device_count() return len(self._device_handlers)
def get_device_utilization(self, idx): def get_device_utilization(self, idx):
"""Get the utilization of device. """Get the utilization of device.
...@@ -351,11 +352,11 @@ def get_device_utilization(self, idx): ...@@ -351,11 +352,11 @@ def get_device_utilization(self, idx):
util (int): the utilization of device, None means failed to get the data. util (int): the utilization of device, None means failed to get the data.
""" """
try: try:
util = rocml.smi_get_device_utilization(idx) engine_usage = rocml.amdsmi_get_gpu_activity(self._device_handlers[idx])
except Exception as err: except Exception as err:
logger.error('Get device utilization failed: {}'.format(str(err))) logger.warning('Get device utilization failed: {}'.format(str(err)))
return None return None
return util return engine_usage['gfx_activity']
def get_device_temperature(self, idx): def get_device_temperature(self, idx):
"""Get the temperature of device, unit: celsius. """Get the temperature of device, unit: celsius.
...@@ -366,8 +367,16 @@ def get_device_temperature(self, idx): ...@@ -366,8 +367,16 @@ def get_device_temperature(self, idx):
Return: Return:
temp (int): the temperature of device, None means failed to get the data. temp (int): the temperature of device, None means failed to get the data.
""" """
# Currently no API provided in rocml. try:
return None temp = rocml.amdsmi_get_temp_metric(
self._device_handlers[idx], rocml.AmdSmiTemperatureType.EDGE, rocml.AmdSmiTemperatureMetric.CURRENT
)
except (rocml.AmdSmiLibraryException, rocml.AmdSmiParameterException):
pass
except Exception as err:
logger.warning('Get device temperature failed: {}'.format(str(err)))
temp = None
return temp
def get_device_power(self, idx): def get_device_power(self, idx):
"""Get the realtime power of device, unit: watt. """Get the realtime power of device, unit: watt.
...@@ -379,11 +388,11 @@ def get_device_power(self, idx): ...@@ -379,11 +388,11 @@ def get_device_power(self, idx):
temp (int): the realtime power of device, None means failed to get the data. temp (int): the realtime power of device, None means failed to get the data.
""" """
try: try:
power = rocml.smi_get_device_average_power(idx) power_measure = rocml.amdsmi_get_power_info(self._device_handlers[idx])
except Exception as err: except Exception as err:
logger.error('Get device power failed: {}'.format(str(err))) logger.warning('Get device power failed: {}'.format(str(err)))
return None return None
return int(int(power) / 1000) return int(power_measure['average_socket_power'])
def get_device_power_limit(self, idx): def get_device_power_limit(self, idx):
"""Get the power management limit of device, unit: watt. """Get the power management limit of device, unit: watt.
...@@ -394,8 +403,12 @@ def get_device_power_limit(self, idx): ...@@ -394,8 +403,12 @@ def get_device_power_limit(self, idx):
Return: Return:
temp (int): the power management limit of device, None means failed to get the data. temp (int): the power management limit of device, None means failed to get the data.
""" """
# Currently no API provided in rocml. try:
return None power_measure = rocml.amdsmi_get_power_info(self._device_handlers[idx])
except Exception as err:
logger.warning('Get device power limit failed: {}'.format(str(err)))
return None
return int(power_measure['power_limit'])
def get_device_memory(self, idx): def get_device_memory(self, idx):
"""Get the memory information of device, unit: byte. """Get the memory information of device, unit: byte.
...@@ -408,10 +421,10 @@ def get_device_memory(self, idx): ...@@ -408,10 +421,10 @@ def get_device_memory(self, idx):
total (int): the total device memory in bytes, None means failed to get the data. total (int): the total device memory in bytes, None means failed to get the data.
""" """
try: try:
mem_used = rocml.smi_get_device_memory_used(idx) mem_used = rocml.amdsmi_get_gpu_memory_usage(self._device_handlers[idx], rocml.AmdSmiMemoryType.VRAM)
mem_total = rocml.smi_get_device_memory_total(idx) mem_total = rocml.amdsmi_get_gpu_memory_total(self._device_handlers[idx], rocml.AmdSmiMemoryType.VRAM)
except Exception as err: except Exception as err:
logger.error('Get device memory failed: {}'.format(str(err))) logger.warning('Get device memory failed: {}'.format(str(err)))
return None, None return None, None
return mem_used, mem_total return mem_used, mem_total
...@@ -425,8 +438,19 @@ def get_device_ecc_error(self, idx): ...@@ -425,8 +438,19 @@ def get_device_ecc_error(self, idx):
corrected_ecc (int) : the count of single bit ecc error. corrected_ecc (int) : the count of single bit ecc error.
uncorrected_ecc (int): the count of double bit ecc error. uncorrected_ecc (int): the count of double bit ecc error.
""" """
# Currently no API provided in rocml. corrected_ecc = 0
return None, None uncorrected_ecc = 0
for block in rocml.AmdSmiGpuBlock:
try:
ecc_count = rocml.amdsmi_get_gpu_ecc_count(self._device_handlers[idx], block)
corrected_ecc += ecc_count['correctable_count']
uncorrected_ecc += ecc_count['uncorrectable_count']
except (rocml.AmdSmiLibraryException, rocml.AmdSmiParameterException):
pass
except Exception as err:
logger.info('Get device ECC information failed: {}'.format(str(err)))
return corrected_ecc, uncorrected_ecc
device_manager: Optional[DeviceManager] = DeviceManager() device_manager: Optional[DeviceManager] = DeviceManager()
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
# Server: # Server:
# - Product: HPE Apollo 6500 # - Product: HPE Apollo 6500
version: v0.9 version: v0.10
superbench: superbench:
enable: null enable: null
var: var:
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
# - Product: G482-Z53 # - Product: G482-Z53
# - Link: https://www.gigabyte.cn/FileUpload/Global/MicroSite/553/G482-Z53.html # - Link: https://www.gigabyte.cn/FileUpload/Global/MicroSite/553/G482-Z53.html
version: v0.9 version: v0.10
superbench: superbench:
enable: null enable: null
var: var:
......
version: v0.9 version: v0.10
superbench: superbench:
enable: null enable: null
monitor: monitor:
......
version: v0.9 version: v0.10
superbench: superbench:
enable: null enable: null
monitor: monitor:
......
version: v0.9 version: v0.10
superbench: superbench:
enable: null enable: null
monitor: monitor:
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
# Azure NDm A100 v4 # Azure NDm A100 v4
# reference: https://docs.microsoft.com/en-us/azure/virtual-machines/ndm-a100-v4-series # reference: https://docs.microsoft.com/en-us/azure/virtual-machines/ndm-a100-v4-series
version: v0.9 version: v0.10
superbench: superbench:
enable: null enable: null
monitor: monitor:
......
# SuperBench Config # SuperBench Config
version: v0.9 version: v0.10
superbench: superbench:
enable: null enable: null
monitor: monitor:
......
# SuperBench Config # SuperBench Config
version: v0.9 version: v0.10
superbench: superbench:
enable: null enable: null
monitor: monitor:
......
...@@ -100,7 +100,7 @@ ...@@ -100,7 +100,7 @@
docker run -itd --name={{ container }} \ docker run -itd --name={{ container }} \
--privileged --net=host --ipc=host \ --privileged --net=host --ipc=host \
{{ '--gpus=all' if nvidia_gpu_exist else '' }} \ {{ '--gpus=all' if nvidia_gpu_exist else '' }} \
{{ '--security-opt seccomp=unconfined --group-add video' if amd_gpu_exist else '' }} \ {{ '--security-opt seccomp=unconfined --group-add video --device=/dev/kfd --device=/dev/dri --cap-add=SYS_PTRACE --shm-size=16G' if amd_gpu_exist else '' }} \
-w /root -v {{ workspace }}:/root -v /mnt:/mnt \ -w /root -v {{ workspace }}:/root -v /mnt:/mnt \
-v /var/run/docker.sock:/var/run/docker.sock \ -v /var/run/docker.sock:/var/run/docker.sock \
--entrypoint /bin/bash {{ docker_image }} && \ --entrypoint /bin/bash {{ docker_image }} && \
......
...@@ -66,6 +66,8 @@ def test_nccl_bw_performance(self, allgather, allreduce, reduce, broadcast, redu ...@@ -66,6 +66,8 @@ def test_nccl_bw_performance(self, allgather, allreduce, reduce, broadcast, redu
assert (benchmark._args.iters == 20) assert (benchmark._args.iters == 20)
assert (benchmark._args.warmup_iters == 5) assert (benchmark._args.warmup_iters == 5)
assert (benchmark._args.graph_iters == 0) assert (benchmark._args.graph_iters == 0)
assert (benchmark._args.in_place is False)
assert (benchmark._args.data_type == 'float')
# Check command list # Check command list
bin_names = [ bin_names = [
...@@ -74,7 +76,7 @@ def test_nccl_bw_performance(self, allgather, allreduce, reduce, broadcast, redu ...@@ -74,7 +76,7 @@ def test_nccl_bw_performance(self, allgather, allreduce, reduce, broadcast, redu
] ]
command = bin_names[0] + benchmark._commands[0].split(bin_names[0])[1] command = bin_names[0] + benchmark._commands[0].split(bin_names[0])[1]
expected_command = '{} -b 8 -e 8G -f 2 -g 8 -c 0 -n 20 -w 5 -G 0'.format(bin_names[0]) expected_command = '{} -b 8 -e 8G -f 2 -g 8 -c 0 -n 20 -w 5 -G 0 -d float'.format(bin_names[0])
assert (command == expected_command) assert (command == expected_command)
# Check results and metrics. # Check results and metrics.
...@@ -91,6 +93,11 @@ def test_nccl_bw_performance(self, allgather, allreduce, reduce, broadcast, redu ...@@ -91,6 +93,11 @@ def test_nccl_bw_performance(self, allgather, allreduce, reduce, broadcast, redu
'alltoall': alltoall, 'alltoall': alltoall,
} }
if 'SB_MODE_SERIAL_INDEX' in os.environ:
os.environ.pop('SB_MODE_SERIAL_INDEX')
if 'SB_MODE_PARALLEL_INDEX' in os.environ:
os.environ.pop('SB_MODE_PARALLEL_INDEX')
for op in raw_output.keys(): for op in raw_output.keys():
benchmark._args.operation = op benchmark._args.operation = op
assert (benchmark._process_raw_result(0, raw_output[op])) assert (benchmark._process_raw_result(0, raw_output[op]))
...@@ -131,3 +138,48 @@ def test_nccl_bw_performance(self, allgather, allreduce, reduce, broadcast, redu ...@@ -131,3 +138,48 @@ def test_nccl_bw_performance(self, allgather, allreduce, reduce, broadcast, redu
assert (benchmark.result['alltoall_0_0:8589934592_time'][0] == 33508.0) assert (benchmark.result['alltoall_0_0:8589934592_time'][0] == 33508.0)
assert (benchmark.result['alltoall_0_0:8589934592_algbw'][0] == 256.36) assert (benchmark.result['alltoall_0_0:8589934592_algbw'][0] == 256.36)
assert (benchmark.result['alltoall_0_0:8589934592_busbw'][0] == 224.31) assert (benchmark.result['alltoall_0_0:8589934592_busbw'][0] == 224.31)
@decorator.load_data('tests/data/nccl_allreduce.log')
@decorator.load_data('tests/data/nccl_alltoall.log')
def test_nccl_bw_performance_in_place_parsing(self, allreduce, alltoall):
"""Test nccl-bw benchmark in-place parsing."""
benchmark_name = 'nccl-bw'
(benchmark_class,
predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
assert (benchmark_class)
benchmark = benchmark_class(benchmark_name, parameters='--ngpus 8 --in_place')
ret = benchmark._preprocess()
assert (ret is True)
assert (benchmark.return_code == ReturnCode.SUCCESS)
assert (benchmark._args.in_place is True)
# Case with valid raw_output
raw_output = {
'allreduce': allreduce,
'alltoall': alltoall,
}
if 'SB_MODE_SERIAL_INDEX' in os.environ:
os.environ.pop('SB_MODE_SERIAL_INDEX')
if 'SB_MODE_PARALLEL_INDEX' in os.environ:
os.environ.pop('SB_MODE_PARALLEL_INDEX')
for op in raw_output.keys():
benchmark._args.operation = op
assert (benchmark._process_raw_result(0, raw_output[op]))
for name in ['time', 'algbw', 'busbw']:
for size in ['8589934592', '4294967296', '2147483648', '1073741824', '536870912', '32']:
metric = op + '_' + size + '_' + name
assert (metric in benchmark.result)
assert (len(benchmark.result[metric]) == 1)
assert (isinstance(benchmark.result[metric][0], numbers.Number))
assert (benchmark.result['allreduce_8589934592_time'][0] == 63959.0)
assert (benchmark.result['allreduce_8589934592_algbw'][0] == 134.30)
assert (benchmark.result['allreduce_8589934592_busbw'][0] == 235.03)
assert (benchmark.result['alltoall_8589934592_time'][0] == 33234.0)
assert (benchmark.result['alltoall_8589934592_algbw'][0] == 258.47)
assert (benchmark.result['alltoall_8589934592_busbw'][0] == 226.16)
...@@ -3,7 +3,6 @@ ...@@ -3,7 +3,6 @@
"""Tests for distributed inference benchmark.""" """Tests for distributed inference benchmark."""
import numbers
import unittest import unittest
from tests.helper import decorator from tests.helper import decorator
...@@ -209,19 +208,17 @@ def _test_dist_inference_result_parsing(self, platform, test_raw_output): ...@@ -209,19 +208,17 @@ def _test_dist_inference_result_parsing(self, platform, test_raw_output):
# step_times # step_times
assert (len(benchmark.raw_data) == 2) assert (len(benchmark.raw_data) == 2)
# return code + (avg, 50th, 90th, 95th, 99th, 99.9th) # return code + (avg, 50th, 90th, 95th, 99th, 99.9th)
test_latency = float(test_raw_output.splitlines()[-1].split(' ms per iteration')[0].split()[-1])
assert (7 == len(benchmark.result)) assert (7 == len(benchmark.result))
for output_key in benchmark.result: assert (benchmark.result['return_code'] == [0])
if output_key == 'return_code': assert (benchmark.result['step_times'] == [1.9052048])
assert (benchmark.result[output_key] == [0]) assert (benchmark.result['step_times_50'] == [1.851])
else: assert (benchmark.result['step_times_90'] == [1.89637])
assert (output_key.startswith('step_times')) assert (benchmark.result['step_times_95'] == [2.12037])
assert (len(benchmark.result[output_key]) == 1) assert (benchmark.result['step_times_99'] == [2.67155])
assert (isinstance(benchmark.result[output_key][0], numbers.Number)) assert (benchmark.result['step_times_99.9'] == [4.4198])
assert (test_latency == benchmark.result[output_key][0])
# Negative case - invalid raw output. # Negative case - invalid raw output.
assert (benchmark._process_raw_result(1, 'Invalid raw output') is False) assert (benchmark._process_raw_result(1, 'Latency of step: xxx ms') is False)
assert (benchmark.return_code == ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE) assert (benchmark.return_code == ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
@decorator.cuda_test @decorator.cuda_test
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment