Release - SuperBench v0.10.0 (#607)

**Description** Cherry-pick bug fixes from v0.10.0 to main. **Major Revisions** * Benchmarks: Microbenchmark - Support different hipblasLt data types in dist_inference #590 * Benchmarks: Microbenchmark - Support in-place for NCCL/RCCL benchmark #591 * Bug Fix - Fix NUMA Domains Swap Issue in NDv4 Topology File #592 * Benchmarks: Microbenchmark - Add data type option for NCCL and RCCL tests #595 * Benchmarks: Bug Fix - Make metrics of dist-inference-cpp aligned with PyTorch version #596 * CI/CD - Add ndv5 topo file #597 * Benchmarks: Microbenchmark - Improve AMD GPU P2P performance with fine-grained GPU memory #593 * Benchmarks: Build Pipeline - fix nccl and nccl test version to 2.18.3 to resolve hang issue in cuda12.2 docker #599 * Dockerfile - Bug fix for rocm docker build and deploy #598 * Benchmarks: Microbenchmark - Adapt to hipblasLt data type changes #603 * Benchmarks: Micro benchmarks - Update hipblaslt metric unit to tflops #604 * Monitor - Upgrade pyrsmi to amdsmi python library. #601 * Benchmarks: Micro benchmarks - add fp8 and initialization for hipblaslt benchmark #605 * Dockerfile - Add rocm6.0 dockerfile #602 * Bug Fix - Bug fix for latest megatron-lm benchmark #600 * Docs - Upgrade version and release note #606 Co-authored-by: Ziyue Yang <ziyyang@microsoft.com> Co-authored-by: Yang Wang <yangwang1@microsoft.com> Co-authored-by: Yuting Jiang <yutingjiang@microsoft.com> Co-authored-by: guoshzhao <guzhao@microsoft.com>

Release - SuperBench v0.10.0 (#607)
**Description** Cherry-pick bug fixes from v0.10.0 to main. **Major Revisions** * Benchmarks: Microbenchmark - Support different hipblasLt data types in dist_inference #590 * Benchmarks: Microbenchmark - Support in-place for NCCL/RCCL benchmark #591 * Bug Fix - Fix NUMA Domains Swap Issue in NDv4 Topology File #592 * Benchmarks: Microbenchmark - Add data type option for NCCL and RCCL tests #595 * Benchmarks: Bug Fix - Make metrics of dist-inference-cpp aligned with PyTorch version #596 * CI/CD - Add ndv5 topo file #597 * Benchmarks: Microbenchmark - Improve AMD GPU P2P performance with fine-grained GPU memory #593 * Benchmarks: Build Pipeline - fix nccl and nccl test version to 2.18.3 to resolve hang issue in cuda12.2 docker #599 * Dockerfile - Bug fix for rocm docker build and deploy #598 * Benchmarks: Microbenchmark - Adapt to hipblasLt data type changes #603 * Benchmarks: Micro benchmarks - Update hipblaslt metric unit to tflops #604 * Monitor - Upgrade pyrsmi to amdsmi python library. #601 * Benchmarks: Micro benchmarks - add fp8 and initialization for hipblaslt benchmark #605 * Dockerfile - Add rocm6.0 dockerfile #602 * Bug Fix - Bug fix for latest megatron-lm benchmark #600 * Docs - Upgrade version and release note #606 Co-authored-by: Ziyue Yang <ziyyang@microsoft.com> Co-authored-by: Yang Wang <yangwang1@microsoft.com> Co-authored-by: Yuting Jiang <yutingjiang@microsoft.com> Co-authored-by: guoshzhao <guzhao@microsoft.com>
2c88db90 · Yifan Xiong · GitHub · 2c2096ed · 2c88db90 · 2c88db90
Unverified Commit 2c88db90 authored Jan 07, 2024 by Yifan Xiong Committed by GitHub Jan 08, 2024
20 changed files
--- a/superbench/benchmarks/micro_benchmarks/dist_inference_cpp/CMakeLists.txt
+++ b/superbench/benchmarks/micro_benchmarks/dist_inference_cpp/CMakeLists.txt
@@ -31,6 +31,14 @@ else()
        # link hip device lib
        add_executable(dist_inference dist_inference.cpp)
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -DROCM_USE_FLOAT16=1")
+        if(DEFINED ENV{USE_HIPBLASLT_DATATYPE})
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_HIPBLASLT_DATATYPE=1")
+        elseif(DEFINED ENV{USE_HIP_DATATYPE})
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_HIP_DATATYPE=1")
+        endif()
+        if(DEFINED ENV{USE_HIPBLAS_COMPUTETYPE})
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_HIPBLAS_COMPUTETYPE=1")
+        endif()
        target_link_libraries(dist_inference MPI::MPI_CXX rccl hipblaslt hip::device)
    else()
        message(FATAL_ERROR "No CUDA or ROCm environment found.")

--- a/superbench/benchmarks/micro_benchmarks/dist_inference_cpp/dist_inference.cu
+++ b/superbench/benchmarks/micro_benchmarks/dist_inference_cpp/dist_inference.cu
@@ -45,6 +45,21 @@
 #include <hipblaslt/hipblaslt.h>
 #include <rccl/rccl.h>
 using cublasLtHalf = hipblasLtHalf;
+#if defined(USE_HIPBLASLT_DATATYPE)
+#define DIST_INF_HIP_DATATYPE_R_16F HIPBLASLT_R_16F
+#define DIST_INF_HIP_DATATYPE_R_32F HIPBLASLT_R_32F
+#elif defined(USE_HIP_DATATYPE)
+#define DIST_INF_HIP_DATATYPE_R_16F HIP_R_16F
+#define DIST_INF_HIP_DATATYPE_R_32F HIP_R_32F
+#else
+#define DIST_INF_HIP_DATATYPE_R_16F HIPBLAS_R_16F
+#define DIST_INF_HIP_DATATYPE_R_32F HIPBLAS_R_32F
+#endif
+#if defined(USE_HIPBLAS_COMPUTETYPE)
+#define DIST_INF_HIP_COMPUTETYPE_F32 HIPBLAS_COMPUTE_32F
+#else
+#define DIST_INF_HIP_COMPUTETYPE_F32 HIPBLASLT_COMPUTE_F32
+#endif
 #else
 #include <cublasLt.h>
 #include <cuda_fp16.h>
@@ -229,16 +244,18 @@ void TestModel(int64_t m, int64_t n, int64_t k, float alpha, float beta, int32_t
    CHECK_CUBLASLT_ERROR(hipblasLtCreate(&handle));
-    CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matA, HIPBLAS_R_16F, k, n, k));
+    CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matA, DIST_INF_HIP_DATATYPE_R_16F, k, n, k));
-    CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matB, HIPBLAS_R_16F, m, k, m));
+    CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matB, DIST_INF_HIP_DATATYPE_R_16F, m, k, m));
-    CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matC, HIPBLAS_R_16F, m, n, m));
+    CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matC, DIST_INF_HIP_DATATYPE_R_16F, m, n, m));
-    CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matD, HIPBLAS_R_16F, m, n, m));
+    CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matD, DIST_INF_HIP_DATATYPE_R_16F, m, n, m));
-    CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matE, HIPBLAS_R_16F, k, m, k));
+    CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matE, DIST_INF_HIP_DATATYPE_R_16F, k, m, k));
-    CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matF, HIPBLAS_R_16F, k, n, k));
+    CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matF, DIST_INF_HIP_DATATYPE_R_16F, k, n, k));
-    CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matG, HIPBLAS_R_16F, k, n, k));
+    CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matG, DIST_INF_HIP_DATATYPE_R_16F, k, n, k));
-    CHECK_CUBLASLT_ERROR(hipblasLtMatmulDescCreate(&matmul1, HIPBLASLT_COMPUTE_F32, HIPBLAS_R_32F));
+    CHECK_CUBLASLT_ERROR(
-    CHECK_CUBLASLT_ERROR(hipblasLtMatmulDescCreate(&matmul2, HIPBLASLT_COMPUTE_F32, HIPBLAS_R_32F));
+        hipblasLtMatmulDescCreate(&matmul1, DIST_INF_HIP_COMPUTETYPE_F32, DIST_INF_HIP_DATATYPE_R_32F));
+    CHECK_CUBLASLT_ERROR(
+        hipblasLtMatmulDescCreate(&matmul2, DIST_INF_HIP_COMPUTETYPE_F32, DIST_INF_HIP_DATATYPE_R_32F));
    hipblasOperation_t trans = HIPBLAS_OP_N;
    CHECK_CUBLASLT_ERROR(
@@ -336,8 +353,9 @@ void TestModel(int64_t m, int64_t n, int64_t k, float alpha, float beta, int32_t
 #endif
    std::chrono::steady_clock::time_point start_time, stop_time;
+    std::vector<double> step_times(num_iters, 0.);
    for (int i = 0; i < num_warmups + num_iters; ++i) {
-        if (i == num_warmups) {
+        if (i >= num_warmups) {
            start_time = std::chrono::steady_clock::now();
        }
 #if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && (CUDART_VERSION >= 11030 || HIP_VERSION >= 50221310)
@@ -350,11 +368,15 @@ void TestModel(int64_t m, int64_t n, int64_t k, float alpha, float beta, int32_t
        model_forward();
 #endif
        CHECK_CUDA_ERROR(cudaStreamSynchronize(stream));
+        if (i >= num_warmups) {
+            stop_time = std::chrono::steady_clock::now();
+            double step_time = std::chrono::duration_cast<std::chrono::nanoseconds>(stop_time - start_time).count();
+            step_times[i - num_warmups] = step_time;
+        }
+    }
+    for (int i = 0; i < num_iters; i++) {
+        fprintf(stdout, "Latency of step %d: %g ms\n", i, step_times[i] / 1e6);
    }
-    stop_time = std::chrono::steady_clock::now();
-    double duration = std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time).count();
-    fprintf(stdout, "Time: %g ms in total, %g ms per iteration, %g ms per layer\n", duration, duration / num_iters,
-            duration / num_iters / num_layers);
 #if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && (CUDART_VERSION >= 11030 || HIP_VERSION >= 50221310)
    // Destroy graph

--- a/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/CMakeLists.txt
+++ b/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/CMakeLists.txt
@@ -27,6 +27,13 @@ else()
        # link hip device lib
        add_executable(gpu_copy gpu_copy.cpp)
+        include(CheckSymbolExists)
+        check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY)
+        if(${HIP_UNCACHED_MEMORY})
+            target_compile_definitions(gpu_copy PRIVATE HIP_UNCACHED_MEMORY)
+        endif()
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
        target_link_libraries(gpu_copy numa hip::device)
    else()

--- a/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu
+++ b/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu
@@ -313,6 +313,25 @@ int SetGpu(int gpu_id) {
    return 0;
 }
+#if defined(__HIP_PLATFORM_AMD__)
+bool UseFineGrained(const SubBenchArgs &args) {
+    return args.is_src_dev_gpu && args.is_dst_dev_gpu && args.src_gpu_id != args.dst_gpu_id;
+}
+cudaError_t GpuMallocDataBuf(uint8_t **ptr, uint64_t size, bool use_fine_grained) {
+    if (use_fine_grained) {
+#if defined(HIP_UNCACHED_MEMORY)
+        return hipExtMallocWithFlags((void **)ptr, size, hipDeviceMallocUncached);
+#else
+        return hipExtMallocWithFlags((void **)ptr, size, hipDeviceMallocFinegrained);
+#endif
+    } else {
+        return cudaMalloc(ptr, size);
+    }
+}
+#else
+cudaError_t GpuMallocDataBuf(uint8_t **ptr, uint64_t size) { return cudaMalloc(ptr, size); }
+#endif
 // Prepare data buffers and streams to be used.
 int PrepareBufAndStream(BenchArgs *args) {
    cudaError_t cuda_err = cudaSuccess;
@@ -346,7 +365,11 @@ int PrepareBufAndStream(BenchArgs *args) {
                    return -1;
                }
                *(host_buf_ptrs[j]) = nullptr;
-                cuda_err = cudaMalloc(gpu_buf_ptrs[j], args->size);
+#if defined(__HIP_PLATFORM_AMD__)
+                cuda_err = GpuMallocDataBuf(gpu_buf_ptrs[j], args->size, UseFineGrained(sub));
+#else
+                cuda_err = GpuMallocDataBuf(gpu_buf_ptrs[j], args->size);
+#endif
                if (cuda_err != cudaSuccess) {
                    fprintf(stderr, "PrepareBufAndStream::cudaMalloc error: %d\n", cuda_err);
                    return -1;
@@ -876,7 +899,11 @@ int RunAllToAllBench(const Opts &opts, int gpu_count, int src_rank, int dst_rank
        }
        // Prepare source buffers
-        cuda_err = cudaMalloc(&(src_buffers_gpu[rank]), opts.size);
+#if defined(__HIP_PLATFORM_AMD__)
+        cuda_err = GpuMallocDataBuf(&(src_buffers_gpu[rank]), opts.size, true);
+#else
+        cuda_err = GpuMallocDataBuf(&(src_buffers_gpu[rank]), opts.size);
+#endif
        if (cuda_err != cudaSuccess) {
            fprintf(stderr, "RunAllToAllBench::cudaMalloc for src_buffers_gpu[%d] error: %d\n", cuda_err, rank);
            return -1;
@@ -893,7 +920,11 @@ int RunAllToAllBench(const Opts &opts, int gpu_count, int src_rank, int dst_rank
        }
        // Prepare destination buffers
-        cuda_err = cudaMalloc(&(dst_buffers_gpu[rank]), opts.size);
+#if defined(__HIP_PLATFORM_AMD__)
+        cuda_err = GpuMallocDataBuf(&(dst_buffers_gpu[rank]), opts.size, true);
+#else
+        cuda_err = GpuMallocDataBuf(&(dst_buffers_gpu[rank]), opts.size);
+#endif
        if (cuda_err != cudaSuccess) {
            fprintf(stderr, "RunAllToAllBench::cudaMalloc for dst_buffers_gpu[%d] error: %d\n", cuda_err, rank);
            return -1;

--- a/superbench/benchmarks/micro_benchmarks/hipblaslt_function.py
+++ b/superbench/benchmarks/micro_benchmarks/hipblaslt_function.py
@@ -4,7 +4,6 @@
 """Module of the hipBlasLt GEMM benchmark."""
 import os
-import re
 from superbench.common.utils import logger
 from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode
@@ -23,11 +22,12 @@ def __init__(self, name, parameters=''):
        super().__init__(name, parameters)
        self._bin_name = 'hipblaslt-bench'
-        self._in_types = ['fp32', 'fp16', 'bf16']
+        self._in_types = ['fp32', 'fp16', 'bf16', 'fp8']
        self._in_type_map = {
            'fp16': '--a_type f16_r --b_type f16_r --c_type f16_r --d_type f16_r --compute_type f32_r',
            'fp32': '--a_type f32_r --b_type f32_r --c_type f32_r --d_type f32_r --compute_type f32_r',
            'bf16': '--a_type bf16_r --b_type bf16_r --c_type bf16_r --d_type bf16_r --compute_type f32_r',
+            'fp8': '--a_type f8_r --b_type f8_r --c_type f8_r --d_type f8_r --compute_type f32_r',
        }
    def add_parser_arguments(self):
@@ -42,6 +42,30 @@ def add_parser_arguments(self):
            required=False,
            help='List of input data types, support {}.'.format(' '.join(self._in_types)),
        )
+        self._parser.add_argument(
+            '--initialization',
+            type=str,
+            default='rand_int',
+            choices=['trig_float', 'rand_int', 'hpl'],
+            required=False,
+            help='Initialize matrix data.',
+        )
+        self._parser.add_argument(
+            '--transA',
+            type=str,
+            default='N',
+            choices=['N', 'T', 'C'],
+            required=False,
+            help='Transpose matrix A.',
+        )
+        self._parser.add_argument(
+            '--transB',
+            type=str,
+            default='N',
+            choices=['N', 'T', 'C'],
+            required=False,
+            help='Transpose matrix B.',
+        )
    def _preprocess(self):
        """Preprocess/preparation operations before the benchmarking.
@@ -58,7 +82,9 @@ def _preprocess(self):
        self._precision_in_commands = []
        for (_m, _n, _k, _b, _in_type) in self._shapes_to_run:
            command = f'{self.__bin_path} -m {_m} -n {_n} -k {_k} -j {self._args.num_warmup}' + \
-                f' -i {self._args.num_steps} {self._in_type_map[_in_type]}'
+                f' -i {self._args.num_steps} {self._in_type_map[_in_type]}' + \
+                f' --transA {self._args.transA} --transB {self._args.transB}' + \
+                f' --initialization {self._args.initialization}'
            command = command + f' -b {str(_b)}' if _b > 0 else command
            logger.info(command)
            self._commands.append(command)
@@ -97,13 +123,12 @@ def _process_raw_result(self, cmd_idx, raw_output):
            fields = lines[index + 1].strip().split(',')
            # Check the number of fields and the format of the first two fields
-            if len(fields) != 23 or not all(
+            if len(fields) != 23:
-                re.match(r'\d*\.\d*$', item.strip()) or item.strip().isdigit() for item in fields[-2:]
-            ):
                raise ValueError('Invalid result')
            self._result.add_result(
-                f'{self._precision_in_commands[cmd_idx]}_{fields[3]}_{"_".join(fields[4:7])}_flops', float(fields[-2])
+                f'{self._precision_in_commands[cmd_idx]}_{fields[3]}_{"_".join(fields[4:7])}_flops',
+                float(fields[-2]) / 1000
            )
        except BaseException as e:
            self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)

--- a/superbench/benchmarks/micro_benchmarks/rocm_common.cmake
+++ b/superbench/benchmarks/micro_benchmarks/rocm_common.cmake
@@ -45,8 +45,7 @@ message(STATUS "CMAKE HIP ARCHITECTURES: ${CMAKE_HIP_ARCHITECTURES}")
 if(EXISTS ${HIP_PATH})
    # Search for hip in common locations
-    list(APPEND CMAKE_PREFIX_PATH ${HIP_PATH} ${ROCM_PATH})
+    list(APPEND CMAKE_PREFIX_PATH ${HIP_PATH} ${ROCM_PATH} ${ROCM_PATH}/hsa ${ROCM_PATH}/hip ${ROCM_PATH}/share/rocm/cmake/)
-    set(CMAKE_PREFIX_PATH /opt/rocm ROCM_PATH)
    set(CMAKE_CXX_COMPILER "${HIP_PATH}/bin/hipcc")
    set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
    set(CMAKE_MODULE_PATH "${HIP_PATH}/lib/cmake/hip" ${CMAKE_MODULE_PATH})

--- a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
+++ b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
@@ -116,6 +116,9 @@ def add_parser_arguments(self):
        self._parser.add_argument('--data_home', type=str, default='/tmp', help='Data home.')
        self._parser.add_argument('--vocab_path', type=str, default='/tmp/gpt2-vocab.json', help='Vocab path.')
        self._parser.add_argument('--merge_path', type=str, default='/tmp/gpt2-merges.txt', help='Merge path.')
+        self._parser.add_argument(
+            '--split', type=str, default='949,50,1', help='Split dataset ratio for train/val/test.'
+        )
        self._parser.add_argument('--prescale_grad', action='store_true', help='Prescale grad.')
        self._parser.add_argument(
            '--hostfile', type=str, default=None, help='Hostfile to run the mutli-node benchmark.'
@@ -128,6 +131,13 @@ def add_parser_arguments(self):
    def _preprocess(self):
        if not super()._preprocess():
            return False
+        if not self._args.code_base:
+            if self._args.deepspeed:
+                self._args.code_base = os.path.join(
+                    os.getenv('SB_MICRO_PATH'), 'third_party/Megatron/Megatron-DeepSpeed/'
+                )
+            else:
+                self._args.code_base = os.path.join(os.getenv('SB_MICRO_PATH'), 'third_party/Megatron/Megatron-LM')
        if not os.path.exists(self._args.code_base) or \
                not os.path.exists(os.path.join(self._args.code_base, 'pretrain_gpt.py')):
@@ -156,35 +166,35 @@ def _preprocess(self):
    def _parse_log(self, output):
        """Parse log output and get the performance."""
-        tflops_pattern = re.compile(r'TFLOPs: (\d+\.\d+)')
+        tflops_pattern = re.compile(r'(TFLOPs|TFLOP/s/GPU\)): (\d+\.\d+)')
        elapsed_time_pattern = re.compile(r'elapsed time per iteration \(ms\): (\d+\.\d+)')
-        mem_allocated_pattern = re.compile(r'MemAllocated=([\d.]+)[KMGTPEZY]?B')
+        mem_allocated_pattern = re.compile(r'allocated: (\d+\.\d+)')
-        max_mem_allocated_pattern = re.compile(r'MaxMemAllocated=([\d.]+)[KMGTPEZY]?B')
+        max_mem_allocated_pattern = re.compile(r'max allocated: (\d+\.\d+)')
        lines = output.splitlines()
        tflops = []
        mem_allocated = []
        max_mem_allocated = []
        iteration_times = []
        for line in lines:
-            if 'TFLOPs' in line:
+            if 'elapsed time per iteration' in line:
                tflops_matches = tflops_pattern.search(line)
                elapsed_time_match = elapsed_time_pattern.search(line)
                if tflops_matches:
-                    tflops_values = float(tflops_matches.group(1))
+                    tflops_values = float(tflops_matches.group(2))
                    tflops.append(tflops_values)
                if elapsed_time_match:
                    elapsed_time_value = float(elapsed_time_match.group(1))
                    iteration_times.append(elapsed_time_value)
-            if 'MaxMemAllocated' in line:
+            if 'max allocated' in line:
                mem_allocated_match = mem_allocated_pattern.search(line)
                max_mem_allocated_match = max_mem_allocated_pattern.search(line)
                if mem_allocated_match:
-                    mem_allocated_value = float(mem_allocated_match.group(1))
+                    mem_allocated_value = float(mem_allocated_match.group(1)) / 1024
                    mem_allocated.append(mem_allocated_value)
                if max_mem_allocated_match:
-                    max_mem_allocated_value = float(max_mem_allocated_match.group(1))
+                    max_mem_allocated_value = float(max_mem_allocated_match.group(1)) / 1024
                    max_mem_allocated.append(max_mem_allocated_value)
        return iteration_times, tflops, mem_allocated, max_mem_allocated
@@ -224,7 +234,9 @@ def __prepare_deespeed_config(self, precision_megatron):
            --deepspeed \
            --deepspeed_config {self._config_json_path} \
            --zero-stage {self._args.zero_stage} \
-            --pipeline-model-parallel-size {self._args.pipeline_model_parallel_size}'
+            --pipeline-model-parallel-size {self._args.pipeline_model_parallel_size} \
+            --train-tokens {self._args.train_tokens} \
+            --data-impl {self._args.data_impl}'
        if self._args.pipeline_model_parallel_size <= 1:
            deepspeed_options = f'{deepspeed_options} --no-pipeline-parallel'
@@ -255,11 +267,10 @@ def _megatron_command(self, precision):    # noqa: C901
            --num-attention-heads {self._args.num_attn_heads} \
            --seq-length {self._args.seq_len} \
            --max-position-embeddings {self._args.seq_len} \
-            --train-tokens {self._args.train_tokens} \
            --train-samples {self._args.num_steps * self._args.batch_size} \
            --lr {self._args.lr} \
            --min-lr {self._args.min_lr} \
-            --split 949,50,1 \
+            --split {self._args.split} \
            --log-interval {self._args.log_interval} \
            --eval-interval {self._args.eval_interval} \
            --eval-iters {self._args.eval_iters} \
@@ -273,7 +284,8 @@ def _megatron_command(self, precision):    # noqa: C901
            --optimizer adam \
            --use-distributed-optimizer \
            {precision_megatron} \
-            --seed {self._args.seed}'
+            --seed {self._args.seed} \
+            --log-throughput'
        if self._args.sequence_parallel:
            megatron_options = f'{megatron_options} --sequence-parallel'
@@ -298,6 +310,8 @@ def _megatron_command(self, precision):    # noqa: C901
        script_path = os.path.join(self._args.code_base, 'pretrain_gpt.py')
        if self._args.deepspeed:
            deepspeed_option = self.__prepare_deespeed_config(precision_megatron.lstrip('--'))
+            # No --log-throughput in Megatron-DeepSpeed by 20231219
+            megatron_options = megatron_options.replace('--log-throughput', '').strip()
            if self._num_nodes > 1:
                command = f'torchrun {self._distributed_args} ' + \
                    f'{script_path} {megatron_options} {self._data_options} {deepspeed_option}'
@@ -379,6 +393,7 @@ def _init_distributed_setting(self):
            return False
        self._num_nodes = int(os.getenv('OMPI_COMM_WORLD_SIZE')) // int(os.getenv('OMPI_COMM_WORLD_LOCAL_SIZE'))
+        master_addr = 'localhost'
        if self._num_nodes > 1:
            if not self._args.hostfile:
                sb_hostfile = os.path.join(os.environ.get('SB_WORKSPACE', '.'), 'hostfile')
@@ -395,12 +410,13 @@ def _init_distributed_setting(self):
            if self._num_nodes != len(hosts):
                logger.error('MPI init failed since hostfile not match the MPI setting.')
                return False
+            master_addr = hosts[0].split()[0]
-            addr = os.getenv('MASTER_ADDR', hosts[0].split()[0])
+        addr = os.getenv('MASTER_ADDR', master_addr)
-            port = os.getenv('MASTER_PORT', '29500')
+        port = os.getenv('MASTER_PORT', '29500')
-            node_rank = int(os.environ['OMPI_COMM_WORLD_RANK']) // int(os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'])
+        node_rank = int(os.environ['OMPI_COMM_WORLD_RANK']) // int(os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'])
-            self._distributed_args = f'--nproc_per_node {self._args.num_gpus} --nnodes {self._num_nodes} ' + \
+        self._distributed_args = f'--nproc_per_node {self._args.num_gpus} --nnodes {self._num_nodes} ' + \
-                f'--node_rank {node_rank} --master_addr {addr} --master_port {port}'
+            f'--node_rank {node_rank} --master_addr {addr} --master_port {port}'
        return True
    def _generate_dataset(self):
@@ -448,8 +464,7 @@ def _generate_dataset(self):
        self._data_options = f'\
            --vocab-file {self._vocab_path} \
            --merge-file {self._merges_path} \
-            --data-path {self._data_path} \
+            --data-path {self._data_path}'
-            --data-impl {self._args.data_impl}'
        logger.info('Dataset preparation successfully.')
        return True

--- a/superbench/benchmarks/model_benchmarks/model_base.py
+++ b/superbench/benchmarks/model_benchmarks/model_base.py
@@ -265,8 +265,8 @@ def __train(self, precision):
        # The unit of step time should be millisecond.
        step_times = self._train_step(precision)
        if isinstance(step_times, tuple):
-            step_times = step_times[0]
            info = step_times[1]
+            step_times = step_times[0]
            self._process_info(ModelAction.TRAIN, precision, info)
        step_times = self.__process_model_result(ModelAction.TRAIN, precision, step_times)
        if not step_times:

--- a/superbench/common/utils/device_manager.py
+++ b/superbench/common/utils/device_manager.py
@@ -13,7 +13,7 @@
 if gpu.vendor == 'nvidia' or gpu.vendor == 'nvidia-graphics':
    import py3nvml.py3nvml as nvml
 elif gpu.vendor == 'amd' or gpu.vendor == 'amd-graphics':
-    from pyrsmi import rocml
+    import amdsmi as rocml
 class DeviceManager:
@@ -150,7 +150,7 @@ def get_device_compute_capability(self):
        try:
            cap = nvml.nvmlDeviceGetCudaComputeCapability(self._device_handlers[0])
        except Exception as err:
-            logger.error('Get device compute capability failed: {}'.format(str(err)))
+            logger.warning('Get device compute capability failed: {}'.format(str(err)))
            return None
        return cap
@@ -166,7 +166,7 @@ def get_device_utilization(self, idx):
        try:
            util = nvml.nvmlDeviceGetUtilizationRates(self._device_handlers[idx])
        except Exception as err:
-            logger.error('Get device utilization failed: {}'.format(str(err)))
+            logger.warning('Get device utilization failed: {}'.format(str(err)))
            return None
        return util.gpu
@@ -182,7 +182,7 @@ def get_device_temperature(self, idx):
        try:
            temp = nvml.nvmlDeviceGetTemperature(self._device_handlers[idx], nvml.NVML_TEMPERATURE_GPU)
        except Exception as err:
-            logger.error('Get device temperature failed: {}'.format(str(err)))
+            logger.warning('Get device temperature failed: {}'.format(str(err)))
            temp = None
        return temp
@@ -198,7 +198,7 @@ def get_device_power(self, idx):
        try:
            power = nvml.nvmlDeviceGetPowerUsage(self._device_handlers[idx])
        except Exception as err:
-            logger.error('Get device power failed: {}'.format(str(err)))
+            logger.warning('Get device power failed: {}'.format(str(err)))
            return None
        return int(int(power) / 1000)
@@ -214,7 +214,7 @@ def get_device_power_limit(self, idx):
        try:
            powerlimit = nvml.nvmlDeviceGetPowerManagementLimit(self._device_handlers[idx])
        except Exception as err:
-            logger.error('Get device power limitation failed: {}'.format(str(err)))
+            logger.warning('Get device power limitation failed: {}'.format(str(err)))
            return None
        return int(int(powerlimit) / 1000)
@@ -231,7 +231,7 @@ def get_device_memory(self, idx):
        try:
            mem = nvml.nvmlDeviceGetMemoryInfo(self._device_handlers[idx])
        except Exception as err:
-            logger.error('Get device memory failed: {}'.format(str(err)))
+            logger.warning('Get device memory failed: {}'.format(str(err)))
            return None, None
        return mem.used, mem.total
@@ -304,7 +304,7 @@ def get_device_ecc_error(self, idx):
            except nvml.NVMLError:
                pass
            except Exception as err:
-                logger.error('Get device ECC information failed: {}'.format(str(err)))
+                logger.warning('Get device ECC information failed: {}'.format(str(err)))
                return None, None
            try:
@@ -316,7 +316,7 @@ def get_device_ecc_error(self, idx):
            except nvml.NVMLError:
                pass
            except Exception as err:
-                logger.error('Get device ECC information failed: {}'.format(str(err)))
+                logger.warning('Get device ECC information failed: {}'.format(str(err)))
                return None, None
        return corrected_ecc, uncorrected_ecc
@@ -326,12 +326,13 @@ class AmdDeviceManager(DeviceManager):
    """Device management module for AMD."""
    def __init__(self):
        """Constructor."""
-        rocml.smi_initialize()
+        rocml.amdsmi_init()
+        self._device_handlers = rocml.amdsmi_get_processor_handles()
        super().__init__()
    def __del__(self):
        """Destructor."""
-        rocml.smi_shutdown()
+        rocml.amdsmi_shut_down()
    def get_device_count(self):
        """Get the number of device.
@@ -339,7 +340,7 @@ def get_device_count(self):
        Return:
            count (int): count of device.
        """
-        return rocml.smi_get_device_count()
+        return len(self._device_handlers)
    def get_device_utilization(self, idx):
        """Get the utilization of device.
@@ -351,11 +352,11 @@ def get_device_utilization(self, idx):
            util (int): the utilization of device, None means failed to get the data.
        """
        try:
-            util = rocml.smi_get_device_utilization(idx)
+            engine_usage = rocml.amdsmi_get_gpu_activity(self._device_handlers[idx])
        except Exception as err:
-            logger.error('Get device utilization failed: {}'.format(str(err)))
+            logger.warning('Get device utilization failed: {}'.format(str(err)))
            return None
-        return util
+        return engine_usage['gfx_activity']
    def get_device_temperature(self, idx):
        """Get the temperature of device, unit: celsius.
@@ -366,8 +367,16 @@ def get_device_temperature(self, idx):
        Return:
            temp (int): the temperature of device, None means failed to get the data.
        """
-        # Currently no API provided in rocml.
+        try:
-        return None
+            temp = rocml.amdsmi_get_temp_metric(
+                self._device_handlers[idx], rocml.AmdSmiTemperatureType.EDGE, rocml.AmdSmiTemperatureMetric.CURRENT
+            )
+        except (rocml.AmdSmiLibraryException, rocml.AmdSmiParameterException):
+            pass
+        except Exception as err:
+            logger.warning('Get device temperature failed: {}'.format(str(err)))
+            temp = None
+        return temp
    def get_device_power(self, idx):
        """Get the realtime power of device, unit: watt.
@@ -379,11 +388,11 @@ def get_device_power(self, idx):
            temp (int): the realtime power of device, None means failed to get the data.
        """
        try:
-            power = rocml.smi_get_device_average_power(idx)
+            power_measure = rocml.amdsmi_get_power_info(self._device_handlers[idx])
        except Exception as err:
-            logger.error('Get device power failed: {}'.format(str(err)))
+            logger.warning('Get device power failed: {}'.format(str(err)))
            return None
-        return int(int(power) / 1000)
+        return int(power_measure['average_socket_power'])
    def get_device_power_limit(self, idx):
        """Get the power management limit of device, unit: watt.
@@ -394,8 +403,12 @@ def get_device_power_limit(self, idx):
        Return:
            temp (int): the power management limit of device, None means failed to get the data.
        """
-        # Currently no API provided in rocml.
+        try:
-        return None
+            power_measure = rocml.amdsmi_get_power_info(self._device_handlers[idx])
+        except Exception as err:
+            logger.warning('Get device power limit failed: {}'.format(str(err)))
+            return None
+        return int(power_measure['power_limit'])
    def get_device_memory(self, idx):
        """Get the memory information of device, unit: byte.
@@ -408,10 +421,10 @@ def get_device_memory(self, idx):
            total (int): the total device memory in bytes, None means failed to get the data.
        """
        try:
-            mem_used = rocml.smi_get_device_memory_used(idx)
+            mem_used = rocml.amdsmi_get_gpu_memory_usage(self._device_handlers[idx], rocml.AmdSmiMemoryType.VRAM)
-            mem_total = rocml.smi_get_device_memory_total(idx)
+            mem_total = rocml.amdsmi_get_gpu_memory_total(self._device_handlers[idx], rocml.AmdSmiMemoryType.VRAM)
        except Exception as err:
-            logger.error('Get device memory failed: {}'.format(str(err)))
+            logger.warning('Get device memory failed: {}'.format(str(err)))
            return None, None
        return mem_used, mem_total
@@ -425,8 +438,19 @@ def get_device_ecc_error(self, idx):
            corrected_ecc (int)  : the count of single bit ecc error.
            uncorrected_ecc (int): the count of double bit ecc error.
        """
-        # Currently no API provided in rocml.
+        corrected_ecc = 0
-        return None, None
+        uncorrected_ecc = 0
+        for block in rocml.AmdSmiGpuBlock:
+            try:
+                ecc_count = rocml.amdsmi_get_gpu_ecc_count(self._device_handlers[idx], block)
+                corrected_ecc += ecc_count['correctable_count']
+                uncorrected_ecc += ecc_count['uncorrectable_count']
+            except (rocml.AmdSmiLibraryException, rocml.AmdSmiParameterException):
+                pass
+            except Exception as err:
+                logger.info('Get device ECC information failed: {}'.format(str(err)))
+        return corrected_ecc, uncorrected_ecc
 device_manager: Optional[DeviceManager] = DeviceManager()

--- a/superbench/config/amd_mi100_hpe.yaml
+++ b/superbench/config/amd_mi100_hpe.yaml
@@ -3,7 +3,7 @@
 # Server:
 #   - Product: HPE Apollo 6500
-version: v0.9
+version: v0.10
 superbench:
  enable: null
  var:

--- a/superbench/config/amd_mi100_z53.yaml
+++ b/superbench/config/amd_mi100_z53.yaml
@@ -4,7 +4,7 @@
 #   - Product: G482-Z53
 #   - Link: https://www.gigabyte.cn/FileUpload/Global/MicroSite/553/G482-Z53.html
-version: v0.9
+version: v0.10
 superbench:
  enable: null
  var:

--- a/superbench/config/azure/inference/standard_nc64as_t4_v3.yaml
+++ b/superbench/config/azure/inference/standard_nc64as_t4_v3.yaml
-version: v0.9
+version: v0.10
 superbench:
  enable: null
  monitor:

--- a/superbench/config/azure/inference/standard_nc96ads_a100_v4.yaml
+++ b/superbench/config/azure/inference/standard_nc96ads_a100_v4.yaml
-version: v0.9
+version: v0.10
 superbench:
  enable: null
  monitor:

--- a/superbench/config/azure/inference/standard_nv18ads_a10_v5.yaml
+++ b/superbench/config/azure/inference/standard_nv18ads_a10_v5.yaml
-version: v0.9
+version: v0.10
 superbench:
  enable: null
  monitor:

--- a/superbench/config/azure_ndmv4.yaml
+++ b/superbench/config/azure_ndmv4.yaml
@@ -3,7 +3,7 @@
 # Azure NDm A100 v4
 # reference: https://docs.microsoft.com/en-us/azure/virtual-machines/ndm-a100-v4-series
-version: v0.9
+version: v0.10
 superbench:
  enable: null
  monitor:

--- a/superbench/config/azure_ndv4.yaml
+++ b/superbench/config/azure_ndv4.yaml
 # SuperBench Config
-version: v0.9
+version: v0.10
 superbench:
  enable: null
  monitor:

--- a/superbench/config/default.yaml
+++ b/superbench/config/default.yaml
 # SuperBench Config
-version: v0.9
+version: v0.10
 superbench:
  enable: null
  monitor:

--- a/superbench/runner/playbooks/deploy.yaml
+++ b/superbench/runner/playbooks/deploy.yaml
@@ -100,7 +100,7 @@
        docker run -itd --name={{ container }} \
          --privileged --net=host --ipc=host \
          {{ '--gpus=all' if nvidia_gpu_exist else '' }} \
-          {{ '--security-opt seccomp=unconfined --group-add video' if amd_gpu_exist else '' }} \
+          {{ '--security-opt seccomp=unconfined --group-add video --device=/dev/kfd --device=/dev/dri --cap-add=SYS_PTRACE --shm-size=16G' if amd_gpu_exist else '' }} \
          -w /root -v {{ workspace }}:/root -v /mnt:/mnt \
          -v /var/run/docker.sock:/var/run/docker.sock \
          --entrypoint /bin/bash {{ docker_image }} && \

--- a/tests/benchmarks/micro_benchmarks/test_cuda_nccl_bw_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_cuda_nccl_bw_performance.py
@@ -66,6 +66,8 @@ def test_nccl_bw_performance(self, allgather, allreduce, reduce, broadcast, redu
        assert (benchmark._args.iters == 20)
        assert (benchmark._args.warmup_iters == 5)
        assert (benchmark._args.graph_iters == 0)
+        assert (benchmark._args.in_place is False)
+        assert (benchmark._args.data_type == 'float')
        # Check command list
        bin_names = [
@@ -74,7 +76,7 @@ def test_nccl_bw_performance(self, allgather, allreduce, reduce, broadcast, redu
        ]
        command = bin_names[0] + benchmark._commands[0].split(bin_names[0])[1]
-        expected_command = '{} -b 8 -e 8G -f 2 -g 8 -c 0 -n 20 -w 5 -G 0'.format(bin_names[0])
+        expected_command = '{} -b 8 -e 8G -f 2 -g 8 -c 0 -n 20 -w 5 -G 0 -d float'.format(bin_names[0])
        assert (command == expected_command)
        # Check results and metrics.
@@ -91,6 +93,11 @@ def test_nccl_bw_performance(self, allgather, allreduce, reduce, broadcast, redu
            'alltoall': alltoall,
        }
+        if 'SB_MODE_SERIAL_INDEX' in os.environ:
+            os.environ.pop('SB_MODE_SERIAL_INDEX')
+        if 'SB_MODE_PARALLEL_INDEX' in os.environ:
+            os.environ.pop('SB_MODE_PARALLEL_INDEX')
        for op in raw_output.keys():
            benchmark._args.operation = op
            assert (benchmark._process_raw_result(0, raw_output[op]))
@@ -131,3 +138,48 @@ def test_nccl_bw_performance(self, allgather, allreduce, reduce, broadcast, redu
        assert (benchmark.result['alltoall_0_0:8589934592_time'][0] == 33508.0)
        assert (benchmark.result['alltoall_0_0:8589934592_algbw'][0] == 256.36)
        assert (benchmark.result['alltoall_0_0:8589934592_busbw'][0] == 224.31)
+    @decorator.load_data('tests/data/nccl_allreduce.log')
+    @decorator.load_data('tests/data/nccl_alltoall.log')
+    def test_nccl_bw_performance_in_place_parsing(self, allreduce, alltoall):
+        """Test nccl-bw benchmark in-place parsing."""
+        benchmark_name = 'nccl-bw'
+        (benchmark_class,
+         predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
+        assert (benchmark_class)
+        benchmark = benchmark_class(benchmark_name, parameters='--ngpus 8 --in_place')
+        ret = benchmark._preprocess()
+        assert (ret is True)
+        assert (benchmark.return_code == ReturnCode.SUCCESS)
+        assert (benchmark._args.in_place is True)
+        # Case with valid raw_output
+        raw_output = {
+            'allreduce': allreduce,
+            'alltoall': alltoall,
+        }
+        if 'SB_MODE_SERIAL_INDEX' in os.environ:
+            os.environ.pop('SB_MODE_SERIAL_INDEX')
+        if 'SB_MODE_PARALLEL_INDEX' in os.environ:
+            os.environ.pop('SB_MODE_PARALLEL_INDEX')
+        for op in raw_output.keys():
+            benchmark._args.operation = op
+            assert (benchmark._process_raw_result(0, raw_output[op]))
+            for name in ['time', 'algbw', 'busbw']:
+                for size in ['8589934592', '4294967296', '2147483648', '1073741824', '536870912', '32']:
+                    metric = op + '_' + size + '_' + name
+                    assert (metric in benchmark.result)
+                    assert (len(benchmark.result[metric]) == 1)
+                    assert (isinstance(benchmark.result[metric][0], numbers.Number))
+        assert (benchmark.result['allreduce_8589934592_time'][0] == 63959.0)
+        assert (benchmark.result['allreduce_8589934592_algbw'][0] == 134.30)
+        assert (benchmark.result['allreduce_8589934592_busbw'][0] == 235.03)
+        assert (benchmark.result['alltoall_8589934592_time'][0] == 33234.0)
+        assert (benchmark.result['alltoall_8589934592_algbw'][0] == 258.47)
+        assert (benchmark.result['alltoall_8589934592_busbw'][0] == 226.16)
--- a/tests/benchmarks/micro_benchmarks/test_dist_inference.py
+++ b/tests/benchmarks/micro_benchmarks/test_dist_inference.py
@@ -3,7 +3,6 @@
 """Tests for distributed inference benchmark."""
-import numbers
 import unittest
 from tests.helper import decorator
@@ -209,19 +208,17 @@ def _test_dist_inference_result_parsing(self, platform, test_raw_output):
        # step_times
        assert (len(benchmark.raw_data) == 2)
        # return code + (avg, 50th, 90th, 95th, 99th, 99.9th)
-        test_latency = float(test_raw_output.splitlines()[-1].split(' ms per iteration')[0].split()[-1])
        assert (7 == len(benchmark.result))
-        for output_key in benchmark.result:
+        assert (benchmark.result['return_code'] == [0])
-            if output_key == 'return_code':
+        assert (benchmark.result['step_times'] == [1.9052048])
-                assert (benchmark.result[output_key] == [0])
+        assert (benchmark.result['step_times_50'] == [1.851])
-            else:
+        assert (benchmark.result['step_times_90'] == [1.89637])
-                assert (output_key.startswith('step_times'))
+        assert (benchmark.result['step_times_95'] == [2.12037])
-                assert (len(benchmark.result[output_key]) == 1)
+        assert (benchmark.result['step_times_99'] == [2.67155])
-                assert (isinstance(benchmark.result[output_key][0], numbers.Number))
+        assert (benchmark.result['step_times_99.9'] == [4.4198])
-                assert (test_latency == benchmark.result[output_key][0])
        # Negative case - invalid raw output.
-        assert (benchmark._process_raw_result(1, 'Invalid raw output') is False)
+        assert (benchmark._process_raw_result(1, 'Latency of step: xxx ms') is False)
        assert (benchmark.return_code == ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
    @decorator.cuda_test