Unverified Commit 2c88db90 authored by Yifan Xiong's avatar Yifan Xiong Committed by GitHub
Browse files

Release - SuperBench v0.10.0 (#607)

**Description**

Cherry-pick bug fixes from v0.10.0 to main.

**Major Revisions**

* Benchmarks: Microbenchmark - Support different hipblasLt data types in dist_inference #590
* Benchmarks: Microbenchmark - Support in-place for NCCL/RCCL benchmark #591
* Bug Fix - Fix NUMA Domains Swap Issue in NDv4 Topology File #592
* Benchmarks: Microbenchmark - Add data type option for NCCL and RCCL tests #595
* Benchmarks: Bug Fix - Make metrics of dist-inference-cpp aligned with PyTorch version #596
* CI/CD - Add ndv5 topo file #597
* Benchmarks: Microbenchmark - Improve AMD GPU P2P performance with fine-grained GPU memory #593
* Benchmarks: Build Pipeline - fix nccl and nccl test version to 2.18.3 to resolve hang issue in cuda12.2 docker #599
* Dockerfile - Bug fix for rocm docker build and deploy #598
* Benchmarks: Microbenchmark - Adapt to hipblasLt data type changes #603
* Benchmarks: Micro benchmarks - Update hipblaslt metric unit to tflops #604
* Monitor - U...
parent 2c2096ed
......@@ -55,7 +55,7 @@ def test_hipblaslt_gemm_command_generation(self):
self.assertFalse(benchmark._preprocess())
benchmark = benchmark_cls(
self.benchmark_name,
parameters='--shapes 2:4,4:8,8:32 2:4,4:8,8:32:+4 --in_types fp16 fp32 bf16',
parameters='--shapes 2:4,4:8,8:32 2:4,4:8,8:32:+4 --in_types fp16 fp32 bf16 fp8',
)
self.assertTrue(benchmark._preprocess())
self.assertEqual((2 * 2 * 3 + 2 * 2 * 7) * len(benchmark._args.in_types), len(benchmark._commands))
......@@ -63,12 +63,16 @@ def test_hipblaslt_gemm_command_generation(self):
def cmd(t, b, m, n, k):
if b == 0:
return f'{benchmark._HipBlasLtBenchmark__bin_path} ' + \
f'-m {m} -n {n} -k {k} -j 20 -i 50 {benchmark._in_type_map[t]}'
f'-m {m} -n {n} -k {k} -j 20 -i 50 {benchmark._in_type_map[t]}' + \
f' --transA {benchmark._args.transA} --transB {benchmark._args.transB}' + \
f' --initialization {benchmark._args.initialization}'
else:
return f'{benchmark._HipBlasLtBenchmark__bin_path} ' + \
f'-m {m} -n {n} -k {k} -j 20 -i 50 {benchmark._in_type_map[t]} -b {b}'
f'-m {m} -n {n} -k {k} -j 20 -i 50 {benchmark._in_type_map[t]} -b {b}' + \
f' --transA {benchmark._args.transA} --transB {benchmark._args.transB}' + \
f' --initialization {benchmark._args.initialization}'
for _t in ['fp16', 'fp32', 'bf16']:
for _t in ['fp16', 'fp32', 'bf16', 'fp8']:
for _m in [2, 4]:
for _n in [4, 8]:
for _k in [8, 16, 32]:
......@@ -102,7 +106,7 @@ def test_hipblaslt_gemm_result_parsing(self):
self.assertEqual(ReturnCode.SUCCESS, benchmark.return_code)
self.assertEqual(2, len(benchmark.result))
self.assertEqual(58624.5, benchmark.result['fp16_1_896_896_896_flops'][0])
self.assertEqual(58.6245, benchmark.result['fp16_1_896_896_896_flops'][0])
# Negative case - invalid raw output
self.assertFalse(benchmark._process_raw_result(1, 'HipBLAS API failed'))
......@@ -177,8 +177,7 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
benchmark._data_options = f'\
--vocab-file {self._tmp_dir}/gpt2-vocab.json \
--merge-file {self._tmp_dir}/gpt2-merges.txt \
--data-path {self._tmp_dir}/dataset_text_document \
--data-impl mmap'
--data-path {self._tmp_dir}/dataset_text_document'
script_path = str(Path(self._tmp_dir) / 'pretrain_gpt.py')
expected_command = 'torchrun {distributed_args} {script_path} \
......@@ -197,7 +196,6 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
--num-attention-heads 32 \
--seq-length 2048 \
--max-position-embeddings 2048 \
--train-tokens 300000000000 \
--train-samples 20480 \
--lr 0.00012 \
--min-lr 1e-06 \
......@@ -215,7 +213,8 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
--optimizer adam \
--use-distributed-optimizer \
{precision} \
--seed 1234 {data_options}'
--seed 1234 \
--log-throughput {data_options}'
precision = Precision.FLOAT32
command = benchmark._megatron_command(precision)
......@@ -262,12 +261,10 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
benchmark._data_options = f'\
--vocab-file {self._tmp_dir}/gpt2-vocab.json \
--merge-file {self._tmp_dir}/gpt2-merges.txt \
--data-path {self._tmp_dir}/dataset_text_document \
--data-impl mmap'
--data-path {self._tmp_dir}/dataset_text_document'
command = benchmark._megatron_command(Precision.BFLOAT16)
expected_command = 'deepspeed {script_path} \
--override-opt_param-scheduler \
expected_command = 'deepspeed {script_path} --override-opt_param-scheduler \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--tensor-model-parallel-size 1 \
......@@ -282,7 +279,6 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
--num-attention-heads 32 \
--seq-length 2048 \
--max-position-embeddings 2048 \
--train-tokens 300000000000 \
--train-samples 20480 \
--lr 0.00012 \
--min-lr 1e-06 \
......@@ -306,7 +302,9 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
--deepspeed \
--deepspeed_config {benchmark._config_json_path} \
--zero-stage 1 \
--pipeline-model-parallel-size 1 --no-pipeline-parallel'
--pipeline-model-parallel-size 1 \
--train-tokens 300000000000 \
--data-impl mmap --no-pipeline-parallel'
self.assertEqual(
command,
......@@ -346,12 +344,12 @@ def test_megatron_parse_log(self, raw_output, mock_generate_dataset):
iteration_times, tflops, mem_allocated, max_mem_allocated = benchmark._parse_log(raw_output)
assert (statistics.mean(iteration_times) == 75239.24)
assert (statistics.mean(tflops) == 149.136)
assert (statistics.mean(mem_allocated) == 17.54)
assert (statistics.mean(max_mem_allocated) == 66.97)
assert (statistics.mean(mem_allocated) == 17.535637855529785)
assert (statistics.mean(max_mem_allocated) == 66.9744234085083)
info = {'tflops': tflops, 'mem_allocated': mem_allocated, 'max_mem_allocated': max_mem_allocated}
benchmark._process_info(ModelAction.TRAIN, Precision.FLOAT16, info)
assert (benchmark.result is not None)
assert (benchmark.result['fp16_train_tflops'][0] == 149.136)
assert (benchmark.result['fp16_train_mem_allocated'][0] == 17.54)
assert (benchmark.result['fp16_train_max_mem_allocated'][0] == 66.97)
assert (benchmark.result['fp16_train_mem_allocated'][0] == 17.535637855529785)
assert (benchmark.result['fp16_train_max_mem_allocated'][0] == 66.9744234085083)
Parameters: m=80, n=128, k=128, alpha=1.000000, beta=1.000000, num_layers=50, num_warmups=20, num_iters=100, use_cuda_graph=0
Time: 173 ms in total, 1.73 ms per iteration, 0.0346 ms per layer
Latency of step 0: 1.8339 ms
Latency of step 1: 1.84222 ms
Latency of step 2: 1.90869 ms
Latency of step 3: 1.85375 ms
Latency of step 4: 1.87192 ms
Latency of step 5: 1.84254 ms
Latency of step 6: 1.91165 ms
Latency of step 7: 1.8214 ms
Latency of step 8: 1.91427 ms
Latency of step 9: 1.89586 ms
Latency of step 10: 1.86816 ms
Latency of step 11: 1.85105 ms
Latency of step 12: 1.84486 ms
Latency of step 13: 1.84915 ms
Latency of step 14: 1.82332 ms
Latency of step 15: 1.91444 ms
Latency of step 16: 1.85073 ms
Latency of step 17: 1.81812 ms
Latency of step 18: 2.67155 ms
Latency of step 19: 1.85119 ms
Latency of step 20: 1.87989 ms
Latency of step 21: 1.83932 ms
Latency of step 22: 1.84041 ms
Latency of step 23: 1.84789 ms
Latency of step 24: 1.85079 ms
Latency of step 25: 1.82229 ms
Latency of step 26: 1.83376 ms
Latency of step 27: 1.851 ms
Latency of step 28: 1.86246 ms
Latency of step 29: 1.8371 ms
Latency of step 30: 1.88932 ms
Latency of step 31: 1.84459 ms
Latency of step 32: 1.82725 ms
Latency of step 33: 1.83566 ms
Latency of step 34: 1.84041 ms
Latency of step 35: 1.87058 ms
Latency of step 36: 1.84038 ms
Latency of step 37: 1.85555 ms
Latency of step 38: 1.85848 ms
Latency of step 39: 2.40561 ms
Latency of step 40: 1.85029 ms
Latency of step 41: 1.84562 ms
Latency of step 42: 1.8351 ms
Latency of step 43: 1.84196 ms
Latency of step 44: 1.86032 ms
Latency of step 45: 1.87147 ms
Latency of step 46: 1.84832 ms
Latency of step 47: 1.85715 ms
Latency of step 48: 1.86012 ms
Latency of step 49: 1.86327 ms
Latency of step 50: 1.84388 ms
Latency of step 51: 1.86396 ms
Latency of step 52: 1.85538 ms
Latency of step 53: 1.85564 ms
Latency of step 54: 1.83979 ms
Latency of step 55: 1.85334 ms
Latency of step 56: 1.85712 ms
Latency of step 57: 1.85284 ms
Latency of step 58: 1.84534 ms
Latency of step 59: 1.86041 ms
Latency of step 60: 1.86305 ms
Latency of step 61: 2.2213 ms
Latency of step 62: 1.83054 ms
Latency of step 63: 4.4198 ms
Latency of step 64: 1.87245 ms
Latency of step 65: 1.83845 ms
Latency of step 66: 1.82047 ms
Latency of step 67: 1.81191 ms
Latency of step 68: 1.83887 ms
Latency of step 69: 1.8463 ms
Latency of step 70: 2.12037 ms
Latency of step 71: 1.85782 ms
Latency of step 72: 1.84939 ms
Latency of step 73: 1.82054 ms
Latency of step 74: 1.8866 ms
Latency of step 75: 1.83937 ms
Latency of step 76: 1.84167 ms
Latency of step 77: 1.89637 ms
Latency of step 78: 1.8392 ms
Latency of step 79: 1.83754 ms
Latency of step 80: 1.84721 ms
Latency of step 81: 1.88112 ms
Latency of step 82: 1.84474 ms
Latency of step 83: 1.84084 ms
Latency of step 84: 1.85134 ms
Latency of step 85: 1.85315 ms
Latency of step 86: 1.83406 ms
Latency of step 87: 1.87803 ms
Latency of step 88: 1.8369 ms
Latency of step 89: 1.85909 ms
Latency of step 90: 1.84519 ms
Latency of step 91: 2.52689 ms
Latency of step 92: 1.86594 ms
Latency of step 93: 1.86974 ms
Latency of step 94: 1.85219 ms
Latency of step 95: 1.86255 ms
Latency of step 96: 1.82652 ms
Latency of step 97: 1.84379 ms
Latency of step 98: 1.84553 ms
Latency of step 99: 1.87082 ms
......@@ -7,18 +7,20 @@ MPI_HOME ?= /usr/local/mpi
HIP_HOME ?= /opt/rocm/hip
RCCL_HOME ?= /opt/rocm/rccl
HPCX_HOME ?= /opt/hpcx
ROCM_PATH ?= /opt/rocm
CUDA_VER ?= $(shell nvcc --version | grep 'release' | awk '{print $$6}' | cut -c2- | cut -d '.' -f1-2)
ROCBLAS_BRANCH ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | cut -d '.' -f1-3)
HIPBLASLT_BRANCH ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | cut -d '.' -f1-3)
ROCM_VER ?= $(shell hipconfig -R | grep -oP '\d+\.\d+\.\d+' || echo "0.0.0")
.PHONY: all cuda_with_msccl cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed
.PHONY: all cuda_with_msccl cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed apex_rocm
# Build all targets.
all: cuda rocm
cuda_with_msccl: cuda cuda_msccl
cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed
rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed
rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed apex_rocm
cpu: common cpu_perftest
common: cpu_hpl cpu_stream fio
directx_amd: directx_amf_encoding_latency
......@@ -62,7 +64,7 @@ endif
cuda_nccl_tests: sb_micro_path
ifneq (,$(wildcard nccl-tests/Makefile))
cd ./nccl-tests && make MPI=1 MPI_HOME=$(MPI_HOME) -j
cp -v ./nccl-tests/build/* $(SB_MICRO_PATH)/bin/
cp -v -r ./nccl-tests/build/* $(SB_MICRO_PATH)/bin/
endif
# Build perftest.
......@@ -86,11 +88,11 @@ ifneq (,$(wildcard fio/Makefile))
cd ./fio && ./configure --prefix=$(SB_MICRO_PATH) --disable-native && make -j && make install
endif
# Build rccl-tests from commit 2a18737 of default branch.
# Build rccl-tests from commit 46375b1 of default branch.
rocm_rccl_tests: sb_micro_path
ifneq (, $(wildcard rccl-tests/Makefile))
cd ./rccl-tests && make MPI=1 MPI_HOME=$(MPI_HOME) HIP_HOME=$(HIP_HOME) RCCL_HOME=$(RCCL_HOME) -j
cp -v ./rccl-tests/build/* $(SB_MICRO_PATH)/bin/
cd ./rccl-tests && make MPI=1 MPI_HOME=$(MPI_HOME) -j
cp -v -r ./rccl-tests/build/* $(SB_MICRO_PATH)/bin/
endif
# Build rocblas-bench.
......@@ -175,42 +177,58 @@ directx_amf_encoding_latency:
"C:\temp\BuildTools\MSBuild\Current\Bin\MSBuild.exe" "AMF\amf\public\samples\CPPSamples_vs2019.sln" /t:EncoderLatency /p:Platform=x64 /p:Configuration=Release /p:OutDir="%SB_MICRO_PATH%\bin" \
)
# Install Megatron-LM
# Install requirements for Megatron-LM
megatron_lm:
if [ ! -d "Megatron/Megatron-LM" ]; then \
git clone "https://github.com/NVIDIA/Megatron-LM.git" "Megatron/Megatron-LM"; \
fi
cd Megatron && \
python -m pip install -r requirements.txt
apt install -y python3-mpi4py && \
python -m pip install --no-cache-dir -r requirements.txt
# Install Megatron-DeepSpeed
# Install requirements for Megatron-DeepSpeed
megatron_deepspeed:
if [ ! -d "Megatron/Megatron-DeepSpeed" ]; then \
git clone "https://github.com/microsoft/Megatron-DeepSpeed.git" "Megatron/Megatron-DeepSpeed"; \
fi
cd Megatron && \
python -m pip install -r requirements.txt && \
apt install -y python3-mpi4py && \
python -m pip install --no-cache-dir -r requirements.txt && \
python -m pip install DeepSpeed
# Instal apex of ROCm due to dependency of Megatron
apex_rocm:
$(eval TORCH_VERSION ?= $(shell python -c "import torch; print(torch.__version__)"))
$(eval TORCH_MAJOR_VERSION ?= $(word 1,$(subst ., ,$(TORCH_VERSION))))
$(eval TORCH_MINOR_VERSION ?= $(word 2,$(subst ., ,$(TORCH_VERSION))))
if [ ! -d "apex" ]; then \
git clone https://github.com/ROCmSoftwarePlatform/apex.git ; \
fi
cd apex && \
if [ "$$(expr $(TORCH_MAJOR_VERSION) \> 2)" -eq 1 ] && [ "$$(expr $(TORCH_MINOR_VERSION) \> 1)" -eq 1 ]; then \
git checkout master ; \
elif [ "$$(expr $(TORCH_MAJOR_VERSION) == 2)" -eq 1 ] && [ "$$(expr $(TORCH_MINOR_VERSION) == 1)" -eq 1 ]; then \
git checkout release/1.1.0 ; \
elif [ "$$(expr $(TORCH_MAJOR_VERSION) == 2)" -eq 1 ] && [ "$$(expr $(TORCH_MINOR_VERSION) == 0)" -eq 1 ]; then \
git checkout release/1.0.0 ; \
elif [ "$$(expr $(TORCH_MAJOR_VERSION) == 1)" -eq 1 ]; then \
git checkout release/1.0.0 ; \
fi
pip install -v --disable-pip-version-check --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./apex
# Build MSCCL for CUDA
cuda_msccl: sb_micro_path
ifneq (,$(wildcard msccl/executor/msccl-executor-nccl/Makefile))
cd ./msccl/executor/msccl-executor-nccl && \
make -j4 src.build && \
make -j $(shell nproc --ignore=2) src.build && \
cd ../../..
mkdir -p $(SB_MICRO_PATH)/lib/msccl-executor-nccl && \
cp -r -v ./msccl/executor/msccl-executor-nccl/build/* $(SB_MICRO_PATH)/lib/msccl-executor-nccl/
endif
ifneq (,$(wildcard msccl/scheduler/msccl-scheduler/Makefile))
cd ./msccl/scheduler/msccl-scheduler && \
CXX=nvcc BIN_HOME=$(SB_MICRO_PATH)/lib/msccl-executor-nccl SRC_HOME=../../../msccl/executor/msccl-executor-nccl make -j4 && \
CXX=nvcc BIN_HOME=$(SB_MICRO_PATH)/lib/msccl-executor-nccl SRC_HOME=../../../msccl/executor/msccl-executor-nccl make -j $(shell nproc --ignore=2) && \
cd ../../..
mkdir -p $(SB_MICRO_PATH)/lib/msccl-scheduler && \
cp -r -v ./msccl/scheduler/msccl-scheduler/build/* $(SB_MICRO_PATH)/lib/msccl-scheduler/
endif
ifneq (,$(wildcard msccl/tests/msccl-tests-nccl/Makefile))
cd ./msccl/tests/msccl-tests-nccl && \
make MPI=1 MPI_HOME=$(MPI_HOME) NCCL_HOME=$(SB_MICRO_PATH)/lib/msccl-executor-nccl -j4 && cd ../../..
make MPI=1 MPI_HOME=$(MPI_HOME) NCCL_HOME=$(SB_MICRO_PATH)/lib/msccl-executor-nccl -j $(shell nproc --ignore=2) && cd ../../..
mkdir -p $(SB_MICRO_PATH)/bin/msccl-tests-nccl && \
cp -r -v ./msccl/tests/msccl-tests-nccl/build/* $(SB_MICRO_PATH)/bin/msccl-tests-nccl/
endif
Subproject commit 71e8407c98bacacb002823ea587c321fe58b28a6
Subproject commit 52b7a18a00bced8b3670eededfd58ee0c4bd7d06
diff --git a/megatron/fused_kernels/scaled_softmax_cuda.cu b/megatron/fused_kernels/scaled_softmax_cuda.cu
index 90e1c9f..d217aec 100644
--- a/megatron/fused_kernels/scaled_softmax_cuda.cu
+++ b/megatron/fused_kernels/scaled_softmax_cuda.cu
@@ -4,7 +4,7 @@
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_fp16.h>
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef __HIP_PLATFORM_AMD__
#include <cuda_profiler_api.h>
#endif
#include <ATen/cuda/CUDAContext.h>
diff --git a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
index 74c9f3d..03b5fc8 100644
--- a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
+++ b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
@@ -4,7 +4,7 @@
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_fp16.h>
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef __HIP_PLATFORM_AMD__
#include <cuda_profiler_api.h>
#endif
#include <ATen/cuda/CUDAContext.h>
......@@ -10,4 +10,6 @@ tqdm
sentencepiece
wandb
einops
typing_extensions==4.5.0
typing_extensions==4.9.0
apex
mpi4py
Subproject commit 8274cb47b6dc70ce4411e7f114b77173d3892414
Subproject commit 1292b25553bd0384f2faa2965f9d82b99797a348
Subproject commit 5fb4f10a7e7827ed15e53c25810a10be279d6e23
Subproject commit dffd1dd8b8a26dad2634a546e7e4d082dc882fbc
diff --git a/configure.ac b/configure.ac
index 20eceda..c8f0c07 100755
--- a/configure.ac
+++ b/configure.ac
@@ -237,7 +237,7 @@ AC_ARG_WITH([rocm],
],
[AS_CASE([$with_rocm],
[yes|no], [],
- [CPPFLAGS="-I$with_rocm/include $CPPFLAGS"
+ [CPPFLAGS="-I$with_rocm/include -D__HIP_PLATFORM_AMD__=1 $CPPFLAGS"
LDFLAGS="-L$with_rocm/lib64 -Wl,-rpath=$with_rocm/lib64 -L$with_rocm/lib -Wl,-rpath=$with_rocm/lib -lamdhip64 $LDFLAGS"])
])
diff --git a/src/rocm_memory.c b/src/rocm_memory.c
index e9a9136..b6cb23a 100644
--- a/src/rocm_memory.c
+++ b/src/rocm_memory.c
@@ -44,8 +44,8 @@ static int init_rocm(int device_id) {
hipDeviceProp_t prop = {0};
ROCM_CHECK(hipGetDeviceProperties(&prop, device_id));
- printf("Using ROCm Device with ID: %d, Name: %s, PCI Bus ID: 0x%x, GCN Arch: %d\n",
- device_id, prop.name, prop.pciBusID, prop.gcnArch);
+ printf("Using ROCm Device with ID: %d, Name: %s, PCI Bus ID: 0x%x, GCN Arch: %s\n",
+ device_id, prop.name, prop.pciBusID, prop.gcnArchName);
return SUCCESS;
}
Subproject commit 2a18737dc681e03ce82c046caa71b28db65017b5
Subproject commit 46375b1c527b2e3afe80fdd6dd136151bd939675
---
slug: release-sb-v0.10
title: Releasing SuperBench v0.10
author: Peng Cheng
author_title: SuperBench Team
author_url: https://github.com/cp5555
author_image_url: https://github.com/cp5555.png
tags: [superbench, announcement, release]
---
We are very happy to announce that **SuperBench 0.10.0 version** is officially released today!
You can install and try superbench by following [Getting Started Tutorial](https://microsoft.github.io/superbenchmark/docs/getting-started/installation).
## SuperBench 0.10.0 Release Notes
### SuperBench Improvements
- Support monitoring for AMD GPUs.
- Support ROCm 5.7 and ROCm 6.0 dockerfile.
- Add MSCCL support for Nvidia GPU.
- Fix NUMA domains swap issue in NDv4 topology file.
- Add NDv5 topo file.
- Fix NCCL and NCCL-test to 2.18.3 for hang issue in CUDA 12.2.
### Micro-benchmark Improvements
- Add HPL random generator to gemm-flops with ROCm.
- Add DirectXGPURenderFPS benchmark to measure the FPS of rendering simple frames.
- Add HWDecoderFPS benchmark to measure the FPS of hardware decoder performance.
- Update Docker image for H100 support.
- Update MLC version into 3.10 for CUDA/ROCm dockerfile.
- Bug fix for GPU Burn test.
- Support INT8 in cublaslt function.
- Add hipBLASLt function benchmark.
- Support cpu-gpu and gpu-cpu in ib-validation.
- Support graph mode in NCCL/RCCL benchmarks for latency metrics.
- Support cpp implementation in distributed inference benchmark.
- Add O2 option for gpu copy ROCm build.
- Support different hipblasLt data types in dist inference.
- Support in-place in NCCL/RCCL benchmark.
- Support data type option in NCCL/RCCL benchmark.
- Improve P2P performance with fine-grained GPU memory in GPU-copy test for AMD GPUs.
- Update hipblaslt GEMM metric unit to tflops.
- Support FP8 for hipblaslt benchmark.
### Model Benchmark Improvements
- Change torch.distributed.launch to torchrun.
- Support Megatron-LM/Megatron-Deepspeed GPT pretrain benchmark.
### Result Analysis
- Support baseline generation from multiple nodes.
......@@ -101,7 +101,7 @@ module.exports = {
announcementBar: {
id: 'supportus',
content:
'📢 <a href="https://microsoft.github.io/superbenchmark/blog/release-sb-v0.9">v0.9.0</a> has been released! ' +
'📢 <a href="https://microsoft.github.io/superbenchmark/blog/release-sb-v0.10">v0.10.0</a> has been released! ' +
'⭐️ If you like SuperBench, give it a star on <a target="_blank" rel="noopener noreferrer" href="https://github.com/microsoft/superbenchmark">GitHub</a>! ⭐️',
},
algolia: {
......
{
"name": "superbench-website",
"version": "0.9.0",
"version": "0.10.0",
"lockfileVersion": 1,
"requires": true,
"dependencies": {
......@@ -11678,4 +11678,4 @@
"integrity": "sha512-V50KMwwzqJV0NpZIZFwfOD5/lyny3WlSzRiXgA0G7VUnRlqttta1L6UQIHzd6EuBY/cHGfwTIck7w1yH6Q5zUw=="
}
}
}
}
\ No newline at end of file
{
"name": "superbench-website",
"version": "0.9.0",
"version": "0.10.0",
"private": true,
"scripts": {
"docusaurus": "docusaurus",
......@@ -38,4 +38,4 @@
"last 1 safari version"
]
}
}
}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment