Unverified Commit dfbd70b1 authored by Yifan Xiong's avatar Yifan Xiong Committed by GitHub
Browse files

Release - SuperBench v0.3.0 (#212)



**Description**

Cherry-pick  bug fixes from v0.3.0 to main.

**Major Revisions**
* Docs - Upgrade version and release note (#209)
* Benchmarks: Build Pipeline - Update rccl-test git submodule to dc1ad48 (#210)
* Benchmarks: Update - Update benchmarks in configuration file (#208)
* CI/CD - Update GitHub Action VM (#211)
* Benchmarks: Fix Bug - Fix wrong parameters for gpu-sm-copy-bw in configuration examples (#203)
* CI/CD - Fix bug in build image for push event (#205)
* Benchmark: Fix Bug - fix error message of communication-computation-overlap (#204)
* Tool: Fix bug - Fix function naming issue in system info  (#200)
* CI/CD - Push images in GitHub Action (#202)
* Bug - Fix torch.distributed command for single node (#201)
* CLI - Integrate system info for node (#199)
* Benchmarks: Code Revision - Revise CMake files for microbenchmarks. (#196)
* CI/CD - Add ROCm image build in GitHub Actions (#194)
* Bug: Fix bug - fix bug of hipBusBandwidth build (#193)
* Benchmarks: Build Pipeline - Restore rocblas build logic (#197)
* Bug: Fix Bug - Add barrier before 'destroy_process_group' in model benchmarks (#198)
* Bug - Revise 'docker run' in sb deploy (#195)
* Bug - Fix Bug : fix bug of error param operations to operation in rccl-bw of hpe config (#190)
Co-authored-by: default avatarYuting Jiang <v-yujiang@microsoft.com>
Co-authored-by: default avatarGuoshuai Zhao <guzhao@microsoft.com>
Co-authored-by: default avatarZiyue Yang <ziyyang@microsoft.com>
parent 37b15db9
...@@ -4,15 +4,32 @@ on: ...@@ -4,15 +4,32 @@ on:
push: push:
branches: branches:
- main - main
- release/*
pull_request: pull_request:
branches: branches:
- main - main
- release/* - release/*
release:
types:
- published
workflow_dispatch:
jobs: jobs:
docker: docker:
name: Docker build name: Docker build ${{ matrix.name }}
runs-on: ubuntu-latest runs-on: ubuntu-latest
permissions:
contents: read
packages: write
strategy:
matrix:
include:
- name: cuda11.1.1
tags: superbench/main:cuda11.1.1,superbench/superbench:latest
- name: rocm4.2-pytorch1.7.0
tags: superbench/main:rocm4.2-pytorch1.7.0
- name: rocm4.0-pytorch1.7.0
tags: superbench/main:rocm4.0-pytorch1.7.0
steps: steps:
- name: Checkout - name: Checkout
uses: actions/checkout@v2 uses: actions/checkout@v2
...@@ -26,18 +43,29 @@ jobs: ...@@ -26,18 +43,29 @@ jobs:
done done
sudo apt-get clean sudo apt-get clean
df -h df -h
echo 'nproc: '$(nproc)
- name: Prepare metadata - name: Prepare metadata
id: metadata id: metadata
run: | run: |
DOCKER_IMAGE=superbench/superbench TAGS=${{ matrix.tags }}
IMAGE_TAG=latest if [[ "${{ github.event_name }}" == "push" ]] && [[ "${{ github.ref }}" == "refs/heads/release/"* ]]; then
TAGS=$(sed "s/main:/release:${GITHUB_REF##*/}-/g" <<< ${TAGS})
fi
if [[ "${{ github.event_name }}" == "pull_request" ]] && [[ "${{ github.base_ref }}" == "release/"* ]]; then
TAGS=$(sed "s/main:/release:${GITHUB_BASE_REF##*/}-/g" <<< ${TAGS})
fi
if [[ "${{ github.event_name }}" == "release" ]]; then
TAGS=$(sed "s/main:/superbench:${GITHUB_REF##*/}-/g" <<< ${TAGS})
GHCR_TAG=$(cut -d, -f1 <<< ${TAGS} | sed "s#superbench/superbench#ghcr.io/${{ github.repository }}/superbench#g")
TAGS="${TAGS},${GHCR_TAG}"
fi
if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
TAGS=$(sed "s/main:/dev:/g" <<< ${TAGS})
fi
DOCKERFILE=dockerfile/${{ matrix.name }}.dockerfile
DOCKERFILE=dockerfile/cuda11.1.1.dockerfile CACHE_FROM="type=registry,ref=$(cut -d, -f1 <<< ${TAGS})"
TAGS="${DOCKER_IMAGE}:${IMAGE_TAG}"
CACHE_FROM="type=registry,ref=${DOCKER_IMAGE}:${IMAGE_TAG}"
CACHE_TO="" CACHE_TO=""
if [ "${{ github.event_name }}" = "push" ]; then if [[ "${{ github.event_name }}" != "pull_request" ]]; then
CACHE_TO="type=inline,mode=max" CACHE_TO="type=inline,mode=max"
fi fi
...@@ -45,16 +73,25 @@ jobs: ...@@ -45,16 +73,25 @@ jobs:
echo ::set-output name=tags::${TAGS} echo ::set-output name=tags::${TAGS}
echo ::set-output name=cache_from::${CACHE_FROM} echo ::set-output name=cache_from::${CACHE_FROM}
echo ::set-output name=cache_to::${CACHE_TO} echo ::set-output name=cache_to::${CACHE_TO}
- name: Echo image tag
run: echo ${{ steps.metadata.outputs.tags }}
- name: Set up QEMU - name: Set up QEMU
uses: docker/setup-qemu-action@v1 uses: docker/setup-qemu-action@v1
- name: Set up Docker Buildx - name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1 uses: docker/setup-buildx-action@v1
- name: Login to Docker Hub - name: Login to Docker Hub
uses: docker/login-action@v1 uses: docker/login-action@v1
if: ${{ github.event_name == 'push' }} if: ${{ github.event_name != 'pull_request' }}
with: with:
username: ${{ secrets.DOCKERHUB_USERNAME }} username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }} password: ${{ secrets.DOCKERHUB_TOKEN }}
- name: Login to the GitHub Container Registry
uses: docker/login-action@v1
if: ${{ github.event_name == 'release' }}
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Build and push - name: Build and push
id: docker_build id: docker_build
uses: docker/build-push-action@v2 uses: docker/build-push-action@v2
...@@ -62,7 +99,7 @@ jobs: ...@@ -62,7 +99,7 @@ jobs:
platforms: linux/amd64 platforms: linux/amd64
context: . context: .
file: ${{ steps.metadata.outputs.dockerfile }} file: ${{ steps.metadata.outputs.dockerfile }}
push: ${{ github.event_name == 'push' }} push: ${{ github.event_name != 'pull_request' }}
tags: ${{ steps.metadata.outputs.tags }} tags: ${{ steps.metadata.outputs.tags }}
cache-from: ${{ steps.metadata.outputs.cache_from }} cache-from: ${{ steps.metadata.outputs.cache_from }}
cache-to: ${{ steps.metadata.outputs.cache_to }} cache-to: ${{ steps.metadata.outputs.cache_to }}
......
...@@ -9,7 +9,7 @@ on: ...@@ -9,7 +9,7 @@ on:
jobs: jobs:
spelling: spelling:
name: Spelling check name: Spelling check
runs-on: ubuntu-16.04 runs-on: ubuntu-latest
steps: steps:
- name: Checkout - name: Checkout
uses: actions/checkout@v2 uses: actions/checkout@v2
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
__SuperBench__ is a validation and profiling tool for AI infrastructure. __SuperBench__ is a validation and profiling tool for AI infrastructure.
📢 [v0.2.1](https://github.com/microsoft/superbenchmark/releases/tag/v0.2.1) has been released! 📢 [v0.3.0](https://github.com/microsoft/superbenchmark/releases/tag/v0.3.0) has been released!
## _Check [aka.ms/superbench](https://aka.ms/superbench) for more details._ ## _Check [aka.ms/superbench](https://aka.ms/superbench) for more details._
......
...@@ -88,7 +88,7 @@ ENV PATH="${PATH}" \ ...@@ -88,7 +88,7 @@ ENV PATH="${PATH}" \
WORKDIR ${SB_HOME} WORKDIR ${SB_HOME}
ADD third_party third_party ADD third_party third_party
RUN ROCM_VERSION=rocm-4.0.0 make -j -C third_party rocm RUN ROCM_VERSION=rocm-4.0.0 make -j -C third_party -o rocm_rocblas rocm
# Workaround for image having package installed in user path # Workaround for image having package installed in user path
RUN mv /root/.local/bin/* /opt/conda/bin/ && \ RUN mv /root/.local/bin/* /opt/conda/bin/ && \
......
...@@ -36,7 +36,10 @@ docker buildx build \ ...@@ -36,7 +36,10 @@ docker buildx build \
<TabItem value='rocm'> <TabItem value='rocm'>
```bash ```bash
# coming soon export DOCKER_BUILDKIT=1
docker buildx build \
--platform linux/amd64 --cache-to type=inline,mode=max \
--tag superbench-dev --file dockerfile/rocm4.2-pytorch1.7.0.dockerfile .
``` ```
</TabItem> </TabItem>
......
...@@ -57,7 +57,7 @@ You can clone the source from GitHub and build it. ...@@ -57,7 +57,7 @@ You can clone the source from GitHub and build it.
:::note Note :::note Note
You should checkout corresponding tag to use release version, for example, You should checkout corresponding tag to use release version, for example,
`git clone -b v0.2.1 https://github.com/microsoft/superbenchmark` `git clone -b v0.3.0 https://github.com/microsoft/superbenchmark`
::: :::
```bash ```bash
......
...@@ -27,7 +27,7 @@ sb deploy -f remote.ini --host-password [password] ...@@ -27,7 +27,7 @@ sb deploy -f remote.ini --host-password [password]
:::note Note :::note Note
You should deploy corresponding Docker image to use release version, for example, You should deploy corresponding Docker image to use release version, for example,
`sb deploy -f local.ini -i superbench/superbench:v0.2.1-cuda11.1.1` `sb deploy -f local.ini -i superbench/superbench:v0.3.0-cuda11.1.1`
::: :::
## Run ## Run
......
...@@ -66,7 +66,7 @@ superbench: ...@@ -66,7 +66,7 @@ superbench:
<TabItem value='example'> <TabItem value='example'>
```yaml ```yaml
version: v0.2 version: v0.3
superbench: superbench:
enable: benchmark_1 enable: benchmark_1
var: var:
......
...@@ -29,13 +29,17 @@ available tags are listed below for all stable versions. ...@@ -29,13 +29,17 @@ available tags are listed below for all stable versions.
| Tag | Description | | Tag | Description |
| ----------------- | ---------------------------------- | | ----------------- | ---------------------------------- |
| v0.3.0-cuda11.1.1 | SuperBench v0.3.0 with CUDA 11.1.1 |
| v0.2.1-cuda11.1.1 | SuperBench v0.2.1 with CUDA 11.1.1 | | v0.2.1-cuda11.1.1 | SuperBench v0.2.1 with CUDA 11.1.1 |
| v0.2.0-cuda11.1.1 | SuperBench v0.2.0 with CUDA 11.1.1 | | v0.2.0-cuda11.1.1 | SuperBench v0.2.0 with CUDA 11.1.1 |
</TabItem> </TabItem>
<TabItem value='rocm'> <TabItem value='rocm'>
Coming soon. | Tag | Description |
| --------------------------- | ---------------------------------------------- |
| v0.3.0-rocm4.2-pytorch1.7.0 | SuperBench v0.3.0 with ROCm 4.2, PyTorch 1.7.0 |
| v0.3.0-rocm4.0-pytorch1.7.0 | SuperBench v0.3.0 with ROCm 4.0, PyTorch 1.7.0 |
</TabItem> </TabItem>
</Tabs> </Tabs>
...@@ -6,5 +6,5 @@ ...@@ -6,5 +6,5 @@
Provide hardware and software benchmarks for AI systems. Provide hardware and software benchmarks for AI systems.
""" """
__version__ = '0.2.1' __version__ = '0.3.0'
__author__ = 'Microsoft' __author__ = 'Microsoft'
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
# Copyright (c) Microsoft Corporation - All rights reserved # Copyright (c) Microsoft Corporation - All rights reserved
# Licensed under the MIT License # Licensed under the MIT License
set -e
SB_MICRO_PATH="${SB_MICRO_PATH:-/usr/local}" SB_MICRO_PATH="${SB_MICRO_PATH:-/usr/local}"
...@@ -12,6 +13,7 @@ for dir in micro_benchmarks/*/ ; do ...@@ -12,6 +13,7 @@ for dir in micro_benchmarks/*/ ; do
BUILD_ROOT=$dir/build BUILD_ROOT=$dir/build
mkdir -p $BUILD_ROOT mkdir -p $BUILD_ROOT
cmake -DCMAKE_INSTALL_PREFIX=$SB_MICRO_PATH -DCMAKE_BUILD_TYPE=Release -S $SOURCE_DIR -B $BUILD_ROOT cmake -DCMAKE_INSTALL_PREFIX=$SB_MICRO_PATH -DCMAKE_BUILD_TYPE=Release -S $SOURCE_DIR -B $BUILD_ROOT
cmake --build $BUILD_ROOT --target install cmake --build $BUILD_ROOT
cmake --install $BUILD_ROOT
fi fi
done done
...@@ -264,11 +264,7 @@ def _postprocess(self): ...@@ -264,11 +264,7 @@ def _postprocess(self):
torch.distributed.destroy_process_group() torch.distributed.destroy_process_group()
except BaseException as e: except BaseException as e:
self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_DESTROY_FAILURE) self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_DESTROY_FAILURE)
logger.error( logger.error('Post process failed - benchmark: {}, message: {}.'.format(self._name, str(e)))
'Post process failed - benchmark: {}, mode: {}, message: {}.'.format(
self._name, self._args.mode, str(e)
)
)
return False return False
return True return True
......
...@@ -4,9 +4,9 @@ ...@@ -4,9 +4,9 @@
cmake_minimum_required(VERSION 3.18) cmake_minimum_required(VERSION 3.18)
project(cublas_benchmark LANGUAGES CXX) project(cublas_benchmark LANGUAGES CXX)
include(../cuda_common.cmake)
find_package(CUDAToolkit QUIET) find_package(CUDAToolkit QUIET)
if(CUDAToolkit_FOUND) if(CUDAToolkit_FOUND)
include(../cuda_common.cmake)
set(SRC "cublas_helper.cpp" CACHE STRING "source file") set(SRC "cublas_helper.cpp" CACHE STRING "source file")
set(TARGET_NAME "cublas_function" CACHE STRING "target name") set(TARGET_NAME "cublas_function" CACHE STRING "target name")
......
...@@ -6,6 +6,8 @@ if(NOT DEFINED CMAKE_CUDA_STANDARD) ...@@ -6,6 +6,8 @@ if(NOT DEFINED CMAKE_CUDA_STANDARD)
set(CMAKE_CUDA_STANDARD_REQUIRED ON) set(CMAKE_CUDA_STANDARD_REQUIRED ON)
endif() endif()
enable_language(CUDA)
if(NOT DEFINED NVCC_ARCHS_SUPPORTED) if(NOT DEFINED NVCC_ARCHS_SUPPORTED)
# Reference: https://github.com/NVIDIA/cutlass/blob/0e137486498a52954eff239d874ee27ab23358e7/CMakeLists.txt#L89 # Reference: https://github.com/NVIDIA/cutlass/blob/0e137486498a52954eff239d874ee27ab23358e7/CMakeLists.txt#L89
set(NVCC_ARCHS_SUPPORTED "") set(NVCC_ARCHS_SUPPORTED "")
......
...@@ -4,9 +4,9 @@ ...@@ -4,9 +4,9 @@
cmake_minimum_required(VERSION 3.18) cmake_minimum_required(VERSION 3.18)
project(cudnn_benchmark LANGUAGES CXX) project(cudnn_benchmark LANGUAGES CXX)
include(../cuda_common.cmake)
find_package(CUDAToolkit QUIET) find_package(CUDAToolkit QUIET)
if(CUDAToolkit_FOUND) if(CUDAToolkit_FOUND)
include(../cuda_common.cmake)
set(SRC "cudnn_helper.cpp" CACHE STRING "source file") set(SRC "cudnn_helper.cpp" CACHE STRING "source file")
set(TARGET_NAME "cudnn_function" CACHE STRING "target name") set(TARGET_NAME "cudnn_function" CACHE STRING "target name")
......
...@@ -5,22 +5,21 @@ cmake_minimum_required(VERSION 3.18) ...@@ -5,22 +5,21 @@ cmake_minimum_required(VERSION 3.18)
project(gpu_sm_copy LANGUAGES CXX) project(gpu_sm_copy LANGUAGES CXX)
include(../cuda_common.cmake)
find_package(CUDAToolkit QUIET) find_package(CUDAToolkit QUIET)
include(../rocm_common.cmake)
find_package(HIP QUIET)
# Cuda environment # Cuda environment
if(CUDAToolkit_FOUND) if(CUDAToolkit_FOUND)
message(STATUS "Found CUDA: " ${CUDAToolkit_VERSION}) message(STATUS "Found CUDA: " ${CUDAToolkit_VERSION})
enable_language(CUDA)
include(../cuda_common.cmake)
add_executable(gpu_sm_copy gpu_sm_copy.cu) add_executable(gpu_sm_copy gpu_sm_copy.cu)
set_property(TARGET gpu_sm_copy PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED}) set_property(TARGET gpu_sm_copy PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED})
install(TARGETS gpu_sm_copy RUNTIME DESTINATION bin) install(TARGETS gpu_sm_copy RUNTIME DESTINATION bin)
else()
# ROCm environment # ROCm environment
elseif(HIP_FOUND) include(../rocm_common.cmake)
find_package(HIP QUIET)
if(HIP_FOUND)
message(STATUS "Found ROCm: " ${HIP_VERSION}) message(STATUS "Found ROCm: " ${HIP_VERSION})
# Convert cuda code to hip code inplace # Convert cuda code to hip code inplace
...@@ -33,8 +32,7 @@ elseif(HIP_FOUND) ...@@ -33,8 +32,7 @@ elseif(HIP_FOUND)
hip_add_executable(gpu_sm_copy gpu_sm_copy.cu) hip_add_executable(gpu_sm_copy gpu_sm_copy.cu)
# Install tergets # Install tergets
install(TARGETS gpu_sm_copy RUNTIME DESTINATION bin) install(TARGETS gpu_sm_copy RUNTIME DESTINATION bin)
else()
else()
message(FATAL_ERROR "No CUDA or ROCm environment found.") message(FATAL_ERROR "No CUDA or ROCm environment found.")
endif()
endif() endif()
...@@ -5,22 +5,21 @@ cmake_minimum_required(VERSION 3.18) ...@@ -5,22 +5,21 @@ cmake_minimum_required(VERSION 3.18)
project(kernel_launch_overhead LANGUAGES CXX) project(kernel_launch_overhead LANGUAGES CXX)
include(../cuda_common.cmake)
find_package(CUDAToolkit QUIET) find_package(CUDAToolkit QUIET)
include(../rocm_common.cmake)
find_package(HIP QUIET)
# Cuda environment # Cuda environment
if(CUDAToolkit_FOUND) if(CUDAToolkit_FOUND)
message(STATUS "Found CUDA: " ${CUDAToolkit_VERSION}) message(STATUS "Found CUDA: " ${CUDAToolkit_VERSION})
enable_language(CUDA)
include(../cuda_common.cmake)
add_executable(kernel_launch_overhead kernel_launch.cu) add_executable(kernel_launch_overhead kernel_launch.cu)
set_property(TARGET kernel_launch_overhead PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED}) set_property(TARGET kernel_launch_overhead PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED})
install(TARGETS kernel_launch_overhead RUNTIME DESTINATION bin) install(TARGETS kernel_launch_overhead RUNTIME DESTINATION bin)
else()
# ROCm environment # ROCm environment
elseif(HIP_FOUND) include(../rocm_common.cmake)
find_package(HIP QUIET)
if(HIP_FOUND)
message(STATUS "Found HIP: " ${HIP_VERSION}) message(STATUS "Found HIP: " ${HIP_VERSION})
# Convert cuda code to hip code inplace # Convert cuda code to hip code inplace
...@@ -33,8 +32,7 @@ elseif(HIP_FOUND) ...@@ -33,8 +32,7 @@ elseif(HIP_FOUND)
hip_add_executable(kernel_launch_overhead kernel_launch.cu) hip_add_executable(kernel_launch_overhead kernel_launch.cu)
# Install tergets # Install tergets
install(TARGETS kernel_launch_overhead RUNTIME DESTINATION bin) install(TARGETS kernel_launch_overhead RUNTIME DESTINATION bin)
else()
else()
message(FATAL_ERROR "No CUDA or ROCm environment found.") message(FATAL_ERROR "No CUDA or ROCm environment found.")
endif()
endif() endif()
...@@ -174,6 +174,7 @@ def _postprocess(self): ...@@ -174,6 +174,7 @@ def _postprocess(self):
try: try:
if self._args.distributed_impl == DistributedImpl.DDP: if self._args.distributed_impl == DistributedImpl.DDP:
torch.distributed.barrier()
torch.distributed.destroy_process_group() torch.distributed.destroy_process_group()
except BaseException as e: except BaseException as e:
self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_DESTROY_FAILURE) self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_DESTROY_FAILURE)
......
...@@ -23,6 +23,8 @@ def load_command_table(self, args): ...@@ -23,6 +23,8 @@ def load_command_table(self, args):
g.command('deploy', 'deploy_command_handler') g.command('deploy', 'deploy_command_handler')
g.command('exec', 'exec_command_handler') g.command('exec', 'exec_command_handler')
g.command('run', 'run_command_handler') g.command('run', 'run_command_handler')
with CommandGroup(self, 'node', 'superbench.cli._node_handler#{}') as g:
g.command('info', 'info_command_handler')
return super().load_command_table(args) return super().load_command_table(args)
def load_arguments(self, command): def load_arguments(self, command):
......
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""SuperBench CLI node subgroup command handler."""
from superbench.tools import SystemInfo
def info_command_handler():
"""Get node hardware info.
Returns:
dict: node info.
"""
try:
info = SystemInfo().get_all()
except Exception as ex:
raise RuntimeError('Failed to get node info.') from ex
return info
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment