Release - SuperBench v0.3.0 (#212)

**Description** Cherry-pick bug fixes from v0.3.0 to main. **Major Revisions** * Docs - Upgrade version and release note (#209) * Benchmarks: Build Pipeline - Update rccl-test git submodule to dc1ad48 (#210) * Benchmarks: Update - Update benchmarks in configuration file (#208) * CI/CD - Update GitHub Action VM (#211) * Benchmarks: Fix Bug - Fix wrong parameters for gpu-sm-copy-bw in configuration examples (#203) * CI/CD - Fix bug in build image for push event (#205) * Benchmark: Fix Bug - fix error message of communication-computation-overlap (#204) * Tool: Fix bug - Fix function naming issue in system info (#200) * CI/CD - Push images in GitHub Action (#202) * Bug - Fix torch.distributed command for single node (#201) * CLI - Integrate system info for node (#199) * Benchmarks: Code Revision - Revise CMake files for microbenchmarks. (#196) * CI/CD - Add ROCm image build in GitHub Actions (#194) * Bug: Fix bug - fix bug of hipBusBandwidth build (#193) * Benchmarks: Build Pipeline - Restore rocblas build logic (#197) * Bug: Fix Bug - Add barrier before 'destroy_process_group' in model benchmarks (#198) * Bug - Revise 'docker run' in sb deploy (#195) * Bug - Fix Bug : fix bug of error param operations to operation in rccl-bw of hpe config (#190) Co-authored-by: Yuting Jiang <v-yujiang@microsoft.com> Co-authored-by: Guoshuai Zhao <guzhao@microsoft.com> Co-authored-by: Ziyue Yang <ziyyang@microsoft.com>

Release - SuperBench v0.3.0 (#212)
**Description** Cherry-pick bug fixes from v0.3.0 to main. **Major Revisions** * Docs - Upgrade version and release note (#209) * Benchmarks: Build Pipeline - Update rccl-test git submodule to dc1ad48 (#210) * Benchmarks: Update - Update benchmarks in configuration file (#208) * CI/CD - Update GitHub Action VM (#211) * Benchmarks: Fix Bug - Fix wrong parameters for gpu-sm-copy-bw in configuration examples (#203) * CI/CD - Fix bug in build image for push event (#205) * Benchmark: Fix Bug - fix error message of communication-computation-overlap (#204) * Tool: Fix bug - Fix function naming issue in system info (#200) * CI/CD - Push images in GitHub Action (#202) * Bug - Fix torch.distributed command for single node (#201) * CLI - Integrate system info for node (#199) * Benchmarks: Code Revision - Revise CMake files for microbenchmarks. (#196) * CI/CD - Add ROCm image build in GitHub Actions (#194) * Bug: Fix bug - fix bug of hipBusBandwidth build (#193) * Benchmarks: Build Pipeline - Restore rocblas build logic (#197) * Bug: Fix Bug - Add barrier before 'destroy_process_group' in model benchmarks (#198) * Bug - Revise 'docker run' in sb deploy (#195) * Bug - Fix Bug : fix bug of error param operations to operation in rccl-bw of hpe config (#190) Co-authored-by: Yuting Jiang <v-yujiang@microsoft.com> Co-authored-by: Guoshuai Zhao <guzhao@microsoft.com> Co-authored-by: Ziyue Yang <ziyyang@microsoft.com>
dfbd70b1 · Yifan Xiong · GitHub · 37b15db9 · dfbd70b1 · dfbd70b1
Unverified Commit dfbd70b1 authored Sep 26, 2021 by Yifan Xiong Committed by GitHub Sep 26, 2021
20 changed files
--- a/.github/workflows/build-image.yml
+++ b/.github/workflows/build-image.yml
@@ -4,15 +4,32 @@ on:
  push:
    branches:
    - main
+    - release/*
  pull_request:
    branches:
    - main
    - release/*
+  release:
+    types:
+    - published
+  workflow_dispatch:
 jobs:
  docker:
-    name: Docker build
+    name: Docker build ${{ matrix.name }}
    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+    strategy:
+      matrix:
+        include:
+        - name: cuda11.1.1
+          tags: superbench/main:cuda11.1.1,superbench/superbench:latest
+        - name: rocm4.2-pytorch1.7.0
+          tags: superbench/main:rocm4.2-pytorch1.7.0
+        - name: rocm4.0-pytorch1.7.0
+          tags: superbench/main:rocm4.0-pytorch1.7.0
    steps:
      - name: Checkout
        uses: actions/checkout@v2
@@ -26,18 +43,29 @@ jobs:
          done
          sudo apt-get clean
          df -h
-          echo 'nproc: '$(nproc)
      - name: Prepare metadata
        id: metadata
        run: |
-          DOCKER_IMAGE=superbench/superbench
+          TAGS=${{ matrix.tags }}
-          IMAGE_TAG=latest
+          if [[ "${{ github.event_name }}" == "push" ]] && [[ "${{ github.ref }}" == "refs/heads/release/"* ]]; then
+            TAGS=$(sed "s/main:/release:${GITHUB_REF##*/}-/g" <<< ${TAGS})
+          fi
+          if [[ "${{ github.event_name }}" == "pull_request" ]] && [[ "${{ github.base_ref }}" == "release/"* ]]; then
+            TAGS=$(sed "s/main:/release:${GITHUB_BASE_REF##*/}-/g" <<< ${TAGS})
+          fi
+          if [[ "${{ github.event_name }}" == "release" ]]; then
+            TAGS=$(sed "s/main:/superbench:${GITHUB_REF##*/}-/g" <<< ${TAGS})
+            GHCR_TAG=$(cut -d, -f1 <<< ${TAGS} | sed "s#superbench/superbench#ghcr.io/${{ github.repository }}/superbench#g")
+            TAGS="${TAGS},${GHCR_TAG}"
+          fi
+          if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
+            TAGS=$(sed "s/main:/dev:/g" <<< ${TAGS})
+          fi
+          DOCKERFILE=dockerfile/${{ matrix.name }}.dockerfile
-          DOCKERFILE=dockerfile/cuda11.1.1.dockerfile
+          CACHE_FROM="type=registry,ref=$(cut -d, -f1 <<< ${TAGS})"
-          TAGS="${DOCKER_IMAGE}:${IMAGE_TAG}"
-          CACHE_FROM="type=registry,ref=${DOCKER_IMAGE}:${IMAGE_TAG}"
          CACHE_TO=""
-          if [ "${{ github.event_name }}" = "push" ]; then
+          if [[ "${{ github.event_name }}" != "pull_request" ]]; then
            CACHE_TO="type=inline,mode=max"
          fi
@@ -45,16 +73,25 @@ jobs:
          echo ::set-output name=tags::${TAGS}
          echo ::set-output name=cache_from::${CACHE_FROM}
          echo ::set-output name=cache_to::${CACHE_TO}
+      - name: Echo image tag
+        run: echo ${{ steps.metadata.outputs.tags }}
      - name: Set up QEMU
        uses: docker/setup-qemu-action@v1
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v1
      - name: Login to Docker Hub
        uses: docker/login-action@v1
-        if: ${{ github.event_name == 'push' }}
+        if: ${{ github.event_name != 'pull_request' }}
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
+      - name: Login to the GitHub Container Registry
+        uses: docker/login-action@v1
+        if: ${{ github.event_name == 'release' }}
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
      - name: Build and push
        id: docker_build
        uses: docker/build-push-action@v2
@@ -62,7 +99,7 @@ jobs:
          platforms: linux/amd64
          context: .
          file: ${{ steps.metadata.outputs.dockerfile }}
-          push: ${{ github.event_name == 'push' }}
+          push: ${{ github.event_name != 'pull_request' }}
          tags: ${{ steps.metadata.outputs.tags }}
          cache-from: ${{ steps.metadata.outputs.cache_from }}
          cache-to: ${{ steps.metadata.outputs.cache_to }}

--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -9,7 +9,7 @@ on:
 jobs:
  spelling:
    name: Spelling check
-    runs-on: ubuntu-16.04
+    runs-on: ubuntu-latest
    steps:
    - name: Checkout
      uses: actions/checkout@v2

--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@
 __SuperBench__ is a validation and profiling tool for AI infrastructure.
-📢 [v0.2.1](https://github.com/microsoft/superbenchmark/releases/tag/v0.2.1) has been released!
+📢 [v0.3.0](https://github.com/microsoft/superbenchmark/releases/tag/v0.3.0) has been released!
 ## _Check [aka.ms/superbench](https://aka.ms/superbench) for more details._

--- a/dockerfile/rocm4.0-pytorch1.7.0.dockerfile
+++ b/dockerfile/rocm4.0-pytorch1.7.0.dockerfile
@@ -88,7 +88,7 @@ ENV PATH="${PATH}" \
 WORKDIR ${SB_HOME}
 ADD third_party third_party
-RUN ROCM_VERSION=rocm-4.0.0 make -j -C third_party rocm
+RUN ROCM_VERSION=rocm-4.0.0 make -j -C third_party -o rocm_rocblas rocm
 # Workaround for image having package installed in user path
 RUN mv /root/.local/bin/* /opt/conda/bin/ && \

--- a/docs/developer-guides/using-docker.mdx
+++ b/docs/developer-guides/using-docker.mdx
@@ -36,7 +36,10 @@ docker buildx build \
 <TabItem value='rocm'>
 ```bash
-# coming soon
+export DOCKER_BUILDKIT=1
+docker buildx build \
+  --platform linux/amd64 --cache-to type=inline,mode=max \
+  --tag superbench-dev --file dockerfile/rocm4.2-pytorch1.7.0.dockerfile .
 ```
 </TabItem>

--- a/docs/getting-started/installation.md
+++ b/docs/getting-started/installation.md
@@ -57,7 +57,7 @@ You can clone the source from GitHub and build it.
 :::note Note
 You should checkout corresponding tag to use release version, for example,
-`git clone -b v0.2.1 https://github.com/microsoft/superbenchmark`
+`git clone -b v0.3.0 https://github.com/microsoft/superbenchmark`
 :::
 ```bash

--- a/docs/getting-started/run-superbench.md
+++ b/docs/getting-started/run-superbench.md
@@ -27,7 +27,7 @@ sb deploy -f remote.ini --host-password [password]
 :::note Note
 You should deploy corresponding Docker image to use release version, for example,
-`sb deploy -f local.ini -i superbench/superbench:v0.2.1-cuda11.1.1`
+`sb deploy -f local.ini -i superbench/superbench:v0.3.0-cuda11.1.1`
 :::
 ## Run

--- a/docs/superbench-config.mdx
+++ b/docs/superbench-config.mdx
@@ -66,7 +66,7 @@ superbench:
 <TabItem value='example'>
 ```yaml
-version: v0.2
+version: v0.3
 superbench:
  enable: benchmark_1
  var:

--- a/docs/tutorial/container-images.mdx
+++ b/docs/tutorial/container-images.mdx
@@ -29,13 +29,17 @@ available tags are listed below for all stable versions.
 | Tag               | Description                        |
 | ----------------- | ---------------------------------- |
+| v0.3.0-cuda11.1.1 | SuperBench v0.3.0 with CUDA 11.1.1 |
 | v0.2.1-cuda11.1.1 | SuperBench v0.2.1 with CUDA 11.1.1 |
 | v0.2.0-cuda11.1.1 | SuperBench v0.2.0 with CUDA 11.1.1 |
 </TabItem>
 <TabItem value='rocm'>
-  Coming soon.
+| Tag                         | Description                                    |
+| --------------------------- | ---------------------------------------------- |
+| v0.3.0-rocm4.2-pytorch1.7.0 | SuperBench v0.3.0 with ROCm 4.2, PyTorch 1.7.0 |
+| v0.3.0-rocm4.0-pytorch1.7.0 | SuperBench v0.3.0 with ROCm 4.0, PyTorch 1.7.0 |
 </TabItem>
 </Tabs>
--- a/superbench/__init__.py
+++ b/superbench/__init__.py
@@ -6,5 +6,5 @@
 Provide hardware and software benchmarks for AI systems.
 """
-__version__ = '0.2.1'
+__version__ = '0.3.0'
 __author__ = 'Microsoft'
--- a/superbench/benchmarks/build.sh
+++ b/superbench/benchmarks/build.sh
@@ -3,6 +3,7 @@
 # Copyright (c) Microsoft Corporation - All rights reserved
 # Licensed under the MIT License
+set -e
 SB_MICRO_PATH="${SB_MICRO_PATH:-/usr/local}"
@@ -12,6 +13,7 @@ for dir in micro_benchmarks/*/ ; do
        BUILD_ROOT=$dir/build
        mkdir -p $BUILD_ROOT
        cmake -DCMAKE_INSTALL_PREFIX=$SB_MICRO_PATH -DCMAKE_BUILD_TYPE=Release -S $SOURCE_DIR -B $BUILD_ROOT
-        cmake --build $BUILD_ROOT --target install
+        cmake --build $BUILD_ROOT
+        cmake --install $BUILD_ROOT
    fi
 done
--- a/superbench/benchmarks/micro_benchmarks/computation_communication_overlap.py
+++ b/superbench/benchmarks/micro_benchmarks/computation_communication_overlap.py
@@ -264,11 +264,7 @@ class ComputationCommunicationOverlap(MicroBenchmark):
            torch.distributed.destroy_process_group()
        except BaseException as e:
            self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_DESTROY_FAILURE)
-            logger.error(
+            logger.error('Post process failed - benchmark: {}, message: {}.'.format(self._name, str(e)))
-                'Post process failed - benchmark: {}, mode: {}, message: {}.'.format(
-                    self._name, self._args.mode, str(e)
-                )
-            )
            return False
        return True

--- a/superbench/benchmarks/micro_benchmarks/cublas_function/CMakeLists.txt
+++ b/superbench/benchmarks/micro_benchmarks/cublas_function/CMakeLists.txt
@@ -4,9 +4,9 @@
 cmake_minimum_required(VERSION 3.18)
 project(cublas_benchmark LANGUAGES CXX)
-include(../cuda_common.cmake)
 find_package(CUDAToolkit QUIET)
 if(CUDAToolkit_FOUND)
+  include(../cuda_common.cmake)
  set(SRC "cublas_helper.cpp" CACHE STRING "source file")
  set(TARGET_NAME "cublas_function" CACHE STRING "target name")

--- a/superbench/benchmarks/micro_benchmarks/cuda_common.cmake
+++ b/superbench/benchmarks/micro_benchmarks/cuda_common.cmake
@@ -6,6 +6,8 @@ if(NOT DEFINED CMAKE_CUDA_STANDARD)
    set(CMAKE_CUDA_STANDARD_REQUIRED ON)
 endif()
+enable_language(CUDA)
 if(NOT DEFINED NVCC_ARCHS_SUPPORTED)
    # Reference: https://github.com/NVIDIA/cutlass/blob/0e137486498a52954eff239d874ee27ab23358e7/CMakeLists.txt#L89
    set(NVCC_ARCHS_SUPPORTED "")

--- a/superbench/benchmarks/micro_benchmarks/cudnn_function/CMakeLists.txt
+++ b/superbench/benchmarks/micro_benchmarks/cudnn_function/CMakeLists.txt
@@ -4,9 +4,9 @@
 cmake_minimum_required(VERSION 3.18)
 project(cudnn_benchmark LANGUAGES CXX)
-include(../cuda_common.cmake)
 find_package(CUDAToolkit QUIET)
 if(CUDAToolkit_FOUND)
+  include(../cuda_common.cmake)
  set(SRC "cudnn_helper.cpp" CACHE STRING "source file")
  set(TARGET_NAME "cudnn_function" CACHE STRING "target name")

--- a/superbench/benchmarks/micro_benchmarks/gpu_sm_copy_performance/CMakeLists.txt
+++ b/superbench/benchmarks/micro_benchmarks/gpu_sm_copy_performance/CMakeLists.txt
@@ -5,22 +5,21 @@ cmake_minimum_required(VERSION 3.18)
 project(gpu_sm_copy LANGUAGES CXX)
-include(../cuda_common.cmake)
 find_package(CUDAToolkit QUIET)
-include(../rocm_common.cmake)
-find_package(HIP QUIET)
 # Cuda environment
 if(CUDAToolkit_FOUND)
    message(STATUS "Found CUDA: " ${CUDAToolkit_VERSION})
-    enable_language(CUDA)
+    include(../cuda_common.cmake)
    add_executable(gpu_sm_copy gpu_sm_copy.cu)
    set_property(TARGET gpu_sm_copy PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED})
    install(TARGETS gpu_sm_copy RUNTIME DESTINATION bin)
+else()
-# ROCm environment
+    # ROCm environment
-elseif(HIP_FOUND)
+    include(../rocm_common.cmake)
+    find_package(HIP QUIET)
+    if(HIP_FOUND)
        message(STATUS "Found ROCm: " ${HIP_VERSION})
        # Convert cuda code to hip code inplace
@@ -33,8 +32,7 @@ elseif(HIP_FOUND)
        hip_add_executable(gpu_sm_copy gpu_sm_copy.cu)
        # Install tergets
        install(TARGETS gpu_sm_copy RUNTIME DESTINATION bin)
+    else()
-else()
        message(FATAL_ERROR "No CUDA or ROCm environment found.")
+    endif()
 endif()
--- a/superbench/benchmarks/micro_benchmarks/kernel_launch_overhead/CMakeLists.txt
+++ b/superbench/benchmarks/micro_benchmarks/kernel_launch_overhead/CMakeLists.txt
@@ -5,22 +5,21 @@ cmake_minimum_required(VERSION 3.18)
 project(kernel_launch_overhead LANGUAGES CXX)
-include(../cuda_common.cmake)
 find_package(CUDAToolkit QUIET)
-include(../rocm_common.cmake)
-find_package(HIP QUIET)
 # Cuda environment
 if(CUDAToolkit_FOUND)
    message(STATUS "Found CUDA: " ${CUDAToolkit_VERSION})
-    enable_language(CUDA)
+    include(../cuda_common.cmake)
    add_executable(kernel_launch_overhead kernel_launch.cu)
    set_property(TARGET kernel_launch_overhead PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED})
    install(TARGETS kernel_launch_overhead RUNTIME DESTINATION bin)
+else()
-# ROCm environment
+    # ROCm environment
-elseif(HIP_FOUND)
+    include(../rocm_common.cmake)
+    find_package(HIP QUIET)
+    if(HIP_FOUND)
        message(STATUS "Found HIP: " ${HIP_VERSION})
        # Convert cuda code to hip code inplace
@@ -33,8 +32,7 @@ elseif(HIP_FOUND)
        hip_add_executable(kernel_launch_overhead kernel_launch.cu)
        # Install tergets
        install(TARGETS kernel_launch_overhead RUNTIME DESTINATION bin)
+    else()
-else()
        message(FATAL_ERROR "No CUDA or ROCm environment found.")
+    endif()
 endif()
--- a/superbench/benchmarks/model_benchmarks/pytorch_base.py
+++ b/superbench/benchmarks/model_benchmarks/pytorch_base.py
@@ -174,6 +174,7 @@ class PytorchBase(ModelBenchmark):
        try:
            if self._args.distributed_impl == DistributedImpl.DDP:
+                torch.distributed.barrier()
                torch.distributed.destroy_process_group()
        except BaseException as e:
            self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_DESTROY_FAILURE)

--- a/superbench/cli/_commands.py
+++ b/superbench/cli/_commands.py
@@ -23,6 +23,8 @@ class SuperBenchCommandsLoader(CLICommandsLoader):
            g.command('deploy', 'deploy_command_handler')
            g.command('exec', 'exec_command_handler')
            g.command('run', 'run_command_handler')
+        with CommandGroup(self, 'node', 'superbench.cli._node_handler#{}') as g:
+            g.command('info', 'info_command_handler')
        return super().load_command_table(args)
    def load_arguments(self, command):

--- a/superbench/cli/_node_handler.py
+++ b/superbench/cli/_node_handler.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+"""SuperBench CLI node subgroup command handler."""
+from superbench.tools import SystemInfo
+def info_command_handler():
+    """Get node hardware info.
+    Returns:
+        dict: node info.
+    """
+    try:
+        info = SystemInfo().get_all()
+    except Exception as ex:
+        raise RuntimeError('Failed to get node info.') from ex
+    return info