Release - SuperBench v0.11.0 (#654)

**Description** Cherry pick bug fixes from v0.11.0 to main **Major Revision** * #645 * #648 * #646 * #647 * #651 * #652 * #650 --------- Co-authored-by: hongtaozhang <hongtaozhang@microsoft.com> Co-authored-by: Yifan Xiong <yifan.xiong@microsoft.com>

Release - SuperBench v0.11.0 (#654)
**Description** Cherry pick bug fixes from v0.11.0 to main **Major Revision** * #645 * #648 * #646 * #647 * #651 * #652 * #650 --------- Co-authored-by: hongtaozhang <hongtaozhang@microsoft.com> Co-authored-by: Yifan Xiong <yifan.xiong@microsoft.com>
949f9cb4 · Yuting Jiang · GitHub · 9f3231e9 · 949f9cb4 · 949f9cb4
Unverified Commit 949f9cb4 authored Oct 10, 2024 by Yuting Jiang Committed by GitHub Oct 10, 2024
9 changed files
--- a/superbench/config/azure/inference/standard_nv18ads_a10_v5.yaml
+++ b/superbench/config/azure/inference/standard_nv18ads_a10_v5.yaml
-version: v0.10
+version: v0.11
 superbench:
  enable: null
  monitor:

--- a/superbench/config/azure_ndmv4.yaml
+++ b/superbench/config/azure_ndmv4.yaml
@@ -3,7 +3,7 @@
 # Azure NDm A100 v4
 # reference: https://docs.microsoft.com/en-us/azure/virtual-machines/ndm-a100-v4-series
-version: v0.10
+version: v0.11
 superbench:
  enable: null
  monitor:

--- a/superbench/config/azure_ndv4.yaml
+++ b/superbench/config/azure_ndv4.yaml
 # SuperBench Config
-version: v0.10
+version: v0.11
 superbench:
  enable: null
  monitor:

--- a/superbench/config/azure_ndv5.yaml
+++ b/superbench/config/azure_ndv5.yaml
+# SuperBench Config
+version: v0.11
+superbench:
+  enable:
+  monitor:
+    enable: true
+    sample_duration: 1
+    sample_interval: 10
+  var:
+    default_local_mode: &default_local_mode
+      enable: true
+      modes:
+        - name: local
+          proc_num: 8
+          prefix: CUDA_VISIBLE_DEVICES={proc_rank}
+          parallel: yes
+    default_pytorch_mode: &default_pytorch_mode
+      enable: true
+      modes:
+        - name: torch.distributed
+          proc_num: 8
+          node_num: 1
+      frameworks:
+        - pytorch
+    common_model_config: &common_model_config
+      duration: 0
+      num_warmup: 128
+      num_steps: 512
+      sample_count: 8192
+      batch_size: 128
+      precision: [float32, float16]
+      model_action: [train]
+      pin_memory: yes
+      num_workers: 0
+  benchmarks:
+    kernel-launch:
+      <<: *default_local_mode
+    gemm-flops:
+      <<: *default_local_mode
+      parameters:
+        precision: ["fp64", "fp32", "fp16", "fp64_tc","tf32_tc", "bf16_tc", "fp16_tc", "int8_tc"]
+    cublaslt-gemm:
+      <<: *default_local_mode
+      parameters:
+          in_types: ['fp8e4m3', 'fp8e5m2', 'fp64', 'fp32', 'fp16', 'bf16', 'int8']
+          shapes:
+            - 4096,4096,4096
+            - 8192,8192,8192
+            - 16384,16384,16384
+    gpu-burn:
+      enable: false
+      modes:
+        - name: local
+          proc_num: 1
+          parallel: no
+      parameters:
+        time: 900
+        doubles: true
+        tensor_core: true
+    nccl-bw:default:
+      enable: true
+      modes:
+        - name: local
+          proc_num: 1
+          parallel: no
+      parameters:
+        ngpus: 8
+    nccl-bw:gdr-only:
+      enable: true
+      modes:
+        - name: local
+          proc_num: 1
+          parallel: no
+          env:
+            NCCL_IB_PCI_RELAXED_ORDERING: '1'
+            NCCL_NET_GDR_LEVEL: '5'
+            NCCL_P2P_DISABLE: '1'
+            NCCL_SHM_DISABLE: '1'
+            NCCL_MIN_NCHANNELS: '16'
+            NCCL_IB_DISABLE: '0'
+      parameters:
+        ngpus: 8
+    nccl-lat:default:
+      enable: true
+      modes:
+        - name: mpi
+          proc_num: 8
+          node_num: 1
+      parameters:
+        maxbytes: 16M
+        warmup_iters: 20
+        iters: 1000
+        graph_iters: 1
+    ib-loopback:
+      timeout: *default_timeout
+      modes:
+          - name: local
+            proc_num: 4
+            prefix: PROC_RANK={proc_rank} IB_DEVICES=0,2,4,6 NUMA_NODES=0,0,1,1
+            parallel: yes
+          - name: local
+            proc_num: 4
+            prefix: PROC_RANK={proc_rank} IB_DEVICES=1,3,5,7 NUMA_NODES=0,0,1,1
+            parallel: yes
+    cpu-memory-bw-latency:
+      enable: false
+      modes:
+        - name: local
+          proc_num: 1
+          parallel: no
+      parameters:
+        tests:
+          - bandwidth_matrix
+          - latency_matrix
+          - max_bandwidth
+    mem-bw:
+      enable: true
+      modes:
+        - name: local
+          proc_num: 8
+          prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2))
+          parallel: no
+    disk-benchmark:
+      enable: false
+      modes:
+        - name: local
+          proc_num: 1
+          parallel: no
+      parameters:
+        block_devices:
+          - /dev/nvme0n1
+          - /dev/nvme1n1
+          - /dev/nvme2n1
+          - /dev/nvme3n1
+          - /dev/nvme4n1
+          - /dev/nvme5n1
+          - /dev/nvme6n1
+          - /dev/nvme7n1
+        seq_read_runtime: 60
+        seq_write_runtime: 60
+        seq_readwrite_runtime: 60
+        rand_read_runtime: 60
+        rand_write_runtime: 60
+        rand_readwrite_runtime: 60
+    gpu-copy-bw:correctness:
+      enable: true
+      modes:
+        - name: local
+          parallel: no
+      parameters:
+        mem_type: [htod, dtoh, dtod, one_to_all, all_to_one, all_to_all]
+        copy_type: [sm, dma]
+        size: 4096
+        num_warm_up: 0
+        num_loops: 1
+        check_data: true
+    gpu-copy-bw:perf:
+      enable: true
+      modes:
+        - name: local
+          parallel: no
+      parameters:
+        mem_type: [htod, dtoh, dtod, one_to_all, all_to_one, all_to_all]
+        copy_type: [sm, dma]
+    cudnn-function:
+      <<: *default_local_mode
+    cublas-function:
+      <<: *default_local_mode
+    matmul:
+      <<: *default_local_mode
+      frameworks:
+        - pytorch
+    sharding-matmul:
+      <<: *default_pytorch_mode
+    computation-communication-overlap:
+      <<: *default_pytorch_mode
+    dist-inference:
+      enable: true
+      timeout: 600
+      modes:
+          - name: mpi
+            proc_num: 8
+            node_num: 1
+            env:
+              NCCL_TOPO_FILE: '/opt/microsoft/ndv5-topo.xml'
+      frameworks:
+        - pytorch
+      parameters:
+        num_layers: 50
+        num_warmup: 20
+        num_steps: 100
+        use_cuda_graph: true
+        precision: float16
+        hidden_size: 128
+        input_size: 128
+        batch_size: 1024
+    ib-traffic:
+      enable: false
+      modes:
+        - name: mpi
+          proc_num: 8
+      parameters:
+        msg_size: 8388608
+        ib_dev: mlx5_$LOCAL_RANK
+        gpu_dev: $LOCAL_RANK
+        numa_dev: $((LOCAL_RANK/2))
+    gpcnet-network-test:
+      enable: false
+      modes:
+        - name: mpi
+          proc_num: 1
+          mca:
+            pml: ucx
+            btl: ^uct
+            btl_tcp_if_include: eth0
+    gpcnet-network-load-test:
+      enable: false
+      modes:
+        - name: mpi
+          proc_num: 1
+          mca:
+            pml: ucx
+            btl: ^uct
+            btl_tcp_if_include: eth0
+    tcp-connectivity:
+      enable: false
+      modes:
+        - name: local
+          parallel: no
+      parameters:
+        port: 22
+    ort-inference:
+      <<: *default_local_mode
+    tensorrt-inference:
+      <<: *default_local_mode
+      parameters:
+        pytorch_models:
+          - resnet50
+          - resnet101
+          - resnet152
+          - densenet169
+          - densenet201
+          - bert-base
+          - bert-large
+        seq_length: 224
+        batch_size: 32
+        precision: int8
+    model-benchmarks:gpt:
+      <<: *default_pytorch_mode
+      models:
+        - gpt2-small
+        - gpt2-large
+      parameters:
+        <<: *common_model_config
+        precision: [float32, float16, fp8_hybrid]
+        batch_size: 32
+        seq_len: 224
+    model-benchmarks:bert:
+      <<: *default_pytorch_mode
+      models:
+        - bert-base
+        - bert-large
+      parameters:
+        <<: *common_model_config
+        precision: [float32, float16, fp8_hybrid]
+        seq_len: 224
+    model-benchmarks:lstm:
+      <<: *default_pytorch_mode
+      models:
+        - lstm
+      parameters:
+        <<: *common_model_config
+        batch_size: 1024
+        input_size: 224
+        hidden_size: 1000
+        seq_len: 32
+        pin_memory: no
+    model-benchmarks:resnet:
+      <<: *default_pytorch_mode
+      models:
+        - resnet50
+        - resnet101
+        - resnet152
+      parameters:
+        <<: *common_model_config
+        batch_size: 384
+        num_steps: 512
+    model-benchmarks:densenet:
+      <<: *default_pytorch_mode
+      models:
+        - densenet169
+        - densenet201
+      parameters:
+        <<: *common_model_config
+        pin_memory: no
+    model-benchmarks:vgg:
+      <<: *default_pytorch_mode
+      models:
+        - vgg11
+        - vgg13
+        - vgg16
+        - vgg19
+      parameters:
+        <<: *common_model_config
+        pin_memory: no
--- a/superbench/config/default.yaml
+++ b/superbench/config/default.yaml
 # SuperBench Config
-version: v0.10
+version: v0.11
 superbench:
  enable: null
  monitor:

--- a/website/blog/2024-09-20-release-0-11.md
+++ b/website/blog/2024-09-20-release-0-11.md
+---
+slug: release-sb-v0.11
+title: Releasing SuperBench v0.11
+author: Peng Cheng
+author_title: SuperBench Team
+author_url: https://github.com/cp5555
+author_image_url: https://github.com/cp5555.png
+tags: [superbench, announcement, release]
+---
+We are very happy to announce that **SuperBench 0.11.0 version** is officially released today!
+You can install and try superbench by following [Getting Started Tutorial](https://microsoft.github.io/superbenchmark/docs/getting-started/installation).
+## SuperBench 0.11.0 Release Notes
+### SuperBench Improvements
+- Add CUDA 12.4 dockerfile.
+- Upgrade nccl version to v2.23.4 and install ucx v1.16.0 in cuda 12.4 dockefile.
+- Fix MSCCL build error in CUDA12.4 docker build pipeline.
+- Add ROCm6.2 dockerfile.
+- Update hpcx link in cuda11.1 dockerfile to fix docker build failure.
+- Improve document (Fix metrics name and typos in user tutorial, add BibTeX in README and repo).
+- Limit protobuf version to be 3.20.x to fix onnxruntime dependency error.
+- Update omegaconf version to 2.3.0 and fix issues caused by omegaconf version update.
+- Fix MSCCL build error in CUDA12.4 docker build pipeline.
+- Update Docker Exec Command for Persistent HPCX Environment.
+- Fix cuda 12.2 dockerfile LD_LIBRARY_PATH issue.
+- Use types-setuptools to replace types-pkg_resources.
+- Add configuration for NDv5 H100 and AMD MI300x.
+### Micro-benchmark Improvements
+- Add hipblasLt tuning to dist-inference cpp implementation.
+- Add support for NVIDIA L4/L40/L40s GPUs in gemm-flops.
+- Upgrade mlc to v3.11.
+## Model-benchmark Improvements
+- Support FP8 transformer model training in ROCm6.2 dockerfile.
+### Result Analysis
+- Fix bug of failure test and warning of pandas in data diagnosis.
--- a/website/docusaurus.config.js
+++ b/website/docusaurus.config.js
@@ -101,7 +101,7 @@ module.exports = {
    announcementBar: {
      id: 'supportus',
      content:
-        '📢 <a href="https://microsoft.github.io/superbenchmark/blog/release-sb-v0.10">v0.10.0</a> has been released! ' +
+        '📢 <a href="https://microsoft.github.io/superbenchmark/blog/release-sb-v0.11">v0.11.0</a> has been released! ' +
        '⭐️ If you like SuperBench, give it a star on <a target="_blank" rel="noopener noreferrer" href="https://github.com/microsoft/superbenchmark">GitHub</a>! ⭐️',
    },
    algolia: {

--- a/website/package-lock.json
+++ b/website/package-lock.json
 {
  "name": "superbench-website",
-  "version": "0.10.0",
+  "version": "0.11.0",
  "lockfileVersion": 1,
  "requires": true,
  "dependencies": {

--- a/website/package.json
+++ b/website/package.json
 {
  "name": "superbench-website",
-  "version": "0.10.0",
+  "version": "0.11.0",
  "private": true,
  "scripts": {
    "docusaurus": "docusaurus",
@@ -38,4 +38,4 @@
      "last 1 safari version"
    ]
  }
 }
\ No newline at end of file