Dockerfile - add cuda13.0.dockerfile (#739)

Add support for cuda13.0. Add cuda13.0.dockerfile. Add cuda13.0 image building task to github pipeline. Update GPU STREAM to work with cuda13.0. Fix data type conversion perf bug in GPU stream. Update nvbandwidth submodule to be v0.8. Update perftest submodule to be 4bee61f80d9e268fc97eaf40be00409e91d3a19e (recent master). --------- Co-authored-by: Ubuntu <dilipreddi@gmail.com> Co-authored-by: guoshzhao <guzhao@microsoft.com>

Dockerfile - add cuda13.0.dockerfile (#739)
Add support for cuda13.0. Add cuda13.0.dockerfile. Add cuda13.0 image building task to github pipeline. Update GPU STREAM to work with cuda13.0. Fix data type conversion perf bug in GPU stream. Update nvbandwidth submodule to be v0.8. Update perftest submodule to be 4bee61f80d9e268fc97eaf40be00409e91d3a19e (recent master). --------- Co-authored-by: Ubuntu <dilipreddi@gmail.com> Co-authored-by: guoshzhao <guzhao@microsoft.com>
60189dd6 · WenqingLan1 · GitHub · 93e9d262 · 60189dd6 · 60189dd6
Unverified Commit 60189dd6 authored Oct 01, 2025 by WenqingLan1 Committed by GitHub Oct 01, 2025
15 changed files
--- a/.github/workflows/build-image.yml
+++ b/.github/workflows/build-image.yml
@@ -26,6 +26,18 @@ jobs:
      fail-fast: true
      matrix:
        include:
+        - name: cuda13.0-arm64
+          dockerfile: cuda13.0
+          tags: superbench/main:cuda13.0-arm64
+          platforms: linux/arm64
+          runner: [self-hosted, linux/arm64]
+          build_args: "NUM_MAKE_JOBS=16"
+        - name: cuda13.0-amd64
+          dockerfile: cuda13.0
+          tags: superbench/main:cuda13.0-amd64
+          platforms: linux/amd64
+          runner: [self-hosted, linux/amd64]
+          build_args: "NUM_MAKE_JOBS=16"
        - name: cuda12.8-arm64
          dockerfile: cuda12.8
          tags: superbench/main:cuda12.8-arm64

--- a/.gitmodules
+++ b/.gitmodules
@@ -20,7 +20,7 @@
 	url = https://github.com/netbench/GPCNET.git
 [submodule "third_party/gpu-burn"]
 	path = third_party/gpu-burn
-	url = https://github.com/wilicc/gpu-burn.git
+	url = https://github.com/WenqingLan1/gpu-burn.git
 [submodule "third_party/msccl"]
 	path = third_party/msccl
 	url = https://github.com/Azure/msccl

--- a/dockerfile/cuda13.0.dockerfile
+++ b/dockerfile/cuda13.0.dockerfile
+FROM nvcr.io/nvidia/pytorch:25.08-py3
+
+# OS:
+#   - Ubuntu: 24.04
+#   - OpenMPI: 4.1.9a1
+#   - Docker Client: 20.10.8 (installed in this dockerfile)
+# NVIDIA:
+#   - CUDA: 13.0.0.044
+#   - cuDNN: 9.12.0.46
+#   - cuBLAS: 13.0.0.19
+#   - NCCL: 2.27.7
+#   - TransformerEngine: v2.5
+#   - torch: 2.8.0a0+34c6371d24
+# Mellanox:
+#   - MOFED_VERSION: (installed in this dockerfile)
+#   - HPC-X: 2.24
+# Intel:
+#   - mlc: ??? (amd64 only)
+
+LABEL maintainer="SuperBench"
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    autoconf \
+    automake \
+    bc \
+    build-essential \
+    curl \
+    dmidecode \
+    ffmpeg \
+    git \
+    iproute2 \
+    jq \
+    libaio-dev \
+    libavcodec-dev \
+    libavformat-dev \
+    libavutil-dev \
+    libboost-program-options-dev \
+    libcap2 \
+    libcurl4-openssl-dev \
+    libnuma-dev \
+    libpci-dev \
+    libswresample-dev \
+    libncurses-dev \
+    libtool \
+    lshw \
+    python3-mpi4py \
+    net-tools \
+    nlohmann-json3-dev \
+    openssh-client \
+    openssh-server \
+    pciutils \
+    sudo \
+    util-linux \
+    vim \
+    wget \
+    rsync \
+    && \
+    apt-get autoremove && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/* /tmp/*
+
+ARG NUM_MAKE_JOBS=
+ARG TARGETPLATFORM
+ARG TARGETARCH
+
+# Install Docker
+ENV DOCKER_VERSION=20.10.8
+RUN TARGETARCH_HW=$(uname -m) && \
+    wget -q https://download.docker.com/linux/static/stable/${TARGETARCH_HW}/docker-${DOCKER_VERSION}.tgz -O docker.tgz && \
+    tar --extract --file docker.tgz --strip-components 1 --directory /usr/local/bin/ && \
+    rm docker.tgz
+
+# Update system config
+RUN mkdir -p /root/.ssh && \
+    touch /root/.ssh/authorized_keys && \
+    mkdir -p /var/run/sshd && \
+    sed -i "s/[# ]*PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config && \
+    sed -i "s/[# ]*PermitUserEnvironment no/PermitUserEnvironment yes/" /etc/ssh/sshd_config && \
+    sed -i "s/[# ]*Port.*/Port 22/" /etc/ssh/sshd_config && \
+    echo "* soft nofile 1048576\n* hard nofile 1048576" >> /etc/security/limits.conf && \
+    echo "root soft nofile 1048576\nroot hard nofile 1048576" >> /etc/security/limits.conf
+
+# Install OFED
+ENV OFED_VERSION=24.10-1.1.4.0
+RUN TARGETARCH_HW=$(uname -m) && \
+    cd /tmp && \
+    wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu24.04-${TARGETARCH_HW}.tgz && \
+    tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu24.04-${TARGETARCH_HW}.tgz && \
+    MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu24.04-${TARGETARCH_HW}/mlnxofedinstall --user-space-only --without-fw-update --without-ucx-cuda --force --all && \
+    rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}*
+
+# Install HPC-X
+ENV HPCX_VERSION=v2.24.1
+RUN TARGETARCH_HW=$(uname -m) && \
+    cd /opt && \
+    rm -rf hpcx && \
+    wget https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}_cuda13/hpcx-${HPCX_VERSION}-gcc-doca_ofed-ubuntu24.04-cuda13-${TARGETARCH_HW}.tbz -O hpcx.tbz && \
+    tar xf hpcx.tbz && \
+    mv hpcx-${HPCX_VERSION}-gcc-doca_ofed-ubuntu24.04-cuda13-${TARGETARCH_HW} hpcx && \
+    rm hpcx.tbz
+
+# Installs specific to amd64 platform
+RUN if [ "$TARGETARCH" = "amd64" ]; then \
+    # Install Intel MLC
+    cd /tmp && \
+    wget -q https://downloadmirror.intel.com/793041/mlc_v3.11.tgz -O mlc.tgz && \
+    tar xzf mlc.tgz Linux/mlc && \
+    cp ./Linux/mlc /usr/local/bin/ && \
+    rm -rf ./Linux mlc.tgz && \
+    # Install AOCC compiler
+    wget https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-4.0.0_1_amd64.deb && \
+    apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \
+    rm -rf aocc-compiler-4.0.0_1_amd64.deb && \
+    # Install AMD BLIS
+    wget https://download.amd.com/developer/eula/blis/blis-4-0/aocl-blis-linux-aocc-4.0.tar.gz && \
+    tar xzf aocl-blis-linux-aocc-4.0.tar.gz && \
+    mv amd-blis /opt/AMD && \
+    rm -rf aocl-blis-linux-aocc-4.0.tar.gz; \
+    else \
+    echo "Skipping Intel MLC, AOCC and AMD Bliss installations for non-amd64 architecture: $TARGETARCH"; \
+    fi
+
+# Install UCX with multi-threading support
+ENV UCX_VERSION=1.18.0
+RUN cd /tmp && \
+    wget https://github.com/openucx/ucx/releases/download/v${UCX_VERSION}-rc1/ucx-${UCX_VERSION}.tar.gz && \
+    tar xzf ucx-${UCX_VERSION}.tar.gz && \
+    cd ucx-${UCX_VERSION} && \
+    ./contrib/configure-release-mt --prefix=/usr/local && \
+    make -j ${NUM_MAKE_JOBS} && \
+    make install
+
+ENV PATH="${PATH}" \
+    LD_LIBRARY_PATH="/usr/local/lib:/usr/local/mpi/lib:${LD_LIBRARY_PATH}" \
+    SB_HOME=/opt/superbench \
+    SB_MICRO_PATH=/opt/superbench \
+    ANSIBLE_DEPRECATION_WARNINGS=FALSE \
+    ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections
+
+RUN echo PATH="$PATH" > /etc/environment && \
+    echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \
+    echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment && \
+    echo "source /opt/hpcx/hpcx-init.sh && hpcx_load" | tee -a /etc/bash.bashrc >> /etc/profile.d/10-hpcx.sh
+
+# Add config files
+ADD dockerfile/etc /opt/microsoft/
+
+WORKDIR ${SB_HOME}
+
+ADD third_party third_party
+RUN make -C third_party cuda
+
+ADD . .
+RUN python3 -m pip install --upgrade setuptools==78.1.0 && \
+    python3 -m pip install --no-cache-dir .[nvworker] && \
+    make cppbuild && \
+    make postinstall && \
+    rm -rf .git
--- a/superbench/benchmarks/micro_benchmarks/cuda_common.cmake
+++ b/superbench/benchmarks/micro_benchmarks/cuda_common.cmake
@@ -38,4 +38,11 @@ if(NOT DEFINED NVCC_ARCHS_SUPPORTED)
    if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12.8)
      list(APPEND NVCC_ARCHS_SUPPORTED 100)
    endif()
+    if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12.9)
+      list(APPEND NVCC_ARCHS_SUPPORTED 103)
+    endif()
+    # CUDA 13.0+ drops support for archs before sm_75, remove them
+    if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
+      list(REMOVE_ITEM NVCC_ARCHS_SUPPORTED 53 60 61 70 72)
+    endif()
 endif()
--- a/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/AppDecPerf.cpp
+++ b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/AppDecPerf.cpp
@@ -171,7 +171,12 @@ void ParseCommandLine(int argc, char *argv[], char *szInputFileName, int &iGpu,
 OptimizedNvDecoder *InitOptimizedNvDecoder(int i, const CUdevice &cuDevice, CUcontext &cuContext, bool bSingle,
                                           bool bHost, cudaVideoCodec codec, CUVIDDECODECAPS decodecaps) {
    if (!bSingle) {
+#if CUDA_VERSION >= 13000
+        CUctxCreateParams ctxCreateParams = {};
+        ck(cuCtxCreate(&cuContext, &ctxCreateParams, 0, cuDevice));
+#else
        ck(cuCtxCreate(&cuContext, 0, cuDevice));
+#endif
    }
    OptimizedNvDecoder *sessionObject = new OptimizedNvDecoder(cuContext, !bHost, codec, decodecaps);
    sessionObject->setDecoderSessionID(i);
@@ -247,7 +252,12 @@ void InitializeContext(std::vector<OptimizedNvDecoder *> &vDec, int iGpu, int nT
    std::cout << "GPU in use: " << szDeviceName << std::endl;

    CUcontext cuContext = NULL;
+#if CUDA_VERSION >= 13000
+    CUctxCreateParams ctxCreateParams = {};
+    ck(cuCtxCreate(&cuContext, &ctxCreateParams, 0, cuDevice));
+#else
    ck(cuCtxCreate(&cuContext, 0, cuDevice));
+#endif

    CUVIDDECODECAPS decodecaps;
    GetDefaultDecoderCaps(decodecaps, codec);

--- a/superbench/benchmarks/micro_benchmarks/cuda_gemm_flops_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/cuda_gemm_flops_performance.py
@@ -53,6 +53,7 @@ def __init__(self, name, parameters=''):
        # Skip INT4 for Hopper due to no native CUDA/Tensor Cores
        self.__kernel_map[9.0] = {k: self.__kernel_map[8.0][k] for k in self.__kernel_map[8.0] if 'int4_tc' not in k}
        self.__kernel_map[10.0] = {k: self.__kernel_map[8.0][k] for k in self.__kernel_map[8.0] if 'int4_tc' not in k}
+        self.__kernel_map[10.3] = {k: self.__kernel_map[8.0][k] for k in self.__kernel_map[8.0] if 'int4_tc' not in k}
        self.__parse_logline = [
            'gemm,cutlass_simt_dgemm_128x128_8x2', 'gemm,cutlass_simt_sgemm_128x128_8x2',
            'gemm,cutlass_simt_hgemm_256x128_8x2', 'gemm,cutlass_tensorop_d884gemm_128x128_16x3',

--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
@@ -9,6 +9,7 @@
 #include "gpu_stream.hpp"
 #include <cassert>
 #include <iostream>
+#include <nvml.h>

 /**
 * @brief Destroys the CUDA events used for benchmarking.
@@ -103,23 +104,73 @@ template <typename T> int GpuStream::Destroy(std::unique_ptr<BenchArgs<T>> &args
    return ret;
 }

+/**
+ * @brief Gets the memory clock rate for a CUDA device.
+ *
+ * @details This function gets the memory clock rate using the appropriate method
+ * based on CUDA version: CUDA 12.0+ uses NVML and cudaDeviceGetAttribute as a fallback;
+ * older CUDA versions use cudaDeviceProp.
+ *
+ * @param[in] device_id The ID of the CUDA device.
+ * @param[in] prop The properties of the CUDA device.
+ * @return float The memory clock rate in MHz, or -1.0f if retrieval fails.
+ */
+float GpuStream::GetMemoryClockRate(int device_id, const cudaDeviceProp &prop) {
+    float memory_clock_mhz = 0.0f;
+
+    // Set the device before getting attributes
+    if (SetGpu(device_id)) {
+        return -1.0f;
+    }
+
+#if CUDA_VERSION >= 12000
+    // For CUDA 12.0+, first try NVML for actual clock rate
+    memory_clock_mhz = GetActualMemoryClockRate(device_id);
+
+    // If NVML fails, fall back to cudaDeviceGetAttribute
+    if (memory_clock_mhz < 0.0f) {
+        int memory_clock_khz = 0;
+        cudaError_t cuda_err = cudaDeviceGetAttribute(&memory_clock_khz, cudaDevAttrMemoryClockRate, device_id);
+        if (cuda_err != cudaSuccess) {
+            std::cerr << "GetMemoryClockRate::cudaDeviceGetAttribute error: " << cuda_err << std::endl;
+            return -1.0f;
+        }
+        // Convert kHz to MHz
+        memory_clock_mhz = memory_clock_khz / 1000.0f;
+    }
+#else
+    // For CUDA < 12.0, use memoryClockRate from cudaDeviceProp
+    // Note: memoryClockRate is in kHz, need to convert to MHz first
+    memory_clock_mhz = prop.memoryClockRate / 1000.0f; // Convert kHz to MHz
+#endif
+
+    return memory_clock_mhz;
+}
+
 /**
 * @brief Prints CUDA device information.
 *
 * @details This function prints the properties of a CUDA device specified by the device ID.
 *
 * @param[in] device_id The ID of the CUDA device.
- * @param[out] prop The properties of the CUDA device.
- * @return void
- * */
-void GpuStream::PrintCudaDeviceInfo(int device_id, const cudaDeviceProp &prop) {
+ * @param[in] prop The properties of the CUDA device.
+ * @param[in] memory_clock_mhz The memory clock rate in MHz (or -1.0f if unavailable).
+ * @param[in] peak_bw The theoretical peak bandwidth in GB/s (or -1.0f if unavailable).
+ */
+void GpuStream::PrintCudaDeviceInfo(int device_id, const cudaDeviceProp &prop, float memory_clock_mhz, float peak_bw) {
    std::cout << "\nDevice " << device_id << ": \"" << prop.name << "\"";
    std::cout << "  " << prop.multiProcessorCount << " SMs(" << prop.major << "." << prop.minor << ")";

-    // Compute theoretical bw:
-    // https://developer.nvidia.com/blog/how-implement-performance-metrics-cuda-cc/#theoretical_bandwidth
-    std::cout << "  Memory: " << prop.memoryClockRate / 1000.0 << "MHz x " << prop.memoryBusWidth
-              << "-bit = " << (prop.memoryClockRate / 1000.0) * (prop.memoryBusWidth / 8) * 2 / 1000.0 << " GB/s PEAK ";
+    if (peak_bw < 0.0f) {
+        // Bandwidth computation failed (memory_clock_mhz will also be < 0)
+        std::cout << "  Memory: " << prop.memoryBusWidth << "-bit bus, " << prop.totalGlobalMem / (1024 * 1024 * 1024)
+                  << " GB total (peak BW unavailable)";
+    } else {
+        // Both memory_clock_mhz and peak_bw are valid
+        std::cout << "  Memory: " << memory_clock_mhz << "MHz x " << prop.memoryBusWidth << "-bit = " << peak_bw
+                  << " GB/s PEAK ";
+    }
+
    std::cout << "  ECC is " << (prop.ECCEnabled ? "ON" : "OFF") << std::endl;
 }

@@ -158,11 +209,12 @@ template <typename T> int GpuStream::PrepareValidationBuf(std::unique_ptr<BenchA
    }

    // Initialize validation buffer
+    T s = static_cast<T>(scalar);
    for (size_t j = 0; j < size; j++) {
        args->sub.validation_buf_ptrs[0][j] = static_cast<T>(j % kUInt8Mod);
-        args->sub.validation_buf_ptrs[1][j] = static_cast<T>(j % kUInt8Mod) * scalar;
+        args->sub.validation_buf_ptrs[1][j] = static_cast<T>(j % kUInt8Mod) * s;
        args->sub.validation_buf_ptrs[2][j] = static_cast<T>(j % kUInt8Mod) + static_cast<T>(j % kUInt8Mod);
-        args->sub.validation_buf_ptrs[3][j] = static_cast<T>(j % kUInt8Mod) + static_cast<T>(j % kUInt8Mod) * scalar;
+        args->sub.validation_buf_ptrs[3][j] = static_cast<T>(j % kUInt8Mod) + static_cast<T>(j % kUInt8Mod) * s;
    }
    return 0;
 }
@@ -404,7 +456,7 @@ int GpuStream::RunStreamKernel(std::unique_ptr<BenchArgs<T>> &args, Kernel kerne
        case Kernel::kScale:
            ScaleKernel<<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>(
                reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[2].get()),
-                reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[0].get()), scalar);
+                reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[0].get()), static_cast<T>(scalar));
            args->sub.kernel_name = "SCALE";
            break;
        case Kernel::kAdd:
@@ -419,7 +471,7 @@ int GpuStream::RunStreamKernel(std::unique_ptr<BenchArgs<T>> &args, Kernel kerne
            TriadKernel<<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>(
                reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[2].get()),
                reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[0].get()),
-                reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[1].get()), scalar);
+                reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[1].get()), static_cast<T>(scalar));
            size_factor = 3;
            args->sub.kernel_name = "TRIAD";
            break;
@@ -458,6 +510,38 @@ int GpuStream::RunStreamKernel(std::unique_ptr<BenchArgs<T>> &args, Kernel kerne
    return 0;
 }

+float GpuStream::GetActualMemoryClockRate(int gpu_id) {
+    nvmlReturn_t result;
+    nvmlDevice_t device;
+    unsigned int clock_mhz;
+
+    // Initialize NVML
+    result = nvmlInit();
+    if (result != NVML_SUCCESS) {
+        std::cerr << "Failed to initialize NVML: " << nvmlErrorString(result) << std::endl;
+        return -1.0f;
+    }
+
+    // Get device handle
+    result = nvmlDeviceGetHandleByIndex(gpu_id, &device);
+    if (result != NVML_SUCCESS) {
+        std::cerr << "Failed to get device handle: " << nvmlErrorString(result) << std::endl;
+        nvmlShutdown();
+        return -1.0f;
+    }
+
+    // Get memory clock rate
+    result = nvmlDeviceGetClockInfo(device, NVML_CLOCK_MEM, &clock_mhz);
+    if (result != NVML_SUCCESS) {
+        std::cerr << "Failed to get memory clock: " << nvmlErrorString(result) << std::endl;
+        nvmlShutdown();
+        return -1.0f;
+    }
+
+    nvmlShutdown();
+    return static_cast<float>(clock_mhz);
+}
+
 /**
 * @brief Runs the benchmark for various kernels and processes the results for a BenchArgs config.
 *
@@ -470,13 +554,11 @@ int GpuStream::RunStreamKernel(std::unique_ptr<BenchArgs<T>> &args, Kernel kerne
 *
 * @return int The status code indicating success or failure of the benchmark execution.
 * */
-template <typename T> int GpuStream::RunStream(std::unique_ptr<BenchArgs<T>> &args, const std::string &data_type) {
+template <typename T>
+int GpuStream::RunStream(std::unique_ptr<BenchArgs<T>> &args, const std::string &data_type, float peak_bw) {
    int ret = 0;
    ret = PrepareBufAndStream<T>(args);

-    float peak_bw =
-        args->gpu_device_prop.memoryClockRate / 1000.0 * args->gpu_device_prop.memoryBusWidth / 8 * 2 / 1000.0;
-
    if (ret != 0) {
        return DestroyBufAndStream(args);
    }
@@ -508,8 +590,13 @@ template <typename T> int GpuStream::RunStream(std::unique_ptr<BenchArgs<T>> &ar
        for (int j = 0; j < args->sub.times_in_ms[i].size(); j++) {
            // Calculate and display bandwidth
            double bw = args->size * args->num_loops / args->sub.times_in_ms[i][j] / 1e6;
-            std::cout << tag << "_block_" << kThreadsPerBlock[j] << "\t" << bw << "\t" << std::fixed
-                      << std::setprecision(2) << bw / peak_bw * 100 << std::endl;
+            std::cout << tag << "_block_" << kThreadsPerBlock[j] << "\t" << bw << "\t";
+
+            if (peak_bw < 0) { // cannot get peak_bw -> prints -1 for efficiency
+                std::cout << "-1" << std::endl;
+            } else {
+                std::cout << std::fixed << std::setprecision(2) << bw / peak_bw * 100 << std::endl;
+            }
        }
    }
    // cleanup buffer and streams for the curr arg
@@ -567,7 +654,20 @@ int GpuStream::Run() {
    for (auto &variant_args : bench_args_) {
        std::visit(
            [&](auto &curr_args) {
-                PrintCudaDeviceInfo(curr_args->gpu_id, curr_args->gpu_device_prop);
+                // Get memory clock rate once for both bandwidth computation and display
+                float memory_clock_mhz = GetMemoryClockRate(curr_args->gpu_id, curr_args->gpu_device_prop);
+
+                // Compute theoretical bandwidth using the memory clock rate
+                float peak_bw = -1.0f;
+                if (memory_clock_mhz > 0.0f) {
+                    // Calculate theoretical bandwidth: memory_clock_mhz * bus_width_bytes * 2 (DDR) / 1000 (convert to
+                    // GB/s)
+                    peak_bw = memory_clock_mhz * (curr_args->gpu_device_prop.memoryBusWidth / 8) * 2 / 1000.0;
+                }
+
+                // Print device info with both the memory clock and peak bandwidth
+                PrintCudaDeviceInfo(curr_args->gpu_id, curr_args->gpu_device_prop, memory_clock_mhz, peak_bw);
+
                // Set the NUMA node
                ret = numa_run_on_node(curr_args->numa_id);
                if (ret != 0) {
@@ -576,11 +676,11 @@ int GpuStream::Run() {
                    return;
                }

-                // Run the stream benchmark for the configued data
+                // Run the stream benchmark for the configured data, passing the peak bandwidth
                if constexpr (std::is_same_v<std::decay_t<decltype(*curr_args)>, BenchArgs<float>>) {
-                    ret = RunStream<float>(curr_args, "float");
+                    ret = RunStream<float>(curr_args, "float", peak_bw);
                } else if constexpr (std::is_same_v<std::decay_t<decltype(*curr_args)>, BenchArgs<double>>) {
-                    ret = RunStream<double>(curr_args, "double");
+                    ret = RunStream<double>(curr_args, "double", peak_bw);
                } else {
                    std::cerr << "Run::Unknown type error" << std::endl;
                    has_error = true;

--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.hpp
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.hpp
@@ -52,10 +52,12 @@ class GpuStream {

    // Benchmark functions
    template <typename T> int RunStreamKernel(std::unique_ptr<BenchArgs<T>> &, Kernel, int);
-    template <typename T> int RunStream(std::unique_ptr<BenchArgs<T>> &, const std::string &data_type);
+    float GetActualMemoryClockRate(int gpu_id);
+    template <typename T> int RunStream(std::unique_ptr<BenchArgs<T>> &, const std::string &data_type, float peak_bw);

    // Helper functions
    int GetGpuCount(int *);
    int SetGpu(int gpu_id);
-    void PrintCudaDeviceInfo(int, const cudaDeviceProp &);
+    float GetMemoryClockRate(int device_id, const cudaDeviceProp &prop);
+    void PrintCudaDeviceInfo(int device_id, const cudaDeviceProp &prop, float memory_clock_mhz, float peak_bw);
 };
\ No newline at end of file
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_kernels.cu
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_kernels.cu
@@ -85,7 +85,7 @@ __global__ void CopyKernel(double *tgt, const double *src) {
 * @param[in] src The source array containing the data to be scaled.
 * @param[in] scalar The scalar value used to scale the source data.
 */
-__global__ void ScaleKernel(double *tgt, const double *src, const long scalar) {
+__global__ void ScaleKernel(double *tgt, const double *src, const double scalar) {
    uint64_t index = blockIdx.x * blockDim.x * kNumLoopUnrollAlias + threadIdx.x;
    double val[kNumLoopUnrollAlias];
 #pragma unroll
@@ -137,7 +137,7 @@ __global__ void AddKernel(double *tgt, const double *src_a, const double *src_b)
 * @param[in] src_b The second source array containing the second set of operands to be multiplied by the scalar.
 * @param[in] scalar The scalar value used in the multiply/add operation.
 */
-__global__ void TriadKernel(double *tgt, const double *src_a, const double *src_b, const long scalar) {
+__global__ void TriadKernel(double *tgt, const double *src_a, const double *src_b, const double scalar) {
    uint64_t index = blockIdx.x * blockDim.x * kNumLoopUnrollAlias + threadIdx.x;
    double val_a[kNumLoopUnrollAlias];
    double val_b[kNumLoopUnrollAlias];

--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_kernels.hpp
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_kernels.hpp
@@ -14,6 +14,6 @@ template <typename T> inline __device__ void Fetch(T &v, const T *p);
 template <typename T> inline __device__ void Store(T *p, const T &v);

 __global__ void CopyKernel(double *, const double *);
-__global__ void ScaleKernel(double *, const double *, const long);
+__global__ void ScaleKernel(double *, const double *, const double);
 __global__ void AddKernel(double *, const double *, const double *);
-__global__ void TriadKernel(double *, const double *, const double *, const long);
\ No newline at end of file
+__global__ void TriadKernel(double *, const double *, const double *, const double);
\ No newline at end of file
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_utils.hpp
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_utils.hpp
@@ -36,7 +36,7 @@ constexpr int kNumBuffers = 3;                                         // Number
 constexpr int kNumValidationBuffers = 4;                          // Number of validation buffers, one for each kernel
 constexpr int kUInt8Mod = 256;                                    // Modulo for unsigned long data type
 constexpr std::array<int, 4> kBufferBwMultipliers = {2, 2, 3, 3}; // Buffer multiplier for triad, add kernel
-constexpr long scalar = 11;                                       // Scalar for scale, triad kernel
+constexpr double scalar = 11.0;                                   // Scalar for scale, triad kernel

 // Enum for different kernels
 enum class Kernel {

--- a/third_party/Makefile
+++ b/third_party/Makefile
@@ -20,6 +20,7 @@ NUM_MAKE_JOBS ?= $(shell nproc --ignore=2)

 # Build targets.
 all: cuda rocm
+# msccl: api change in cudaStreamUpdateCaptureDependencies
 cuda_with_msccl: cuda cuda_msccl
 cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed nvbandwidth
 rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed apex_rocm rocm_megatron_lm
@@ -40,7 +41,11 @@ sb_micro_path:
 # Build cutlass.
 # for cuda 12.9 and later Build from commit v3.9 (3.9 release commit) for blackwell support
 cuda_cutlass:
-ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1)
+ifeq ($(shell echo $(CUDA_VER)">=12.9" | bc -l), 1)
+	$(eval ARCHS := "100;103")
+	if [ -d cutlass ]; then rm -rf cutlass; fi
+	git clone --branch v4.1.0 --depth 1 https://github.com/NVIDIA/cutlass.git && cd cutlass
+else ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1)
 	$(eval ARCHS := "90;100")
 	if [ -d cutlass ]; then rm -rf cutlass; fi
 	git clone --branch v3.9.2 --depth 1 https://github.com/NVIDIA/cutlass.git && cd cutlass
@@ -108,7 +113,7 @@ ifneq (,$(wildcard nccl-tests/Makefile))
 endif

 # Build perftest.
-# The version we use is the tag v4.5-0.2.
+# The commit we use is 4bee61f80d9e268fc97eaf40be00409e91d3a19e.
 cuda_perftest:
 ifneq (,$(wildcard perftest/autogen.sh))
 	cd perftest && ./autogen.sh && ./configure CUDA_H_PATH=/usr/local/cuda/include/cuda.h --prefix=$(SB_MICRO_PATH) && make -j && make install
@@ -264,7 +269,12 @@ apex_rocm:

 # Build MSCCL for CUDA
 cuda_msccl: sb_micro_path
-ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1)
+ifeq ($(shell echo $(CUDA_VER)">=12.9" | bc -l), 1)
+	$(eval ARCHS := 100 103)
+	if [ -d msccl ]; then rm -rf msccl; fi; \
+	git clone --single-branch --branch main https://github.com/Azure/msccl.git \
+	&& git -C msccl checkout 87048bd && git -C msccl submodule update --recursive --init
+else ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1)
    # Get commit 87048bd from msscl to support updated nccl and sm_100
 	$(eval ARCHS := 75 80 86 89 90 100)
 	if [ -d msccl ]; then rm -rf msccl; fi; \

--- a/gpu-burn @ 565e55b4
+++ b/gpu-burn @ 565e55b4
-Subproject commit cab8221b1147e83dd1fea3e42c3fe255254236ff
+Subproject commit 565e55b46f9885688ba9737f1600b1f62d47a95e
--- a/nvbandwidth @ 66746a3b
+++ b/nvbandwidth @ 66746a3b
-Subproject commit fb851de841a0b1fb261cbc3a6fe626f17a19ba0f
+Subproject commit 66746a3bef61c8c2e12ab34955310da70b9e38cb
--- a/perftest @ 4bee61f8
+++ b/perftest @ 4bee61f8
-Subproject commit dffd1dd8b8a26dad2634a546e7e4d082dc882fbc
+Subproject commit 4bee61f80d9e268fc97eaf40be00409e91d3a19e