Update DTK dockerfile and microbenchmarks

- Update rocm_commom.cmake for CMake>=3.24 - Prevent isolation build - Add BabelStream as a submodule - Update dockerignore

Update DTK dockerfile and microbenchmarks
- Update rocm_commom.cmake for CMake>=3.24 - Prevent isolation build - Add BabelStream as a submodule - Update dockerignore
c4f39919 · one · 0fdfe4c3 · c4f39919 · c4f39919 · c4f39919
Commit c4f39919 authored Mar 19, 2026 by one
18 changed files
--- a/.dockerignore
+++ b/.dockerignore
@@ -15,3 +15,8 @@ outputs/
 **/.dockerignore
 .github/
 .azure-pipelines/
+# Build directories
+**/build/
+**/Build/
+**/build-*/
--- a/.gitmodules
+++ b/.gitmodules
@@ -33,3 +33,6 @@
 [submodule "third_party/nvbandwidth"]
 	path = third_party/nvbandwidth
 	url = https://github.com/NVIDIA/nvbandwidth.git
+[submodule "third_party/BabelStream"]
+	path = third_party/BabelStream
+	url = https://github.com/UoB-HPC/BabelStream.git
--- a/dockerfile/dtk26.04.dockerfile
+++ b/dockerfile/dtk26.04.dockerfile
@@ -77,13 +77,14 @@ RUN cd /tmp && \
    tar xzf ucx-${UCX_VERSION}.tar.gz && \
    cd ucx-${UCX_VERSION} && \
    ./contrib/configure-release --prefix=${UCX_HOME} \
-        --enable-optimizations --enable-tuning \
+    --enable-optimizations --enable-tuning \
-        --enable-cma --enable-mt \
+    --enable-cma --enable-mt \
-        --with-mlx5 --with-rc --with-ud --with-dc --with-dm --with-ib_hw_tm \
+    --with-mlx5 --with-rc --with-ud --with-dc --with-dm --with-ib_hw_tm \
-        --with-verbs=/usr/include --with-rdmacm=/usr \
+    --with-verbs=/usr/include --with-rdmacm=/usr \
-        --with-rocm=${ROCM_PATH} \
+    --with-rocm=${ROCM_PATH} \
-        --without-knem --without-cuda --without-java && \
+    --without-knem --without-cuda --without-java && \
    make -j $(nproc) && \
+    rm -rf ${UCX_HOME} && \
    make install && \
    rm -rf /tmp/ucx-${UCX_VERSION}*
@@ -95,27 +96,40 @@ RUN cd /tmp && \
    tar xzf openmpi-${OMPI_VERSION}.tar.gz && \
    cd openmpi-${OMPI_VERSION} && \
    ./configure --prefix=${MPI_HOME} \
-        --with-ucx=${UCX_HOME} \
+    --with-ucx=${UCX_HOME} \
-        --with-rocm=${ROCM_PATH} \
+    --with-rocm=${ROCM_PATH} \
-        --enable-builtin-atomics \
+    --enable-builtin-atomics \
-        --enable-wrapper-rpath \
+    --enable-wrapper-rpath \
-        --enable-mca-no-build=btl-uct \
+    --enable-mca-no-build=btl-uct \
-        --enable-prte-prefix-by-default && \
+    --enable-prte-prefix-by-default && \
    make -j $(nproc) && \
+    rm -rf ${MPI_HOME} && \
    make install && \
    ldconfig && \
    cd / && \
    rm -rf /tmp/openmpi-${OMPI_VERSION}*
 # Install Intel MLC
-# RUN cd /tmp && \
+RUN cd /tmp && \
-#     wget -q https://downloadmirror.intel.com/866182/mlc_v3.12.tgz -O mlc.tgz && \
+    wget -q https://downloadmirror.intel.com/866182/mlc_v3.12.tgz -O mlc.tgz && \
-#     tar xzf mlc.tgz Linux/mlc && \
+    tar xzf mlc.tgz Linux/mlc && \
-#     cp ./Linux/mlc /usr/local/bin/ && \
+    cp ./Linux/mlc /usr/local/bin/ && \
-#     rm -rf ./Linux mlc.tgz
+    rm -rf ./Linux mlc.tgz
 # Install AMD SMI Python Library
-RUN python3 -m pip install amdsmi==5.7.0
+RUN cd /tmp && \
+    wget -q https://github.com/ROCm/amdsmi/archive/refs/tags/rocm-5.7.0.tar.gz -O amdsmi.tar.gz && \
+    tar xzf amdsmi.tar.gz --transform 's/amdsmi-rocm-5.7.0/amdsmi/' && \
+    cd amdsmi && \
+    cmake -S . -B build && \
+    cmake --build build -j $(nproc) && \
+    cmake --install build --prefix ${ROCM_PATH}/ && \
+    rm -rf amdsmi.tar.gz amdsmi && \
+    python3 -m pip install amdsmi==5.7.0
+# Add rocblas-bench to path
+RUN ln -s ${ROCM_PATH}/lib/rocblas/benchmark_tool/rocblas-bench ${ROCM_PATH}/bin/ && \
+    chmod +x ${ROCM_PATH}/bin/rocblas-bench
 ENV PATH="${MPI_HOME}/bin:${UCX_HOME}/bin:/opt/superbench/bin:/usr/local/bin/${PATH:+:${PATH}}" \
    LD_LIBRARY_PATH="${MPI_HOME}/lib:${UCX_HOME}/lib:/usr/lib/x86_64-linux-gnu/:/usr/local/lib/${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" \
@@ -128,17 +142,30 @@ RUN echo PATH="$PATH" > /etc/environment && \
    echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \
    echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment
-RUN python3 -m pip install --upgrade pip wheel setuptools==65.7 mpi4py
 WORKDIR ${SB_HOME}
-ADD third_party third_party
+COPY third_party third_party
-RUN make RCCL_HOME=${ROCM_PATH}/rccl ROCM_PATH=${ROCM_PATH} HIP_HOME=${ROCM_PATH}/hip MPI_HOME=${MPI_HOME} -C third_party dtk -o cpu_hpl -o cpu_stream -o megatron_lm -o apex_rocm -o megatron_deepspeed -o rocm_megatron_lm
+RUN --mount=type=bind,from=hyhal,source=/,target=/opt/hyhal \
+    make \
-ADD . .
+    RCCL_HOME=${ROCM_PATH}/rccl \
-# ENV USE_HIP_DATATYPE=1
+    ROCM_PATH=${ROCM_PATH} \
-# ENV USE_HIPBLAS_COMPUTETYPE=1
+    HIP_HOME=${ROCM_PATH}/hip \
-RUN python3 -m pip install .[hgworker]  && \
+    MPI_HOME=${MPI_HOME} \
-    CXX=${ROCM_PATH}/bin/hipcc make cppbuild  && \
+    -C third_party \
+    dtk \
+    -o cpu_hpl \
+    -o cpu_stream \
+    -o megatron_lm \
+    -o apex_rocm \
+    -o megatron_deepspeed \
+    -o rocm_megatron_lm
+COPY . .
+ENV USE_HIP_DATATYPE=1
+ENV USE_HIPBLAS_COMPUTETYPE=1
+RUN --mount=type=bind,from=hyhal,source=/,target=/opt/hyhal \
+    python3 -m pip install --upgrade pip wheel setuptools==65.7 mpi4py && \
+    python3 -m pip install --no-build-isolation .[hgworker]  && \
+    make cppbuild  && \
    make postinstall
--- a/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/CMakeLists.txt
+++ b/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/CMakeLists.txt
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
-cmake_minimum_required(VERSION 3.18)
+cmake_minimum_required(VERSION 3.24)
 project(cpu_copy LANGUAGES CXX)
-find_package(CUDAToolkit QUIET)
+add_executable(cpu_copy cpu_copy.cpp)
+target_compile_options(cpu_copy PRIVATE -O2)
-# Cuda environment
+target_link_libraries(cpu_copy PRIVATE numa)
-if(CUDAToolkit_FOUND)
-    message(STATUS "Found CUDA: " ${CUDAToolkit_VERSION})
-    include(../cuda_common.cmake)
-    add_executable(cpu_copy cpu_copy.cpp)
-    set_property(TARGET cpu_copy PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED})
-    target_link_libraries(cpu_copy numa)
-else()
-    # ROCm environment
-    include(../rocm_common.cmake)
-    find_package(hip QUIET)
-    if(hip_FOUND)
-        message(STATUS "Found ROCm: " ${HIP_VERSION})
-        # Convert cuda code to hip code in cpp
-        execute_process(COMMAND hipify-perl -print-stats -o cpu_copy.cpp cpu_copy.cu WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/)
-        # link hip device lib
-        add_executable(cpu_copy cpu_copy.cpp)
-        include(CheckSymbolExists)
-        check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY)
-        if(${HIP_UNCACHED_MEMORY})
-            target_compile_definitions(cpu_copy PRIVATE HIP_UNCACHED_MEMORY)
-        endif()
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
-        target_link_libraries(cpu_copy numa hip::device)
-    else()
-        message(FATAL_ERROR "No CUDA or ROCm environment found.")
-    endif()
-endif()
 install(TARGETS cpu_copy RUNTIME DESTINATION bin)
--- a/superbench/benchmarks/micro_benchmarks/dist_inference_cpp/CMakeLists.txt
+++ b/superbench/benchmarks/micro_benchmarks/dist_inference_cpp/CMakeLists.txt
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
-cmake_minimum_required(VERSION 3.18)
+cmake_minimum_required(VERSION 3.24)
 project(dist_inference LANGUAGES CXX)
@@ -10,39 +10,31 @@ include_directories(SYSTEM ${MPI_INCLUDE_PATH})
 find_package(CUDAToolkit QUIET)
-# Cuda environment
+if(CUDAToolkit_FOUND) # CUDA environment
-if(CUDAToolkit_FOUND)
    message(STATUS "Found CUDA: " ${CUDAToolkit_VERSION})
    include(../cuda_common.cmake)
    add_executable(dist_inference dist_inference.cu)
    set_property(TARGET dist_inference PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED})
-    target_link_libraries(dist_inference MPI::MPI_CXX nccl cublasLt)
+    target_link_libraries(dist_inference PRIVATE MPI::MPI_CXX nccl cublasLt)
-else()
+else() # ROCm environment
-    # ROCm environment
    include(../rocm_common.cmake)
-    find_package(hip QUIET)
-    if(hip_FOUND)
+    hipify_sources(HIP_FILES dist_inference.cu)
-        message(STATUS "Found ROCm: " ${HIP_VERSION})
+    add_executable(dist_inference ${HIP_FILES})
+    target_compile_options(dist_inference PRIVATE -O2)
-        # Convert cuda code to hip code in cpp
+    target_compile_definitions(dist_inference PRIVATE ROCM_USE_FLOAT16=1)
-        execute_process(COMMAND hipify-perl -print-stats -o dist_inference.cpp dist_inference.cu WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/)
+    if(DEFINED ENV{USE_HIPBLASLT_DATATYPE})
-        # link hip device lib
+        target_compile_definitions(dist_inference PRIVATE USE_HIPBLASLT_DATATYPE=1)
-        add_executable(dist_inference dist_inference.cpp)
+    elseif(DEFINED ENV{USE_HIP_DATATYPE})
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -DROCM_USE_FLOAT16=1")
+        target_compile_definitions(dist_inference PRIVATE USE_HIP_DATATYPE=1)
-        if(DEFINED ENV{USE_HIPBLASLT_DATATYPE})
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_HIPBLASLT_DATATYPE=1")
-        elseif(DEFINED ENV{USE_HIP_DATATYPE})
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_HIP_DATATYPE=1")
-        endif()
-        if(DEFINED ENV{USE_HIPBLAS_COMPUTETYPE})
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_HIPBLAS_COMPUTETYPE=1")
-        endif()
-        target_link_libraries(dist_inference MPI::MPI_CXX rccl hipblaslt hip::device)
-    else()
-        message(FATAL_ERROR "No CUDA or ROCm environment found.")
    endif()
+    if(DEFINED ENV{USE_HIPBLAS_COMPUTETYPE})
+        target_compile_definitions(dist_inference PRIVATE USE_HIPBLAS_COMPUTETYPE=1)
+    endif()
+    target_link_libraries(dist_inference PRIVATE MPI::MPI_CXX rccl hipblaslt)
 endif()
 install(TARGETS dist_inference RUNTIME DESTINATION bin)
--- a/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/CMakeLists.txt
+++ b/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/CMakeLists.txt
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
-cmake_minimum_required(VERSION 3.18)
+cmake_minimum_required(VERSION 3.24)
 project(gpu_copy LANGUAGES CXX)
 find_package(CUDAToolkit QUIET)
-# Cuda environment
+if(CUDAToolkit_FOUND) # CUDA environment
-if(CUDAToolkit_FOUND)
    message(STATUS "Found CUDA: " ${CUDAToolkit_VERSION})
    include(../cuda_common.cmake)
    add_executable(gpu_copy gpu_copy.cu)
    set_property(TARGET gpu_copy PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED})
    target_link_libraries(gpu_copy numa)
-else()
+else() # ROCm environment
-    # ROCm environment
    include(../rocm_common.cmake)
-    find_package(hip QUIET)
+    hipify_sources(HIP_FILES gpu_copy.cu)
-    if(hip_FOUND)
+    add_executable(gpu_copy ${HIP_FILES})
-        message(STATUS "Found ROCm: " ${HIP_VERSION})
+    target_compile_options(gpu_copy PRIVATE -O2)
+    target_link_libraries(gpu_copy PRIVATE numa)
-        # Convert cuda code to hip code in cpp
-        execute_process(COMMAND hipify-perl -print-stats -o gpu_copy.cpp gpu_copy.cu WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/)
+    include(CheckSymbolExists)
+    check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY)
-        # link hip device lib
+    if(${HIP_UNCACHED_MEMORY})
-        add_executable(gpu_copy gpu_copy.cpp)
+        target_compile_definitions(gpu_copy PRIVATE HIP_UNCACHED_MEMORY)
-        include(CheckSymbolExists)
-        check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY)
-        if(${HIP_UNCACHED_MEMORY})
-            target_compile_definitions(gpu_copy PRIVATE HIP_UNCACHED_MEMORY)
-        endif()
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
-        target_link_libraries(gpu_copy numa hip::device)
-    else()
-        message(FATAL_ERROR "No CUDA or ROCm environment found.")
    endif()
 endif()
 install(TARGETS gpu_copy RUNTIME DESTINATION bin)
\ No newline at end of file
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/CMakeLists.txt
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/CMakeLists.txt
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-cmake_minimum_required(VERSION 3.18)
-project(gpu_stream LANGUAGES CXX)
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-set(CMAKE_CUDA_STANDARD 17)
-set(CMAKE_CUDA_STANDARD_REQUIRED ON)
-find_package(CUDAToolkit QUIET)
-# Source files
-set(SOURCES
-    gpu_stream_test.cpp
-    gpu_stream_utils.cpp
-    gpu_stream.cu
-    gpu_stream_kernels.cu
-)
-# Cuda environment
-if(CUDAToolkit_FOUND)
-    message(STATUS "Found CUDA: " ${CUDAToolkit_VERSION})
-    include(../cuda_common.cmake)
-    add_executable(gpu_stream ${SOURCES})
-    set_property(TARGET gpu_stream PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED})
-    target_include_directories(gpu_stream PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
-    target_link_libraries(gpu_stream numa nvidia-ml)
-else()
-    # TODO: test for ROC
-    # ROCm environment
-    include(../rocm_common.cmake)
-    find_package(hip QUIET)
-    if(hip_FOUND)
-        message(STATUS "Found ROCm: " ${HIP_VERSION})
-        # Convert cuda code to hip code in cpp
-        execute_process(COMMAND hipify-perl -print-stats -o gpu_stream.cpp ${SOURCES} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/)
-        # link hip device lib
-        add_executable(gpu_stream gpu_stream.cpp)
-        include(CheckSymbolExists)
-        check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY)
-        if(${HIP_UNCACHED_MEMORY})
-            target_compile_definitions(gpu_stream PRIVATE HIP_UNCACHED_MEMORY)
-        endif()
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
-        target_link_libraries(gpu_stream numa hip::device)
-    else()
-        message(FATAL_ERROR "No CUDA or ROCm environment found.")
-    endif()
-endif()
-install(TARGETS gpu_stream RUNTIME DESTINATION bin)
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT License.
-// GPU stream benchmark
-// This benchmark is based on the STREAM benchmark, which is a simple  benchmark program that measures
-// sustainable memory bandwidth (in MB/s) and the corresponding computation rate for simple (COPY, SCALE, ADD, TRIAD)
-// kernels.
-#include "gpu_stream.hpp"
-#include <cassert>
-#include <iostream>
-#include <nvml.h>
-/**
- * @brief Destroys the CUDA events used for benchmarking.
- *
- * @details This function cleans up and releases the resources associated with the CUDA events
- * used for benchmarking based on the provided arguments. It ensures that all allocated resources
- * are properly freed.
- *
- * @param[in,out] args A unique pointer to a BenchArgs structure
- *
- * @return int The status code indicating success or failure of the destruction process.
- */
-template <typename T> int GpuStream::DestroyEvent(std::unique_ptr<BenchArgs<T>> &args) {
-    cudaError_t cuda_err = cudaSuccess;
-    if (SetGpu(args->gpu_id)) {
-        return -1;
-    }
-    cuda_err = cudaEventDestroy(args->sub.start_event);
-    if (cuda_err != cudaSuccess) {
-        std::cerr << "DestroyEvent::cudaEventDestroy error: " << cuda_err << std::endl;
-        return -1;
-    }
-    cuda_err = cudaEventDestroy(args->sub.end_event);
-    if (cuda_err != cudaSuccess) {
-        std::cerr << "DestroyEvent::cudaEventDestroy error: " << cuda_err << std::endl;
-        return -1;
-    }
-    return 0;
-}
-/**
- * @brief Constructor for the GpuStream class.
- *
- * This constructor initializes the GpuStream opts with the given parameters.
- *
- * @param opts parsed command line options..
- */
-GpuStream::GpuStream(Opts &opts) noexcept : opts_(opts) { PrintInputInfo(opts_); }
-/**
- * @brief Sets the active GPU.
- *
- * @details This function sets the active GPU to the specified GPU ID.
- *
- * @param[in] gpu_id The ID of the GPU to set as active.
- *
- * @return int The status code indicating success or failure.
- */
-int GpuStream::SetGpu(int gpu_id) {
-    cudaError_t cuda_err = cudaSetDevice(gpu_id);
-    if (cuda_err != cudaSuccess) {
-        std::cerr << "SetGpu::cudaSetDevice " << gpu_id << "error: " << cuda_err << std::endl;
-        return -1;
-    }
-    return 0;
-}
-/**
- * @brief Retrieves the number of GPUs available.
- *
- * @details This function retrieves the number of GPUs available on the system and stores the count
- * in the provided pointer.
- *
- * @param[out] gpu_count Pointer to an integer where the GPU count will be stored.
- *
- * @return int The status code indicating success or failure.
- */
-int GpuStream::GetGpuCount(int *gpu_count) {
-    cudaError_t cuda_err = cudaGetDeviceCount(gpu_count);
-    if (cuda_err != cudaSuccess) {
-        std::cerr << "GetGpuCount::cudaGetDeviceCount error: " << cuda_err << std::endl;
-        return -1;
-    }
-    return 0;
-}
-/**
- * @brief destroys buff and stream resources.
- *
- * @details This method cleans up and releases the resources associated with the buffer and stream
- *
- * @param[in,out] args A unique pointer to a BenchArgs structure containing the necessary arguments.
- *
- * @return int The status code indicating success or failure.
- */
-template <typename T> int GpuStream::Destroy(std::unique_ptr<BenchArgs<T>> &args) {
-    int ret = DestroyBufAndStream(args);
-    if (ret == 0) {
-        ret = DestroyEvent(args);
-    }
-    return ret;
-}
-/**
- * @brief Gets the memory clock rate for a CUDA device.
- *
- * @details This function gets the memory clock rate using the appropriate method
- * based on CUDA version: CUDA 12.0+ uses NVML and cudaDeviceGetAttribute as a fallback;
- * older CUDA versions use cudaDeviceProp.
- *
- * @param[in] device_id The ID of the CUDA device.
- * @param[in] prop The properties of the CUDA device.
- * @return float The memory clock rate in MHz, or -1.0f if retrieval fails.
- */
-float GpuStream::GetMemoryClockRate(int device_id, const cudaDeviceProp &prop) {
-    float memory_clock_mhz = 0.0f;
-    // Set the device before getting attributes
-    if (SetGpu(device_id)) {
-        return -1.0f;
-    }
-#if CUDA_VERSION >= 12000
-    // For CUDA 12.0+, first try NVML for actual clock rate
-    memory_clock_mhz = GetActualMemoryClockRate(device_id);
-    // If NVML fails, fall back to cudaDeviceGetAttribute
-    if (memory_clock_mhz < 0.0f) {
-        int memory_clock_khz = 0;
-        cudaError_t cuda_err = cudaDeviceGetAttribute(&memory_clock_khz, cudaDevAttrMemoryClockRate, device_id);
-        if (cuda_err != cudaSuccess) {
-            std::cerr << "GetMemoryClockRate::cudaDeviceGetAttribute error: " << cuda_err << std::endl;
-            return -1.0f;
-        }
-        // Convert kHz to MHz
-        memory_clock_mhz = memory_clock_khz / 1000.0f;
-    }
-#else
-    // For CUDA < 12.0, use memoryClockRate from cudaDeviceProp
-    // Note: memoryClockRate is in kHz, need to convert to MHz first
-    memory_clock_mhz = prop.memoryClockRate / 1000.0f; // Convert kHz to MHz
-#endif
-    return memory_clock_mhz;
-}
-/**
- * @brief Prints CUDA device information.
- *
- * @details This function prints the properties of a CUDA device specified by the device ID.
- *
- * @param[in] device_id The ID of the CUDA device.
- * @param[in] prop The properties of the CUDA device.
- * @param[in] memory_clock_mhz The memory clock rate in MHz (or -1.0f if unavailable).
- * @param[in] peak_bw The theoretical peak bandwidth in GB/s (or -1.0f if unavailable).
- */
-void GpuStream::PrintCudaDeviceInfo(int device_id, const cudaDeviceProp &prop, float memory_clock_mhz, float peak_bw) {
-    std::cout << "\nDevice " << device_id << ": \"" << prop.name << "\"";
-    std::cout << "  " << prop.multiProcessorCount << " SMs(" << prop.major << "." << prop.minor << ")";
-    if (peak_bw < 0.0f) {
-        // Bandwidth computation failed (memory_clock_mhz will also be < 0)
-        std::cout << "  Memory: " << prop.memoryBusWidth << "-bit bus, " << prop.totalGlobalMem / (1024 * 1024 * 1024)
-                  << " GB total (peak BW unavailable)";
-    } else {
-        // Both memory_clock_mhz and peak_bw are valid
-        std::cout << "  Memory: " << memory_clock_mhz << "MHz x " << prop.memoryBusWidth << "-bit = " << peak_bw
-                  << " GB/s PEAK ";
-    }
-    std::cout << "  ECC is " << (prop.ECCEnabled ? "ON" : "OFF") << std::endl;
-}
-/**
- * @brief Allocates a buffer on the GPU.
- *
- * @details This function allocates a buffer of the specified size on the GPU and returns a pointer
- * to the allocated memory.
- *
- * @param[out] ptr Pointer to a pointer where the address of the allocated buffer will be stored.
- * @param[in] size The size of the buffer to allocate in bytes.
- *
- * @return cudaError_t The status code indicating success or failure of the memory allocation.
- */
-template <typename T> cudaError_t GpuStream::GpuMallocDataBuf(T **ptr, uint64_t size) { return cudaMalloc(ptr, size); }
-/**
- * @brief Prepares validation buffers for GPU stream benchmark.
- *
- * @details This function allocates and initializes validation buffers for different
- * kernels (copy, scale, add, and triad) used in the GPU stream benchmark. The buffer order
- * matches the kernel order in the Kernel enum.
- *
- * @param args A unique pointer to a BenchArgs structure containing the necessary arguments
- * for preparing the buffer and stream.
- *
- * @return int The status code indicating success or failure of the preparation.
- */
-template <typename T> int GpuStream::PrepareValidationBuf(std::unique_ptr<BenchArgs<T>> &args) {
-    args->sub.validation_buf_ptrs.resize(kNumValidationBuffers);
-    // Compute and allocate validation buffers for add, scale and triad
-    uint64_t size = args->size / sizeof(T);
-    for (auto &buf_ptr : args->sub.validation_buf_ptrs) {
-        buf_ptr.resize(size);
-    }
-    // Initialize validation buffer
-    T s = static_cast<T>(scalar);
-    for (size_t j = 0; j < size; j++) {
-        args->sub.validation_buf_ptrs[0][j] = static_cast<T>(j % kUInt8Mod);
-        args->sub.validation_buf_ptrs[1][j] = static_cast<T>(j % kUInt8Mod) * s;
-        args->sub.validation_buf_ptrs[2][j] = static_cast<T>(j % kUInt8Mod) + static_cast<T>(j % kUInt8Mod);
-        args->sub.validation_buf_ptrs[3][j] = static_cast<T>(j % kUInt8Mod) + static_cast<T>(j % kUInt8Mod) * s;
-    }
-    return 0;
-}
-/**
- * @brief Prepares the buffer and stream for benchmarking.
- *
- * @details This function prepares the necessary buffer and stream for benchmarking based on the
- * provided arguments. It initializes and configures the buffer and stream as required.
- *
- * @param[in,out] args A unique pointer to a BenchArgs structure containing the necessary arguments
- * for preparing the buffer and stream.
- *
- * @return int The status code indicating success or failure of the preparation.
- */
-template <typename T> int GpuStream::PrepareBufAndStream(std::unique_ptr<BenchArgs<T>> &args) {
-    cudaError_t cuda_err = cudaSuccess;
-    if (args->check_data) {
-        // Generate data to copy
-        args->sub.data_buf = static_cast<T *>(numa_alloc_onnode(args->size * sizeof(T), args->numa_id));
-        for (int j = 0; j < args->size / sizeof(T); j++) {
-            args->sub.data_buf[j] = static_cast<T>(j % kUInt8Mod);
-        }
-        // Allocate check buffer
-        args->sub.check_buf = static_cast<T *>(numa_alloc_onnode(args->size * sizeof(T), args->numa_id));
-    }
-    // Allocate buffers
-    args->sub.gpu_buf_ptrs.resize(kNumBuffers);
-    // Set to buffer device for GPU buffer
-    if (SetGpu(args->gpu_id)) {
-        return -1;
-    }
-    // Allocate buffers
-    for (auto &buf_ptr : args->sub.gpu_buf_ptrs) {
-        T *raw_ptr = nullptr;
-        cuda_err = GpuMallocDataBuf(&raw_ptr, args->size * sizeof(T));
-        if (cuda_err != cudaSuccess) {
-            std::cerr << "PrepareBufAndStream::cudaMalloc error: " << cuda_err << std::endl;
-            return -1;
-        }
-        buf_ptr.reset(raw_ptr); // Transfer ownership to the smart pointer
-    }
-    // Initialize source buffer
-    if (args->check_data) {
-        cuda_err = cudaMemcpy(args->sub.gpu_buf_ptrs[0].get(), args->sub.data_buf, args->size, cudaMemcpyDefault);
-        if (cuda_err != cudaSuccess) {
-            std::cerr << "PrepareBufAndStream::cudaMemcpy error: " << cuda_err << std::endl;
-            return -1;
-        }
-        cuda_err = cudaMemcpy(args->sub.gpu_buf_ptrs[1].get(), args->sub.data_buf, args->size, cudaMemcpyDefault);
-        if (cuda_err != cudaSuccess) {
-            std::cerr << "PrepareBufAndStream::cudaMemcpy error: " << cuda_err << std::endl;
-            return -1;
-        }
-        PrepareValidationBuf<T>(args);
-    }
-    cuda_err = cudaStreamCreateWithFlags(&(args->sub.stream), cudaStreamNonBlocking);
-    if (cuda_err != cudaSuccess) {
-        std::cerr << "PrepareBufAndStream::cudaStreamCreate error: " << cuda_err << std::endl;
-        return -1;
-    }
-    return 0;
-}
-/**
- * @brief Prepares CUDA events for benchmarking.
- *
- * @details This function creates the necessary CUDA events for benchmarking based on the
- * provided arguments. It initializes and configures the events as required.
- *
- * @param[in,out] args A unique pointer to a BenchArgs structure containing the necessary arguments
- * for preparing the CUDA events.
- *
- * @return int The status code indicating success or failure of the preparation.
- */
-template <typename T> int GpuStream::PrepareEvent(std::unique_ptr<BenchArgs<T>> &args) {
-    cudaError_t cuda_err = cudaSuccess;
-    if (SetGpu(args->gpu_id)) {
-        return -1;
-    }
-    cuda_err = cudaEventCreate(&(args->sub.start_event));
-    if (cuda_err != cudaSuccess) {
-        std::cerr << "PrepareEvent::cudaEventCreate error: " << cuda_err << std::endl;
-        return -1;
-    }
-    cuda_err = cudaEventCreate(&(args->sub.end_event));
-    if (cuda_err != cudaSuccess) {
-        std::cerr << "PrepareEvent::cudaEventCreate error: " << cuda_err << std::endl;
-        return -1;
-    }
-    return 0;
-}
-/**
- * @brief Validates the result of data transfer.
- *
- * @details This function checks the buffer to validate the result of a data transfer operation
- * based on the provided arguments. It ensures that the data transfer was successful and that
- * the buffer contains the expected data.
- *
- * @param[in,out] args A unique pointer to a BenchArgs structure containing the necessary arguments
- * for validating the buffer.
- *
- * @return int The status code indicating success or failure of the validation.
- */
-template <typename T> int GpuStream::CheckBuf(std::unique_ptr<BenchArgs<T>> &args, int kernel_idx) {
-    cudaError_t cuda_err = cudaSuccess;
-    int memcmp_result = 0;
-    if (SetGpu(args->gpu_id)) {
-        return -1;
-    }
-    // Copy buffer output from stream kernel to local buffer
-    cuda_err = cudaMemcpy(args->sub.check_buf, args->sub.gpu_buf_ptrs[2].get(), args->size, cudaMemcpyDefault);
-    if (cuda_err != cudaSuccess) {
-        std::cerr << "CheckBuf::cudaMemcpy error: " << cuda_err << std::endl;
-        return -1;
-    }
-    // Validate result by comparing the data buffer and check buffer
-    memcmp_result = memcmp(args->sub.validation_buf_ptrs[kernel_idx].data(), args->sub.check_buf, args->size);
-    if (memcmp_result) {
-        std::cerr << "CheckBuf::Memory check failed for kernel index " << kernel_idx << std::endl;
-        return -1;
-    }
-    return 0;
-}
-/**
- * @brief Destroys the buffer and stream used for benchmarking.
- *
- * @details This function cleans up and releases the resources associated with the buffer and stream
- * used for benchmarking based on the provided arguments. It ensures that all allocated buffers and streams
- * resources are properly freed.
- *
- * @param[in,out] args A unique pointer to a BenchArgs structure containing the necessary arguments
- * for destroying the buffer and stream.
- *
- * @return int The status code indicating success or failure of the destruction process.
- */
-template <typename T> int GpuStream::DestroyBufAndStream(std::unique_ptr<BenchArgs<T>> &args) {
-    int ret = 0;
-    cudaError_t cuda_err = cudaSuccess;
-    // Destroy original data buffer and check buffer
-    if (args->sub.data_buf != nullptr) {
-        numa_free(args->sub.data_buf, args->size);
-    }
-    if (args->sub.check_buf != nullptr) {
-        numa_free(args->sub.check_buf, args->size);
-    }
-    // Set to buffer device for GPU buffer
-    if (SetGpu(args->gpu_id)) {
-        return -1;
-    }
-    cuda_err = cudaStreamDestroy(args->sub.stream);
-    if (cuda_err != cudaSuccess) {
-        std::cerr << "DestroyBufAndStream::cudaStreamDestroy error: " << cuda_err << std::endl;
-        return -1;
-    }
-    return ret;
-}
-/**
- * @brief Runs the STREAM benchmark.
- *
- * @details This function runs the STREAM benchmark using the specified kernel and number of threads per block.
- * It prepares the necessary arguments and configurations for the benchmark execution.
- *
- * @param[in,out] args A unique pointer to a BenchArgs structure containing the necessary arguments for the
- benchmark.
- * @param[in] kernel The kernel function to be used for the benchmark.
- * @param[in] num_threads_per_block The number of threads per block to be used in the kernel execution.
- *
- * @return int The status code indicating success or failure of the benchmark execution.
- */
-template <typename T>
-int GpuStream::RunStreamKernel(std::unique_ptr<BenchArgs<T>> &args, Kernel kernel, int num_threads_per_block) {
-    cudaError_t cuda_err = cudaSuccess;
-    uint64_t num_thread_blocks;
-    int size_factor = 2;
-    // Validate data size
-    uint64_t num_elements_in_thread_block = kNumLoopUnroll * num_threads_per_block;
-    uint64_t num_bytes_in_thread_block = num_elements_in_thread_block * sizeof(T);
-    if (args->size % num_bytes_in_thread_block) {
-        std::cerr << "RunCopy: Data size should be multiple of " << num_bytes_in_thread_block << std::endl;
-        return -1;
-    }
-    num_thread_blocks = args->size / num_bytes_in_thread_block;
-    args->sub.times_in_ms.resize(static_cast<int>(Kernel::kCount));
-    if (SetGpu(args->gpu_id)) {
-        return -1;
-    }
-    // Launch jobs and collect running time
-    for (int i = 0; i < args->num_loops + args->num_warm_up; i++) {
-        // Record start event once warm up iterations are done
-        if (i == args->num_warm_up) {
-            cuda_err = cudaEventRecord(args->sub.start_event, args->sub.stream);
-            if (cuda_err != cudaSuccess) {
-                std::cerr << "RunStreamKernel::cudaEventRecord error: " << cuda_err << std::endl;
-                return -1;
-            }
-        }
-        switch (kernel) {
-        case Kernel::kCopy:
-            CopyKernel<<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>(
-                reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[2].get()),
-                reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[0].get()));
-            args->sub.kernel_name = "COPY";
-            break;
-        case Kernel::kScale:
-            ScaleKernel<<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>(
-                reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[2].get()),
-                reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[0].get()), static_cast<T>(scalar));
-            args->sub.kernel_name = "SCALE";
-            break;
-        case Kernel::kAdd:
-            AddKernel<<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>(
-                reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[2].get()),
-                reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[0].get()),
-                reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[1].get()));
-            size_factor = 3;
-            args->sub.kernel_name = "ADD";
-            break;
-        case Kernel::kTriad:
-            TriadKernel<<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>(
-                reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[2].get()),
-                reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[0].get()),
-                reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[1].get()), static_cast<T>(scalar));
-            size_factor = 3;
-            args->sub.kernel_name = "TRIAD";
-            break;
-        default:
-            std::cerr << "RunStreamKernel::Invalid kernel: " << std::endl;
-            break;
-        }
-        // Record end event at the end of iterations
-        if (i + 1 == args->num_loops + args->num_warm_up) {
-            cuda_err = cudaEventRecord(args->sub.end_event, args->sub.stream);
-            if (cuda_err != cudaSuccess) {
-                std::cerr << "RunStreamKernel::cudaEventRecord error: " << cuda_err << std::endl;
-                return -1;
-            }
-        }
-    }
-    // Wait for the stream to finish
-    cuda_err = cudaStreamSynchronize(args->sub.stream);
-    if (cuda_err != cudaSuccess) {
-        std::cerr << "RunStreamKernel::cudaStreamSynchronize error: " << cuda_err << std::endl;
-        return -1;
-    }
-    // Calculate time
-    float time_in_ms = 0;
-    cuda_err = cudaEventElapsedTime(&time_in_ms, args->sub.start_event, args->sub.end_event);
-    if (cuda_err != cudaSuccess) {
-        std::cerr << "RunStreamKernel::cudaEventElapsedTime error: " << cuda_err << std::endl;
-        return -1;
-    }
-    args->sub.times_in_ms[static_cast<int>(kernel)].push_back(time_in_ms / size_factor);
-    return 0;
-}
-float GpuStream::GetActualMemoryClockRate(int gpu_id) {
-    nvmlReturn_t result;
-    nvmlDevice_t device;
-    unsigned int clock_mhz;
-    // Initialize NVML
-    result = nvmlInit();
-    if (result != NVML_SUCCESS) {
-        std::cerr << "Failed to initialize NVML: " << nvmlErrorString(result) << std::endl;
-        return -1.0f;
-    }
-    // Get device handle
-    result = nvmlDeviceGetHandleByIndex(gpu_id, &device);
-    if (result != NVML_SUCCESS) {
-        std::cerr << "Failed to get device handle: " << nvmlErrorString(result) << std::endl;
-        nvmlShutdown();
-        return -1.0f;
-    }
-    // Get memory clock rate
-    result = nvmlDeviceGetClockInfo(device, NVML_CLOCK_MEM, &clock_mhz);
-    if (result != NVML_SUCCESS) {
-        std::cerr << "Failed to get memory clock: " << nvmlErrorString(result) << std::endl;
-        nvmlShutdown();
-        return -1.0f;
-    }
-    nvmlShutdown();
-    return static_cast<float>(clock_mhz);
-}
-/**
- * @brief Runs the benchmark for various kernels and processes the results for a BenchArgs config.
- *
- * @details This function prepares the necessary buffers and streams, runs the benchmark for each kernel
- * with different thread per block configurations, checks the results, and processes the benchmark results.
- * It also handles cleanup of resources in case of errors.
- *
- * @param[in,out] args A unique pointer to a BenchArgs structure containing the necessary arguments for the
- benchmark.
- *
- * @return int The status code indicating success or failure of the benchmark execution.
- * */
-template <typename T>
-int GpuStream::RunStream(std::unique_ptr<BenchArgs<T>> &args, const std::string &data_type, float peak_bw) {
-    int ret = 0;
-    ret = PrepareBufAndStream<T>(args);
-    if (ret != 0) {
-        return DestroyBufAndStream(args);
-    }
-    ret = PrepareEvent(args);
-    if (ret != 0) {
-        return DestroyEvent(args);
-    }
-    // benchmark over the kThreadsPerBlock array
-    for (const int num_threads_in_block : kThreadsPerBlock) {
-        // run the stream benchmark over the stream kernels
-        for (int i = 0; i < static_cast<int>(Kernel::kCount); ++i) {
-            Kernel kernel = static_cast<Kernel>(i);
-            int ret = RunStreamKernel<T>(args, kernel, num_threads_in_block);
-            if (ret == 0 && args->check_data) {
-                // Compare buffer based on the kernel
-                ret = CheckBuf(args, i);
-            }
-        }
-    }
-    // output formatted results to stdout
-    // Tags are of format:
-    // STREAM_<Kernelname>_datatype_gpu_<gpu_id>_buffer_<buffer_size>_block_<block_size>
-    for (int i = 0; i < args->sub.times_in_ms.size(); i++) {
-        std::string tag = "STREAM_" + KernelToString(i) + "_" + data_type + "_gpu_" + std::to_string(args->gpu_id) +
-                          "_buffer_" + std::to_string(args->size);
-        for (int j = 0; j < args->sub.times_in_ms[i].size(); j++) {
-            // Calculate and display bandwidth
-            double bw = args->size * args->num_loops / args->sub.times_in_ms[i][j] / 1e6;
-            std::cout << tag << "_block_" << kThreadsPerBlock[j] << "\t" << bw << "\t";
-            if (peak_bw < 0) { // cannot get peak_bw -> prints -1 for efficiency
-                std::cout << "-1" << std::endl;
-            } else {
-                std::cout << std::fixed << std::setprecision(2) << bw / peak_bw * 100 << std::endl;
-            }
-        }
-    }
-    // cleanup buffer and streams for the curr arg
-    Destroy(args);
-    return ret;
-}
-/**
- * @brief Runs the Stream benchmark.
- *
- * @details This function processes the input args, validates and composes the BenchArgs structure for the
- availavble
- * GPUs, and runs the benchmark.
- *
- * @return int The status code indicating success or failure of the benchmark execution.
- * */
-int GpuStream::Run() {
-    int ret = 0;
-    int gpu_count = 0;
-    // Get number of NUMA nodes
-    if (numa_available()) {
-        std::cerr << "main::numa_available error" << std::endl;
-        return -1;
-    }
-    // Get number of GPUs
-    ret = GetGpuCount(&gpu_count);
-    if (ret != 0) {
-        return ret;
-    }
-    // find all GPUs and compose the Benchmarking data structure
-    for (int j = 0; j < gpu_count; j++) {
-        auto args = std::make_unique<BenchArgs<double>>();
-        args->numa_id = 0;
-        args->gpu_id = j;
-        cudaGetDeviceProperties(&args->gpu_device_prop, j);
-        args->num_warm_up = opts_.num_warm_up;
-        args->num_loops = opts_.num_loops;
-        args->size = opts_.size;
-        args->check_data = opts_.check_data;
-        args->numa_id = 0;
-        args->gpu_id = j;
-        // add data to vector
-        bench_args_.emplace_back(std::move(args));
-    }
-    bool has_error = false;
-    // Run the benchmark for all the configured data
-    for (auto &variant_args : bench_args_) {
-        std::visit(
-            [&](auto &curr_args) {
-                // Get memory clock rate once for both bandwidth computation and display
-                float memory_clock_mhz = GetMemoryClockRate(curr_args->gpu_id, curr_args->gpu_device_prop);
-                // Compute theoretical bandwidth using the memory clock rate
-                float peak_bw = -1.0f;
-                if (memory_clock_mhz > 0.0f) {
-                    // Calculate theoretical bandwidth: memory_clock_mhz * bus_width_bytes * 2 (DDR) / 1000 (convert to
-                    // GB/s)
-                    peak_bw = memory_clock_mhz * (curr_args->gpu_device_prop.memoryBusWidth / 8) * 2 / 1000.0;
-                }
-                // Print device info with both the memory clock and peak bandwidth
-                PrintCudaDeviceInfo(curr_args->gpu_id, curr_args->gpu_device_prop, memory_clock_mhz, peak_bw);
-                // Set the NUMA node
-                ret = numa_run_on_node(curr_args->numa_id);
-                if (ret != 0) {
-                    std::cerr << "Run::numa_run_on_node error: " << errno << std::endl;
-                    has_error = true;
-                    return;
-                }
-                // Run the stream benchmark for the configured data, passing the peak bandwidth
-                if constexpr (std::is_same_v<std::decay_t<decltype(*curr_args)>, BenchArgs<float>>) {
-                    ret = RunStream<float>(curr_args, "float", peak_bw);
-                } else if constexpr (std::is_same_v<std::decay_t<decltype(*curr_args)>, BenchArgs<double>>) {
-                    ret = RunStream<double>(curr_args, "double", peak_bw);
-                } else {
-                    std::cerr << "Run::Unknown type error" << std::endl;
-                    has_error = true;
-                    return;
-                }
-                if (ret != 0) {
-                    std::cerr << "Run::RunStream error: " << errno << std::endl;
-                    has_error = true;
-                }
-            },
-            variant_args);
-    }
-    if (has_error) {
-        return -1;
-    }
-    return ret;
-}
\ No newline at end of file
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.hpp
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.hpp
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT License.
-#pragma once
-#include <getopt.h>
-#include <iostream>
-#include <memory>
-#include <variant>
-#include <vector>
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <numa.h>
-#include "gpu_stream_kernels.hpp"
-#include "gpu_stream_utils.hpp"
-#define NON_HIP (!defined(__HIP_PLATFORM_HCC__) && !defined(__HCC__) && !defined(__HIPCC__))
-using namespace stream_config;
-class GpuStream {
-  public:
-    GpuStream() = delete;            // Delete default constructor
-    GpuStream(Opts &) noexcept;      // Constructor
-    ~GpuStream() noexcept = default; // Destructor
-    GpuStream(const GpuStream &) = delete;
-    GpuStream &operator=(const GpuStream &) = delete;
-    GpuStream(GpuStream &&) noexcept = default;
-    GpuStream &operator=(GpuStream &&) noexcept = default;
-    int Run();
-  private:
-    using BenchArgsVariant = std::variant<std::unique_ptr<BenchArgs<double>>>;
-    std::vector<BenchArgsVariant> bench_args_;
-    Opts opts_;
-    // Memory management functions
-    template <typename T> cudaError_t GpuMallocDataBuf(T **, uint64_t);
-    template <typename T> int PrepareValidationBuf(std::unique_ptr<BenchArgs<T>> &);
-    template <typename T> int CheckBuf(std::unique_ptr<BenchArgs<T>> &, int);
-    template <typename T> int PrepareEvent(std::unique_ptr<BenchArgs<T>> &);
-    template <typename T> int PrepareBufAndStream(std::unique_ptr<BenchArgs<T>> &);
-    template <typename T> int DestroyEvent(std::unique_ptr<BenchArgs<T>> &);
-    template <typename T> int DestroyBufAndStream(std::unique_ptr<BenchArgs<T>> &);
-    template <typename T> int Destroy(std::unique_ptr<BenchArgs<T>> &);
-    // Benchmark functions
-    template <typename T> int RunStreamKernel(std::unique_ptr<BenchArgs<T>> &, Kernel, int);
-    float GetActualMemoryClockRate(int gpu_id);
-    template <typename T> int RunStream(std::unique_ptr<BenchArgs<T>> &, const std::string &data_type, float peak_bw);
-    // Helper functions
-    int GetGpuCount(int *);
-    int SetGpu(int gpu_id);
-    float GetMemoryClockRate(int device_id, const cudaDeviceProp &prop);
-    void PrintCudaDeviceInfo(int device_id, const cudaDeviceProp &prop, float memory_clock_mhz, float peak_bw);
-};
\ No newline at end of file
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_kernels.cu
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_kernels.cu
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT License.
-#include "gpu_stream_kernels.hpp"
-/**
- * @brief Fetches a value from source memory and writes it to a register.
- *
- * @details This inline device function fetches a value from the specified source memory
- * location and writes it to the provided register. The implementation references the following:
- * 1) NCCL:
- * https://github.com/NVIDIA/nccl/blob/7e515921295adaab72adf56ea71a0fafb0ecb5f3/src/collectives/device/common_kernel.h#L483
- * 2) RCCL:
- * https://github.com/ROCmSoftwarePlatform/rccl/blob/5c8380ff5b5925cae4bce00b1879a5f930226e8d/src/collectives/device/common_kernel.h#L268
- *
- * @tparam T The type of the value to fetch.
- * @param[out] v The register to write the fetched value to.
- * @param[in] p The source memory location to fetch the value from.
- */
-template <typename T> inline __device__ void Fetch(T &v, const T *p) {
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
-    v = *p;
-#else
-    if constexpr (std::is_same<T, float>::value) {
-        asm volatile("ld.volatile.global.f32 %0, [%1];" : "=f"(v) : "l"(p) : "memory");
-    } else if constexpr (std::is_same<T, double>::value) {
-        asm volatile("ld.volatile.global.f64 %0, [%1];" : "=d"(v) : "l"(p) : "memory");
-    }
-#endif
-}
-/**
- * @brief Stores a value from register and writes it to target memory.
- *
- * @details This inline device function stores a value from the provided register
- * and writes it to the specified target memory location. The implementation references the following:
- * 1) NCCL:
- * https://github.com/NVIDIA/nccl/blob/7e515921295adaab72adf56ea71a0fafb0ecb5f3/src/collectives/device/common_kernel.h#L486
- * 2) RCCL:
- * https://github.com/ROCmSoftwarePlatform/rccl/blob/5c8380ff5b5925cae4bce00b1879a5f930226e8d/src/collectives/device/common_kernel.h#L276
- *
- * @tparam T The type of the value to store.
- * @param[out] p The target memory location to write the value to.
- * @param[in] v The register containing the value to be stored.
- */
-template <typename T> inline __device__ void Store(T *p, const T &v) {
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
-    *p = v;
-#else
-    if constexpr (std::is_same<T, float>::value) {
-        asm volatile("st.volatile.global.f32 [%0], %1;" ::"l"(p), "f"(v) : "memory");
-    } else if constexpr (std::is_same<T, double>::value) {
-        asm volatile("st.volatile.global.f64 [%0], %1;" ::"l"(p), "d"(v) : "memory");
-    }
-#endif
-}
-/**
- * @brief Performs COPY, a simple copy operation from source to target. b = a
- *
- * @details This CUDA kernel performs a simple copy operation, copying data from the source array
- * to the target array. This is used to measure transfer rates without any arithmetic operations.
- *
- * @param[out] tgt The target array where data will be copied to.
- * @param[in] src The source array from which data will be copied.
- */
-__global__ void CopyKernel(double *tgt, const double *src) {
-    uint64_t index = blockIdx.x * blockDim.x * kNumLoopUnrollAlias + threadIdx.x;
-    double val[kNumLoopUnrollAlias];
-#pragma unroll
-    for (uint64_t i = 0; i < kNumLoopUnrollAlias; i++)
-        Fetch(val[i], src + index + i * blockDim.x);
-#pragma unroll
-    for (uint64_t i = 0; i < kNumLoopUnrollAlias; i++)
-        Store(tgt + index + i * blockDim.x, val[i]);
-}
-/**
- * @brief Performs SCALE, a scaling operation on the source data. b = x * a
- *
- * @details This CUDA kernel performs a simple arithmetic operation by scaling the source data
- * with a given scalar value and storing the result in the target array.
- *
- * @param[out] tgt The target array where the scaled data will be stored.
- * @param[in] src The source array containing the data to be scaled.
- * @param[in] scalar The scalar value used to scale the source data.
- */
-__global__ void ScaleKernel(double *tgt, const double *src, const double scalar) {
-    uint64_t index = blockIdx.x * blockDim.x * kNumLoopUnrollAlias + threadIdx.x;
-    double val[kNumLoopUnrollAlias];
-#pragma unroll
-    for (uint64_t i = 0; i < kNumLoopUnrollAlias; i++)
-        Fetch(val[i], src + index + i * blockDim.x);
-#pragma unroll
-    for (uint64_t i = 0; i < kNumLoopUnrollAlias; i++) {
-        val[i] *= scalar;
-        Store(tgt + index + i * blockDim.x, val[i]);
-    }
-}
-/**
- * @brief Performs ADD, an addition operation on two source arrays. c = a + b
- *
- * @details This CUDA kernel adds corresponding elements from two source arrays and stores the result
- * in the target array. This operation is used to measure transfer rates with a simple arithmetic addition.
- *
- * @param[out] tgt The target array where the result of the addition will be stored.
- * @param[in] src_a The first source array containing the first set of operands.
- * @param[in] src_b The second source array containing the second set of operands.
- */
-__global__ void AddKernel(double *tgt, const double *src_a, const double *src_b) {
-    uint64_t index = blockIdx.x * blockDim.x * kNumLoopUnrollAlias + threadIdx.x;
-    double val_a[kNumLoopUnrollAlias];
-    double val_b[kNumLoopUnrollAlias];
-#pragma unroll
-    for (uint64_t i = 0; i < kNumLoopUnrollAlias; i++) {
-        Fetch(val_a[i], src_a + index + i * blockDim.x);
-        Fetch(val_b[i], src_b + index + i * blockDim.x);
-    }
-#pragma unroll
-    for (uint64_t i = 0; i < kNumLoopUnrollAlias; i++) {
-        val_a[i] += val_b[i];
-        Store(tgt + index + i * blockDim.x, val_a[i]);
-    }
-}
-/**
- * @brief Performs TRIAD, fused multiply/add operations on source arrays. a = b + x * c
- *
- * @details This CUDA kernel performs a fused multiply/add operation by multiplying elements from
- * the second source array with a scalar value, adding the result to corresponding elements from
- * the first source array, and storing the result in the target array.
- *
- * @param[out] tgt The target array where the result of the fused multiply/add operation will be stored.
- * @param[in] src_a The first source array containing the first set of operands.
- * @param[in] src_b The second source array containing the second set of operands to be multiplied by the scalar.
- * @param[in] scalar The scalar value used in the multiply/add operation.
- */
-__global__ void TriadKernel(double *tgt, const double *src_a, const double *src_b, const double scalar) {
-    uint64_t index = blockIdx.x * blockDim.x * kNumLoopUnrollAlias + threadIdx.x;
-    double val_a[kNumLoopUnrollAlias];
-    double val_b[kNumLoopUnrollAlias];
-#pragma unroll
-    for (uint64_t i = 0; i < kNumLoopUnrollAlias; i++) {
-        Fetch(val_a[i], src_a + index + i * blockDim.x);
-        Fetch(val_b[i], src_b + index + i * blockDim.x);
-    }
-#pragma unroll
-    for (uint64_t i = 0; i < kNumLoopUnrollAlias; i++) {
-        val_b[i] += (val_a[i] * scalar);
-        Store(tgt + index + i * blockDim.x, val_b[i]);
-    }
-}
\ No newline at end of file
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_kernels.hpp
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_kernels.hpp
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT License.
-#pragma once
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include "gpu_stream_utils.hpp"
-constexpr auto kNumLoopUnrollAlias = stream_config::kNumLoopUnroll;
-// Function declarations
-template <typename T> inline __device__ void Fetch(T &v, const T *p);
-template <typename T> inline __device__ void Store(T *p, const T &v);
-__global__ void CopyKernel(double *, const double *);
-__global__ void ScaleKernel(double *, const double *, const double);
-__global__ void AddKernel(double *, const double *, const double *);
-__global__ void TriadKernel(double *, const double *, const double *, const double);
\ No newline at end of file
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_test.cpp
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_test.cpp
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT License.
-#include "gpu_stream.hpp"
-/**
- * @brief Main function and entry of gpu stream benchmark
- * @details
- * params list:
- *  num_warm_up: warm up count
- *  num_loops: num of runs for timing
- *  size: number of bytes to setup for the test
- * @param  argc argument count
- * @param  argv argument vector
- * @return int
- */
-int main(int argc, char **argv) {
-    int ret = 0;
-    stream_config::Opts opts;
-    // parse arguments from cmd
-    ret = stream_config::ParseOpts(argc, argv, &opts);
-    if (ret != 0) {
-        return ret;
-    }
-    // run the stream benchmark
-    GpuStream gpu_stream(opts);
-    ret = gpu_stream.Run();
-    if (ret != 0) {
-        return ret;
-    }
-    return 0;
-}
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_utils.cpp
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_utils.cpp
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT License.
-#include "gpu_stream_utils.hpp"
-namespace stream_config {
-/**
- * @brief Converts a kernel index to its corresponding string representation.
- *
- * @details This function takes an integer representing a kernel index and returns the corresponding
- * string representation of the kernel. The mapping between kernel indices and their string representations
- * should be defined within the function.
- *
- * @param[in] kernel_idx The index of the kernel to be converted to a string.
- *
- * @return std::string The string representation of the kernel.
- */
-std::string KernelToString(int kernel_idx) {
-    switch (kernel_idx) {
-    case static_cast<int>(Kernel::kCopy):
-        return "COPY";
-    case static_cast<int>(Kernel::kScale):
-        return "SCALE";
-    case static_cast<int>(Kernel::kAdd):
-        return "ADD";
-    case static_cast<int>(Kernel::kTriad):
-        return "TRIAD";
-    default:
-        return "UNKNOWN";
-    }
-}
-/**
- * @brief Print the usage of this program.
- *
- * @details Thus function prints the usage of this program.
- *
- * @return void.
- * */
-void PrintUsage() {
-    std::cout << "Usage: gpu_stream "
-              << "--size <size in bytes> "
-              << "--num_warm_up <num_warm_up> "
-              << "--num_loops <num_loops> "
-              << "[--check_data]" << std::endl;
-}
-/**
- * @brief Print the user provided inputs info.
- *
- * @details Thus function prints the parsed user provided inputs of this program..
- *
- * @param[in] opts The Opts struct that stores the parsed values.
- *
- * @return void
- * */
-void PrintInputInfo(Opts &opts) {
-    std::cout << "STREAM Benchmark" << std::endl;
-    std::cout << "Buffer size(bytes): " << opts.size << std::endl;
-    std::cout << "Number of warm up runs: " << opts.num_warm_up << std::endl;
-    std::cout << "Number of loops: " << opts.num_loops << std::endl;
-    std::cout << "Check data: " << (opts.check_data ? "Yes" : "No") << std::endl;
-}
-/**
- * @brief Parse the command line options.
- *
- * @details Thus function parses the command line options and stores the values in the Opts struct.
- *
- * @param[in] argc The number of command line options.
- * @param[in] argv The command line options.
- * @param[out] opts The Opts struct to store the parsed values.
- *
- * @return int The status code.
- * */
-int ParseOpts(int argc, char **argv, Opts *opts) {
-    enum class OptIdx { kSize, kNumWarmUp, kNumLoops, kEnableCheckData };
-    const struct option options[] = {{"size", required_argument, nullptr, static_cast<int>(OptIdx::kSize)},
-                                     {"num_warm_up", required_argument, nullptr, static_cast<int>(OptIdx::kNumWarmUp)},
-                                     {"num_loops", required_argument, nullptr, static_cast<int>(OptIdx::kNumLoops)},
-                                     {"check_data", no_argument, nullptr, static_cast<int>(OptIdx::kEnableCheckData)}};
-    int getopt_ret = 0;
-    int opt_idx = 0;
-    bool size_specified = true;
-    bool num_warm_up_specified = false;
-    bool num_loops_specified = false;
-    bool parse_err = false;
-    while (true) {
-        getopt_ret = getopt_long(argc, argv, "", options, &opt_idx);
-        if (getopt_ret == -1) {
-            if (!size_specified || !num_warm_up_specified || !num_loops_specified) {
-                parse_err = true;
-            }
-            break;
-        } else if (getopt_ret == '?') {
-            parse_err = true;
-            break;
-        }
-        switch (opt_idx) {
-        case static_cast<int>(OptIdx::kSize):
-            if (1 != sscanf(optarg, "%lu", &(opts->size))) {
-                std::cerr << "Invalid size: " << optarg << std::endl;
-                parse_err = true;
-            } else {
-                size_specified = true;
-            }
-            break;
-        case static_cast<int>(OptIdx::kNumWarmUp):
-            if (1 != sscanf(optarg, "%lu", &(opts->num_warm_up))) {
-                std::cerr << "Invalid num_warm_up: " << optarg << std::endl;
-                parse_err = true;
-            } else {
-                num_warm_up_specified = true;
-            }
-            break;
-        case static_cast<int>(OptIdx::kNumLoops):
-            if (1 != sscanf(optarg, "%lu", &(opts->num_loops))) {
-                std::cerr << "Invalid num_loops: " << optarg << std::endl;
-                parse_err = true;
-            } else {
-                num_loops_specified = true;
-            }
-            break;
-        case static_cast<int>(OptIdx::kEnableCheckData):
-            opts->check_data = true;
-            break;
-        default:
-            parse_err = true;
-        }
-        if (parse_err) {
-            break;
-        }
-    }
-    if (parse_err) {
-        PrintUsage();
-        return -1;
-    }
-    return 0;
-}
-} // namespace stream_config
-unsigned long long getCurrentTimestampInMicroseconds() {
-    // Get the current time point
-    auto now = std::chrono::system_clock::now();
-    // Convert to time since epoch
-    auto duration = now.time_since_epoch();
-    // Convert to microseconds
-    auto microseconds = std::chrono::duration_cast<std::chrono::microseconds>(duration).count();
-    return static_cast<unsigned long long>(microseconds);
-}
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_utils.hpp
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_utils.hpp
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT License.
-#pragma once
-#include <array>
-#include <chrono>
-#include <getopt.h>
-#include <iomanip>
-#include <iostream>
-#include <memory>
-#include <string>
-#include <vector>
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <numa.h>
-#include <nvml.h>
-// Custom deleter for GPU buffers
-struct GpuBufferDeleter {
-    template <typename T> void operator()(T *ptr) const {
-        if (ptr) {
-            cudaFree(ptr);
-        }
-    }
-};
-unsigned long long getCurrentTimestampInMicroseconds();
-namespace stream_config {
-constexpr std::array<int, 4> kThreadsPerBlock = {128, 256, 512, 1024}; // Threads per block
-constexpr uint64_t kDefaultBufferSizeInBytes = 4294967296;             // Default buffer size 4GB
-constexpr int kNumLoopUnroll = 2;                                      // Unroll depth in SM copy kernel
-constexpr int kNumBuffers = 3;                                         // Number of buffers for triad, add kernel
-constexpr int kNumValidationBuffers = 4;                          // Number of validation buffers, one for each kernel
-constexpr int kUInt8Mod = 256;                                    // Modulo for unsigned long data type
-constexpr std::array<int, 4> kBufferBwMultipliers = {2, 2, 3, 3}; // Buffer multiplier for triad, add kernel
-constexpr double scalar = 11.0;                                   // Scalar for scale, triad kernel
-// Enum for different kernels
-enum class Kernel {
-    kCopy,
-    kScale,
-    kAdd,
-    kTriad,
-    kCount // Add a count to keep track of the number of enums. Helpful for iterating over enums.
-};
-// Arguments for each sub benchmark run.
-template <typename T> struct SubBenchArgs {
-    // Unique pointer for GPU buffers
-    using GpuBufferUniquePtr = std::unique_ptr<T, GpuBufferDeleter>;
-    // Original data buffer.
-    T *data_buf = nullptr;
-    // Buffer to validate the correctness of data transfer.
-    T *check_buf = nullptr;
-    // GPU pointer of the data buffer on source devices.
-    std::vector<GpuBufferUniquePtr> gpu_buf_ptrs;
-    // Pointer of the validation buffers for each kernel. Order is same as Kernel enum.
-    std::vector<std::vector<T>> validation_buf_ptrs;
-    // CUDA stream to be used.
-    cudaStream_t stream;
-    // CUDA event to record start time.
-    cudaEvent_t start_event;
-    // CUDA event to record end time.
-    cudaEvent_t end_event;
-    // CUDA event to record end time.
-    std::vector<std::vector<float>> times_in_ms;
-    // Stream Kernel name.
-    std::string kernel_name;
-};
-// Arguments for each benchmark run.
-template <typename T> struct BenchArgs {
-    // NUMA node under which the benchmark is done.
-    uint64_t numa_id = 0;
-    // GPU ID for device.
-    int gpu_id = 0;
-    // GPU device info
-    cudaDeviceProp gpu_device_prop;
-    // Data buffer size used.
-    uint64_t size = kDefaultBufferSizeInBytes;
-    // Number of warm up rounds to run.
-    uint64_t num_warm_up = 0;
-    // Number of loops to run.
-    uint64_t num_loops = 1;
-    // Whether check data after copy.
-    bool check_data = false;
-    // Sub-benchmarks in parallel.
-    SubBenchArgs<T> sub;
-};
-// Options accepted by this program.
-struct Opts {
-    // Data buffer size for copy benchmark.
-    uint64_t size = kDefaultBufferSizeInBytes;
-    // Number of warm up rounds to run.
-    uint64_t num_warm_up = 0;
-    // Number of loops to run.
-    uint64_t num_loops = 0;
-    // Whether check data after copy.
-    bool check_data = false;
-};
-std::string KernelToString(int); // Function to convert enum to string
-int ParseOpts(int, char **, Opts *);
-void PrintInputInfo(Opts &);
-void PrintUsage();
-} // namespace stream_config
--- a/superbench/benchmarks/micro_benchmarks/kernel_launch_overhead/CMakeLists.txt
+++ b/superbench/benchmarks/micro_benchmarks/kernel_launch_overhead/CMakeLists.txt
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
-cmake_minimum_required(VERSION 3.18)
+cmake_minimum_required(VERSION 3.24)
 project(kernel_launch_overhead LANGUAGES CXX)
 find_package(CUDAToolkit QUIET)
-# Cuda environment
+if(CUDAToolkit_FOUND) # CUDA environment
-if(CUDAToolkit_FOUND)
    message(STATUS "Found CUDA: " ${CUDAToolkit_VERSION})
    include(../cuda_common.cmake)
    add_executable(kernel_launch_overhead kernel_launch.cu)
    set_property(TARGET kernel_launch_overhead PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED})
-    install(TARGETS kernel_launch_overhead RUNTIME DESTINATION bin)
+else() # ROCm environment
-else()
-    # ROCm environment
    include(../rocm_common.cmake)
-    find_package(hip QUIET)
+    hipify_sources(HIP_FILES kernel_launch.cu)
-    if(hip_FOUND)
+    add_executable(kernel_launch_overhead ${HIP_FILES})
-        message(STATUS "Found HIP: " ${HIP_VERSION})
-        # Convert cuda code to hip code in cpp
-        execute_process(COMMAND hipify-perl -print-stats -o kernel_launch.cpp kernel_launch.cu WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/)
-        # link hip device lib
-        add_executable(kernel_launch_overhead kernel_launch.cpp)
-        target_link_libraries(kernel_launch_overhead hip::device)
-        # Install tergets
-        install(TARGETS kernel_launch_overhead RUNTIME DESTINATION bin)
-    else()
-        message(FATAL_ERROR "No CUDA or ROCm environment found.")
-    endif()
 endif()
+install(TARGETS kernel_launch_overhead RUNTIME DESTINATION bin)
--- a/superbench/benchmarks/micro_benchmarks/rocm_common.cmake
+++ b/superbench/benchmarks/micro_benchmarks/rocm_common.cmake
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
+enable_language(HIP)
 # Set ROCM_PATH
 if(NOT DEFINED ENV{ROCM_PATH})
    # Run hipconfig -p to get ROCm path
@@ -37,16 +39,53 @@ else()
    set(HIP_PATH $ENV{HIP_PATH})
 endif()
-# Turn off CMAKE_HIP_ARCHITECTURES Feature if cmake version is 3.21+
-if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.21.0)
-    set(CMAKE_HIP_ARCHITECTURES OFF)
-endif()
-message(STATUS "CMAKE HIP ARCHITECTURES: ${CMAKE_HIP_ARCHITECTURES}")
 if(EXISTS ${HIP_PATH})
    # Search for hip in common locations
    list(APPEND CMAKE_PREFIX_PATH ${HIP_PATH} ${ROCM_PATH} ${ROCM_PATH}/hsa ${ROCM_PATH}/hip ${ROCM_PATH}/share/rocm/cmake/)
-    set(CMAKE_CXX_COMPILER "${HIP_PATH}/bin/hipcc")
+    list(APPEND CMAKE_MODULE_PATH
-    set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
+        "${HIP_PATH}/cmake"
-    set(CMAKE_MODULE_PATH "${HIP_PATH}/lib/cmake/hip" ${CMAKE_MODULE_PATH})
+        "${HIP_PATH}/lib/cmake/hip"
+    )
 endif()
+function(hipify_sources OUTPUT_VAR_NAME)
+    if(NOT HIPIFY_TOOL)
+        find_program(HIPIFY_TOOL hipify-perl PATHS $ENV{ROCM_PATH}/bin)
+        if(NOT HIPIFY_TOOL)
+            message(FATAL_ERROR "hipify-perl not found! Cannot translate CUDA to HIP.")
+        endif()
+    endif()
+    set(HIP_SOURCE_EXTS ".hip" ".cpp" ".cc" ".cxx")
+    set(GENERATED_HIP_FILES "")
+    foreach(SRC_FILE ${ARGN})
+        get_filename_component(FILE_ABS ${SRC_FILE} ABSOLUTE)
+        get_filename_component(FILE_NAME_WE ${SRC_FILE} NAME_WE)
+        get_filename_component(FILE_EXT ${SRC_FILE} EXT)
+        if(FILE_EXT STREQUAL ".cu")
+            set(OUT_EXT ".hip")
+        else()
+            set(OUT_EXT ${FILE_EXT})
+        endif()
+        set(OUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/${FILE_NAME_WE}${OUT_EXT}")
+        add_custom_command(
+            OUTPUT ${OUT_FILE}
+            COMMAND ${HIPIFY_TOOL} -print-stats -o ${OUT_FILE} ${FILE_ABS}
+            DEPENDS ${FILE_ABS}
+            COMMENT "Auto-hipifying ${SRC_FILE}..."
+        )
+        if(OUT_EXT IN_LIST HIP_SOURCE_EXTS)
+            set_source_files_properties(${OUT_FILE} PROPERTIES
+                COMPILE_OPTIONS "-Wno-unused-result;-Wno-return-type"
+                LANGUAGE HIP
+            )
+        endif()
+        list(APPEND GENERATED_HIP_FILES ${OUT_FILE})
+    endforeach()
+    set(${OUTPUT_VAR_NAME} ${GENERATED_HIP_FILES} PARENT_SCOPE)
+endfunction()
--- a/BabelStream @ f6ae48de
+++ b/BabelStream @ f6ae48de
+Subproject commit f6ae48de899408cf50c24079417dc71a03dbb5a8
--- a/third_party/Makefile
+++ b/third_party/Makefile
@@ -16,7 +16,7 @@ ROCM_VER ?= $(shell hipconfig -R | grep -oP '\d+\.\d+\.\d+' || echo "0.0.0")
 NUM_MAKE_JOBS ?= $(shell nproc --ignore=2)
-.PHONY: all cuda_with_msccl cuda rocm dtk common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed apex_rocm nvbandwidth rocm_megatron_lm
+.PHONY: all cuda_with_msccl cuda rocm dtk common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt rocm_babelstream_hip megatron_lm megatron_deepspeed apex_rocm nvbandwidth rocm_megatron_lm
 # Build targets.
 all: cuda rocm
@@ -24,7 +24,7 @@ all: cuda rocm
 cuda_with_msccl: cuda cuda_msccl
 cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed nvbandwidth
 rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed apex_rocm rocm_megatron_lm
-dtk: common rocm_perftest rocm_rccl_tests megatron_deepspeed apex_rocm rocm_megatron_lm
+dtk: common rocm_perftest rocm_rccl_tests rocm_babelstream_hip megatron_deepspeed apex_rocm rocm_megatron_lm
 cpu: common cpu_perftest
 common: fio cpu_stream
@@ -180,6 +180,18 @@ rocm_bandwidthTest: sb_micro_path
 	cd ./HIP/samples/1_Utils/hipBusBandwidth/ && mkdir -p build && cd build && cmake .. && make
 	cp -v ./HIP/samples/1_Utils/hipBusBandwidth/build/hipBusBandwidth $(SB_MICRO_PATH)/bin/
+# Build BabelStream hip-stream from submodule tag v5.0.
+rocm_babelstream_hip: sb_micro_path
+ifneq (,$(wildcard BabelStream/CMakeLists.txt))
+	cd ./BabelStream && \
+	cmake -S . -B build \
+		-DMODEL=hip \
+		-DCMAKE_CXX_COMPILER=hipcc \
+		-DCXX_EXTRA_FLAGS="--gpu-max-threads-per-block=1024" && \
+	cmake --build build -j $(NUM_MAKE_JOBS)
+	cp -v ./BabelStream/build/hip-stream $(SB_MICRO_PATH)/bin/
+endif
 # Build GPCNET from commit c56fd9.
 gpcnet: sb_micro_path
 	bash -c "source ${HPCX_HOME}/hpcx-init.sh && hpcx_load && make CC=mpicc -C GPCNET all && hpcx_unload"