Bugfix - gpu_stream: remove ROCm build support, require CUDA with NVML (#789)

Summary The gpu_stream benchmark has NVIDIA-specific dependencies that prevent it from compiling on ROCm 6.3+. This change makes it CUDA-only, gracefully skipping the build with a warning on non-NVIDIA environments. Problem The gpu_stream benchmark fails to compile on ROCm 6.3+ due to multiple NVIDIA-specific dependencies: 1. nvml.h — NVIDIA Management Library header, used for querying actual memory clock rates. No HIP equivalent. Referenced in gpu_stream.cu and gpu_stream_utils.hpp. 2. cuda.h in headers — Three .hpp files (gpu_stream.hpp, gpu_stream_kernels.hpp, gpu_stream_utils.hpp) directly include <cuda.h> and <cuda_runtime.h>. These headers are not processed by hipify-perl (only .cu source files are), so they fail to resolve on ROCm. 3. Deprecated hipDeviceProp_t struct fields — The code accesses memoryBusWidth, memoryClockRate, and ECCEnabled from the device properties struct. These fields were removed from hipDeviceProp_t in ROCm 6.3, causing compilation errors after hipification. The existing ROCm path was marked as incomplete (# TODO: test for ROC) and was never fully functional on recent ROCm versions. Changes - Removed the non-functional ROCm/HIP build path from gpu_stream/CMakeLists.txt - When CUDA is not found, prints a warning and returns gracefully instead of attempting a broken hipify build or raising FATAL_ERROR - No changes to the NVIDIA/CUDA build path — it continues to work as before Impact - NVIDIA builds: No change — gpu_stream builds and installs normally - ROCm builds: gpu_stream is skipped with a warning message. Previously it would fail the entire make cppbuild step, blocking the Docker image build - Other benchmarks: Unaffected — build.sh continues to the next benchmark after gpu_stream returns Co-authored-by: Hongtao Zhang <hongtaozhang@microsoft.com>

Bugfix - gpu_stream: remove ROCm build support, require CUDA with NVML (#789)
Summary The gpu_stream benchmark has NVIDIA-specific dependencies that prevent it from compiling on ROCm 6.3+. This change makes it CUDA-only, gracefully skipping the build with a warning on non-NVIDIA environments. Problem The gpu_stream benchmark fails to compile on ROCm 6.3+ due to multiple NVIDIA-specific dependencies: 1. nvml.h — NVIDIA Management Library header, used for querying actual memory clock rates. No HIP equivalent. Referenced in gpu_stream.cu and gpu_stream_utils.hpp. 2. cuda.h in headers — Three .hpp files (gpu_stream.hpp, gpu_stream_kernels.hpp, gpu_stream_utils.hpp) directly include <cuda.h> and <cuda_runtime.h>. These headers are not processed by hipify-perl (only .cu source files are), so they fail to resolve on ROCm. 3. Deprecated hipDeviceProp_t struct fields — The code accesses memoryBusWidth, memoryClockRate, and ECCEnabled from the device properties struct. These fields were removed from hipDeviceProp_t in ROCm 6.3, causing compilation errors after hipification. The existing ROCm path was marked as incomplete (# TODO: test for ROC) and was never fully functional on recent ROCm versions. Changes - Removed the non-functional ROCm/HIP build path from gpu_stream/CMakeLists.txt - When CUDA is not found, prints a warning and returns gracefully instead of attempting a broken hipify build or raising FATAL_ERROR - No changes to the NVIDIA/CUDA build path — it continues to work as before Impact - NVIDIA builds: No change — gpu_stream builds and installs normally - ROCm builds: gpu_stream is skipped with a warning message. Previously it would fail the entire make cppbuild step, blocking the Docker image build - Other benchmarks: Unaffected — build.sh continues to the next benchmark after gpu_stream returns Co-authored-by: Hongtao Zhang <hongtaozhang@microsoft.com>
3c95714f · Hongtao Zhang · GitHub · 8c7e2be0 · 3c95714f · 3c95714f
Unverified Commit 3c95714f authored Apr 21, 2026 by Hongtao Zhang Committed by GitHub Apr 21, 2026
3 changed files
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream.py
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream.py
@@ -6,7 +6,7 @@
 import os
 from superbench.common.utils import logger
-from superbench.benchmarks import BenchmarkRegistry, ReturnCode
+from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode
 from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
@@ -116,4 +116,4 @@ def _process_raw_result(self, cmd_idx, raw_output):
        return True
-BenchmarkRegistry.register_benchmark('gpu-stream', GpuStreamBenchmark)
+BenchmarkRegistry.register_benchmark('gpu-stream', GpuStreamBenchmark, platform=Platform.CUDA)
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/CMakeLists.txt
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/CMakeLists.txt
@@ -13,6 +13,20 @@ set(CMAKE_CUDA_STANDARD_REQUIRED ON)
 find_package(CUDAToolkit QUIET)
+if(NOT CUDAToolkit_FOUND)
+    message(WARNING "gpu_stream: CUDA not found, skipping build (requires NVIDIA GPU with NVML)")
+    return()
+endif()
+# Check for NVML (nvidia-ml) library, required for querying memory clock rates
+find_library(NVML_LIBRARY nvidia-ml PATHS ${CUDAToolkit_LIBRARY_DIR} PATH_SUFFIXES stubs)
+if(NOT NVML_LIBRARY)
+    message(WARNING "gpu_stream: NVML (nvidia-ml) not found, skipping build")
+    return()
+endif()
+message(STATUS "Found CUDA: " ${CUDAToolkit_VERSION})
 # Source files
 set(SOURCES
    gpu_stream_test.cpp
@@ -21,40 +35,10 @@ set(SOURCES
    gpu_stream_kernels.cu
 )
-# Cuda environment
+include(../cuda_common.cmake)
-if(CUDAToolkit_FOUND)
+add_executable(gpu_stream ${SOURCES})
-    message(STATUS "Found CUDA: " ${CUDAToolkit_VERSION})
+set_property(TARGET gpu_stream PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED})
+target_include_directories(gpu_stream PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
-    include(../cuda_common.cmake)
+target_link_libraries(gpu_stream numa ${NVML_LIBRARY})
-    add_executable(gpu_stream ${SOURCES})
-    set_property(TARGET gpu_stream PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED})
-    target_include_directories(gpu_stream PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
-    target_link_libraries(gpu_stream numa nvidia-ml)
-else()
-    # TODO: test for ROC
-    # ROCm environment
-    include(../rocm_common.cmake)
-    find_package(hip QUIET)
-    if(hip_FOUND)
-        message(STATUS "Found ROCm: " ${HIP_VERSION})
-        # Convert cuda code to hip code in cpp
-        execute_process(COMMAND hipify-perl -print-stats -o gpu_stream.cpp ${SOURCES} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/)
-        # link hip device lib
-        add_executable(gpu_stream gpu_stream.cpp)
-        include(CheckSymbolExists)
-        check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY)
-        if(${HIP_UNCACHED_MEMORY})
-            target_compile_definitions(gpu_stream PRIVATE HIP_UNCACHED_MEMORY)
-        endif()
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
-        target_link_libraries(gpu_stream numa hip::device)
-    else()
-        message(FATAL_ERROR "No CUDA or ROCm environment found.")
-    endif()
-endif()
 install(TARGETS gpu_stream RUNTIME DESTINATION bin)
--- a/tests/benchmarks/micro_benchmarks/test_gpu_stream.py
+++ b/tests/benchmarks/micro_benchmarks/test_gpu_stream.py
@@ -63,11 +63,6 @@ def test_gpu_stream_command_generation_cuda(self):
        """Test gpu-stream benchmark command generation, CUDA case."""
        self._test_gpu_stream_command_generation(Platform.CUDA)
-    @decorator.rocm_test
-    def test_gpu_stream_command_generation_rocm(self):
-        """Test gpu-stream benchmark command generation, ROCm case."""
-        self._test_gpu_stream_command_generation(Platform.ROCM)
    @decorator.load_data('tests/data/gpu_stream.log')
    def _test_gpu_stream_result_parsing(self, platform, test_raw_output):
        """Test gpu-stream benchmark result parsing."""
@@ -115,8 +110,3 @@ def _test_gpu_stream_result_parsing(self, platform, test_raw_output):
    def test_gpu_stream_result_parsing_cuda(self):
        """Test gpu-stream benchmark result parsing, CUDA case."""
        self._test_gpu_stream_result_parsing(Platform.CUDA)
-    @decorator.rocm_test
-    def test_gpu_stream_result_parsing_rocm(self):
-        """Test gpu-stream benchmark result parsing, ROCm case."""
-        self._test_gpu_stream_result_parsing(Platform.ROCM)