issue/367 - Fix compile bug on cuda 13.0

* fix compile bug on cuda 13.0 * issue/367 - clang format code on ubuntu --------- Co-authored-by: root <root@Crapromer>

issue/367 - Fix compile bug on cuda 13.0
* fix compile bug on cuda 13.0 * issue/367 - clang format code on ubuntu --------- Co-authored-by: root <root@Crapromer>
c76c0645 · crapromer · GitHub · f796aaa8 · c76c0645 · c76c0645
Unverified Commit c76c0645 authored Nov 07, 2025 by crapromer Committed by GitHub Nov 07, 2025
4 changed files
--- a/src/infiniop/ops/logsoftmax/cuda/kernel.cuh
+++ b/src/infiniop/ops/logsoftmax/cuda/kernel.cuh
@@ -54,7 +54,11 @@ __device__ void logSoftmaxKernel(
            }
        }
    }
+#if CUDART_VERSION >= 12090
+    max_val = BlockReduce(temp_storage).Reduce(max_val, ::cuda::maximum());
+#else
    max_val = BlockReduce(temp_storage).Reduce(max_val, cub::Max());
+#endif
    if (tid == 0) {
        shared_max_val = max_val;
    }

--- a/src/infiniop/ops/random_sample/nvidia/random_sample_kernel.cuh
+++ b/src/infiniop/ops/random_sample/nvidia/random_sample_kernel.cuh
@@ -89,9 +89,13 @@ utils::Result<size_t> calculateWorkspace(size_t n_) {
        nullptr, size_inclusive_sum,
        nullptr, n,
        nullptr));
+#if CUDART_VERSION >= 12090
+    size_random += ::cuda::maximum()(size_radix_sort, size_inclusive_sum);
+    return utils::Result<size_t>(::cuda::maximum()(argmax, size_random));
+#else
    size_random += cub::Max()(size_radix_sort, size_inclusive_sum);
-
    return utils::Result<size_t>(cub::Max()(argmax, size_random));
+#endif
 }

 // ↑↑↑ 计算 workspace
@@ -161,8 +165,13 @@ static __global__ void randomSampleKernel(
    const Tidx *__restrict__ indices_out,
    size_t n,
    float random, float topp, size_t topk) {
+#if CUDART_VERSION >= 12090
+    topk = ::cuda::minimum()(topk, n);
+    auto p = (Tval)(random * ::cuda::minimum()(topp * (float)sorted[n - 1], (float)sorted[topk - 1]));
+#else
    topk = cub::Min()(topk, n);
    auto p = (Tval)(random * cub::Min()(topp * (float)sorted[n - 1], (float)sorted[topk - 1]));
+#endif
    for (size_t i = 0;; ++i) {
        if ((sorted[i]) >= p) {
            *result = indices_out[i];
@@ -228,8 +237,11 @@ struct Algo {

        workspace_ = reinterpret_cast<void *>(workspace);
        workspace_size = workspace_end - workspace;
-
+#if CUDART_VERSION >= 12090
+        auto block = ::cuda::minimum()((size_t)block_size, n);
+#else
        auto block = cub::Min()((size_t)block_size, n);
+#endif
        auto grid = (n + block - 1) / block;
        // sort
        fillIndices<<<static_cast<unsigned int>(grid), static_cast<unsigned int>(block), 0, stream>>>(indices, static_cast<int>(n));

--- a/src/infiniop/ops/topksoftmax/cuda/kernel.cuh
+++ b/src/infiniop/ops/topksoftmax/cuda/kernel.cuh
@@ -55,7 +55,11 @@ __global__ void softmax_topk_row_kernel(float *values_topk, // 输出数据, 形

    {
        __shared__ typename BlockReduce::TempStorage temp_storage_max;
+#if CUDART_VERSION >= 12090
+        T value_max = BlockReduce(temp_storage_max).Reduce(thread_max, ::cuda::maximum());
+#else
        T value_max = BlockReduce(temp_storage_max).Reduce(thread_max, cub::Max());
+#endif
        if (tid == 0) {
            shared_max = value_max;
        }

--- a/src/infiniop/reduce/cuda/reduce.cuh
+++ b/src/infiniop/reduce/cuda/reduce.cuh
@@ -54,8 +54,12 @@ __device__ __forceinline__ Tdata max(const Tdata *data_ptr, size_t count) {
    for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) {
 #ifdef ENABLE_HYGON_API
        max_ = (data_ptr[i] > max_) ? data_ptr[i] : max_;
+#else
+#if CUDART_VERSION >= 12090
+        max_ = ::cuda::maximum()(max_, data_ptr[i]);
 #else
        max_ = cub::Max()(max_, data_ptr[i]);
+#endif
 #endif
    }

@@ -65,9 +69,13 @@ __device__ __forceinline__ Tdata max(const Tdata *data_ptr, size_t count) {
 #ifdef ENABLE_HYGON_API
    return BlockReduce(temp_storage).Reduce(
        max_, [](const Tdata &a, const Tdata &b) { return (a > b) ? a : b; }, BLOCK_SIZE);
+#else
+#if CUDART_VERSION >= 12090
+    return BlockReduce(temp_storage).Reduce(max_, ::cuda::maximum(), BLOCK_SIZE);
 #else
    return BlockReduce(temp_storage).Reduce(max_, cub::Max(), BLOCK_SIZE);
 #endif
+#endif
 }

 } // namespace op::common_cuda::reduce_op