Merge pull request #105 from PanZezhong1725/issue/6

issue/6 添加cuda通用reduce模块，实现rmsnorm cuda算子

Merge pull request #105 from PanZezhong1725/issue/6
issue/6 添加cuda通用reduce模块，实现rmsnorm cuda算子
3ea1fabf · PanZezhong1725 · GitHub · e950d3fd · 650bc975 · 3ea1fabf
Unverified Commit 3ea1fabf authored Mar 20, 2025 by PanZezhong1725 Committed by GitHub Mar 20, 2025
8 changed files
--- a/include/infinicore.h
+++ b/include/infinicore.h
@@ -28,6 +28,7 @@ typedef enum {
    INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED = 5,
    INFINI_STATUS_DEVICE_NOT_FOUND = 6,
    INFINI_STATUS_DEVICE_NOT_INITIALIZED = 7,
+    INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED = 8,
    // Op Errors
    INFINI_STATUS_BAD_TENSOR_DTYPE = 10,
    INFINI_STATUS_BAD_TENSOR_SHAPE = 11,

--- a/src/infiniop/devices/cuda/cuda_common.cuh
+++ b/src/infiniop/devices/cuda/cuda_common.cuh
 #ifndef __INFINIOP_CUDA_COMMON_CUH__
 #define __INFINIOP_CUDA_COMMON_CUH__
+#include "../../reduce/cuda/reduce.cuh"
 #include "cuda_handle.cuh"
 #include "infinicore.h"
+#ifdef ENABLE_SUGON_CUDA_API
+#define INFINIOP_CUDA_KERNEL __launch_bounds__(512) __global__ void
+#else
+#define INFINIOP_CUDA_KERNEL __global__ void
+#endif
+// Posible maximum number of threads per block for CUDA architectures
+// Used for picking correct kernel launch configuration
+#define CUDA_BLOCK_SIZE_1024 1024
+#define CUDA_BLOCK_SIZE_512 512
 namespace device::cuda {
 cudnnDataType_t getCudnnDtype(infiniDtype_t dt);

--- a/src/infiniop/ops/rms_norm/cuda/rms_norm_cuda.cu
+++ b/src/infiniop/ops/rms_norm/cuda/rms_norm_cuda.cu
+#include "../../../devices/cuda/cuda_common.cuh"
+#include "rms_norm_cuda.cuh"
+#include "rms_norm_kernel.cuh"
+#include <memory>
+#include <stdint.h>
+namespace op::rms_norm::cuda {
+struct Descriptor::Opaque {
+    std::shared_ptr<device::cuda::Handle::Internal> internal;
+};
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t w_desc,
+    float epsilon) {
+    RMSNormInfo info;
+    CHECK_STATUS(createRMSNormInfo(&info, y_desc, x_desc, w_desc, epsilon));
+    // only support contiguous last dimension
+    if (info.x_strides[1] != 1 || info.y_strides[1] != 1) {
+        return INFINI_STATUS_BAD_TENSOR_STRIDES;
+    }
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::cuda::Handle *>(handle)->internal()},
+        info, 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+// launch kernel with different data types
+template <unsigned int BLOCK_SIZE>
+infiniStatus_t launchKernel(
+    uint32_t batch_size, size_t dim,
+    void *y, infiniDtype_t atype, ptrdiff_t stride_y,
+    const void *x, ptrdiff_t stride_x,
+    const void *w, infiniDtype_t wtype,
+    float epsilon,
+    cudaStream_t cuda_stream) {
+#define LAUNCH_KERNEL(Tdata, Tweight, Tcompute)                                                     \
+    rmsnormBlock<BLOCK_SIZE, Tdata, Tweight, Tcompute><<<batch_size, BLOCK_SIZE, 0, cuda_stream>>>( \
+        reinterpret_cast<Tdata *>(y),                                                               \
+        stride_y,                                                                                   \
+        reinterpret_cast<const Tdata *>(x),                                                         \
+        stride_x,                                                                                   \
+        reinterpret_cast<const Tweight *>(w),                                                       \
+        dim,                                                                                        \
+        epsilon)
+    if (atype == INFINI_DTYPE_F16 && wtype == INFINI_DTYPE_F16) {
+        LAUNCH_KERNEL(half, half, float);
+    } else if (atype == INFINI_DTYPE_F16 && wtype == INFINI_DTYPE_F32) {
+        LAUNCH_KERNEL(half, float, float);
+    } else if (atype == INFINI_DTYPE_F32 && wtype == INFINI_DTYPE_F32) {
+        LAUNCH_KERNEL(float, float, float);
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+#undef LAUNCH_KERNEL
+    return INFINI_STATUS_SUCCESS;
+}
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *y, const void *x, const void *w, void *stream) {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    auto stride_x = _info.x_strides[0];
+    auto stride_y = _info.y_strides[0];
+    auto dim = _info.dim();
+    uint32_t batch_size = static_cast<uint32_t>(_info.shape[0]);
+    auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+    // launch kernel with different block sizes
+    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
+        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_1024>(batch_size, dim, y, _info.atype, stride_y, x, stride_x, w, _info.wtype, _info.epsilon, cuda_stream));
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
+        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_512>(batch_size, dim, y, _info.atype, stride_y, x, stride_x, w, _info.wtype, _info.epsilon, cuda_stream));
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::rms_norm::cuda
--- a/src/infiniop/ops/rms_norm/cuda/rms_norm_cuda.cuh
+++ b/src/infiniop/ops/rms_norm/cuda/rms_norm_cuda.cuh
+#ifndef __RMS_NORM_CUDA_H__
+#define __RMS_NORM_CUDA_H__
+#include "../rms_norm.h"
+DESCRIPTOR(cuda)
+#endif
--- a/src/infiniop/ops/rms_norm/cuda/rms_norm_kernel.cuh
+++ b/src/infiniop/ops/rms_norm/cuda/rms_norm_kernel.cuh
+#ifndef __RMS_NORM_CUDA_KERNEL_H__
+#define __RMS_NORM_CUDA_KERNEL_H__
+#include "../../../devices/cuda/cuda_common.cuh"
+#include <cub/block/block_reduce.cuh>
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tweight, typename Tcompute>
+INFINIOP_CUDA_KERNEL rmsnormBlock(
+    Tdata *__restrict__ y,
+    ptrdiff_t stride_y,
+    const Tdata *__restrict__ x,
+    ptrdiff_t stride_x,
+    const Tweight *__restrict__ w,
+    size_t dim,
+    float epsilon) {
+    // Each block takes care of a row of continuous data of length dim
+    // Each thread deals with every block_size element in the row
+    auto y_ptr = y + blockIdx.x * stride_y;
+    auto x_ptr = x + blockIdx.x * stride_x;
+    auto w_ptr = w;
+    // Block-reduce sum of x^2
+    Tcompute ss = op::common_cuda::reduce_op::sumSquared<BLOCK_SIZE, Tdata, Tcompute>(x_ptr, dim);
+    // Thread_0 computes RMS=1/sqrt(ss/dim+epsilon) and stores in shared memory
+    __shared__ Tcompute rms;
+    if (threadIdx.x == 0) {
+        rms = Tdata(rsqrtf(ss / Tcompute(dim) + epsilon));
+    }
+    __syncthreads();
+    for (size_t i = threadIdx.x; i < dim; i += BLOCK_SIZE) {
+        y_ptr[i] = Tdata(Tcompute(x_ptr[i]) * Tcompute(w_ptr[i]) * rms);
+    }
+}
+#endif
--- a/src/infiniop/ops/rms_norm/operator.cc
+++ b/src/infiniop/ops/rms_norm/operator.cc
@@ -5,6 +5,9 @@
 #ifdef ENABLE_CPU_API
 #include "cpu/rms_norm_cpu.h"
 #endif
+#ifdef ENABLE_CUDA_API
+#include "cuda/rms_norm_cuda.cuh"
+#endif
 __C infiniStatus_t infiniopCreateRMSNormDescriptor(
    infiniopHandle_t handle,
@@ -28,10 +31,8 @@ __C infiniStatus_t infiniopCreateRMSNormDescriptor(
 #ifdef ENABLE_CPU_API
        CREATE(INFINI_DEVICE_CPU, cpu)
 #endif
-#ifdef ENABLE_NV_GPU
+#ifdef ENABLE_CUDA_API
-    case DevNvGpu: {
+        CREATE(INFINI_DEVICE_NVIDIA, cuda)
-        return cudaCreateRMSNormDescriptor((CudaHandle_t)handle, (RMSNormCudaDescriptor_t *)desc_ptr, y_desc, x_desc, w_desc, epsilon);
-    }
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
    case DevCambriconMlu: {
@@ -76,11 +77,8 @@ __C infiniStatus_t infiniopGetRMSNormWorkspaceSize(infiniopRMSNormDescriptor_t d
 #ifdef ENABLE_CPU_API
        GET(INFINI_DEVICE_CPU, cpu)
 #endif
-#ifdef ENABLE_NV_GPU
+#ifdef ENABLE_CUDA_API
-    case DevNvGpu: {
+        GET(INFINI_DEVICE_NVIDIA, cuda)
-        return cudaGetRMSNormWorkspaceSize((RMSNormCudaDescriptor_t)desc, size);
-    }
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
    case DevCambriconMlu: {
@@ -122,11 +120,8 @@ __C infiniStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *works
 #ifdef ENABLE_CPU_API
        CALCULATE(INFINI_DEVICE_CPU, cpu)
 #endif
-#ifdef ENABLE_NV_GPU
+#ifdef ENABLE_CUDA_API
-    case DevNvGpu: {
+        CALCULATE(INFINI_DEVICE_NVIDIA, cuda)
-        return cudaRMSNorm((RMSNormCudaDescriptor_t)desc, workspace, workspace_size, y, x, w, stream);
-    }
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
    case DevCambriconMlu: {
@@ -172,11 +167,8 @@ __C infiniStatus_t infiniopDestroyRMSNormDescriptor(infiniopRMSNormDescriptor_t
 #ifdef ENABLE_CPU_API
        DESTROY(INFINI_DEVICE_CPU, cpu)
 #endif
-#ifdef ENABLE_NV_GPU
+#ifdef ENABLE_CUDA_API
-    case DevNvGpu: {
+        DESTROY(INFINI_DEVICE_NVIDIA, cuda)
-        return cudaDestroyRMSNormDescriptor((RMSNormCudaDescriptor_t)desc);
-    }
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
    case DevCambriconMlu: {

--- a/src/infiniop/ops/rms_norm/rms_norm.h
+++ b/src/infiniop/ops/rms_norm/rms_norm.h
@@ -51,6 +51,10 @@ inline infiniStatus_t createRMSNormInfo(RMSNormInfo *info, infiniopTensorDescrip
        return INFINI_STATUS_BAD_TENSOR_SHAPE;
    }
+    if (w_desc->stride(0) != 1) {
+        return INFINI_STATUS_BAD_TENSOR_STRIDES;
+    }
    info->shape = std::move(y_desc->shape());
    info->y_strides = std::move(y_desc->strides());
    info->x_strides = std::move(x_desc->strides());

--- a/src/infiniop/reduce/cuda/reduce.cuh
+++ b/src/infiniop/reduce/cuda/reduce.cuh
+#ifndef __INFINIOP_REDUCE_CUDA_H__
+#define __INFINIOP_REDUCE_CUDA_H__
+#include <cub/block/block_reduce.cuh>
+namespace op::common_cuda::reduce_op {
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+__device__ __forceinline__ Tcompute sumSquared(const Tdata *data_ptr, size_t count) {
+    Tcompute ss = 0;
+    // Each thread computes its partial sum
+    for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) {
+        ss += Tcompute(data_ptr[i] * data_ptr[i]);
+    }
+    // Use CUB block-level reduction
+    using BlockReduce = cub::BlockReduce<Tcompute, BLOCK_SIZE>;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+    return BlockReduce(temp_storage).Sum(ss);
+}
+} // namespace op::common_cuda::reduce_op
+#endif