issue/291/build: nvidia 和 metax 共用 softmax kernel 实现

Signed-off-by: YdrMaster <ydrml@hotmail.com>

issue/291/build: nvidia 和 metax 共用 softmax kernel 实现
Signed-off-by: YdrMaster <ydrml@hotmail.com>
f0c5a569 · YdrMaster · c6a3e4c7 · f0c5a569 · c6a3e4c7 · f0c5a569
Commit f0c5a569 authored Jul 01, 2025 by YdrMaster
16 changed files
--- a/src/infiniop/ops/causal_softmax/cuda/causal_softmax_kernel.cuh
+++ b/src/infiniop/ops/causal_softmax/cuda/causal_softmax_kernel.cuh
-#ifndef __CAUSAL_SOFTMAX_KERNEL_CUH__
+#ifndef __CAUSAL_SOFTMAX_KERNEL_CUH__
 #define __CAUSAL_SOFTMAX_KERNEL_CUH__

-#include "../../../devices/cuda/cuda_kernel_common.cuh"
-#include "../../../reduce/cuda/reduce.cuh"
-
 template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
-INFINIOP_CUDA_KERNEL causalSoftmax(
+__device__ void causalSoftmaxKernel(
    Tdata *y_, const Tdata *x_,
    size_t batch, size_t height, size_t width,
    ptrdiff_t y_stride_b, ptrdiff_t y_stride_h,
@@ -32,11 +29,11 @@ INFINIOP_CUDA_KERNEL causalSoftmax(
        //          2 | * * * ... * * * |
        //  height: 3  col_id->
        if (width + blockIdx.x >= threadIdx.x + height) {
-#ifdef ENABLE_NVIDIA_API
-            y[col] = exp_(x[col] - max_);
-#else
+            if constexpr (std::is_same_v<Tdata, half>) {
+                y[col] = hexp(x[col] - max_);
+            } else {
                y[col] = exp(x[col] - max_);
-#endif
+            }
        } else {
            y[col] = Tdata(0);
        }

--- a/src/infiniop/ops/causal_softmax/maca/causal_softmax_kernel.h
+++ b/src/infiniop/ops/causal_softmax/maca/causal_softmax_kernel.h
-#ifndef __CAUSAL_SOFTMAX_KERNEL_H__
-#define __CAUSAL_SOFTMAX_KERNEL_H__
-
-#include "../../../devices/maca/maca_kernel_common.h"
-#include "../../../reduce/maca/reduce.h"
-
-template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
-INFINIOP_MACA_KERNEL causalSoftmax(
-    Tdata *y_, const Tdata *x_,
-    size_t batch, size_t height, size_t width,
-    ptrdiff_t y_stride_b, ptrdiff_t y_stride_h,
-    ptrdiff_t x_stride_b, ptrdiff_t x_stride_h) {
-
-    Tdata *y = y_                       // threadIdx.x for col_id
-             + blockIdx.y * y_stride_b  // gridDim.y for batch_id
-             + blockIdx.x * y_stride_h; // gridDim.x for row_id
-    const Tdata *x = x_ + blockIdx.y * x_stride_b + blockIdx.x * x_stride_h;
-
-    // [Reduce] Find max value in each row and store in shared memory
-    __shared__ Tdata max_;
-    Tdata max_0 = op::common_maca::reduce_op::max<BLOCK_SIZE, Tdata>(x, width - height + 1 + blockIdx.x);
-    if (threadIdx.x == 0) {
-        max_ = max_0;
-    }
-    __syncthreads();
-
-    // [Elementwise] Subtract max value from each element and apply causal mask
-    for (size_t col = threadIdx.x; col < width; col += BLOCK_SIZE) {
-        //   row_id ↓ |<-     width   ->|
-        //          0 | * * * ... *     |
-        //          1 | * * * ... * *   |
-        //          2 | * * * ... * * * |
-        //  height: 3  col_id->
-        if (width + blockIdx.x >= threadIdx.x + height) {
-#ifdef ENABLE_MACA_API
-            y[col] = exp_(x[col] - max_);
-#else
-            y[col] = exp(x[col] - max_);
-#endif
-        } else {
-            y[col] = Tdata(0);
-        }
-    }
-    __syncthreads();
-
-    // [Reduce] Find the sum of each updated row and store in shared memory
-    __shared__ Tcompute sum_;
-    Tcompute sum_0 = op::common_maca::reduce_op::sum<BLOCK_SIZE, Tdata, Tcompute>(y, width);
-    if (threadIdx.x == 0) {
-        sum_ = sum_0;
-    }
-    __syncthreads();
-
-    // [Elementwise] Divide each element by the sum and store in shared memory
-    for (size_t col = threadIdx.x; col < width; col += BLOCK_SIZE) {
-        y[col] /= Tdata(sum_);
-    }
-}
-
-#endif // __CAUSAL_SOFTMAX_KERNEL_H__
--- a/src/infiniop/ops/causal_softmax/maca/causal_softmax_maca.h
+++ b/src/infiniop/ops/causal_softmax/maca/causal_softmax_maca.h
-#ifndef __CAUSAL_SOFTMAX_MACA_H__
-#define __CAUSAL_SOFTMAX_MACA_H__
+#ifndef __CAUSAL_SOFTMAX_METAX_H__
+#define __CAUSAL_SOFTMAX_METAX_H__

 #include "../causal_softmax.h"

-DESCRIPTOR(maca)
+DESCRIPTOR(metax)

 #endif
--- a/src/infiniop/ops/causal_softmax/maca/causal_softmax_maca.maca
+++ b/src/infiniop/ops/causal_softmax/maca/causal_softmax_maca.maca
 #include "../../../devices/maca/common_maca.h"
-#include "causal_softmax_kernel.h"
-#include "causal_softmax_maca.h"
+#include "../../../devices/maca/maca_kernel_common.h"
+#include "causal_softmax_metax.h"

-namespace op::causal_softmax::maca {
+#include <hccub/block/block_reduce.cuh>
+
+#include "../../../reduce/cuda/reduce.cuh"
+
+#include "../cuda/kernel.cuh"
+
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+INFINIOP_MACA_KERNEL causalSoftmax(
+    Tdata *y, const Tdata *x,
+    size_t batch, size_t height, size_t width,
+    ptrdiff_t y_stride_b, ptrdiff_t y_stride_h,
+    ptrdiff_t x_stride_b, ptrdiff_t x_stride_h) {
+    causalSoftmaxKernel<BLOCK_SIZE, Tdata, Tcompute>(y, x, batch, height, width, y_stride_b, y_stride_h, x_stride_b, x_stride_h);
+}
+
+namespace op::causal_softmax::metax {

 struct Descriptor::Opaque {
    std::shared_ptr<device::maca::Handle::Internal> internal;
@@ -75,4 +90,4 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
    return INFINI_STATUS_SUCCESS;
 }

-} // namespace op::causal_softmax::maca
+} // namespace op::causal_softmax::metax
--- a/src/infiniop/ops/causal_softmax/cuda/causal_softmax_cuda.cu
+++ b/src/infiniop/ops/causal_softmax/cuda/causal_softmax_cuda.cu
 #include "../../../devices/cuda/cuda_common.cuh"
-#include "causal_softmax_cuda.cuh"
-#include "causal_softmax_kernel.cuh"
+#include "../../../devices/cuda/cuda_kernel_common.cuh"
+#include "causal_softmax_nvidia.cuh"

-namespace op::causal_softmax::cuda {
+#include <cub/block/block_reduce.cuh>
+
+#include "../../../reduce/cuda/reduce.cuh"
+
+#include "../cuda/kernel.cuh"
+
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+INFINIOP_CUDA_KERNEL causalSoftmax(
+    Tdata *y, const Tdata *x,
+    size_t batch, size_t height, size_t width,
+    ptrdiff_t y_stride_b, ptrdiff_t y_stride_h,
+    ptrdiff_t x_stride_b, ptrdiff_t x_stride_h) {
+    causalSoftmaxKernel<BLOCK_SIZE, Tdata, Tcompute>(y, x, batch, height, width, y_stride_b, y_stride_h, x_stride_b, x_stride_h);
+}
+
+namespace op::causal_softmax::nvidia {

 struct Descriptor::Opaque {
    std::shared_ptr<device::cuda::Handle::Internal> internal;
@@ -79,4 +94,4 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
    return INFINI_STATUS_SUCCESS;
 }

-} // namespace op::causal_softmax::cuda
+} // namespace op::causal_softmax::nvidia
--- a/src/infiniop/ops/causal_softmax/cuda/causal_softmax_cuda.cuh
+++ b/src/infiniop/ops/causal_softmax/cuda/causal_softmax_cuda.cuh
-#ifndef __CAUSAL_SOFTMAX_CUDA_H__
-#define __CAUSAL_SOFTMAX_CUDA_H__
+#ifndef __CAUSAL_SOFTMAX_NVIDIA_H__
+#define __CAUSAL_SOFTMAX_NVIDIA_H__

 #include "../causal_softmax.h"

-DESCRIPTOR(cuda)
+DESCRIPTOR(nvidia)

 #endif
--- a/src/infiniop/ops/causal_softmax/operator.cc
+++ b/src/infiniop/ops/causal_softmax/operator.cc
@@ -6,10 +6,10 @@
 #include "cpu/causal_softmax_cpu.h"
 #endif
 #ifdef ENABLE_NVIDIA_API
-#include "cuda/causal_softmax_cuda.cuh"
+#include "nvidia/causal_softmax_nvidia.cuh"
 #endif
 #ifdef ENABLE_METAX_API
-#include "maca/causal_softmax_maca.h"
+#include "metax/causal_softmax_metax.h"
 #endif
 #ifdef ENABLE_ASCEND_API
 #include "ascend/causal_softmax_ascend.h"
@@ -34,10 +34,13 @@ __C infiniStatus_t infiniopCreateCausalSoftmaxDescriptor(
        CREATE(INFINI_DEVICE_CPU, cpu)
 #endif
 #ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, cuda)
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia)
 #endif
 #ifdef ENABLE_METAX_API
-        CREATE(INFINI_DEVICE_METAX, maca)
+        CREATE(INFINI_DEVICE_METAX, metax)
+#endif
+#ifdef ENABLE_ASCEND_API
+        CREATE(INFINI_DEVICE_ASCEND, ascend)
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
    case DevCambriconMlu: {
@@ -45,14 +48,6 @@ __C infiniStatus_t infiniopCreateCausalSoftmaxDescriptor(
        // return cnnlCreateCausalSoftmaxDescriptor((BangHandle_t) handle, (CausalSoftmaxCnnlDescriptor_t *) desc_ptr, y_desc);
    }
 #endif
-#ifdef ENABLE_ASCEND_API
-        CREATE(INFINI_DEVICE_ASCEND, ascend)
-#endif
-#ifdef ENABLE_METAX_GPU
-    case DevMetaxGpu: {
-        return macaCreateCausalSoftmaxDescriptor((MacaHandle_t)handle, (CausalSoftmaxMacaDescriptor_t *)desc_ptr, y_desc);
-    }
-#endif
 #ifdef ENABLE_MTHREADS_GPU
    case DevMthreadsGpu: {
        return musaCreateCausalSoftmaxDescriptor((MusaHandle_t)handle, (CausalSoftmaxMusaDescriptor_t *)desc_ptr, y_desc);
@@ -74,7 +69,13 @@ __C infiniStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmaxDe
        GET(INFINI_DEVICE_CPU, cpu)
 #endif
 #ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, cuda)
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax)
+#endif
+#ifdef ENABLE_ASCEND_API
+        GET(INFINI_DEVICE_ASCEND, ascend)
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
    case DevCambriconMlu: {
@@ -83,17 +84,6 @@ __C infiniStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmaxDe
    }

 #endif
-#ifdef ENABLE_ASCEND_API
-        GET(INFINI_DEVICE_ASCEND, ascend)
-#endif
-#ifdef ENABLE_METAX_API
-        GET(INFINI_DEVICE_METAX, maca)
-#endif
-#ifdef ENABLE_METAX_GPU
-    case DevMetaxGpu: {
-        return macaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxMacaDescriptor_t)desc, size);
-    }
-#endif
 #ifdef ENABLE_MTHREADS_GPU
    case DevMthreadsGpu: {
        return musaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxMusaDescriptor_t)desc, size);
@@ -120,10 +110,13 @@ __C infiniStatus_t infiniopCausalSoftmax(
        CALCULATE(INFINI_DEVICE_CPU, cpu)
 #endif
 #ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, cuda)
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia)
 #endif
 #ifdef ENABLE_METAX_API
-        CALCULATE(INFINI_DEVICE_METAX, maca)
+        CALCULATE(INFINI_DEVICE_METAX, metax)
+#endif
+#ifdef ENABLE_ASCEND_API
+        CALCULATE(INFINI_DEVICE_ASCEND, ascend)
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
    case DevCambriconMlu: {
@@ -131,14 +124,6 @@ __C infiniStatus_t infiniopCausalSoftmax(
        // return cnnlCausalSoftmax((CausalSoftmaxCnnlDescriptor_t) desc, workspace, workspace_size, data, stream);
    }
 #endif
-#ifdef ENABLE_ASCEND_API
-        CALCULATE(INFINI_DEVICE_ASCEND, ascend)
-#endif
-#ifdef ENABLE_METAX_GPU
-    case DevMetaxGpu: {
-        return macaCausalSoftmax((CausalSoftmaxMacaDescriptor_t)desc, workspace, workspace_size, data, stream);
-    }
-#endif
 #ifdef ENABLE_MTHREADS_GPU
    case DevMthreadsGpu: {
        return musaCausalSoftmax((CausalSoftmaxMusaDescriptor_t)desc, workspace, workspace_size, data, stream);
@@ -160,10 +145,13 @@ __C infiniStatus_t infiniopDestroyCausalSoftmaxDescriptor(infiniopCausalSoftmaxD
        DESTROY(INFINI_DEVICE_CPU, cpu)
 #endif
 #ifdef ENABLE_NVIDIA_API
-        DESTROY(INFINI_DEVICE_NVIDIA, cuda)
+        DESTROY(INFINI_DEVICE_NVIDIA, nvidia)
 #endif
 #ifdef ENABLE_METAX_API
-        DESTROY(INFINI_DEVICE_METAX, maca)
+        DESTROY(INFINI_DEVICE_METAX, metax)
+#endif
+#ifdef ENABLE_ASCEND_API
+        DESTROY(INFINI_DEVICE_ASCEND, ascend)
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
    case DevCambriconMlu: {
@@ -171,14 +159,6 @@ __C infiniStatus_t infiniopDestroyCausalSoftmaxDescriptor(infiniopCausalSoftmaxD
        // return cnnlDestroyCausalSoftmaxDescriptor((CausalSoftmaxCnnlDescriptor_t) desc);
    }
 #endif
-#ifdef ENABLE_ASCEND_API
-        DESTROY(INFINI_DEVICE_ASCEND, ascend)
-#endif
-#ifdef ENABLE_METAX_GPU
-    case DevMetaxGpu: {
-        return macaDestroyCausalSoftmaxDescriptor((CausalSoftmaxMacaDescriptor_t)desc);
-    }
-#endif
 #ifdef ENABLE_MTHREADS_GPU
    case DevMthreadsGpu:
        return musaDestroyCausalSoftmaxDescriptor((CausalSoftmaxMusaDescriptor_t)desc);

--- a/src/infiniop/ops/rms_norm/cuda/rms_norm_kernel.cuh
+++ b/src/infiniop/ops/rms_norm/cuda/rms_norm_kernel.cuh
@@ -2,6 +2,9 @@
 #define __RMS_NORM_CUDA_KERNEL_H__

 #include "../../../devices/cuda/cuda_kernel_common.cuh"
+
+#include <cub/block/block_reduce.cuh>
+
 #include "../../../reduce/cuda/reduce.cuh"

 template <unsigned int BLOCK_SIZE, typename Tdata, typename Tweight, typename Tcompute>

--- a/src/infiniop/ops/rms_norm/maca/rms_norm_maca.cuh
+++ b/src/infiniop/ops/rms_norm/maca/rms_norm_maca.cuh
--- a/src/infiniop/ops/rms_norm/maca/rms_norm_maca.maca
+++ b/src/infiniop/ops/rms_norm/maca/rms_norm_maca.maca
 #include "../../../devices/maca/common_maca.h"
 #include "../cuda/rms_norm_kernel.cuh"
-#include "rms_norm_maca.cuh"
+#include "rms_norm_metax.cuh"

 namespace op::rms_norm::maca {


--- a/src/infiniop/ops/rms_norm/operator.cc
+++ b/src/infiniop/ops/rms_norm/operator.cc
@@ -12,7 +12,7 @@
 #include "ascend/rms_norm_aclnn.h"
 #endif
 #ifdef ENABLE_METAX_API
-#include "maca/rms_norm_maca.cuh"
+#include "metax/rms_norm_metax.cuh"
 #endif
 #ifdef ENABLE_MOORE_API
 #include "musa/rms_norm_musa.cuh"

--- a/src/infiniop/reduce/cuda/reduce.cuh
+++ b/src/infiniop/reduce/cuda/reduce.cuh
 #ifndef __INFINIOP_REDUCE_CUDA_H__
 #define __INFINIOP_REDUCE_CUDA_H__

-#include <cub/block/block_reduce.cuh>
-
 /*
 * Device functions for reduction operations on CUDA.
 *

--- a/src/infiniop/reduce/maca/reduce.h
+++ b/src/infiniop/reduce/maca/reduce.h
-#ifndef __INFINIOP_REDUCE_MACA_H__
-#define __INFINIOP_REDUCE_MACA_H__
-
-#include <hccub/block/block_reduce.cuh>
-
-/*
- * Device functions for reduction operations on MACA.
- *
- * Note: Only local result on thread 0 is guranteed to be correct.
- *       A manual broadcast is needed for other threads.
- */
-namespace op::common_maca::reduce_op {
-
-// Sum(x^2) on contiguous data of length count
-template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
-__device__ __forceinline__ Tcompute sumSquared(const Tdata *data_ptr, size_t count) {
-    Tcompute ss = 0;
-
-    // Each thread computes its partial sum
-    for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) {
-        ss += Tcompute(data_ptr[i]) * Tcompute(data_ptr[i]);
-    }
-
-    // Use CUB block-level reduction
-    using BlockReduce = cub::BlockReduce<Tcompute, BLOCK_SIZE>;
-    __shared__ typename BlockReduce::TempStorage temp_storage;
-
-    return BlockReduce(temp_storage).Sum(ss);
-}
-
-// Sum(x) on contiguous data of length count
-template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
-__device__ __forceinline__ Tcompute sum(const Tdata *data_ptr, size_t count) {
-    Tcompute s = 0;
-
-    for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) {
-        s += Tcompute(data_ptr[i]);
-    }
-
-    using BlockReduce = cub::BlockReduce<Tcompute, BLOCK_SIZE>;
-    __shared__ typename BlockReduce::TempStorage temp_storage;
-
-    return BlockReduce(temp_storage).Sum(s);
-}
-
-// Max(x) on contiguous data of length count
-template <unsigned int BLOCK_SIZE, typename Tdata>
-__device__ __forceinline__ Tdata max(const Tdata *data_ptr, size_t count) {
-    Tdata max_ = data_ptr[0];
-
-    for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) {
-        max_ = cub::Max()(max_, data_ptr[i]);
-    }
-
-    using BlockReduce = cub::BlockReduce<Tdata, BLOCK_SIZE>;
-    __shared__ typename BlockReduce::TempStorage temp_storage;
-
-    return BlockReduce(temp_storage).Reduce(max_, cub::Max(), BLOCK_SIZE);
-}
-
-} // namespace op::common_maca::reduce_op
-
-#endif
--- a/xmake.lua
+++ b/xmake.lua
@@ -108,7 +108,7 @@ option_end()

 if has_config("metax-gpu") then
    add_defines("ENABLE_METAX_API")
-    includes("xmake/maca.lua")
+    includes("xmake/metax.lua")
 end

 -- 摩尔线程

--- a/xmake/cuda.lua
+++ b/xmake/cuda.lua
@@ -46,7 +46,7 @@ target("infiniop-cuda")
    add_cuflags("-Xcompiler=-Wno-error=deprecated-declarations")

    set_languages("cxx17")
-    add_files("../src/infiniop/devices/cuda/*.cu", "../src/infiniop/ops/*/cuda/*.cu", "../build/ninetoothed/*.c")
+    add_files("../src/infiniop/devices/cuda/*.cu", "../src/infiniop/ops/*/cuda/*.cu", "../src/infiniop/ops/*/nvidia/*.cu", "../build/ninetoothed/*.c")
 target_end()

 target("infinirt-cuda")

--- a/xmake/maca.lua
+++ b/xmake/maca.lua
@@ -35,7 +35,7 @@ target("infiniop-metax")
    set_warnings("all", "error")
    add_cxflags("-lstdc++", "-fPIC", "-Wno-defaulted-function-deleted", "-Wno-strict-aliasing")
    add_files("../src/infiniop/devices/maca/*.cc", "../src/infiniop/ops/*/maca/*.cc")
-    add_files("../src/infiniop/ops/*/maca/*.maca", {rule = "maca"})
+    add_files("../src/infiniop/ops/*/maca/*.maca", "../src/infiniop/ops/*/metax/*.maca", {rule = "maca"})
 target_end()

 target("infinirt-metax")