支持 elementwise float16 bfloat16 数据类型

46a15a1a · zhangyue · 3a7633ba · 46a15a1a · 46a15a1a
Commit 46a15a1a authored Aug 06, 2025 by zhangyue
Showing with 21 additions and 1 deletion

src/infiniop/devices/kunlun/kunlun_kernel_common.h src/infiniop/devices/kunlun/kunlun_kernel_common.h +1 -0

src/infiniop/ops/swiglu/kunlun/swiglu_kunlun.xpu src/infiniop/ops/swiglu/kunlun/swiglu_kunlun.xpu +20 -1

No files found.
--- a/src/infiniop/devices/kunlun/kunlun_kernel_common.h
+++ b/src/infiniop/devices/kunlun/kunlun_kernel_common.h
@@ -4,6 +4,7 @@
 // This header file will only be include by .xpu file
 #include "xpu/runtime.h"
 #include <xpu/kernel/xtdk.h>
+#include <xpu/kernel/xtdk_bf16.h>
 #include <xpu/kernel/xtdk_math.h>
 #include <xpu/kernel/xtdk_simd.h>

--- a/src/infiniop/ops/swiglu/kunlun/swiglu_kunlun.xpu
+++ b/src/infiniop/ops/swiglu/kunlun/swiglu_kunlun.xpu
@@ -10,6 +10,10 @@ private:
    inline __device__ T sigmoid(T x) const {
        return 1.0f / (1.0f + exp(-x));
    }
+    // float version of sigmoid
+    inline __device__ float sigmoidf(float x) const {
+        return 1.0f / (1.0f + exp(-x));
+    }
 public:
    // This static number must be set in other Ops
@@ -21,9 +25,20 @@ public:
        T out = gate * sigmoid(gate) * up;
        return out;
    }
+    // bfloat16 特化版本（使用 float 计算精度）
+    inline __device__ bfloat16_t operator()(const bfloat16_t *inputs) const {
+        float up_f = __bfloat162float(inputs[0]);
+        float gate_f = __bfloat162float(inputs[1]);
+        float out_f = gate_f * sigmoidf(gate_f) * up_f;
+        return __float2bfloat16(out_f);
+    }
 } SwiGLUOp;
+// __global__ template function instantiation
 INSTANTIATE_ELEMENTWISE_KERNEL(SwiGLUOp::num_inputs, SwiGLUOp, float);
+INSTANTIATE_ELEMENTWISE_KERNEL(SwiGLUOp::num_inputs, SwiGLUOp, half);
+INSTANTIATE_ELEMENTWISE_KERNEL(SwiGLUOp::num_inputs, SwiGLUOp, bfloat16_t);
 } // namespace op::elementwise::kunlun
 namespace op::swiglu::kunlun {
@@ -45,7 +60,7 @@ infiniStatus_t Descriptor::create(
    const auto &up_shape = up_desc->shape();
    const auto &gate_shape = gate_desc->shape();
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F32);
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F32, INFINI_DTYPE_F16, INFINI_DTYPE_BF16);
    CHECK_SAME_SHAPE(out_shape, up_shape, gate_shape);
    // create KUNLUN elementwise descriptor
@@ -68,6 +83,10 @@ infiniStatus_t Descriptor::calculate(
    switch (_dtype) {
    case INFINI_DTYPE_F32:
        return _device_info->calculate<8, op::elementwise::kunlun::SwiGLUOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<8, op::elementwise::kunlun::SwiGLUOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<8, op::elementwise::kunlun::SwiGLUOp, bfloat16_t>(_info, workspace, output, inputs, stream);
    default:
        return INFINI_STATUS_BAD_TENSOR_DTYPE;
    }