支持p800上编译手写算子，重构elementwise 算子组件

f7e7c7ba · zhangyue · 7d3ca92d · f7e7c7ba · f7e7c7ba · f7e7c7ba
Commit f7e7c7ba authored Aug 06, 2025 by zhangyue
16 changed files
--- a/src/infiniop/devices/kunlun/kunlun_common.cc
+++ b/src/infiniop/devices/kunlun/kunlun_common.cc
+#include "kunlun_common.h"
+#include "../../../utils.h"
+#include <functional>
+
+namespace device::kunlun {
+
+infiniStatus_t Handle::Internal::useXdnn(kunlunStream_t stream, const Fn<xdnnHandle_t> &f) const {
+    auto handle = dnn_handles.pop();
+    if (!handle) {
+        *handle = xdnn::create_context();
+    }
+    (*handle)->set_stream(stream);
+    CHECK_STATUS(f(*handle));
+    dnn_handles.push(std::move(*handle));
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Handle::create(InfiniopHandle **handle_ptr, int device_id) {
+    *handle_ptr = new Handle(device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace device::kunlun
\ No newline at end of file
--- a/src/infiniop/devices/kunlun/kunlun_common.h
+++ b/src/infiniop/devices/kunlun/kunlun_common.h
+#include "../pool.h"
+#include "kunlun_handle.h"
+#include <xpu/runtime.h>
+#include <xpu/runtime_ex.h>
+#include <xpu/xdnn.h>
+
+namespace xdnn = baidu::xpu::api;
+
+typedef XPUStream kunlunStream_t;
+typedef XPUEvent kunlunEvent_t;
+typedef xdnn::Context *xdnnHandle_t;
+
+#define CHECK_KUNLUN(API) CHECK_INTERNAL(API, XPU_SUCCESS)
+
+namespace device::kunlun {
+
+class Handle::Internal {
+    Pool<xdnnHandle_t> dnn_handles;
+    template <typename T>
+    using Fn = std::function<infiniStatus_t(T)>;
+
+public:
+    infiniStatus_t useXdnn(kunlunStream_t stream, const Fn<xdnnHandle_t> &f) const;
+};
+
+} // namespace device::kunlun
--- a/src/infiniop/devices/kunlun/kunlun_handle.cc
+++ b/src/infiniop/devices/kunlun/kunlun_handle.cc
-#include "kunlun_handle.h"
+#include "kunlun_common.h"

 namespace device::kunlun {

@@ -10,20 +10,4 @@ auto Handle::internal() const -> const std::shared_ptr<Internal> & {
    return _internal;
 }

-infiniStatus_t Handle::Internal::useXdnn(kunlunStream_t stream, const Fn<xdnnHandle_t> &f) const {
-    auto handle = dnn_handles.pop();
-    if (!handle) {
-        *handle = xdnn::create_context();
-    }
-    (*handle)->set_stream(stream);
-    CHECK_STATUS(f(*handle));
-    dnn_handles.push(std::move(*handle));
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Handle::create(InfiniopHandle **handle_ptr, int device_id) {
-    *handle_ptr = new Handle(device_id);
-    return INFINI_STATUS_SUCCESS;
-}
-
 } // namespace device::kunlun
--- a/src/infiniop/devices/kunlun/kunlun_handle.h
+++ b/src/infiniop/devices/kunlun/kunlun_handle.h
 #ifndef __INFINIOP_KUNLUN_HANDLE_H__
 #define __INFINIOP_KUNLUN_HANDLE_H__

-#include "../../../utils.h"
 #include "../../handle.h"
-#include "../pool.h"
-#include <functional>
 #include <memory>
-#include <xpu/runtime.h>
-#include <xpu/runtime_ex.h>
-#include <xpu/xdnn.h>
-
-namespace xdnn = baidu::xpu::api;
-
-typedef XPUStream kunlunStream_t;
-typedef XPUEvent kunlunEvent_t;
-typedef xdnn::Context *xdnnHandle_t;
-
-#define CHECK_KUNLUN(API) CHECK_INTERNAL(API, XPU_SUCCESS)

 namespace device::kunlun {

@@ -33,15 +19,6 @@ public:
    static infiniStatus_t create(InfiniopHandle **handle_ptr, int device_id);
 };

-class Handle::Internal {
-    Pool<xdnnHandle_t> dnn_handles;
-    template <typename T>
-    using Fn = std::function<infiniStatus_t(T)>;
-
-public:
-    infiniStatus_t useXdnn(kunlunStream_t stream, const Fn<xdnnHandle_t> &f) const;
-};
-
 } // namespace device::kunlun

 #endif // __INFINIOP_KUNLUN_HANDLE_H__
--- a/src/infiniop/devices/kunlun/kunlun_kernel_common.h
+++ b/src/infiniop/devices/kunlun/kunlun_kernel_common.h
@@ -2,13 +2,25 @@
 #define __INFINIOP_KUNLUN_KERNEL_COMMON_H__

 // This header file will only be include by .xpu file
-#include "kunlun_kernel_dtype.h"
-#include "xpu/kernel/xtdk.h"
-#include "xpu/kernel/xtdk_math.h"
-#include "xpu/kernel/xtdk_simd.h"
 #include "xpu/runtime.h"
+#include <xpu/kernel/xtdk.h>
+#include <xpu/kernel/xtdk_io.h>
+#include <xpu/kernel/xtdk_math.h>
+#include <xpu/kernel/xtdk_simd.h>

 namespace device::kunlun::kernel {
+
+typedef struct _ptrdiff_t {
+    ptrdiff_t value;   // 32 bit
+    ptrdiff_t padding; // 32 bit
+} _ptrdiff_t;
+
+// same as ptrdiff
+typedef struct _size_t {
+    size_t value;
+    size_t padding;
+} _size_t;
+
 // Get mask for kunlun xpu 512bit register calculation
 // if data is not enough to 512bit, padding zero and use
 // mask to identify real data
@@ -28,37 +40,50 @@ inline __device__ void atomicAddF32(__shared_ptr__ float *ptr, float value) {
    }
 }

-inline __device__ size_t indexToReducedOffset(
-    size_t flat_index,
-    size_t ndim,
-    const _ptrdiff_t *broadcasted_strides,
-    const _ptrdiff_t *target_strides) {
+/**
+ * @brief Get index of broadcasted input
+ * flat_index: flatten index of output tensor
+ * ndim: dim of output tensor
+ * broadcasted_strides: strides of output tensor
+ * target_strides: strides of input tensor
+ */
+inline __device__ int indexToReducedOffset(
+    int flat_index,                        // output flatten index
+    int ndim,                              // output dims
+    const _ptrdiff_t *broadcasted_strides, // output strides
+    const _ptrdiff_t *target_strides) {    // strides of inputs

-    size_t res = 0;
-    for (size_t i = 0; i < ndim; ++i) {
+    int res = 0;
+    for (int i = 0; i < ndim; ++i) {
        res += flat_index / broadcasted_strides[i].value * target_strides[i].value;
        flat_index %= broadcasted_strides[i].value;
-        mfence();
    }
    return res;
 }

-inline __device__ size_t indexToOffset(
-    size_t flat_index,
-    size_t ndim,
+/**
+ * @brief Get real offset of input index
+ * flat_index: flatten index input
+ * ndim: dim of input tensor
+ * shape: shape of input tensor
+ * strides: strides of input tensor
+ */
+inline __device__ int indexToOffset(
+    int flat_index,
+    int ndim,
    const _size_t *shape,
    const _ptrdiff_t *strides) {

-    size_t res = 0;
-    for (size_t i = ndim; i-- > 0;) {
+    int res = 0;
+    for (int i = ndim; i-- > 0;) {
        res += (flat_index % shape[i].value) * strides[i].value;
        flat_index /= shape[i].value;
-        mfence();
    }
    return res;
 }

 } // namespace device::kunlun::kernel
+
+#endif // __INFINIOP_KUNLUN_KERNEL_COMMON_H__
 // TODO: atomicAddF16
 // TODO: atomicAddI8
-#endif
--- a/src/infiniop/devices/kunlun/kunlun_kernel_dtype.h
+++ b/src/infiniop/devices/kunlun/kunlun_kernel_dtype.h
-#ifndef __INFINIOP_KUNLUN_DTYPE_H__
-#define __INFINIOP_KUNLUN_DTYPE_H__
-
-#include "xpu/kernel/xtdk.h"
-#include "xpu/kernel/xtdk_math.h"
-#include "xpu/kernel/xtdk_simd.h"
-#include "xpu/runtime.h"
-
-// kunlun ptrdiff_t* is used to save ptrdiff_t array
-// copied from host
-typedef struct _ptrdiff_t {
-    long value;   // 32 bit
-    long padding; // 32 bit
-} _ptrdiff_t;
-
-// same as ptrdiff
-typedef struct _size_t {
-    size_t value;
-    size_t padding;
-} _size_t;
-
-#endif
--- a/src/infiniop/elementwise/kunlun/elementwise_kunlun.h
+++ b/src/infiniop/elementwise/kunlun/elementwise_kunlun.h
-#ifndef __INFINIOP_ELEMENTWISE_KUNLUN_H__
-#define __INFINIOP_ELEMENTWISE_KUNLUN_H__
+#ifndef __INFINIOP_ELEMENTWISE_KUNLUN_XPU__
+#define __INFINIOP_ELEMENTWISE_KUNLUN_XPU__

 #include "../../../utils.h"
-#include "../../devices/kunlun/kunlun_handle.h"
+#include "../../devices/kunlun/kunlun_common.h"
+#include "../../devices/kunlun/kunlun_kernel_common.h"
 #include "elementwise_kunlun_api.h"

 namespace op::elementwise::kunlun {

+using namespace device::kunlun::kernel;
+
+template <typename T>
+__device__ const T *typedInputPtr(const void *ptr) {
+    return reinterpret_cast<const T *>(ptr);
+}
+
+/**
+ * @brief Computes input tile offset
+ */
+struct InputIndexer {
+    int idx;
+    int ndim;
+    const bool *input_contiguous;
+    const bool *input_broadcasted;
+    const _size_t *input_shapes;
+    const _ptrdiff_t *input_strides;
+    const _ptrdiff_t *output_strides;
+
+    inline __device__ int operator()(int input_id) const {
+        return input_contiguous[input_id]
+                 ? idx
+                 : (input_broadcasted[input_id]
+                        ? indexToReducedOffset(idx, ndim, output_strides, input_strides + input_id * ndim)
+                        : indexToOffset(idx, ndim, input_shapes + input_id * ndim, input_strides + input_id * ndim));
+    }
+};
+
+/**
+ * @brief Computes the output index in memory, accounting for strides if non-contiguous.
+ *
+ * @param idx            Linear index.
+ * @param is_contiguous  Whether the output tensor is contiguous.
+ * @param ndim           Number of dimensions.
+ * @param shape          Shape of the output tensor.
+ * @param strides        Strides of the output tensor.
+ * @return               Memory offset index.
+ */
+inline __device__ int
+getOutputIndex(int idx,
+               bool is_contiguous,
+               int ndim,
+               const _size_t *shape,
+               const _ptrdiff_t *strides) {
+    return is_contiguous ? idx : indexToOffset(idx, ndim, shape, strides);
+}
+
+/**
+ * @brief Computes elements of input indexes
+ */
+template <int N, typename Op, typename Tdata, typename... Args>
+__device__ void launchOp(
+    __global_ptr__ Tdata **typed_inputs, // gm pointer
+    __global_ptr__ Tdata *output,        // gm pointer output
+    Tdata *inputs_buf,                   // local mem buffer
+    int *input_indexes,
+    int output_index,
+    Args... args) {
+
+    static_assert(N == Op::num_inputs, "template N is not equal to Op::num_inputs!\n");
+
+#pragma unroll
+    // Copy inputs to buf
+    for (int i = 0; i < N; i++) {
+        auto gm = typed_inputs[i] + input_indexes[i];
+        auto lm = inputs_buf + i;
+        GM2LM_ASYNC(gm, lm, 1 * sizeof(Tdata));
+    }
+    mfence();
+
+    // Calculate elementwise
+    // Inputs save all operands
+    Tdata out = Op{}(inputs_buf, args...);
+
+    // Copy out to gm
+    LM2GM_ASYNC(&out, output + output_index, 1 * sizeof(Tdata));
+    mfence();
+}
+
+template <int N, typename Op, typename Tdata, typename... Args>
+__global__ void elementwiseKernel(
+    int output_size,
+    int ndim,
+    bool output_contiguous,
+    const bool *input_contiguous_gm,
+    const bool *input_broadcasted_gm,
+    const void *output_shape_gm,
+    const void *input_shapes_gm,
+    const void *output_strides_gm,
+    const void *input_strides_gm,
+    Tdata *output,
+    const void *const *inputs,
+    Args... args) {
+
+    int cid = core_id();
+    int ncores = core_num();
+    if (cid >= ncores) {
+        return;
+    }
+    int thread_id = ncores * cluster_id() + cid;
+    int nthreads = ncores * cluster_num();
+
+    // Cast input gm pointer type
+    auto typed_inputs = reinterpret_cast<const __global_ptr__ Tdata *const __global_ptr__ *>(inputs);
+
+    const int BUFF_SIZE = 64;
+    // Input data cache
+    __local__ Tdata inputs_buf[N];
+    // Input contiguous/broadcasted flags
+    __local__ bool input_contiguous[N];
+    __local__ bool input_broadcasted[N];
+    // Input shape/strides
+    __local__ _size_t input_shapes[N * ndim];
+    __local__ _ptrdiff_t input_strides[N * ndim];
+    // Output shape/strides
+    __local__ _size_t output_shape[ndim];
+    __local__ _ptrdiff_t output_strides[ndim];
+    // Inputs gm ptr buf
+    __local__ __global_ptr__ Tdata *typed_inputs_ptr[N];
+
+    // Load from gm
+    GM2LM_ASYNC(input_contiguous_gm, input_contiguous, N * sizeof(bool));
+    GM2LM_ASYNC(input_broadcasted_gm, input_broadcasted, N * sizeof(bool));
+    GM2LM_ASYNC(input_shapes_gm, input_shapes, N * ndim * sizeof(_size_t));
+    GM2LM_ASYNC(input_strides_gm, input_strides, N * ndim * sizeof(_ptrdiff_t));
+    GM2LM_ASYNC(output_shape_gm, output_shape, ndim * sizeof(_size_t));
+    GM2LM_ASYNC(output_strides_gm, output_strides, ndim * sizeof(_ptrdiff_t));
+    GM2LM_ASYNC(typed_inputs, typed_inputs_ptr, N * sizeof(__global_ptr__ Tdata *));
+    mfence();
+
+    int len_per_loop = min(BUFF_SIZE, roundup_div(output_size, nthreads));
+
+    for (int start = thread_id * len_per_loop; start < output_size; start += nthreads * len_per_loop) {
+        int read_len = min(len_per_loop, output_size - start);
+        for (int idx = start; idx < start + read_len; ++idx) {
+            int out_idx = getOutputIndex(idx, output_contiguous,
+                                         ndim, output_shape, output_strides);
+            InputIndexer indexer{idx, ndim, input_contiguous, input_broadcasted,
+                                 input_shapes, input_strides, output_strides};
+            // Get index offset for every operand
+            int indexes[N];
+            for (int i = 0; i < N; i++) {
+                indexes[i] = indexer(i);
+            }
+            // Launch operater
+            launchOp<N, Op, Tdata>(&typed_inputs_ptr[0], output, inputs_buf, indexes, out_idx, args...);
+        }
+    }
+    sync_cluster();
+}
+
 struct DeviceImpl::Opaque {
    std::shared_ptr<device::kunlun::Handle::Internal> internal;

    Opaque(const std::shared_ptr<device::kunlun::Handle::Internal> &internal_)
        : internal(internal_) {}

-    template <size_t N, typename Op, typename Tdata, typename... Args>
+    template <uint32_t BLOCK_SIZE, int N, typename Op, typename Tdata, typename... Args>
    infiniStatus_t calculateImpl(const op::elementwise::ElementwiseInfo &info,
                                 void *workspace,
                                 void *output,
                                 const std::vector<const void *> &inputs,
                                 kunlunStream_t stream,
                                 Args &&...args) {
-
-        auto output_size = info.getOutputSize();
-        if (output_size == 0) {
-            return INFINI_STATUS_SUCCESS;
-        }
-
-        // Device pointers
-        const void **d_inputs_arr = nullptr;
-        const bool *d_input_contiguous = nullptr;
-        const bool *d_input_broadcasted = nullptr;
-        const size_t *d_output_shape = nullptr;
-        const ptrdiff_t *d_output_strides = nullptr;
-        const size_t *d_input_shapes = nullptr;
-        const ptrdiff_t *d_input_strides = nullptr;
-
-        CHECK_STATUS(infoToDevice<N>(info, workspace, inputs.data(), d_inputs_arr,
-                                     d_input_contiguous, d_input_broadcasted,
-                                     d_output_shape, d_output_strides,
-                                     d_input_shapes, d_input_strides));
-
-        Op::template launch<Tdata>(
-            output_size,
-            info.getNdim(),
-            info.isOutputContiguous(),
-            reinterpret_cast<const void *>(d_input_contiguous),
-            reinterpret_cast<const void *>(d_input_broadcasted),
-            reinterpret_cast<const void *>(d_output_shape),
-            reinterpret_cast<const void *>(d_input_shapes),
-            reinterpret_cast<const void *>(d_output_strides),
-            reinterpret_cast<const void *>(d_input_strides),
-            output,
-            reinterpret_cast<const void *const *>(d_inputs_arr),
+        return launchElementwiseKernel<BLOCK_SIZE, N>(
+            info,
+            workspace,
+            reinterpret_cast<Tdata *>(output),
+            inputs,
+            elementwiseKernel<N, Op, Tdata, Args...>,
            stream,
-            args...);
-
-        return INFINI_STATUS_SUCCESS;
+            std::forward<Args>(args)...);
    }

 private:
-    template <size_t N>
+    /**
+     * @brief Transfers elementwise operation metadata and input pointers from host to device memory.
+     *
+     * @tparam N                     Number of input tensors.
+     *
+     * @param info                   Elementwise operation metadata (shapes, strides, flags, etc.).
+     * @param workspace              Pointer to device workspace memory for storing metadata and input pointers.
+     * @param h_inputs_arr           Host array of input tensor pointers.
+     * @param d_inputs_arr           Input reference to device array of input tensor pointers.
+     * @param d_input_contiguous     Input reference to device array indicating whether each input is contiguous.
+     * @param d_input_broadcasted    Input reference to device array indicating whether each input is broadcasted.
+     * @param d_output_shape         Output reference to device array holding the output tensor shape.
+     * @param d_output_strides       Output reference to device array holding output tensor strides.
+     * @param d_input_shapes         Output reference to flattened input tensor shapes (N * ndim).
+     * @param d_input_strides        Output reference to flattened input tensor strides (N * ndim).
+     * @param stream                 KUNLUN stream used for asynchronous memory transfer.
+     * @return infiniStatus_t        Status indicating success or failure of the memory transfer and setup.
+     */
+    template <int N>
    infiniStatus_t infoToDevice(
        const op::elementwise::ElementwiseInfo &info,
        void *workspace,
        const void *const *h_inputs_arr,
-        const void **&d_inputs_arr,
-        const bool *&d_input_contiguous,
-        const bool *&d_input_broadcasted,
-        const size_t *&d_output_shape,
-        const ptrdiff_t *&d_output_strides,
-        const size_t *&d_input_shapes,
-        const ptrdiff_t *&d_input_strides) const {
+        __global_ptr__ const void **&d_inputs_arr,
+        __global_ptr__ const bool *&d_input_contiguous,
+        __global_ptr__ const bool *&d_input_broadcasted,
+        __global_ptr__ const size_t *&d_output_shape,
+        __global_ptr__ const ptrdiff_t *&d_output_strides,
+        __global_ptr__ const size_t *&d_input_shapes,
+        __global_ptr__ const ptrdiff_t *&d_input_strides,
+        kunlunStream_t stream) const {

        constexpr auto input_size = N;
        const auto ndim = info.getNdim();
        constexpr auto input_arr_size = N * sizeof(*h_inputs_arr);
-        const int8_t *info_meta_start = info.getMetaStart();
-        const int8_t *d_meta_start = reinterpret_cast<int8_t *>(workspace) + input_arr_size;
+        auto info_meta_start = info.getMetaStart(); // host meta pointer
+
+        auto d_meta_start = reinterpret_cast<__global_ptr__ int8_t *>(workspace)
+                          + input_arr_size; // device meta pointer

        // copy the input pointer array and meta to device
-        CHECK_KUNLUN(xpu_memcpy(workspace, h_inputs_arr, input_arr_size, XPU_HOST_TO_DEVICE));
-        CHECK_KUNLUN(xpu_memcpy((void *)d_meta_start, info_meta_start, info.getMetaMemSize(), XPU_HOST_TO_DEVICE));
+        CHECK_KUNLUN(xpu_memcpy_async(workspace, h_inputs_arr, input_arr_size, XPU_HOST_TO_DEVICE, stream));
+        CHECK_KUNLUN(xpu_memcpy_async((void *)d_meta_start, info_meta_start, info.getMetaMemSize(), XPU_HOST_TO_DEVICE, stream));
+
+        xpu_wait(stream);
+        // xpu_wait(stream);

        // offset/assign the pointers
-        d_inputs_arr = reinterpret_cast<const void **>(workspace);
-        d_output_shape = reinterpret_cast<const size_t *>(d_meta_start);
-        d_output_strides = reinterpret_cast<const ptrdiff_t *>(d_output_shape + ndim);
-        d_input_shapes = reinterpret_cast<const size_t *>(d_output_strides + ndim);
-        d_input_strides = reinterpret_cast<const ptrdiff_t *>(d_input_shapes + input_size * ndim);
+        d_inputs_arr = reinterpret_cast<__global_ptr__ const void **>(workspace);
+        d_output_shape = reinterpret_cast<__global_ptr__ const size_t *>(d_meta_start);
+        d_output_strides = reinterpret_cast<__global_ptr__ const ptrdiff_t *>(d_output_shape + ndim);
+        d_input_shapes = reinterpret_cast<__global_ptr__ const size_t *>(d_output_strides + ndim);
+        d_input_strides = reinterpret_cast<__global_ptr__ const ptrdiff_t *>(d_input_shapes + input_size * ndim);
        d_input_contiguous = reinterpret_cast<const bool *>(d_input_strides + input_size * ndim);
        d_input_broadcasted = reinterpret_cast<const bool *>(d_input_contiguous + input_size);

+        // contiguous / broadcast 信息
+        const bool *contiguous = info.getInputContiguous();
+        const bool *broadcasted = info.getInputBroadcasted();
+
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    /**
+     * @brief Launch elementwise kernel
+     */
+    template <uint32_t BLOCK_SIZE, int N, typename KernelFunc, typename Tout, typename... Args>
+    infiniStatus_t launchElementwiseKernel(
+        const op::elementwise::ElementwiseInfo &info,
+        void *workspace,
+        Tout *output,
+        const std::vector<const void *> &inputs,
+        KernelFunc kernel_func,
+        kunlunStream_t stream,
+        Args &&...args) {
+
+        auto output_size = info.getOutputSize();
+        if (output_size == 0) {
+            return INFINI_STATUS_SUCCESS;
+        }
+
+        // Device pointers
+        __global_ptr__ const void **d_inputs_arr = nullptr;
+        __global_ptr__ const bool *d_input_contiguous = nullptr;
+        __global_ptr__ const bool *d_input_broadcasted = nullptr;
+        __global_ptr__ const size_t *d_output_shape = nullptr;
+        __global_ptr__ const ptrdiff_t *d_output_strides = nullptr;
+        __global_ptr__ const size_t *d_input_shapes = nullptr;
+        __global_ptr__ const ptrdiff_t *d_input_strides = nullptr;
+
+        CHECK_STATUS(infoToDevice<N>(info, workspace, inputs.data(), d_inputs_arr,
+                                     d_input_contiguous, d_input_broadcasted,
+                                     d_output_shape, d_output_strides,
+                                     d_input_shapes, d_input_strides, stream));
+
+        kernel_func<<<BLOCK_SIZE, 64, stream>>>(
+            output_size,
+            info.getNdim(),
+            info.isOutputContiguous(),
+            d_input_contiguous,
+            d_input_broadcasted,
+            reinterpret_cast<__global_ptr__ const void *>(d_output_shape),
+            reinterpret_cast<__global_ptr__ const void *>(d_input_shapes),
+            reinterpret_cast<__global_ptr__ const void *>(d_output_strides),
+            reinterpret_cast<__global_ptr__ const void *>(d_input_strides),
+            output,
+            reinterpret_cast<__global_ptr__ const void **>(d_inputs_arr),
+            args...);
+
        return INFINI_STATUS_SUCCESS;
    }
 };
@@ -101,37 +302,35 @@ utils::Result<DeviceImpl *> DeviceImpl::create(Args &&...args) {
    return utils::Result<DeviceImpl *>(new DeviceImpl(opaque));
 }

-template <typename Op, typename Tdata, typename... Args>
+template <unsigned int BLOCK_SIZE, typename Op, typename Tdata, typename... Args>
 infiniStatus_t DeviceImpl::calculate(const op::elementwise::ElementwiseInfo &info,
                                     void *workspace,
                                     void *output,
                                     const std::vector<const void *> &inputs,
                                     void *stream,
                                     Args &&...args) {
-    constexpr size_t N = Op::num_inputs;
-    return _opaque->calculateImpl<N, Op, Tdata>(
+    constexpr int N = Op::num_inputs;
+    return _opaque->calculateImpl<BLOCK_SIZE, N, Op, Tdata>(
        info, workspace, output, inputs,
        reinterpret_cast<kunlunStream_t>(stream),
        std::forward<Args>(args)...);
 }
-} // namespace op::elementwise::kunlun

-// Template for kunlun kernel interface declaration
-#define LAUNCH_ELEMENTWISE_KERNEL(OpName)       \
-    template <typename Tdata, typename... Args> \
-    void launch##OpName##Kernel(                \
-        size_t output_size,                     \
-        size_t ndim,                            \
+#define INSTANTIATE_ELEMENTWISE_KERNEL(N, Op, Tdata, ...)                    \
+    template __global__ void elementwiseKernel<N, Op, Tdata, ##__VA_ARGS__>( \
+        int output_size,                                                     \
+        int ndim,                                                            \
        bool output_contiguous,                                              \
-        const void *input_contiguous,           \
-        const void *input_broadcasted,          \
-        const void *output_shape,               \
-        const void *input_shapes,               \
-        const void *output_strides,             \
-        const void *input_strides,              \
-        void *output,                           \
+        const bool *input_contiguous_gm,                                     \
+        const bool *input_broadcasted_gm,                                    \
+        const void *output_shape_gm,                                         \
+        const void *input_shapes_gm,                                         \
+        const void *output_strides_gm,                                       \
+        const void *input_strides_gm,                                        \
+        Tdata *output,                                                       \
        const void *const *inputs,                                           \
-        XPUStream stream,                       \
-        Args... args);
+        ##__VA_ARGS__);
+
+} // namespace op::elementwise::kunlun

 #endif
--- a/src/infiniop/elementwise/kunlun/elementwise_kunlun_api.h
+++ b/src/infiniop/elementwise/kunlun/elementwise_kunlun_api.h
@@ -17,7 +17,10 @@ public:
    template <typename... Args>
    static utils::Result<DeviceImpl *> create(Args &&...args);

-    template <typename Op, typename Tdata, typename... Args>
+    /**
+     * @brief Launches elementwise operation. Operands have same types.
+     */
+    template <unsigned int BLOCK_SIZE, typename Op, typename Tdata, typename... Args>
    infiniStatus_t calculate(
        const op::elementwise::ElementwiseInfo &info,
        void *workspace,
@@ -25,6 +28,20 @@ public:
        const std::vector<const void *> &inputs,
        void *stream,
        Args &&...args);
+
+    //     /**
+    //      * @brief Launches elementwise operation where operands' types differ
+    //      */
+    //     template <unsigned int BLOCK_SIZE, typename Op, typename Tout, typename... Tin,
+    //               typename... Args,
+    //               std::enable_if_t<(sizeof...(Tin) == Op::num_inputs), int> = 0>
+    //     infiniStatus_t calculate(
+    //         const op::elementwise::ElementwiseInfo &info,
+    //         void *workspace,
+    //         void *output,
+    //         const std::vector<const void *> &inputs,
+    //         void *stream,
+    //         Args &&...args);
 };

 } // namespace op::elementwise::kunlun

--- a/src/infiniop/elementwise/kunlun/elementwise_kunlun_kernel.h
+++ b/src/infiniop/elementwise/kunlun/elementwise_kunlun_kernel.h
-#ifndef __INFINIOP_ELEMENTWISE_KUNLUN_XPU__
-#define __INFINIOP_ELEMENTWISE_KUNLUN_XPU__
-
-#include "../../devices/kunlun/kunlun_kernel_common.h"
-
-using namespace device::kunlun::kernel;
-
-/**
- * @brief Computes input tile offset
- */
-struct InputIndexer {
-    size_t idx;
-    size_t ndim;
-    const bool *input_contiguous;
-    const bool *input_broadcasted;
-    const _size_t *input_shapes;
-    const _ptrdiff_t *input_strides;
-    const _ptrdiff_t *output_strides;
-
-    __device__ size_t operator()(size_t input_id) const {
-        return input_contiguous[input_id]
-                 ? idx
-                 : (input_broadcasted[input_id]
-                        ? indexToReducedOffset(idx, ndim, output_strides, input_strides + input_id * ndim)
-                        : indexToOffset(idx, ndim, input_shapes + input_id * ndim, input_strides + input_id * ndim));
-    }
-};
-
-/**
- * @brief Computes the output index in memory, accounting for strides if non-contiguous.
- *
- * @param idx            Linear index.
- * @param is_contiguous  Whether the output tensor is contiguous.
- * @param ndim           Number of dimensions.
- * @param shape          Shape of the output tensor.
- * @param strides        Strides of the output tensor.
- * @return               Memory offset index.
- */
-inline __device__ size_t
-getOutputIndex(size_t idx,
-               bool is_contiguous,
-               size_t ndim,
-               const _size_t *shape,
-               const _ptrdiff_t *strides) {
-    return is_contiguous ? idx : indexToOffset(idx, ndim, shape, strides);
-}
-
-template <size_t N, typename Op, typename Tdata, typename... Args>
-__device__ void launchOp(
-    __global_ptr__ Tdata **typed_inputs, // gm pointer
-    __global_ptr__ Tdata *output,        // gm pointer output
-    Tdata *inputs_buf,                   // local mem buffer
-    size_t *input_indexes,
-    size_t output_index,
-    Args... args) {
-
-    static_assert(N == Op::num_inputs, "template N is not equal to Op::num_inputs!\n");
-
-#pragma unroll
-    // Copy inputs to buf
-    for (size_t i = 0; i < N; i++) {
-        auto gm = typed_inputs[i] + input_indexes[i];
-        auto lm = inputs_buf + i;
-        GM2LM_ASYNC(gm, lm, 1 * sizeof(Tdata));
-    }
-    mfence();
-
-    // Calculate elementwise
-    // Inputs save all operands
-    Tdata out = Op{}(inputs_buf, args...);
-
-    // Copy out to gm
-    LM2GM_ASYNC(&out, output + output_index, 1 * sizeof(Tdata));
-    mfence();
-}
-
-template <size_t N, typename Op, typename Tdata, typename... Args>
-__global__ void elementwiseKernel(
-    size_t output_size,
-    size_t ndim,
-    bool output_contiguous,
-    const bool *input_contiguous_gm,
-    const bool *input_broadcasted_gm,
-    const _size_t *output_shape_gm,
-    const _size_t *input_shapes_gm,
-    const _ptrdiff_t *output_strides_gm,
-    const _ptrdiff_t *input_strides_gm,
-    Tdata *output,
-    const void *const *inputs,
-    Args... args) {
-
-    int cid = core_id();
-    int ncores = core_num();
-    if (cid >= ncores) {
-        return;
-    }
-    int thread_id = ncores * cluster_id() + cid;
-    int nthreads = ncores * cluster_num();
-
-    // Cast input gm pointer type
-    auto typed_inputs = reinterpret_cast<const __global_ptr__ Tdata *const __global_ptr__ *>(inputs);
-
-    const int BUFF_SIZE = 64;
-    // Input data cache
-    __local__ Tdata inputs_buf[N];
-    // Input contiguous/broadcasted flags
-    __local__ bool input_contiguous[N];
-    __local__ bool input_broadcasted[N];
-    // Input shape/strides
-    __local__ _size_t input_shapes[N * ndim];
-    __local__ _ptrdiff_t input_strides[N * ndim];
-    // Output shape/strides
-    __local__ _size_t output_shape[ndim];
-    __local__ _ptrdiff_t output_strides[ndim];
-    // Inputs gm ptr buf
-    __local__ __global_ptr__ Tdata *typed_inputs_ptr[N];
-
-    // Load from gm
-    GM2LM_ASYNC(input_contiguous_gm, input_contiguous, N * sizeof(bool));
-    GM2LM_ASYNC(input_broadcasted_gm, input_broadcasted, N * sizeof(bool));
-    GM2LM_ASYNC(input_shapes_gm, input_shapes, N * ndim * sizeof(_size_t));
-    GM2LM_ASYNC(input_strides_gm, input_strides, N * ndim * sizeof(_ptrdiff_t));
-    GM2LM_ASYNC(output_shape_gm, output_shape, ndim * sizeof(_size_t));
-    GM2LM_ASYNC(output_strides_gm, output_strides, ndim * sizeof(_ptrdiff_t));
-    GM2LM_ASYNC(typed_inputs, typed_inputs_ptr, N * sizeof(__global_ptr__ Tdata *));
-    mfence();
-
-    int len_per_loop = min(BUFF_SIZE, roundup_div(output_size, nthreads));
-
-    for (int start = thread_id * len_per_loop; start < output_size; start += nthreads * len_per_loop) {
-        size_t read_len = min(len_per_loop, output_size - start);
-        for (int idx = start; idx < start + read_len; ++idx) {
-            size_t out_idx = getOutputIndex(static_cast<size_t>(idx), output_contiguous,
-                                            ndim, output_shape, output_strides);
-            InputIndexer indexer{static_cast<size_t>(idx), ndim, input_contiguous, input_broadcasted,
-                                 input_shapes, input_strides, output_strides};
-            // Get index offset for every operand
-            size_t indexes[N];
-            for (size_t i = 0; i < N; i++) {
-                indexes[i] = indexer(i);
-            }
-            // Launch operater
-            launchOp<N, Op, Tdata>(&typed_inputs_ptr[0], output, inputs_buf, indexes, out_idx, args...);
-        }
-    }
-    sync_cluster();
-}
-
-#define LAUNCH_ELEMENTWISE_KERNEL_IMPL(OpName, Op)                       \
-    template <typename Tdata, typename... Args>                          \
-    void launch##OpName##Kernel(                                         \
-        size_t output_size,                                              \
-        size_t ndim,                                                     \
-        bool output_contiguous,                                          \
-        const void *input_contiguous,                                    \
-        const void *input_broadcasted,                                   \
-        const void *output_shape,                                        \
-        const void *input_shapes,                                        \
-        const void *output_strides,                                      \
-        const void *input_strides,                                       \
-        void *output,                                                    \
-        const void *const *inputs,                                       \
-        XPUStream stream,                                                \
-        Args... args) {                                                  \
-        elementwiseKernel<Op::num_inputs, Op, Tdata><<<8, 64, stream>>>( \
-            output_size, ndim, output_contiguous,                        \
-            reinterpret_cast<const bool *>(input_contiguous),            \
-            reinterpret_cast<const bool *>(input_broadcasted),           \
-            reinterpret_cast<const _size_t *>(output_shape),             \
-            reinterpret_cast<const _size_t *>(input_shapes),             \
-            reinterpret_cast<const _ptrdiff_t *>(output_strides),        \
-            reinterpret_cast<const _ptrdiff_t *>(input_strides),         \
-            reinterpret_cast<Tdata *>(output), inputs, args...);         \
-    }
-
-#define LAUNCH_ELEMENTWISE_KERNEL_INSTANTIATE(OpName, T, ...) \
-    template void launch##OpName##Kernel<T, ##__VA_ARGS__>(   \
-        size_t output_size,                                   \
-        size_t ndim,                                          \
-        bool output_contiguous,                               \
-        const void *input_contiguous,                         \
-        const void *input_broadcasted,                        \
-        const void *output_shape,                             \
-        const void *input_shapes,                             \
-        const void *output_strides,                           \
-        const void *input_strides,                            \
-        void *output,                                         \
-        const void *const *inputs,                            \
-        XPUStream stream,                                     \
-        ##__VA_ARGS__);
-
-#endif
--- a/src/infiniop/elementwise/nvidia/elementwise_nvidia.cuh
+++ b/src/infiniop/elementwise/nvidia/elementwise_nvidia.cuh
@@ -265,9 +265,9 @@ private:
     * @param info                   Elementwise operation metadata (shapes, strides, flags, etc.).
     * @param workspace              Pointer to device workspace memory for storing metadata and input pointers.
     * @param h_inputs_arr           Host array of input tensor pointers.
-     * @param d_inputs_arr           Output reference to device array of input tensor pointers.
-     * @param d_input_contiguous     Output reference to device array indicating whether each input is contiguous.
-     * @param d_input_broadcasted    Output reference to device array indicating whether each input is broadcasted.
+     * @param d_inputs_arr           Input reference to device array of input tensor pointers.
+     * @param d_input_contiguous     Input reference to device array indicating whether each input is contiguous.
+     * @param d_input_broadcasted    Input reference to device array indicating whether each input is broadcasted.
     * @param d_output_shape         Output reference to device array holding the output tensor shape.
     * @param d_output_strides       Output reference to device array holding output tensor strides.
     * @param d_input_shapes         Output reference to flattened input tensor shapes (N * ndim).

--- a/src/infiniop/ops/gemm/kunlun/gemm_kunlun.cc
+++ b/src/infiniop/ops/gemm/kunlun/gemm_kunlun.cc
 #include "gemm_kunlun.h"
 #include "../../../../utils.h"
-#include "../../../devices/kunlun/kunlun_handle.h"
+#include "../../../devices/kunlun/kunlun_common.h"

 namespace op::gemm::kunlun {


--- a/src/infiniop/ops/rms_norm/kunlun/rms_norm_kunlun.cc
+++ b/src/infiniop/ops/rms_norm/kunlun/rms_norm_kunlun.cc
 #include "rms_norm_kunlun.h"
-#include "../../../devices/kunlun/kunlun_handle.h"
+#include "../../../devices/kunlun/kunlun_common.h"
 #include <memory>
 #include <stdint.h>


--- a/src/infiniop/ops/swiglu/kunlun/swiglu_kunlun.h
+++ b/src/infiniop/ops/swiglu/kunlun/swiglu_kunlun.h
 #ifndef __SWIGLU_KUNLUN_H__
 #define __SWIGLU_KUNLUN_H__

-#include "../../../elementwise/kunlun/elementwise_kunlun.h"
+#include "../../../elementwise/kunlun/elementwise_kunlun_api.h"

 ELEMENTWISE_DESCRIPTOR(swiglu, kunlun)


--- a/src/infiniop/ops/swiglu/kunlun/swiglu_kunlun.cc
+++ b/src/infiniop/ops/swiglu/kunlun/swiglu_kunlun.cc
+#include "../../../elementwise/kunlun/elementwise_kunlun.h"
 #include "swiglu_kunlun.h"

-// Op interface declare
-LAUNCH_ELEMENTWISE_KERNEL(SwiGLU)
-
-namespace op::swiglu::kunlun {
+namespace op::elementwise::kunlun {

+/// @brief SwiGLU op kernel
 typedef struct SwiGLUOp {
-    static constexpr size_t num_inputs = 2;
-    template <typename Tdata, typename... Args>
-    static infiniStatus_t launch(Args... args) {
-        launchSwiGLUKernel<Tdata>(args...);
-        return INFINI_STATUS_SUCCESS;
+private:
+    template <typename T>
+    inline __device__ T sigmoid(T x) const {
+        return 1.0f / (1.0f + exp(-x));
+    }
+
+public:
+    // This static number must be set in other Ops
+    static constexpr int num_inputs = 2;
+    template <typename T>
+    inline __device__ T operator()(const T *inputs) const {
+        T up = inputs[0];
+        T gate = inputs[1];
+        T out = gate * sigmoid(gate) * up;
+        return out;
    }
 } SwiGLUOp;

+INSTANTIATE_ELEMENTWISE_KERNEL(SwiGLUOp::num_inputs, SwiGLUOp, float);
+} // namespace op::elementwise::kunlun
+
+namespace op::swiglu::kunlun {
+
 Descriptor::~Descriptor() = default;

 infiniStatus_t Descriptor::create(
@@ -53,7 +67,7 @@ infiniStatus_t Descriptor::calculate(

    switch (_dtype) {
    case INFINI_DTYPE_F32:
-        return _device_info->calculate<SwiGLUOp, float>(_info, workspace, output, inputs, stream);
+        return _device_info->calculate<8, op::elementwise::kunlun::SwiGLUOp, float>(_info, workspace, output, inputs, stream);
    default:
        return INFINI_STATUS_BAD_TENSOR_DTYPE;
    }

--- a/src/infiniop/ops/swiglu/kunlun/swiglu_kunlun_internal.xpu
+++ b/src/infiniop/ops/swiglu/kunlun/swiglu_kunlun_internal.xpu
-#ifndef __SWIGLU_KUNLUN_H__
-#define __SWIGLU_KUNLUN_H__
-
-#include "../../../devices/kunlun/kunlun_kernel_common.h"
-#include "../../../elementwise/kunlun/elementwise_kunlun_kernel.h"
-
-/// @brief Define swiglu op for local mem
-typedef struct SwiGLUOp {
-private:
-    template <typename T>
-    inline __device__ T sigmoid(T x) const {
-        return 1.0f / (1.0f + exp(-x));
-    }
-
-public:
-    // This static number must be set in other Ops
-    static constexpr size_t num_inputs = 2;
-    template <typename T>
-    inline __device__ T operator()(const T *inputs) const {
-        T up = inputs[0];
-        T gate = inputs[1];
-        T out = gate * sigmoid(gate) * up;
-        return out;
-    }
-} SwiGLUOp;
-
-// Definition for swiglu kernel interface
-LAUNCH_ELEMENTWISE_KERNEL_IMPL(SwiGLU, SwiGLUOp)
-
-// Template instantiate
-LAUNCH_ELEMENTWISE_KERNEL_INSTANTIATE(SwiGLU, float)
-
-#endif // __SWIGLU_KUNLUN_H__
--- a/xmake/kunlun.lua
+++ b/xmake/kunlun.lua
@@ -5,27 +5,22 @@ local XTDK_DIR = path.join(KUNLUN_HOME, "xtdk")
 local XDNN_DIR = path.join(KUNLUN_HOME, "xhpc", "xdnn")

 -- Add include dirs
-add_includedirs(path.join(XRE_DIR, "include"), {public=true})
-add_includedirs(path.join(XTDK_DIR, "include"), {public=true})
-add_includedirs(path.join(XDNN_DIR, "include"), {public=true})
-
+add_includedirs(path.join(XRE_DIR, "include"))
+add_includedirs(path.join(XDNN_DIR, "include"))
+add_includedirs(path.join(XTDK_DIR, "include"))
 add_linkdirs(path.join(XRE_DIR, "so"))
 add_linkdirs(path.join(XDNN_DIR, "so"))
-add_links("xpurt")
-add_links("xpuapi")
+add_links("xpurt", "xpuapi")

 rule("xpu")
    set_extensions(".xpu")
    
-    on_load(function (target)
-        target:add("includedirs", path.join(os.projectdir(), "include"))
-    end)
-
    on_build_file(function (target, sourcefile)

+        local sourcefile_config = target:fileconfig(sourcefile) or {}
+        local includedirs = sourcefile_config.includedirs or {}
+
        local objectfile = target:objectfile(sourcefile)
-        print("Compiling:", sourcefile, "->", objectfile)
-        -- local basename = objectfile:gsub("%.o$", "")
        os.mkdir(path.directory(objectfile))
        local cc = path.join(XTDK_DIR, "bin/clang++")
        local includedirs = table.concat(target:get("includedirs"), " ")
@@ -35,19 +30,15 @@ rule("xpu")
        }

        local args = {
-            -- "--sysroot=/",
+            "--sysroot=/",
            "--target=" .. arch_map[os.arch()],
            "-fPIC",
-            -- "-pie",
            "--xpu-arch=xpu3",
-            -- "--basename", basename,
            "-std=c++17",
            "-O2",
            "-fno-builtin",
-            -- "-g",
            "-c", sourcefile,
            "-o", objectfile
-            -- "-v"
        }
        
        for _, includedir in ipairs(target:get("includedirs")) do
@@ -59,8 +50,7 @@ rule("xpu")
        assert(ok == 0, "Compile failed: " .. sourcefile)

        table.insert(target:objectfiles(), objectfile)
-        -- table.insert(target:objectfiles(), basename .. ".device.bin.o")
-        -- print(target:objectfiles())
+        print(target:objectfiles())
    end)
 rule_end()

@@ -79,7 +69,15 @@ target("infiniop-kunlun")
    -- compile handwriting kernel
    local xpu_files = os.files(src_dir .. "/ops/*/kunlun/*.xpu")
    if #xpu_files > 0 then
-        add_files(xpu_files, {rule = "xpu"})
+        add_files(xpu_files, {
+            rule = "xpu",
+            includedirs = {
+                path.join(os.projectdir, "include"),
+                path.join(XRE_DIR, "include"),
+                path.join(XDNN_DIR, "include"),
+                path.join(XTDK_DIR, "include")
+            }
+        })
    end
 target_end()

@@ -89,7 +87,7 @@ target("infinirt-kunlun")
    set_languages("cxx17")
    on_install(function (target) end)
    -- Add include dirs
-    add_files("../src/infinirt/kunlun/*.cc")
+    add_files("$(projectdir)/src/infinirt/kunlun/*.cc")
    add_cxflags("-lstdc++ -Wall -Werror -fPIC")

 target_end()