fix conflicts of operator.cc in swiglu

802a75d3 · crapromer · 7d2acaf7 · 802a75d3 · 802a75d3 · 802a75d3
Commit 802a75d3 authored May 07, 2025 by crapromer
5 changed files
--- a/src/infiniop/devices/maca/maca_kernel_common.h
+++ b/src/infiniop/devices/maca/maca_kernel_common.h
-#ifdef ENABLE_SUGON_MACA_API
-#define INFINIOP_MACA_KERNEL __launch_bounds__(512) __global__ void
-#else
 #define INFINIOP_MACA_KERNEL __global__ void
-#endif
-
 // Posible maximum number of threads per block for MACA architectures
 // Used for picking correct kernel launch configuration
 #define MACA_BLOCK_SIZE_1024 1024

--- a/src/infiniop/elementwise/maca/elementwise_maca.h
+++ b/src/infiniop/elementwise/maca/elementwise_maca.h
@@ -107,7 +107,7 @@ struct DeviceImpl::Opaque {
    Opaque(const std::shared_ptr<device::maca::Handle::Internal> &internal)
        : internal(internal) {}

-    template <size_t BLOCK_SIZE, size_t N, typename Op, typename Tdata, typename... Args>
+    template <uint32_t BLOCK_SIZE, size_t N, typename Op, typename Tdata, typename... Args>
    infiniStatus_t calculateImpl(const op::elementwise::ElementwiseInfo &info,
                                 void *workspace,
                                 void *output,
@@ -122,7 +122,7 @@ struct DeviceImpl::Opaque {
            std::forward<Args>(args)...);
    }

-    template <size_t BLOCK_SIZE, size_t N, typename Op, typename Tout, typename... Tin, typename... Args,
+    template <uint32_t BLOCK_SIZE, size_t N, typename Op, typename Tout, typename... Tin, typename... Args,
              std::enable_if_t<(sizeof...(Tin) == Op::num_inputs), int> = 0>
    infiniStatus_t calculateImpl(const op::elementwise::ElementwiseInfo &info,
                                 void *workspace,
@@ -174,7 +174,7 @@ private:
        return INFINI_STATUS_SUCCESS;
    }

-    template <size_t BLOCK_SIZE, size_t N, typename KernelFunc, typename Tout, typename... Args>
+    template <uint32_t BLOCK_SIZE, size_t N, typename KernelFunc, typename Tout, typename... Args>
    infiniStatus_t launchElementwiseKernel(
        const op::elementwise::ElementwiseInfo &info,
        void *workspace,
@@ -203,8 +203,8 @@ private:
                                     d_output_shape, d_output_strides,
                                     d_input_shapes, d_input_strides, stream));

-        dim3 blockDims(std::min(BLOCK_SIZE, static_cast<size_t>(internal->maxThreadsPerBlock())));
-        dim3 gridDims(std::min(CEIL_DIV(output_size, blockDims.x), static_cast<size_t>(internal->gridSizeX())));
+        dim3 blockDims(std::min(BLOCK_SIZE, static_cast<uint32_t>(internal->maxThreadsPerBlock())));
+        dim3 gridDims(std::min(uint32_t(CEIL_DIV(output_size, blockDims.x)), static_cast<uint32_t>(internal->gridSizeX())));
        size_t step = gridDims.x * blockDims.x;

        for (size_t i = 0; i < output_size; i += step) {
@@ -228,7 +228,7 @@ utils::Result<DeviceImpl *> DeviceImpl::create(Args &&...args) {
 }

 /* Invoke elementwise operation for different input types */
-template <unsigned int BLOCK_SIZE, typename Op, typename Tout, typename... Tin, typename... Args,
+template <uint32_t BLOCK_SIZE, typename Op, typename Tout, typename... Tin, typename... Args,
          std::enable_if_t<(sizeof...(Tin) == Op::num_inputs), int>>
 infiniStatus_t DeviceImpl::calculate(const op::elementwise::ElementwiseInfo &info,
                                     void *workspace,
@@ -245,7 +245,7 @@ infiniStatus_t DeviceImpl::calculate(const op::elementwise::ElementwiseInfo &inf
 }

 /* Invoke elementwise operation when all inputs have the same dtype */
-template <unsigned int BLOCK_SIZE, typename Op, typename Tdata, typename... Args>
+template <uint32_t BLOCK_SIZE, typename Op, typename Tdata, typename... Args>
 infiniStatus_t DeviceImpl::calculate(const op::elementwise::ElementwiseInfo &info,
                                     void *workspace,
                                     void *output,

--- a/src/infiniop/elementwise/maca/elementwise_maca_api.h
+++ b/src/infiniop/elementwise/maca/elementwise_maca_api.h
@@ -17,7 +17,7 @@ public:
    template <typename... Args>
    static utils::Result<DeviceImpl *> create(Args &&...args);

-    template <unsigned int BLOCK_SIZE, typename Op, typename Tdata, typename... Args>
+    template <uint32_t BLOCK_SIZE, typename Op, typename Tdata, typename... Args>
    infiniStatus_t calculate(
        const op::elementwise::ElementwiseInfo &info,
        void *workspace,
@@ -26,7 +26,7 @@ public:
        void *stream,
        Args &&...args);

-    template <unsigned int BLOCK_SIZE, typename Op, typename Tout, typename... Tin,
+    template <uint32_t BLOCK_SIZE, typename Op, typename Tout, typename... Tin,
              typename... Args,
              std::enable_if_t<(sizeof...(Tin) == Op::num_inputs), int> = 0>
    infiniStatus_t calculate(

--- a/src/infiniop/ops/swiglu/maca/swiglu_maca_internal.h
+++ b/src/infiniop/ops/swiglu/maca/swiglu_maca_internal.h
+#ifndef __SWIGLU_MACA_H__
+#define __SWIGLU_MACA_H__
+
 #include "../../../elementwise/maca/elementwise_maca.h"
 #include <hctlass/half.h>
+
 namespace op::swiglu::maca {
 typedef struct SwiGLUOp {
 private:
    template <typename T>
    __device__ __forceinline__ T sigmoid(const T &x) const {
-        // if constexpr (std::is_same_v<T, half2>) {
-        //     return h2rcp(__hadd2(make_half2(1, 1), h2exp(__hneg2(x))));
-        // } else
-        if constexpr (std::is_same_v<T, half>) {
+        if constexpr (std::is_same_v<T, half2>) {
+            return h2rcp(__hadd2(make_half2(1, 1), h2exp(__hneg2(x))));
+        } else if constexpr (std::is_same_v<T, half>) {
            return hrcp(__hadd(half(1.f), __float2half(__expf(__half2float(__hneg(x))))));
        } else if constexpr (std::is_same_v<T, float>) {
            return __frcp_rd(__fadd_rd(1, __expf(-x)));
@@ -33,3 +36,5 @@ public:
    }
 } SwiGLUOp;
 } // namespace op::swiglu::maca
+
+#endif
--- a/src/infiniop/ops/swiglu/operator.cc
+++ b/src/infiniop/ops/swiglu/operator.cc
@@ -42,13 +42,11 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
 #ifdef ENABLE_CUDA_API
        CREATE(INFINI_DEVICE_NVIDIA, cuda);
 #endif
-        < < < < < < < HEAD
 #ifdef ENABLE_KUNLUN_API
        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
-=======
+#endif
 #ifdef ENABLE_METAX_API
        CREATE(INFINI_DEVICE_METAX, maca);
->>>>>>> f3a0177 (Migrate elementwise base from cuda to maca, and implement swiglu with test pass)
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
    case DevCambriconMlu: {
@@ -94,13 +92,11 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
 #ifdef ENABLE_CUDA_API
        GET(INFINI_DEVICE_NVIDIA, cuda)
 #endif
-        < < < < < < < HEAD
 #ifdef ENABLE_KUNLUN_API
        GET(INFINI_DEVICE_KUNLUN, kunlun)
-=======
+#endif
 #ifdef ENABLE_METAX_API
        GET(INFINI_DEVICE_METAX, maca);
->>>>>>> f3a0177 (Migrate elementwise base from cuda to maca, and implement swiglu with test pass)
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
    case DevCambriconMlu: {
@@ -149,13 +145,11 @@ __C infiniStatus_t infiniopSwiGLU(
 #ifdef ENABLE_CUDA_API
        CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
 #endif
-        < < < < < < < HEAD
 #ifdef ENABLE_KUNLUN_API
        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
-=======
+#endif
 #ifdef ENABLE_METAX_API
        CALCULATE(INFINI_DEVICE_METAX, maca);
->>>>>>> f3a0177 (Migrate elementwise base from cuda to maca, and implement swiglu with test pass)
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
    case DevCambriconMlu: {
@@ -197,13 +191,11 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
 #ifdef ENABLE_CUDA_API
        DELETE(INFINI_DEVICE_NVIDIA, cuda);
 #endif
-        < < < < < < < HEAD
 #ifdef ENABLE_KUNLUN_API
        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
-=======
+#endif
 #ifdef ENABLE_METAX_API
        DELETE(INFINI_DEVICE_METAX, maca);
->>>>>>> f3a0177 (Migrate elementwise base from cuda to maca, and implement swiglu with test pass)
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
    case DevCambriconMlu: {