Commit 802a75d3 authored by crapromer's avatar crapromer
Browse files

fix conflicts of operator.cc in swiglu

parent 7d2acaf7
#ifdef ENABLE_SUGON_MACA_API
#define INFINIOP_MACA_KERNEL __launch_bounds__(512) __global__ void
#else
#define INFINIOP_MACA_KERNEL __global__ void
#endif
// Posible maximum number of threads per block for MACA architectures
// Used for picking correct kernel launch configuration
#define MACA_BLOCK_SIZE_1024 1024
......
......@@ -107,7 +107,7 @@ struct DeviceImpl::Opaque {
Opaque(const std::shared_ptr<device::maca::Handle::Internal> &internal)
: internal(internal) {}
template <size_t BLOCK_SIZE, size_t N, typename Op, typename Tdata, typename... Args>
template <uint32_t BLOCK_SIZE, size_t N, typename Op, typename Tdata, typename... Args>
infiniStatus_t calculateImpl(const op::elementwise::ElementwiseInfo &info,
void *workspace,
void *output,
......@@ -122,7 +122,7 @@ struct DeviceImpl::Opaque {
std::forward<Args>(args)...);
}
template <size_t BLOCK_SIZE, size_t N, typename Op, typename Tout, typename... Tin, typename... Args,
template <uint32_t BLOCK_SIZE, size_t N, typename Op, typename Tout, typename... Tin, typename... Args,
std::enable_if_t<(sizeof...(Tin) == Op::num_inputs), int> = 0>
infiniStatus_t calculateImpl(const op::elementwise::ElementwiseInfo &info,
void *workspace,
......@@ -174,7 +174,7 @@ private:
return INFINI_STATUS_SUCCESS;
}
template <size_t BLOCK_SIZE, size_t N, typename KernelFunc, typename Tout, typename... Args>
template <uint32_t BLOCK_SIZE, size_t N, typename KernelFunc, typename Tout, typename... Args>
infiniStatus_t launchElementwiseKernel(
const op::elementwise::ElementwiseInfo &info,
void *workspace,
......@@ -203,8 +203,8 @@ private:
d_output_shape, d_output_strides,
d_input_shapes, d_input_strides, stream));
dim3 blockDims(std::min(BLOCK_SIZE, static_cast<size_t>(internal->maxThreadsPerBlock())));
dim3 gridDims(std::min(CEIL_DIV(output_size, blockDims.x), static_cast<size_t>(internal->gridSizeX())));
dim3 blockDims(std::min(BLOCK_SIZE, static_cast<uint32_t>(internal->maxThreadsPerBlock())));
dim3 gridDims(std::min(uint32_t(CEIL_DIV(output_size, blockDims.x)), static_cast<uint32_t>(internal->gridSizeX())));
size_t step = gridDims.x * blockDims.x;
for (size_t i = 0; i < output_size; i += step) {
......@@ -228,7 +228,7 @@ utils::Result<DeviceImpl *> DeviceImpl::create(Args &&...args) {
}
/* Invoke elementwise operation for different input types */
template <unsigned int BLOCK_SIZE, typename Op, typename Tout, typename... Tin, typename... Args,
template <uint32_t BLOCK_SIZE, typename Op, typename Tout, typename... Tin, typename... Args,
std::enable_if_t<(sizeof...(Tin) == Op::num_inputs), int>>
infiniStatus_t DeviceImpl::calculate(const op::elementwise::ElementwiseInfo &info,
void *workspace,
......@@ -245,7 +245,7 @@ infiniStatus_t DeviceImpl::calculate(const op::elementwise::ElementwiseInfo &inf
}
/* Invoke elementwise operation when all inputs have the same dtype */
template <unsigned int BLOCK_SIZE, typename Op, typename Tdata, typename... Args>
template <uint32_t BLOCK_SIZE, typename Op, typename Tdata, typename... Args>
infiniStatus_t DeviceImpl::calculate(const op::elementwise::ElementwiseInfo &info,
void *workspace,
void *output,
......
......@@ -17,7 +17,7 @@ public:
template <typename... Args>
static utils::Result<DeviceImpl *> create(Args &&...args);
template <unsigned int BLOCK_SIZE, typename Op, typename Tdata, typename... Args>
template <uint32_t BLOCK_SIZE, typename Op, typename Tdata, typename... Args>
infiniStatus_t calculate(
const op::elementwise::ElementwiseInfo &info,
void *workspace,
......@@ -26,7 +26,7 @@ public:
void *stream,
Args &&...args);
template <unsigned int BLOCK_SIZE, typename Op, typename Tout, typename... Tin,
template <uint32_t BLOCK_SIZE, typename Op, typename Tout, typename... Tin,
typename... Args,
std::enable_if_t<(sizeof...(Tin) == Op::num_inputs), int> = 0>
infiniStatus_t calculate(
......
#ifndef __SWIGLU_MACA_H__
#define __SWIGLU_MACA_H__
#include "../../../elementwise/maca/elementwise_maca.h"
#include <hctlass/half.h>
namespace op::swiglu::maca {
typedef struct SwiGLUOp {
private:
template <typename T>
__device__ __forceinline__ T sigmoid(const T &x) const {
// if constexpr (std::is_same_v<T, half2>) {
// return h2rcp(__hadd2(make_half2(1, 1), h2exp(__hneg2(x))));
// } else
if constexpr (std::is_same_v<T, half>) {
if constexpr (std::is_same_v<T, half2>) {
return h2rcp(__hadd2(make_half2(1, 1), h2exp(__hneg2(x))));
} else if constexpr (std::is_same_v<T, half>) {
return hrcp(__hadd(half(1.f), __float2half(__expf(__half2float(__hneg(x))))));
} else if constexpr (std::is_same_v<T, float>) {
return __frcp_rd(__fadd_rd(1, __expf(-x)));
......@@ -33,3 +36,5 @@ public:
}
} SwiGLUOp;
} // namespace op::swiglu::maca
#endif
......@@ -42,13 +42,11 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
#ifdef ENABLE_CUDA_API
CREATE(INFINI_DEVICE_NVIDIA, cuda);
#endif
< < < < < < < HEAD
#ifdef ENABLE_KUNLUN_API
CREATE(INFINI_DEVICE_KUNLUN, kunlun);
=======
#endif
#ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, maca);
>>>>>>> f3a0177 (Migrate elementwise base from cuda to maca, and implement swiglu with test pass)
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
......@@ -94,13 +92,11 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
#ifdef ENABLE_CUDA_API
GET(INFINI_DEVICE_NVIDIA, cuda)
#endif
< < < < < < < HEAD
#ifdef ENABLE_KUNLUN_API
GET(INFINI_DEVICE_KUNLUN, kunlun)
=======
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, maca);
>>>>>>> f3a0177 (Migrate elementwise base from cuda to maca, and implement swiglu with test pass)
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
......@@ -149,13 +145,11 @@ __C infiniStatus_t infiniopSwiGLU(
#ifdef ENABLE_CUDA_API
CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
#endif
< < < < < < < HEAD
#ifdef ENABLE_KUNLUN_API
CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
=======
#endif
#ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, maca);
>>>>>>> f3a0177 (Migrate elementwise base from cuda to maca, and implement swiglu with test pass)
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
......@@ -197,13 +191,11 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
#ifdef ENABLE_CUDA_API
DELETE(INFINI_DEVICE_NVIDIA, cuda);
#endif
< < < < < < < HEAD
#ifdef ENABLE_KUNLUN_API
DELETE(INFINI_DEVICE_KUNLUN, kunlun);
=======
#endif
#ifdef ENABLE_METAX_API
DELETE(INFINI_DEVICE_METAX, maca);
>>>>>>> f3a0177 (Migrate elementwise base from cuda to maca, and implement swiglu with test pass)
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment