Commit abf1e021 authored by YdrMaster's avatar YdrMaster
Browse files

issue/291/refactor: 改造 rms_norm、rope、swiglu


Signed-off-by: default avatarYdrMaster <ydrml@hotmail.com>
parent f06eb359
#ifndef __RMS_NORM_CUDA_KERNEL_H__
#define __RMS_NORM_CUDA_KERNEL_H__
#include "../../../devices/cuda/cuda_kernel_common.cuh"
#include <cub/block/block_reduce.cuh>
#include "../../../reduce/cuda/reduce.cuh"
template <unsigned int BLOCK_SIZE, typename Tdata, typename Tweight, typename Tcompute>
INFINIOP_CUDA_KERNEL rmsnormBlock(
Tdata *__restrict__ y,
......
#include "../../../devices/cuda/cuda_common.cuh"
#include "rms_norm_cuda.cuh"
#include "rms_norm_kernel.cuh"
#include "rms_norm_nvidia.cuh"
namespace op::rms_norm::cuda {
#include "../../../devices/cuda/cuda_kernel_common.cuh"
#include <cub/block/block_reduce.cuh>
#include "../../../reduce/cuda/reduce.cuh"
#include "../cuda/kernel.cuh"
namespace op::rms_norm::nvidia {
struct Descriptor::Opaque {
std::shared_ptr<device::cuda::Handle::Internal> internal;
......
......@@ -3,6 +3,6 @@
#include "../rms_norm.h"
DESCRIPTOR(cuda)
DESCRIPTOR(nvidia)
#endif
......@@ -6,7 +6,7 @@
#include "cpu/rms_norm_cpu.h"
#endif
#ifdef ENABLE_NVIDIA_API
#include "cuda/rms_norm_cuda.cuh"
#include "nvidia/rms_norm_nvidia.cuh"
#endif
#ifdef ENABLE_ASCEND_API
#include "ascend/rms_norm_aclnn.h"
......@@ -37,17 +37,17 @@ __C infiniStatus_t infiniopCreateRMSNormDescriptor(
y_desc, \
x_desc, \
w_desc, \
epsilon);
epsilon)
switch (handle->device) {
#ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu)
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, cuda)
CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_KUNLUN_API
CREATE(INFINI_DEVICE_KUNLUN, kunlun)
CREATE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
......@@ -55,13 +55,13 @@ __C infiniStatus_t infiniopCreateRMSNormDescriptor(
}
#endif
#ifdef ENABLE_ASCEND_API
CREATE(INFINI_DEVICE_ASCEND, ascend)
CREATE(INFINI_DEVICE_ASCEND, ascend);
#endif
#ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, maca)
CREATE(INFINI_DEVICE_METAX, maca);
#endif
#ifdef ENABLE_MOORE_API
CREATE(INFINI_DEVICE_MOORE, musa)
CREATE(INFINI_DEVICE_MOORE, musa);
#endif
}
......@@ -75,17 +75,17 @@ __C infiniStatus_t infiniopGetRMSNormWorkspaceSize(infiniopRMSNormDescriptor_t d
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::rms_norm::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS;
return INFINI_STATUS_SUCCESS
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu)
GET(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, cuda)
GET(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_KUNLUN_API
GET(INFINI_DEVICE_KUNLUN, kunlun)
GET(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
......@@ -93,13 +93,13 @@ __C infiniStatus_t infiniopGetRMSNormWorkspaceSize(infiniopRMSNormDescriptor_t d
}
#endif
#ifdef ENABLE_ASCEND_API
GET(INFINI_DEVICE_ASCEND, ascend)
GET(INFINI_DEVICE_ASCEND, ascend);
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, maca)
GET(INFINI_DEVICE_METAX, maca);
#endif
#ifdef ENABLE_MOORE_API
GET(INFINI_DEVICE_MOORE, musa)
GET(INFINI_DEVICE_MOORE, musa);
#endif
}
......@@ -114,17 +114,17 @@ __C infiniStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *works
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<op::rms_norm::NAMESPACE::Descriptor *>(desc)->calculate( \
workspace, workspace_size, y, x, w, stream);
workspace, workspace_size, y, x, w, stream)
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
CALCULATE(INFINI_DEVICE_CPU, cpu)
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, cuda)
CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_KUNLUN_API
CALCULATE(INFINI_DEVICE_KUNLUN, kunlun)
CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
......@@ -132,13 +132,13 @@ __C infiniStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *works
}
#endif
#ifdef ENABLE_ASCEND_API
CALCULATE(INFINI_DEVICE_ASCEND, ascend)
CALCULATE(INFINI_DEVICE_ASCEND, ascend);
#endif
#ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, maca)
CALCULATE(INFINI_DEVICE_METAX, maca);
#endif
#ifdef ENABLE_MOORE_API
CALCULATE(INFINI_DEVICE_MOORE, musa)
CALCULATE(INFINI_DEVICE_MOORE, musa);
#endif
}
......@@ -152,17 +152,17 @@ __C infiniStatus_t infiniopDestroyRMSNormDescriptor(infiniopRMSNormDescriptor_t
#define DESTROY(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<op::rms_norm::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
return INFINI_STATUS_SUCCESS
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
DESTROY(INFINI_DEVICE_CPU, cpu)
DESTROY(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
DESTROY(INFINI_DEVICE_NVIDIA, cuda)
DESTROY(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_KUNLUN_API
DESTROY(INFINI_DEVICE_KUNLUN, kunlun)
DESTROY(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
......@@ -170,13 +170,13 @@ __C infiniStatus_t infiniopDestroyRMSNormDescriptor(infiniopRMSNormDescriptor_t
}
#endif
#ifdef ENABLE_ASCEND_API
DESTROY(INFINI_DEVICE_ASCEND, ascend)
DESTROY(INFINI_DEVICE_ASCEND, ascend);
#endif
#ifdef ENABLE_METAX_API
DESTROY(INFINI_DEVICE_METAX, maca)
DESTROY(INFINI_DEVICE_METAX, maca);
#endif
#ifdef ENABLE_MOORE_API
DESTROY(INFINI_DEVICE_MOORE, musa)
DESTROY(INFINI_DEVICE_MOORE, musa);
#endif
}
......
#ifndef __INFINIOP_ROPE_CUDA_KERNEL_CUH__
#define __INFINIOP_ROPE_CUDA_KERNEL_CUH__
#include "../../../devices/cuda/cuda_kernel_common.cuh"
template <typename Tdata, typename Tindex, typename Tangle>
INFINIOP_CUDA_KERNEL ropeThreadPerItem(
Tdata *y_,
......
#include "../../../devices/cuda/cuda_common.cuh"
#include "rope_cuda.cuh"
#include "rope_cuda_kernel.cuh"
#include "rope_nvidia.cuh"
namespace op::rope::cuda {
#include "../../../devices/cuda/cuda_kernel_common.cuh"
#include "../cuda/kernel.cuh"
namespace op::rope::nvidia {
struct Descriptor::Opaque {
std::shared_ptr<device::cuda::Handle::Internal> internal;
......
......@@ -3,6 +3,6 @@
#include "../rope.h"
DESCRIPTOR(cuda)
DESCRIPTOR(nvidia)
#endif // __INFINIOP_ROPE_CUDA_H__
......@@ -6,7 +6,7 @@
#include "cpu/rope_cpu.h"
#endif
#ifdef ENABLE_NVIDIA_API
#include "cuda/rope_cuda.cuh"
#include "nvidia/rope_nvidia.cuh"
#endif
#ifdef ENABLE_ASCEND_API
#include "ascend/rope_ascend.h"
......@@ -40,7 +40,7 @@ __C infiniStatus_t infiniopCreateRoPEDescriptor(
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, cuda);
CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, maca);
......@@ -81,7 +81,7 @@ __C infiniStatus_t infiniopGetRoPEWorkspaceSize(infiniopRoPEDescriptor_t desc,
GET(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, cuda);
GET(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, maca);
......@@ -132,7 +132,7 @@ __C infiniStatus_t infiniopRoPE(
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, maca);
......@@ -178,7 +178,7 @@ infiniopDestroyRoPEDescriptor(infiniopRoPEDescriptor_t desc) {
DELETE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
DELETE(INFINI_DEVICE_NVIDIA, cuda);
DELETE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_METAX_API
DELETE(INFINI_DEVICE_METAX, maca);
......
#include "swiglu_cuda.cuh"
#include "swiglu_cuda_internal.cuh"
#include "swiglu_nvidia.cuh"
#include "../cuda/kernel.cuh"
namespace op::swiglu::cuda {
namespace op::swiglu::nvidia {
Descriptor::~Descriptor() = default;
......@@ -42,13 +42,13 @@ infiniStatus_t Descriptor::calculate(
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, SwiGLUOp, half>(_info, workspace, output, inputs, stream);
return _device_info->calculate<256, cuda::SwiGLUOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, SwiGLUOp, __nv_bfloat16>(_info, workspace, output, inputs, stream);
return _device_info->calculate<256, cuda::SwiGLUOp, __nv_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, SwiGLUOp, float>(_info, workspace, output, inputs, stream);
return _device_info->calculate<256, cuda::SwiGLUOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, SwiGLUOp, double>(_info, workspace, output, inputs, stream);
return _device_info->calculate<256, cuda::SwiGLUOp, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
......
......@@ -3,6 +3,6 @@
#include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
ELEMENTWISE_DESCRIPTOR(swiglu, cuda, cuda)
ELEMENTWISE_DESCRIPTOR(swiglu, nvidia, cuda)
#endif // __SWIGLU_CUDA_API_H__
......@@ -6,7 +6,7 @@
#include "cpu/swiglu_cpu.h"
#endif
#ifdef ENABLE_NVIDIA_API
#include "cuda/swiglu_cuda.cuh"
#include "nvidia/swiglu_nvidia.cuh"
#endif
#ifdef ENABLE_KUNLUN_API
#include "kunlun/swiglu_kunlun.h"
......@@ -40,7 +40,7 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, cuda);
CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_KUNLUN_API
CREATE(INFINI_DEVICE_KUNLUN, kunlun);
......@@ -83,17 +83,17 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::swiglu::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS;
return INFINI_STATUS_SUCCESS
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu)
GET(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, cuda)
GET(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_KUNLUN_API
GET(INFINI_DEVICE_KUNLUN, kunlun)
GET(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, maca);
......@@ -104,12 +104,7 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
}
#endif
#ifdef ENABLE_ASCEND_API
GET(INFINI_DEVICE_ASCEND, ascend)
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaGetSwiGLUWorkspaceSize((SwiGLUMacaDescriptor_t)desc, size);
}
GET(INFINI_DEVICE_ASCEND, ascend);
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: {
......@@ -143,7 +138,7 @@ __C infiniStatus_t infiniopSwiGLU(
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_KUNLUN_API
CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
......@@ -189,7 +184,7 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
DELETE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
DELETE(INFINI_DEVICE_NVIDIA, cuda);
DELETE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_KUNLUN_API
DELETE(INFINI_DEVICE_KUNLUN, kunlun);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment