Unverified Commit 8d09630a authored by gongchensu's avatar gongchensu Committed by GitHub
Browse files

Merge branch 'demo131' into Issue/862

parents ab52dead 012df56c
#include "../../../devices/nvidia/nvidia_common.cuh"
#include "add_rms_norm_nvidia.cuh"
#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
#include <cub/block/block_reduce.cuh>
#include "../../../reduce/cuda/reduce.cuh"
#include "../cuda/kernel.cuh"
template <unsigned int BLOCK_SIZE, typename Tcompute, typename Tdata, typename Tweight>
INFINIOP_CUDA_KERNEL add_rmsnormKernel(
Tdata *__restrict__ y,
Tdata *__restrict__ residual_out,
ptrdiff_t stride_y_batch,
ptrdiff_t stride_y_nhead,
ptrdiff_t stride_residual_out_batch,
ptrdiff_t stride_residual_out_nhead,
const Tdata *__restrict__ a,
ptrdiff_t stride_a_batch,
ptrdiff_t stride_a_nhead,
const Tdata *__restrict__ b,
ptrdiff_t stride_b_batch,
ptrdiff_t stride_b_nhead,
const Tweight *__restrict__ w,
size_t nhead,
size_t dim,
float epsilon) {
add_rmsnormBlock<BLOCK_SIZE, Tcompute>(
y, residual_out,
stride_y_batch, stride_y_nhead,
stride_residual_out_batch, stride_residual_out_nhead,
a, stride_a_batch, stride_a_nhead,
b, stride_b_batch, stride_b_nhead,
w, nhead, dim, epsilon);
}
namespace op::add_rms_norm::nvidia {
struct Descriptor::Opaque {
std::shared_ptr<device::nvidia::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t residual_out_desc,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc,
infiniopTensorDescriptor_t weight_desc,
float epsilon) {
auto result = AddRMSNormInfo::create(y_desc, residual_out_desc, a_desc, b_desc, weight_desc, epsilon);
CHECK_RESULT(result);
auto info = result.take();
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
std::move(info),
0,
handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
// launch kernel with different data types
template <unsigned int BLOCK_SIZE>
infiniStatus_t launchKernel(
uint32_t batch_size, size_t nhead, size_t dim,
void *y, infiniDtype_t atype, ptrdiff_t stride_y_batch, ptrdiff_t stride_y_nhead,
void *residual_out, ptrdiff_t stride_residual_out_batch, ptrdiff_t stride_residual_out_nhead,
const void *a, ptrdiff_t stride_a_batch, ptrdiff_t stride_a_nhead,
const void *b, ptrdiff_t stride_b_batch, ptrdiff_t stride_b_nhead,
const void *w, infiniDtype_t wtype,
float epsilon,
cudaStream_t cuda_stream) {
#define LAUNCH_KERNEL(Tdata, Tweight, Tcompute) \
add_rmsnormKernel<BLOCK_SIZE, Tcompute, Tdata, Tweight><<<batch_size * nhead, BLOCK_SIZE, 0, cuda_stream>>>( \
reinterpret_cast<Tdata *>(y), \
reinterpret_cast<Tdata *>(residual_out), \
stride_y_batch, \
stride_y_nhead, \
stride_residual_out_batch, \
stride_residual_out_nhead, \
reinterpret_cast<const Tdata *>(a), \
stride_a_batch, \
stride_a_nhead, \
reinterpret_cast<const Tdata *>(b), \
stride_b_batch, \
stride_b_nhead, \
reinterpret_cast<const Tweight *>(w), \
nhead, \
dim, \
epsilon)
if (atype == INFINI_DTYPE_F16 && wtype == INFINI_DTYPE_F16) {
LAUNCH_KERNEL(half, half, float);
} else if (atype == INFINI_DTYPE_F16 && wtype == INFINI_DTYPE_BF16) {
LAUNCH_KERNEL(half, __nv_bfloat16, float);
} else if (atype == INFINI_DTYPE_F16 && wtype == INFINI_DTYPE_F32) {
LAUNCH_KERNEL(half, float, float);
} else if (atype == INFINI_DTYPE_BF16 && wtype == INFINI_DTYPE_BF16) {
LAUNCH_KERNEL(__nv_bfloat16, __nv_bfloat16, float);
} else if (atype == INFINI_DTYPE_BF16 && wtype == INFINI_DTYPE_F16) {
LAUNCH_KERNEL(__nv_bfloat16, half, float);
} else if (atype == INFINI_DTYPE_BF16 && wtype == INFINI_DTYPE_F32) {
LAUNCH_KERNEL(__nv_bfloat16, float, float);
} else if (atype == INFINI_DTYPE_F32 && wtype == INFINI_DTYPE_F32) {
LAUNCH_KERNEL(float, float, float);
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
#undef LAUNCH_KERNEL
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace, size_t workspace_size,
void *y, void *residual_out, const void *a, const void *b, const void *weight,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
auto stride_a_batch = _info.a_strides[0];
auto stride_a_nhead = _info.a_strides[1];
auto stride_b_batch = _info.b_strides[0];
auto stride_b_nhead = _info.b_strides[1];
auto stride_y_batch = _info.y_strides[0];
auto stride_y_nhead = _info.y_strides[1];
auto stride_residual_out_batch = _info.residual_out_strides[0];
auto stride_residual_out_nhead = _info.residual_out_strides[1];
auto dim = _info.dim();
uint32_t batch_size = static_cast<uint32_t>(_info.shape[0]);
size_t nhead = _info.shape.size() > 2 ? _info.shape[1] : 1;
auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
// launch kernel with different block sizes
if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_512>(
batch_size, nhead, dim,
y, _info.atype, stride_y_batch, stride_y_nhead,
residual_out, stride_residual_out_batch, stride_residual_out_nhead,
a, stride_a_batch, stride_a_nhead,
b, stride_b_batch, stride_b_nhead,
weight, _info.wtype, _info.epsilon, cuda_stream));
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_1024>(
batch_size, nhead, dim,
y, _info.atype, stride_y_batch, stride_y_nhead,
residual_out, stride_residual_out_batch, stride_residual_out_nhead,
a, stride_a_batch, stride_a_nhead,
b, stride_b_batch, stride_b_nhead,
weight, _info.wtype, _info.epsilon, cuda_stream));
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_2048) {
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_2048>(
batch_size, nhead, dim,
y, _info.atype, stride_y_batch, stride_y_nhead,
residual_out, stride_residual_out_batch, stride_residual_out_nhead,
a, stride_a_batch, stride_a_nhead,
b, stride_b_batch, stride_b_nhead,
weight, _info.wtype, _info.epsilon, cuda_stream));
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_4096>(
batch_size, nhead, dim,
y, _info.atype, stride_y_batch, stride_y_nhead,
residual_out, stride_residual_out_batch, stride_residual_out_nhead,
a, stride_a_batch, stride_a_nhead,
b, stride_b_batch, stride_b_nhead,
weight, _info.wtype, _info.epsilon, cuda_stream));
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::add_rms_norm::nvidia
#ifndef __ADD_RMS_NORM_NVIDIA_CUDA_H__
#define __ADD_RMS_NORM_NVIDIA_CUDA_H__
#include "../add_rms_norm.h"
DESCRIPTOR(nvidia)
#endif
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/add_rms_norm.h"
#ifdef ENABLE_CPU_API
#include "cpu/add_rms_norm_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) || defined(ENABLE_ALI_API)
#include "nvidia/add_rms_norm_nvidia.cuh"
#endif
#ifdef ENABLE_ASCEND_API
// TODO: Add Ascend implementation
// #include "ascend/add_rms_norm_aclnn.h"
#endif
#ifdef ENABLE_CAMBRICON_API
// TODO: Add Cambricon implementation
// #include "bang/add_rms_norm_bang.h"
#endif
#ifdef ENABLE_METAX_API
#include "metax/add_rms_norm_metax.cuh"
#endif
#ifdef ENABLE_MOORE_API
#include "moore/add_rms_norm_moore.h"
#endif
#ifdef ENABLE_KUNLUN_API
// TODO: Add Kunlun implementation
// #include "kunlun/add_rms_norm_kunlun.h"
#endif
__C infiniStatus_t infiniopCreateAddRMSNormDescriptor(
infiniopHandle_t handle,
infiniopAddRMSNormDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t residual_out_desc,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc,
infiniopTensorDescriptor_t weight_desc,
float epsilon) {
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::add_rms_norm::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::add_rms_norm::NAMESPACE::Descriptor **>(desc_ptr), \
y_desc, \
residual_out_desc, \
a_desc, \
b_desc, \
weight_desc, \
epsilon)
switch (handle->device) {
#ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_ALI_API
CREATE(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_MOORE_API
CREATE(INFINI_DEVICE_MOORE, moore);
#endif
#ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_QY_API
CREATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_HYGON_API
CREATE(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_KUNLUN_API
// CREATE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CREATE
}
__C infiniStatus_t infiniopGetAddRMSNormWorkspaceSize(infiniopAddRMSNormDescriptor_t desc, size_t *size) {
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::add_rms_norm::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_ALI_API
GET(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_MOORE_API
GET(INFINI_DEVICE_MOORE, moore);
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_QY_API
GET(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_HYGON_API
GET(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_KUNLUN_API
// GET(INFINI_DEVICE_KUNLUN, kunlun);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef GET
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniStatus_t infiniopAddRMSNorm(
infiniopAddRMSNormDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *y,
void *residual_out,
const void *a,
const void *b,
const void *weight,
void *stream) {
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::add_rms_norm::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, y, residual_out, a, b, weight, stream)
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_ALI_API
CALCULATE(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_MOORE_API
CALCULATE(INFINI_DEVICE_MOORE, moore);
#endif
#ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_QY_API
CALCULATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_HYGON_API
CALCULATE(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_KUNLUN_API
// CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CALCULATE
}
__C infiniStatus_t infiniopDestroyAddRMSNormDescriptor(infiniopAddRMSNormDescriptor_t desc) {
if (desc == nullptr) {
return INFINI_STATUS_SUCCESS;
}
#define DESTROY(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<op::add_rms_norm::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
DESTROY(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
DESTROY(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
DESTROY(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_ALI_API
DESTROY(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_MOORE_API
DESTROY(INFINI_DEVICE_MOORE, moore);
#endif
#ifdef ENABLE_METAX_API
DESTROY(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_QY_API
DESTROY(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_HYGON_API
DESTROY(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_KUNLUN_API
// DESTROY(INFINI_DEVICE_KUNLUN, kunlun);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef DESTROY
}
......@@ -131,7 +131,7 @@ void causalSoftmaxUnion(void *workspace, int core_per_cluster, int cluster_count
kernel_dim.x = core_per_cluster;
kernel_dim.y = cluster_count;
kernel_dim.z = 1;
kernel_type = CNRT_FUNC_TYPE_UNION1;
kernel_type = cnrtFuncTypeUnion1;
// Launch kernel
causalSoftmax<T><<<kernel_dim, kernel_type, queue>>>(
......
......@@ -28,7 +28,7 @@ __device__ void causalSoftmaxKernel(
// 1 | * * * ... * * |
// 2 | * * * ... * * * |
// height: 3 col_id->
if (width + blockIdx.x >= threadIdx.x + height) {
if (width + blockIdx.x >= col + height) {
if constexpr (std::is_same_v<Tdata, half> || std::is_same_v<Tdata, cuda_bfloat16>) {
/*
* MUSA does not support CUDA's native `hexp` function.
......
......@@ -76,7 +76,15 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
const void *x,
void *stream_) const {
cudaStream_t stream = (cudaStream_t)stream_;
if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_4096>(
y, x, _info.dtype, _info.batch_size, _info.seq_len, _info.total_seq_len,
_info.y_stride_b, _info.y_stride_i, _info.x_stride_b, _info.x_stride_i, stream));
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_2048) {
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_2048>(
y, x, _info.dtype, _info.batch_size, _info.seq_len, _info.total_seq_len,
_info.y_stride_b, _info.y_stride_i, _info.x_stride_b, _info.x_stride_i, stream));
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_1024>(
y, x, _info.dtype, _info.batch_size, _info.seq_len, _info.total_seq_len,
_info.y_stride_b, _info.y_stride_i, _info.x_stride_b, _info.x_stride_i, stream));
......@@ -84,10 +92,6 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_512>(
y, x, _info.dtype, _info.batch_size, _info.seq_len, _info.total_seq_len,
_info.y_stride_b, _info.y_stride_i, _info.x_stride_b, _info.x_stride_i, stream));
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_4096>(
y, x, _info.dtype, _info.batch_size, _info.seq_len, _info.total_seq_len,
_info.y_stride_b, _info.y_stride_i, _info.x_stride_b, _info.x_stride_i, stream));
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
......
......@@ -5,7 +5,7 @@
#ifdef ENABLE_CPU_API
#include "cpu/causal_softmax_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) || defined(ENABLE_ALI_API)
#include "nvidia/causal_softmax_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
......@@ -48,6 +48,9 @@ __C infiniStatus_t infiniopCreateCausalSoftmaxDescriptor(
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_ALI_API
CREATE(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_QY_API
CREATE(INFINI_DEVICE_QY, nvidia);
#endif
......@@ -90,6 +93,9 @@ __C infiniStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmaxDe
#ifdef ENABLE_ILUVATAR_API
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_ALI_API
GET(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_QY_API
GET(INFINI_DEVICE_QY, nvidia);
#endif
......@@ -137,6 +143,9 @@ __C infiniStatus_t infiniopCausalSoftmax(
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_ALI_API
CALCULATE(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_QY_API
CALCULATE(INFINI_DEVICE_QY, nvidia);
#endif
......@@ -179,6 +188,9 @@ __C infiniStatus_t infiniopDestroyCausalSoftmaxDescriptor(infiniopCausalSoftmaxD
#ifdef ENABLE_ILUVATAR_API
DESTROY(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_ALI_API
DESTROY(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_QY_API
DESTROY(INFINI_DEVICE_QY, nvidia);
#endif
......
......@@ -5,7 +5,7 @@
#ifdef ENABLE_CPU_API
#include "cpu/clip_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API)
#include "nvidia/clip_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
......@@ -42,6 +42,9 @@ __C infiniStatus_t infiniopCreateClipDescriptor(
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_ALI_API
CREATE(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_QY_API
CREATE(INFINI_DEVICE_QY, nvidia);
#endif
......@@ -76,6 +79,9 @@ __C infiniStatus_t infiniopGetClipWorkspaceSize(infiniopClipDescriptor_t desc, s
#ifdef ENABLE_ILUVATAR_API
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_ALI_API
GET(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_QY_API
GET(INFINI_DEVICE_QY, nvidia);
#endif
......@@ -118,6 +124,9 @@ __C infiniStatus_t infiniopClip(
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_ALI_API
CALCULATE(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_QY_API
CALCULATE(INFINI_DEVICE_QY, nvidia);
#endif
......@@ -154,6 +163,9 @@ infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc) {
#ifdef ENABLE_ILUVATAR_API
DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_ALI_API
DELETE(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_QY_API
DELETE(INFINI_DEVICE_QY, nvidia);
#endif
......
......@@ -5,7 +5,7 @@
#ifdef ENABLE_CPU_API
#include "cpu/conv_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API)
#include "nvidia/conv_nvidia.cuh"
#endif
......@@ -45,6 +45,9 @@ __C __export infiniStatus_t infiniopCreateConvDescriptor(infiniopHandle_t handle
#ifdef ENABLE_QY_API
CREATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_ALI_API
CREATE(INFINI_DEVICE_ALI, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......@@ -76,6 +79,9 @@ infiniopGetConvWorkspaceSize(
#ifdef ENABLE_QY_API
GET(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_ALI_API
GET(INFINI_DEVICE_ALI, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......@@ -115,6 +121,9 @@ __C infiniStatus_t infiniopConv(
#ifdef ENABLE_QY_API
CALCULATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_ALI_API
CALCULATE(INFINI_DEVICE_ALI, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......@@ -142,6 +151,9 @@ infiniopDestroyConvDescriptor(infiniopConvDescriptor_t desc) {
#ifdef ENABLE_QY_API
DELETE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_ALI_API
DELETE(INFINI_DEVICE_ALI, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API)
#include "../../../devices/nvidia/nvidia_handle.cuh"
#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
......
......@@ -2,7 +2,7 @@
#include "../../handle.h"
#include "infiniop/ops/dequantize_awq.h"
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API)
#include "nvidia/dequantize_w42f16_nvidia.cuh"
#endif
#ifdef ENABLE_MOORE_API
......@@ -43,6 +43,10 @@ __C infiniStatus_t infiniopCreateDequantizeAWQDescriptor(
#ifdef ENABLE_QY_API
CREATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_ALI_API
CREATE(INFINI_DEVICE_ALI, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
......@@ -70,6 +74,10 @@ __C infiniStatus_t infiniopGetDequantizeAWQWorkspaceSize(infiniopDequantizeAWQDe
#ifdef ENABLE_QY_API
GET(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_ALI_API
GET(INFINI_DEVICE_ALI, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
......@@ -104,6 +112,10 @@ __C infiniStatus_t infiniopDequantizeAWQ(
#ifdef ENABLE_QY_API
CALCULATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_ALI_API
CALCULATE(INFINI_DEVICE_ALI, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
......@@ -132,6 +144,10 @@ infiniopDestroyDequantizeAWQDescriptor(infiniopDequantizeAWQDescriptor_t desc) {
#ifdef ENABLE_QY_API
DELETE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_ALI_API
DELETE(INFINI_DEVICE_ALI, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
......
#include "embedding_cpu.h"
#include "../../../../utils.h"
#include "../../../handle.h"
#include "../../../tensor.h"
#include <cstring>
namespace op::embedding::cpu {
struct Descriptor::Opaque {};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t output_desc,
infiniopTensorDescriptor_t input_desc,
infiniopTensorDescriptor_t weight_desc) {
auto input_shape = input_desc->shape();
auto weight_shape = weight_desc->shape();
CHECK_OR_RETURN(weight_shape.size() == 2, INFINI_STATUS_BAD_TENSOR_SHAPE);
CHECK_OR_RETURN(output_desc->shape().size() == input_shape.size() + 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
auto output_shape = output_desc->shape();
size_t embedding_dim = weight_shape[1];
CHECK_OR_RETURN(output_shape.back() == embedding_dim, INFINI_STATUS_BAD_TENSOR_SHAPE);
for (size_t i = 0; i < input_shape.size(); ++i) {
CHECK_OR_RETURN(output_shape[i] == input_shape[i], INFINI_STATUS_BAD_TENSOR_SHAPE);
}
auto input_dtype = input_desc->dtype();
auto weight_dtype = weight_desc->dtype();
CHECK_OR_RETURN(input_dtype == INFINI_DTYPE_I32 || input_dtype == INFINI_DTYPE_I64,
INFINI_STATUS_BAD_TENSOR_DTYPE);
CHECK_OR_RETURN(weight_dtype == INFINI_DTYPE_F32 || weight_dtype == INFINI_DTYPE_F16 || weight_dtype == INFINI_DTYPE_BF16, INFINI_STATUS_BAD_TENSOR_DTYPE);
CHECK_OR_RETURN(output_desc->dtype() == weight_dtype, INFINI_STATUS_BAD_TENSOR_DTYPE);
size_t num_indices = 1;
for (auto dim : input_shape) {
num_indices *= dim;
}
size_t vocab_size = weight_shape[0];
*desc_ptr = new Descriptor(
num_indices,
embedding_dim,
vocab_size,
input_dtype,
weight_dtype,
new Opaque{},
handle->device,
handle->device_id);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *output,
const void *input,
const void *weight,
void *stream) const {
if (_num_indices == 0) {
return INFINI_STATUS_SUCCESS;
}
size_t element_size = infiniSizeOf(_weight_dtype);
size_t row_bytes = _embedding_dim * element_size;
if (_input_dtype == INFINI_DTYPE_I32) {
const int32_t *indices_ptr = reinterpret_cast<const int32_t *>(input);
const std::byte *weight_ptr = reinterpret_cast<const std::byte *>(weight);
std::byte *out_ptr = reinterpret_cast<std::byte *>(output);
for (size_t i = 0; i < _num_indices; ++i) {
int32_t idx = indices_ptr[i];
if (idx >= 0 && static_cast<size_t>(idx) < _vocab_size) {
std::memcpy(out_ptr + i * row_bytes,
weight_ptr + static_cast<size_t>(idx) * row_bytes,
row_bytes);
}
}
} else if (_input_dtype == INFINI_DTYPE_I64) {
const int64_t *indices_ptr = reinterpret_cast<const int64_t *>(input);
const std::byte *weight_ptr = reinterpret_cast<const std::byte *>(weight);
std::byte *out_ptr = reinterpret_cast<std::byte *>(output);
for (size_t i = 0; i < _num_indices; ++i) {
int64_t idx = indices_ptr[i];
if (idx >= 0 && static_cast<size_t>(idx) < _vocab_size) {
std::memcpy(out_ptr + i * row_bytes,
weight_ptr + static_cast<size_t>(idx) * row_bytes,
row_bytes);
}
}
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::embedding::cpu
#ifndef __EMBEDDING_CPU_H__
#define __EMBEDDING_CPU_H__
#include "../embedding.h"
DESCRIPTOR(cpu)
#endif // __EMBEDDING_CPU_H__
#ifndef __EMBEDDING_CUDA_KERNEL_CUH__
#define __EMBEDDING_CUDA_KERNEL_CUH__
#include <type_traits>
// Helper function to check memory alignment
__forceinline__ __device__ bool is_aligned(const void *ptr, size_t alignment) {
// Use size_t for pointer arithmetic in device code (more compatible)
return (reinterpret_cast<size_t>(ptr) % alignment == 0);
}
// Vectorized copy for float type using float4
template <typename IndexType>
__forceinline__ __device__ void copyVectorizedFloat4(
float *__restrict__ dst,
const float *__restrict__ src,
size_t embedding_dim) {
// Use float4 for vectorized access (16 bytes, 4 floats)
const float4 *src_vec = reinterpret_cast<const float4 *>(src);
float4 *dst_vec = reinterpret_cast<float4 *>(dst);
size_t vec_count = embedding_dim / 4;
// Vectorized copy using __ldg for read-only weight
for (size_t i = 0; i < vec_count; ++i) {
dst_vec[i] = __ldg(&src_vec[i]);
}
// Copy remaining elements
size_t remaining = embedding_dim % 4;
if (remaining > 0) {
size_t offset = vec_count * 4;
for (size_t i = 0; i < remaining; ++i) {
dst[offset + i] = __ldg(&src[offset + i]);
}
}
}
// Vectorized copy for float type using float2 (fallback when not aligned to 16 bytes)
template <typename IndexType>
__forceinline__ __device__ void copyVectorizedFloat2(
float *__restrict__ dst,
const float *__restrict__ src,
size_t embedding_dim) {
// Use float2 for vectorized access (8 bytes, 2 floats)
const float2 *src_vec = reinterpret_cast<const float2 *>(src);
float2 *dst_vec = reinterpret_cast<float2 *>(dst);
size_t vec_count = embedding_dim / 2;
// Vectorized copy using __ldg for read-only weight
for (size_t i = 0; i < vec_count; ++i) {
dst_vec[i] = __ldg(&src_vec[i]);
}
// Copy remaining element if odd
if (embedding_dim % 2 != 0) {
dst[embedding_dim - 1] = __ldg(&src[embedding_dim - 1]);
}
}
// Vectorized copy for half type using half2
template <typename IndexType>
__forceinline__ __device__ void copyVectorizedHalf2(
half *__restrict__ dst,
const half *__restrict__ src,
size_t embedding_dim) {
// Use half2 for vectorized access (4 bytes, 2 halfs)
const half2 *src_vec = reinterpret_cast<const half2 *>(src);
half2 *dst_vec = reinterpret_cast<half2 *>(dst);
size_t vec_count = embedding_dim / 2;
// Vectorized copy using __ldg for read-only weight
for (size_t i = 0; i < vec_count; ++i) {
dst_vec[i] = __ldg(&src_vec[i]);
}
// Copy remaining element if odd
if (embedding_dim % 2 != 0) {
dst[embedding_dim - 1] = __ldg(&src[embedding_dim - 1]);
}
}
// Vectorized copy for bfloat16 type using bfloat162
template <typename IndexType>
__forceinline__ __device__ void copyVectorizedBFloat162(
cuda_bfloat16 *__restrict__ dst,
const cuda_bfloat16 *__restrict__ src,
size_t embedding_dim) {
// Use bfloat162 for vectorized access (4 bytes, 2 bfloat16s)
const cuda_bfloat162 *src_vec = reinterpret_cast<const cuda_bfloat162 *>(src);
cuda_bfloat162 *dst_vec = reinterpret_cast<cuda_bfloat162 *>(dst);
size_t vec_count = embedding_dim / 2;
// Vectorized copy using __ldg for read-only weight
for (size_t i = 0; i < vec_count; ++i) {
dst_vec[i] = __ldg(&src_vec[i]);
}
// Copy remaining element if odd
if (embedding_dim % 2 != 0) {
dst[embedding_dim - 1] = __ldg(&src[embedding_dim - 1]);
}
}
// Scalar copy fallback with __ldg optimization
template <typename T, typename IndexType>
__forceinline__ __device__ void copyScalar(
T *__restrict__ dst,
const T *__restrict__ src,
size_t embedding_dim) {
// Scalar copy with __ldg for read-only weight
for (size_t i = 0; i < embedding_dim; ++i) {
dst[i] = __ldg(&src[i]);
}
}
#endif // __EMBEDDING_CUDA_KERNEL_CUH__
#ifndef __EMBEDDING_H__
#define __EMBEDDING_H__
#include "../../../utils.h"
#include "../../operator.h"
#define DESCRIPTOR(NAMESPACE) \
\
namespace op::embedding::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
size_t _num_indices; \
size_t _embedding_dim; \
size_t _vocab_size; \
infiniDtype_t _input_dtype; \
infiniDtype_t _weight_dtype; \
\
Descriptor( \
size_t num_indices, \
size_t embedding_dim, \
size_t vocab_size, \
infiniDtype_t input_dtype, \
infiniDtype_t weight_dtype, \
Opaque *opaque, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_num_indices(num_indices), \
_embedding_dim(embedding_dim), \
_vocab_size(vocab_size), \
_input_dtype(input_dtype), \
_weight_dtype(weight_dtype) {} \
\
public: \
~Descriptor(); \
\
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t output_desc, \
infiniopTensorDescriptor_t input_desc, \
infiniopTensorDescriptor_t weight_desc); \
\
infiniStatus_t calculate( \
void *output, \
const void *input, \
const void *weight, \
void *stream) const; \
}; \
}
#endif // __EMBEDDING_H__
#ifndef __EMBEDDING_METAX_H__
#define __EMBEDDING_METAX_H__
#include "../embedding.h"
DESCRIPTOR(metax)
#endif // __EMBEDDING_METAX_H__
#include "../../../../utils.h"
#include "../../../devices/metax/metax_common.h"
#include "../../../devices/metax/metax_kernel_common.h"
#include "../../../tensor.h"
#include "../cuda/embedding_kernel.cuh"
#include "embedding_metax.cuh"
template <typename T, typename IndexType>
INFINIOP_METAX_KERNEL embeddingKernel(
T *__restrict__ output,
const IndexType *__restrict__ indices,
const T *__restrict__ weight,
size_t num_indices,
size_t embedding_dim,
size_t vocab_size) {
// Calculate global thread index
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < num_indices) {
// Get the index value
IndexType index_val = __ldg(&indices[idx]);
// Bounds check - handle negative indices gracefully
if (index_val >= 0 && static_cast<size_t>(index_val) < vocab_size) {
// Copy embedding vector from weight to output
const T *src = weight + static_cast<size_t>(index_val) * embedding_dim;
T *dst = output + idx * embedding_dim;
// Choose optimal copy strategy based on type and alignment
if constexpr (std::is_same_v<T, float>) {
// Check alignment for float4 (16 bytes)
bool aligned_16 = is_aligned(src, 16) && is_aligned(dst, 16);
if (aligned_16 && embedding_dim >= 4 && embedding_dim % 4 == 0) {
copyVectorizedFloat4<IndexType>(dst, src, embedding_dim);
} else if (embedding_dim >= 2 && embedding_dim % 2 == 0) {
// Try float2 if not aligned to 16 bytes
copyVectorizedFloat2<IndexType>(dst, src, embedding_dim);
} else {
copyScalar<T, IndexType>(dst, src, embedding_dim);
}
} else if constexpr (std::is_same_v<T, half>) {
// Use half2 for vectorized access
if (embedding_dim >= 2 && embedding_dim % 2 == 0) {
copyVectorizedHalf2<IndexType>(dst, src, embedding_dim);
} else {
copyScalar<T, IndexType>(dst, src, embedding_dim);
}
} else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
// Use bfloat162 for vectorized access
if (embedding_dim >= 2 && embedding_dim % 2 == 0) {
copyVectorizedBFloat162<IndexType>(dst, src, embedding_dim);
} else {
copyScalar<T, IndexType>(dst, src, embedding_dim);
}
} else {
// Fallback to scalar copy with __ldg
copyScalar<T, IndexType>(dst, src, embedding_dim);
}
}
}
}
namespace op::embedding::metax {
struct Descriptor::Opaque {
std::shared_ptr<device::metax::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t output_desc,
infiniopTensorDescriptor_t input_desc,
infiniopTensorDescriptor_t weight_desc) {
auto input_shape = input_desc->shape();
auto weight_shape = weight_desc->shape();
// Validate shapes
CHECK_OR_RETURN(weight_shape.size() == 2, INFINI_STATUS_BAD_TENSOR_SHAPE);
CHECK_OR_RETURN(output_desc->shape().size() == input_shape.size() + 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
// Check output shape matches input shape + embedding_dim
auto output_shape = output_desc->shape();
size_t embedding_dim = weight_shape[1];
CHECK_OR_RETURN(output_shape.back() == embedding_dim, INFINI_STATUS_BAD_TENSOR_SHAPE);
for (size_t i = 0; i < input_shape.size(); ++i) {
CHECK_OR_RETURN(output_shape[i] == input_shape[i], INFINI_STATUS_BAD_TENSOR_SHAPE);
}
// Validate dtypes
auto input_dtype = input_desc->dtype();
auto weight_dtype = weight_desc->dtype();
CHECK_OR_RETURN(input_dtype == INFINI_DTYPE_I32 || input_dtype == INFINI_DTYPE_I64,
INFINI_STATUS_BAD_TENSOR_DTYPE);
CHECK_OR_RETURN(weight_dtype == INFINI_DTYPE_F32 || weight_dtype == INFINI_DTYPE_F16 ||
weight_dtype == INFINI_DTYPE_BF16, INFINI_STATUS_BAD_TENSOR_DTYPE);
CHECK_OR_RETURN(output_desc->dtype() == weight_dtype, INFINI_STATUS_BAD_TENSOR_DTYPE);
// Calculate number of indices (supporting batch dimension)
size_t num_indices = 1;
for (auto dim : input_shape) {
num_indices *= dim;
}
size_t vocab_size = weight_shape[0];
*desc_ptr = new Descriptor(
num_indices,
embedding_dim,
vocab_size,
input_dtype,
weight_dtype,
new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
handle->device,
handle->device_id);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *output,
const void *input,
const void *weight,
void *stream) const {
if (_num_indices == 0) {
return INFINI_STATUS_SUCCESS;
}
auto hc_stream = reinterpret_cast<hcStream_t>(stream);
// Dynamic block size optimization based on embedding_dim for Metax platform
size_t block_size = 256; // Default block size for Metax
if (_embedding_dim <= 64) {
block_size = 512; // Small embedding_dim: use larger block for better occupancy
} else if (_embedding_dim >= 1024) {
block_size = 128; // Large embedding_dim: use smaller block to reduce register pressure
}
size_t grid_size = (_num_indices + block_size - 1) / block_size;
// Launch kernel based on dtypes for Metax platform
if (_input_dtype == INFINI_DTYPE_I32) {
const int32_t *indices_ptr = reinterpret_cast<const int32_t *>(input);
if (_weight_dtype == INFINI_DTYPE_F32) {
embeddingKernel<float, int32_t><<<grid_size, block_size, 0, hc_stream>>>(
reinterpret_cast<float *>(output),
indices_ptr,
reinterpret_cast<const float *>(weight),
_num_indices,
_embedding_dim,
_vocab_size);
} else if (_weight_dtype == INFINI_DTYPE_F16) {
embeddingKernel<half, int32_t><<<grid_size, block_size, 0, hc_stream>>>(
reinterpret_cast<half *>(output),
indices_ptr,
reinterpret_cast<const half *>(weight),
_num_indices,
_embedding_dim,
_vocab_size);
} else if (_weight_dtype == INFINI_DTYPE_BF16) {
// Use Metax's bfloat16 type
embeddingKernel<__hpcc_bfloat16, int32_t><<<grid_size, block_size, 0, hc_stream>>>(
reinterpret_cast<__hpcc_bfloat16 *>(output),
indices_ptr,
reinterpret_cast<const __hpcc_bfloat16 *>(weight),
_num_indices,
_embedding_dim,
_vocab_size);
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
} else if (_input_dtype == INFINI_DTYPE_I64) {
const int64_t *indices_ptr = reinterpret_cast<const int64_t *>(input);
if (_weight_dtype == INFINI_DTYPE_F32) {
embeddingKernel<float, int64_t><<<grid_size, block_size, 0, hc_stream>>>(
reinterpret_cast<float *>(output),
indices_ptr,
reinterpret_cast<const float *>(weight),
_num_indices,
_embedding_dim,
_vocab_size);
} else if (_weight_dtype == INFINI_DTYPE_F16) {
embeddingKernel<half, int64_t><<<grid_size, block_size, 0, hc_stream>>>(
reinterpret_cast<half *>(output),
indices_ptr,
reinterpret_cast<const half *>(weight),
_num_indices,
_embedding_dim,
_vocab_size);
} else if (_weight_dtype == INFINI_DTYPE_BF16) {
embeddingKernel<__hpcc_bfloat16, int64_t><<<grid_size, block_size, 0, hc_stream>>>(
reinterpret_cast<__hpcc_bfloat16 *>(output),
indices_ptr,
reinterpret_cast<const __hpcc_bfloat16 *>(weight),
_num_indices,
_embedding_dim,
_vocab_size);
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::embedding::metax
#ifndef __EMBEDDING_MOORE_H__
#define __EMBEDDING_MOORE_H__
#include "../embedding.h"
DESCRIPTOR(moore)
#endif // __EMBEDDING_MOORE_H__
#include "../../../../utils.h"
#include "../../../devices/moore/moore_common.h"
#include "../../../devices/moore/moore_kernel_common.h"
#include "../../../tensor.h"
#include "embedding_moore_kernel.h"
#include "embedding_moore.h"
#include <musa_runtime.h>
template <typename T, typename IndexType>
INFINIOP_MOORE_KERNEL embeddingKernel(
T *__restrict__ output,
const IndexType *__restrict__ indices,
const T *__restrict__ weight,
size_t num_indices,
size_t embedding_dim,
size_t vocab_size) {
// Calculate global thread index
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < num_indices) {
// Get the index value with Moore-optimized memory access
IndexType index_val = indices[idx];
// Bounds check - handle negative indices gracefully
if (index_val >= 0 && static_cast<size_t>(index_val) < vocab_size) {
// Copy embedding vector from weight to output
const T *src = weight + static_cast<size_t>(index_val) * embedding_dim;
T *dst = output + idx * embedding_dim;
// Choose optimal copy strategy based on type and alignment
if constexpr (std::is_same_v<T, float>) {
// Check alignment for float4 (16 bytes)
bool aligned_16 = is_aligned(src, 16) && is_aligned(dst, 16);
if (aligned_16 && embedding_dim >= 4 && embedding_dim % 4 == 0) {
copyVectorizedFloat4<IndexType>(dst, src, embedding_dim);
} else if (embedding_dim >= 2 && embedding_dim % 2 == 0) {
// Try float2 if not aligned to 16 bytes
copyVectorizedFloat2<IndexType>(dst, src, embedding_dim);
} else {
copyScalar<T, IndexType>(dst, src, embedding_dim);
}
} else if constexpr (std::is_same_v<T, half>) {
// Use half2 for vectorized access
if (embedding_dim >= 2 && embedding_dim % 2 == 0) {
copyVectorizedHalf2<IndexType>(dst, src, embedding_dim);
} else {
copyScalar<T, IndexType>(dst, src, embedding_dim);
}
} else if constexpr (std::is_same_v<T, __mt_bfloat16>) {
// Use mt_bfloat162 for vectorized access (Moore-specific type)
if (embedding_dim >= 2 && embedding_dim % 2 == 0) {
copyVectorizedBFloat162<IndexType>(dst, src, embedding_dim);
} else {
copyScalar<T, IndexType>(dst, src, embedding_dim);
}
} else {
// Fallback to scalar copy with Moore-optimized memory access
copyScalar<T, IndexType>(dst, src, embedding_dim);
}
}
}
}
namespace op::embedding::moore {
struct Descriptor::Opaque {
std::shared_ptr<device::moore::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t output_desc,
infiniopTensorDescriptor_t input_desc,
infiniopTensorDescriptor_t weight_desc) {
auto input_shape = input_desc->shape();
auto weight_shape = weight_desc->shape();
// Validate shapes
CHECK_OR_RETURN(weight_shape.size() == 2, INFINI_STATUS_BAD_TENSOR_SHAPE);
CHECK_OR_RETURN(output_desc->shape().size() == input_shape.size() + 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
// Check output shape matches input shape + embedding_dim
auto output_shape = output_desc->shape();
size_t embedding_dim = weight_shape[1];
CHECK_OR_RETURN(output_shape.back() == embedding_dim, INFINI_STATUS_BAD_TENSOR_SHAPE);
for (size_t i = 0; i < input_shape.size(); ++i) {
CHECK_OR_RETURN(output_shape[i] == input_shape[i], INFINI_STATUS_BAD_TENSOR_SHAPE);
}
// Validate dtypes
auto input_dtype = input_desc->dtype();
auto weight_dtype = weight_desc->dtype();
CHECK_OR_RETURN(input_dtype == INFINI_DTYPE_I32 || input_dtype == INFINI_DTYPE_I64,
INFINI_STATUS_BAD_TENSOR_DTYPE);
CHECK_OR_RETURN(weight_dtype == INFINI_DTYPE_F32 || weight_dtype == INFINI_DTYPE_F16 || weight_dtype == INFINI_DTYPE_BF16, INFINI_STATUS_BAD_TENSOR_DTYPE);
CHECK_OR_RETURN(output_desc->dtype() == weight_dtype, INFINI_STATUS_BAD_TENSOR_DTYPE);
// Calculate number of indices (supporting batch dimension)
size_t num_indices = 1;
for (auto dim : input_shape) {
num_indices *= dim;
}
size_t vocab_size = weight_shape[0];
*desc_ptr = new Descriptor(
num_indices,
embedding_dim,
vocab_size,
input_dtype,
weight_dtype,
new Opaque{reinterpret_cast<device::moore::Handle *>(handle)->internal()},
handle->device,
handle->device_id);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *output,
const void *input,
const void *weight,
void *stream) const {
if (_num_indices == 0) {
return INFINI_STATUS_SUCCESS;
}
auto musa_stream = reinterpret_cast<musaStream_t>(stream);
// Dynamic block size optimization based on embedding_dim
// Moore platform typically has different performance characteristics
size_t block_size = 256; // Default for Moore
if (_embedding_dim <= 64) {
block_size = 512; // Small embedding_dim: use larger block for better occupancy
} else if (_embedding_dim >= 1024) {
block_size = 128; // Large embedding_dim: use smaller block to reduce register pressure
} else if (_embedding_dim <= 256) {
block_size = 384; // Medium embedding_dim: balanced configuration
}
size_t grid_size = (_num_indices + block_size - 1) / block_size;
// Launch kernel based on dtypes
// Note: Moore uses __mt_bfloat16 instead of __nv_bfloat16
if (_input_dtype == INFINI_DTYPE_I32) {
const int32_t *indices_ptr = reinterpret_cast<const int32_t *>(input);
if (_weight_dtype == INFINI_DTYPE_F32) {
embeddingKernel<float, int32_t><<<grid_size, block_size, 0, musa_stream>>>(
reinterpret_cast<float *>(output),
indices_ptr,
reinterpret_cast<const float *>(weight),
_num_indices,
_embedding_dim,
_vocab_size);
} else if (_weight_dtype == INFINI_DTYPE_F16) {
embeddingKernel<half, int32_t><<<grid_size, block_size, 0, musa_stream>>>(
reinterpret_cast<half *>(output),
indices_ptr,
reinterpret_cast<const half *>(weight),
_num_indices,
_embedding_dim,
_vocab_size);
} else if (_weight_dtype == INFINI_DTYPE_BF16) {
// Use Moore's bfloat16 type
embeddingKernel<__mt_bfloat16, int32_t><<<grid_size, block_size, 0, musa_stream>>>(
reinterpret_cast<__mt_bfloat16 *>(output),
indices_ptr,
reinterpret_cast<const __mt_bfloat16 *>(weight),
_num_indices,
_embedding_dim,
_vocab_size);
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
} else if (_input_dtype == INFINI_DTYPE_I64) {
const int64_t *indices_ptr = reinterpret_cast<const int64_t *>(input);
if (_weight_dtype == INFINI_DTYPE_F32) {
embeddingKernel<float, int64_t><<<grid_size, block_size, 0, musa_stream>>>(
reinterpret_cast<float *>(output),
indices_ptr,
reinterpret_cast<const float *>(weight),
_num_indices,
_embedding_dim,
_vocab_size);
} else if (_weight_dtype == INFINI_DTYPE_F16) {
embeddingKernel<half, int64_t><<<grid_size, block_size, 0, musa_stream>>>(
reinterpret_cast<half *>(output),
indices_ptr,
reinterpret_cast<const half *>(weight),
_num_indices,
_embedding_dim,
_vocab_size);
} else if (_weight_dtype == INFINI_DTYPE_BF16) {
embeddingKernel<__mt_bfloat16, int64_t><<<grid_size, block_size, 0, musa_stream>>>(
reinterpret_cast<__mt_bfloat16 *>(output),
indices_ptr,
reinterpret_cast<const __mt_bfloat16 *>(weight),
_num_indices,
_embedding_dim,
_vocab_size);
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
// Check for kernel launch errors
musaError_t err = musaGetLastError();
if (err != musaSuccess) {
return INFINI_STATUS_INTERNAL_ERROR;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::embedding::moore
#ifndef __EMBEDDING_MOORE_KERNEL_CUH__
#define __EMBEDDING_MOORE_KERNEL_CUH__
#include <type_traits>
// Helper function to check memory alignment
__forceinline__ __device__ bool is_aligned(const void *ptr, size_t alignment) {
// Use size_t for pointer arithmetic in device code (more compatible)
return (reinterpret_cast<size_t>(ptr) % alignment == 0);
}
// Vectorized copy for float type using float4
template <typename IndexType>
__forceinline__ __device__ void copyVectorizedFloat4(
float *__restrict__ dst,
const float *__restrict__ src,
size_t embedding_dim) {
// Use float4 for vectorized access (16 bytes, 4 floats)
const float4 *src_vec = reinterpret_cast<const float4 *>(src);
float4 *dst_vec = reinterpret_cast<float4 *>(dst);
size_t vec_count = embedding_dim / 4;
// Vectorized copy with __ldg equivalent for Moore platform
for (size_t i = 0; i < vec_count; ++i) {
dst_vec[i] = src_vec[i];
}
// Copy remaining elements
size_t remaining = embedding_dim % 4;
if (remaining > 0) {
size_t offset = vec_count * 4;
for (size_t i = 0; i < remaining; ++i) {
dst[offset + i] = src[offset + i];
}
}
}
// Vectorized copy for float type using float2 (fallback when not aligned to 16 bytes)
template <typename IndexType>
__forceinline__ __device__ void copyVectorizedFloat2(
float *__restrict__ dst,
const float *__restrict__ src,
size_t embedding_dim) {
// Use float2 for vectorized access (8 bytes, 2 floats)
const float2 *src_vec = reinterpret_cast<const float2 *>(src);
float2 *dst_vec = reinterpret_cast<float2 *>(dst);
size_t vec_count = embedding_dim / 2;
// Vectorized copy with Moore-optimized memory access
for (size_t i = 0; i < vec_count; ++i) {
dst_vec[i] = src_vec[i];
}
// Copy remaining element if odd
if (embedding_dim % 2 != 0) {
dst[embedding_dim - 1] = src[embedding_dim - 1];
}
}
// Vectorized copy for half type using half2
template <typename IndexType>
__forceinline__ __device__ void copyVectorizedHalf2(
half *__restrict__ dst,
const half *__restrict__ src,
size_t embedding_dim) {
// Use half2 for vectorized access (4 bytes, 2 halfs)
const half2 *src_vec = reinterpret_cast<const half2 *>(src);
half2 *dst_vec = reinterpret_cast<half2 *>(dst);
size_t vec_count = embedding_dim / 2;
// Vectorized copy optimized for Moore architecture
for (size_t i = 0; i < vec_count; ++i) {
dst_vec[i] = src_vec[i];
}
// Copy remaining element if odd
if (embedding_dim % 2 != 0) {
dst[embedding_dim - 1] = src[embedding_dim - 1];
}
}
// Vectorized copy for Moore bfloat16 type using bfloat162
template <typename IndexType>
__forceinline__ __device__ void copyVectorizedBFloat162(
__mt_bfloat16 *__restrict__ dst,
const __mt_bfloat16 *__restrict__ src,
size_t embedding_dim) {
// Use mt_bfloat162 for vectorized access (4 bytes, 2 bfloat16s)
const __mt_bfloat162 *src_vec = reinterpret_cast<const __mt_bfloat162 *>(src);
__mt_bfloat162 *dst_vec = reinterpret_cast<__mt_bfloat162 *>(dst);
size_t vec_count = embedding_dim / 2;
// Vectorized copy with Moore-specific optimization
for (size_t i = 0; i < vec_count; ++i) {
dst_vec[i] = src_vec[i];
}
// Copy remaining element if odd
if (embedding_dim % 2 != 0) {
dst[embedding_dim - 1] = src[embedding_dim - 1];
}
}
// Scalar copy fallback with Moore-optimized memory access
template <typename T, typename IndexType>
__forceinline__ __device__ void copyScalar(
T *__restrict__ dst,
const T *__restrict__ src,
size_t embedding_dim) {
// Scalar copy with Moore read-only weight optimization
for (size_t i = 0; i < embedding_dim; ++i) {
dst[i] = src[i];
}
}
#endif // __EMBEDDING_MOORE_KERNEL_CUH__
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment