Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
784139b9
Unverified
Commit
784139b9
authored
Feb 13, 2026
by
thatPepe
Committed by
GitHub
Feb 13, 2026
Browse files
Merge pull request #990 from InfiniTensor/demo131
Demo-131 Cuda graph with optimized paged attention
parents
3c8fb3c0
1d6527cb
Changes
582
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1317 additions
and
12 deletions
+1317
-12
src/infiniop/ops/causal_softmax/bang/causal_softmax_bang.mlu
src/infiniop/ops/causal_softmax/bang/causal_softmax_bang.mlu
+1
-1
src/infiniop/ops/causal_softmax/moore/causal_softmax_kernel.h
...infiniop/ops/causal_softmax/moore/causal_softmax_kernel.h
+1
-1
src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu
...finiop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu
+9
-5
src/infiniop/ops/causal_softmax/operator.cc
src/infiniop/ops/causal_softmax/operator.cc
+13
-1
src/infiniop/ops/clip/operator.cc
src/infiniop/ops/clip/operator.cc
+13
-1
src/infiniop/ops/conv/operator.cc
src/infiniop/ops/conv/operator.cc
+13
-1
src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_nvidia.cu
...iop/ops/dequantize_awq/nvidia/dequantize_w42f16_nvidia.cu
+1
-1
src/infiniop/ops/dequantize_awq/operator.cc
src/infiniop/ops/dequantize_awq/operator.cc
+17
-1
src/infiniop/ops/embedding/cpu/embedding_cpu.cc
src/infiniop/ops/embedding/cpu/embedding_cpu.cc
+109
-0
src/infiniop/ops/embedding/cpu/embedding_cpu.h
src/infiniop/ops/embedding/cpu/embedding_cpu.h
+8
-0
src/infiniop/ops/embedding/cuda/embedding_kernel.cuh
src/infiniop/ops/embedding/cuda/embedding_kernel.cuh
+116
-0
src/infiniop/ops/embedding/embedding.h
src/infiniop/ops/embedding/embedding.h
+54
-0
src/infiniop/ops/embedding/metax/embedding_metax.cuh
src/infiniop/ops/embedding/metax/embedding_metax.cuh
+8
-0
src/infiniop/ops/embedding/metax/embedding_metax.maca
src/infiniop/ops/embedding/metax/embedding_metax.maca
+217
-0
src/infiniop/ops/embedding/moore/embedding_moore.h
src/infiniop/ops/embedding/moore/embedding_moore.h
+8
-0
src/infiniop/ops/embedding/moore/embedding_moore.mu
src/infiniop/ops/embedding/moore/embedding_moore.mu
+227
-0
src/infiniop/ops/embedding/moore/embedding_moore_kernel.h
src/infiniop/ops/embedding/moore/embedding_moore_kernel.h
+116
-0
src/infiniop/ops/embedding/nvidia/embedding_nvidia.cu
src/infiniop/ops/embedding/nvidia/embedding_nvidia.cu
+224
-0
src/infiniop/ops/embedding/nvidia/embedding_nvidia.cuh
src/infiniop/ops/embedding/nvidia/embedding_nvidia.cuh
+8
-0
src/infiniop/ops/embedding/operator.cc
src/infiniop/ops/embedding/operator.cc
+154
-0
No files found.
src/infiniop/ops/causal_softmax/bang/causal_softmax_bang.mlu
View file @
784139b9
...
...
@@ -131,7 +131,7 @@ void causalSoftmaxUnion(void *workspace, int core_per_cluster, int cluster_count
kernel_dim.x = core_per_cluster;
kernel_dim.y = cluster_count;
kernel_dim.z = 1;
kernel_type =
CNRT_FUNC_TYPE_UNION
1;
kernel_type =
cnrtFuncTypeUnion
1;
// Launch kernel
causalSoftmax<T><<<kernel_dim, kernel_type, queue>>>(
...
...
src/infiniop/ops/causal_softmax/moore/causal_softmax_kernel.h
View file @
784139b9
...
...
@@ -28,7 +28,7 @@ __device__ void causalSoftmaxKernel(
// 1 | * * * ... * * |
// 2 | * * * ... * * * |
// height: 3 col_id->
if
(
width
+
blockIdx
.
x
>=
threadIdx
.
x
+
height
)
{
if
(
width
+
blockIdx
.
x
>=
col
+
height
)
{
if
constexpr
(
std
::
is_same_v
<
Tdata
,
half
>
||
std
::
is_same_v
<
Tdata
,
cuda_bfloat16
>
)
{
/*
* MUSA does not support CUDA's native `hexp` function.
...
...
src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu
View file @
784139b9
...
...
@@ -76,7 +76,15 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
const
void
*
x
,
void
*
stream_
)
const
{
cudaStream_t
stream
=
(
cudaStream_t
)
stream_
;
if
(
_opaque
->
internal
->
maxThreadsPerBlock
()
==
CUDA_BLOCK_SIZE_1024
)
{
if
(
_opaque
->
internal
->
maxThreadsPerBlock
()
==
CUDA_BLOCK_SIZE_4096
)
{
CHECK_STATUS
(
launchKernel
<
CUDA_BLOCK_SIZE_4096
>
(
y
,
x
,
_info
.
dtype
,
_info
.
batch_size
,
_info
.
seq_len
,
_info
.
total_seq_len
,
_info
.
y_stride_b
,
_info
.
y_stride_i
,
_info
.
x_stride_b
,
_info
.
x_stride_i
,
stream
));
}
else
if
(
_opaque
->
internal
->
maxThreadsPerBlock
()
==
CUDA_BLOCK_SIZE_2048
)
{
CHECK_STATUS
(
launchKernel
<
CUDA_BLOCK_SIZE_2048
>
(
y
,
x
,
_info
.
dtype
,
_info
.
batch_size
,
_info
.
seq_len
,
_info
.
total_seq_len
,
_info
.
y_stride_b
,
_info
.
y_stride_i
,
_info
.
x_stride_b
,
_info
.
x_stride_i
,
stream
));
}
else
if
(
_opaque
->
internal
->
maxThreadsPerBlock
()
==
CUDA_BLOCK_SIZE_1024
)
{
CHECK_STATUS
(
launchKernel
<
CUDA_BLOCK_SIZE_1024
>
(
y
,
x
,
_info
.
dtype
,
_info
.
batch_size
,
_info
.
seq_len
,
_info
.
total_seq_len
,
_info
.
y_stride_b
,
_info
.
y_stride_i
,
_info
.
x_stride_b
,
_info
.
x_stride_i
,
stream
));
...
...
@@ -84,10 +92,6 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
CHECK_STATUS
(
launchKernel
<
CUDA_BLOCK_SIZE_512
>
(
y
,
x
,
_info
.
dtype
,
_info
.
batch_size
,
_info
.
seq_len
,
_info
.
total_seq_len
,
_info
.
y_stride_b
,
_info
.
y_stride_i
,
_info
.
x_stride_b
,
_info
.
x_stride_i
,
stream
));
}
else
if
(
_opaque
->
internal
->
maxThreadsPerBlock
()
==
CUDA_BLOCK_SIZE_4096
)
{
CHECK_STATUS
(
launchKernel
<
CUDA_BLOCK_SIZE_4096
>
(
y
,
x
,
_info
.
dtype
,
_info
.
batch_size
,
_info
.
seq_len
,
_info
.
total_seq_len
,
_info
.
y_stride_b
,
_info
.
y_stride_i
,
_info
.
x_stride_b
,
_info
.
x_stride_i
,
stream
));
}
else
{
return
INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED
;
}
...
...
src/infiniop/ops/causal_softmax/operator.cc
View file @
784139b9
...
...
@@ -5,7 +5,7 @@
#ifdef ENABLE_CPU_API
#include "cpu/causal_softmax_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
|| defined(ENABLE_ALI_API)
#include "nvidia/causal_softmax_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
...
...
@@ -48,6 +48,9 @@ __C infiniStatus_t infiniopCreateCausalSoftmaxDescriptor(
#ifdef ENABLE_ILUVATAR_API
CREATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
CREATE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
CREATE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
...
...
@@ -90,6 +93,9 @@ __C infiniStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmaxDe
#ifdef ENABLE_ILUVATAR_API
GET
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
GET
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
GET
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
...
...
@@ -137,6 +143,9 @@ __C infiniStatus_t infiniopCausalSoftmax(
#ifdef ENABLE_ILUVATAR_API
CALCULATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
CALCULATE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
CALCULATE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
...
...
@@ -179,6 +188,9 @@ __C infiniStatus_t infiniopDestroyCausalSoftmaxDescriptor(infiniopCausalSoftmaxD
#ifdef ENABLE_ILUVATAR_API
DESTROY
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
DESTROY
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
DESTROY
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
...
...
src/infiniop/ops/clip/operator.cc
View file @
784139b9
...
...
@@ -5,7 +5,7 @@
#ifdef ENABLE_CPU_API
#include "cpu/clip_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
|| defined(ENABLE_ALI_API)
#include "nvidia/clip_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
...
...
@@ -42,6 +42,9 @@ __C infiniStatus_t infiniopCreateClipDescriptor(
#ifdef ENABLE_ILUVATAR_API
CREATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
CREATE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
CREATE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
...
...
@@ -76,6 +79,9 @@ __C infiniStatus_t infiniopGetClipWorkspaceSize(infiniopClipDescriptor_t desc, s
#ifdef ENABLE_ILUVATAR_API
GET
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
GET
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
GET
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
...
...
@@ -118,6 +124,9 @@ __C infiniStatus_t infiniopClip(
#ifdef ENABLE_ILUVATAR_API
CALCULATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
CALCULATE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
CALCULATE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
...
...
@@ -154,6 +163,9 @@ infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc) {
#ifdef ENABLE_ILUVATAR_API
DELETE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
DELETE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
DELETE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
...
...
src/infiniop/ops/conv/operator.cc
View file @
784139b9
...
...
@@ -5,7 +5,7 @@
#ifdef ENABLE_CPU_API
#include "cpu/conv_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
|| defined(ENABLE_ALI_API)
#include "nvidia/conv_nvidia.cuh"
#endif
...
...
@@ -45,6 +45,9 @@ __C __export infiniStatus_t infiniopCreateConvDescriptor(infiniopHandle_t handle
#ifdef ENABLE_QY_API
CREATE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
CREATE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -76,6 +79,9 @@ infiniopGetConvWorkspaceSize(
#ifdef ENABLE_QY_API
GET
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
GET
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -115,6 +121,9 @@ __C infiniStatus_t infiniopConv(
#ifdef ENABLE_QY_API
CALCULATE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
CALCULATE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -142,6 +151,9 @@ infiniopDestroyConvDescriptor(infiniopConvDescriptor_t desc) {
#ifdef ENABLE_QY_API
DELETE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
DELETE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_nvidia.cu
View file @
784139b9
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
|| defined(ENABLE_ALI_API)
#include "../../../devices/nvidia/nvidia_handle.cuh"
#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
...
...
src/infiniop/ops/dequantize_awq/operator.cc
View file @
784139b9
...
...
@@ -2,7 +2,7 @@
#include "../../handle.h"
#include "infiniop/ops/dequantize_awq.h"
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
|| defined(ENABLE_ALI_API)
#include "nvidia/dequantize_w42f16_nvidia.cuh"
#endif
#ifdef ENABLE_MOORE_API
...
...
@@ -43,6 +43,10 @@ __C infiniStatus_t infiniopCreateDequantizeAWQDescriptor(
#ifdef ENABLE_QY_API
CREATE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
CREATE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
...
...
@@ -70,6 +74,10 @@ __C infiniStatus_t infiniopGetDequantizeAWQWorkspaceSize(infiniopDequantizeAWQDe
#ifdef ENABLE_QY_API
GET
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
GET
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
...
...
@@ -104,6 +112,10 @@ __C infiniStatus_t infiniopDequantizeAWQ(
#ifdef ENABLE_QY_API
CALCULATE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
CALCULATE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
...
...
@@ -132,6 +144,10 @@ infiniopDestroyDequantizeAWQDescriptor(infiniopDequantizeAWQDescriptor_t desc) {
#ifdef ENABLE_QY_API
DELETE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
DELETE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
...
...
src/infiniop/ops/embedding/cpu/embedding_cpu.cc
0 → 100644
View file @
784139b9
#include "embedding_cpu.h"
#include "../../../../utils.h"
#include "../../../handle.h"
#include "../../../tensor.h"
#include <cstring>
namespace
op
::
embedding
::
cpu
{
struct
Descriptor
::
Opaque
{};
Descriptor
::~
Descriptor
()
{
delete
_opaque
;
}
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
output_desc
,
infiniopTensorDescriptor_t
input_desc
,
infiniopTensorDescriptor_t
weight_desc
)
{
auto
input_shape
=
input_desc
->
shape
();
auto
weight_shape
=
weight_desc
->
shape
();
CHECK_OR_RETURN
(
weight_shape
.
size
()
==
2
,
INFINI_STATUS_BAD_TENSOR_SHAPE
);
CHECK_OR_RETURN
(
output_desc
->
shape
().
size
()
==
input_shape
.
size
()
+
1
,
INFINI_STATUS_BAD_TENSOR_SHAPE
);
auto
output_shape
=
output_desc
->
shape
();
size_t
embedding_dim
=
weight_shape
[
1
];
CHECK_OR_RETURN
(
output_shape
.
back
()
==
embedding_dim
,
INFINI_STATUS_BAD_TENSOR_SHAPE
);
for
(
size_t
i
=
0
;
i
<
input_shape
.
size
();
++
i
)
{
CHECK_OR_RETURN
(
output_shape
[
i
]
==
input_shape
[
i
],
INFINI_STATUS_BAD_TENSOR_SHAPE
);
}
auto
input_dtype
=
input_desc
->
dtype
();
auto
weight_dtype
=
weight_desc
->
dtype
();
CHECK_OR_RETURN
(
input_dtype
==
INFINI_DTYPE_I32
||
input_dtype
==
INFINI_DTYPE_I64
,
INFINI_STATUS_BAD_TENSOR_DTYPE
);
CHECK_OR_RETURN
(
weight_dtype
==
INFINI_DTYPE_F32
||
weight_dtype
==
INFINI_DTYPE_F16
||
weight_dtype
==
INFINI_DTYPE_BF16
,
INFINI_STATUS_BAD_TENSOR_DTYPE
);
CHECK_OR_RETURN
(
output_desc
->
dtype
()
==
weight_dtype
,
INFINI_STATUS_BAD_TENSOR_DTYPE
);
size_t
num_indices
=
1
;
for
(
auto
dim
:
input_shape
)
{
num_indices
*=
dim
;
}
size_t
vocab_size
=
weight_shape
[
0
];
*
desc_ptr
=
new
Descriptor
(
num_indices
,
embedding_dim
,
vocab_size
,
input_dtype
,
weight_dtype
,
new
Opaque
{},
handle
->
device
,
handle
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
output
,
const
void
*
input
,
const
void
*
weight
,
void
*
stream
)
const
{
if
(
_num_indices
==
0
)
{
return
INFINI_STATUS_SUCCESS
;
}
size_t
element_size
=
infiniSizeOf
(
_weight_dtype
);
size_t
row_bytes
=
_embedding_dim
*
element_size
;
if
(
_input_dtype
==
INFINI_DTYPE_I32
)
{
const
int32_t
*
indices_ptr
=
reinterpret_cast
<
const
int32_t
*>
(
input
);
const
std
::
byte
*
weight_ptr
=
reinterpret_cast
<
const
std
::
byte
*>
(
weight
);
std
::
byte
*
out_ptr
=
reinterpret_cast
<
std
::
byte
*>
(
output
);
for
(
size_t
i
=
0
;
i
<
_num_indices
;
++
i
)
{
int32_t
idx
=
indices_ptr
[
i
];
if
(
idx
>=
0
&&
static_cast
<
size_t
>
(
idx
)
<
_vocab_size
)
{
std
::
memcpy
(
out_ptr
+
i
*
row_bytes
,
weight_ptr
+
static_cast
<
size_t
>
(
idx
)
*
row_bytes
,
row_bytes
);
}
}
}
else
if
(
_input_dtype
==
INFINI_DTYPE_I64
)
{
const
int64_t
*
indices_ptr
=
reinterpret_cast
<
const
int64_t
*>
(
input
);
const
std
::
byte
*
weight_ptr
=
reinterpret_cast
<
const
std
::
byte
*>
(
weight
);
std
::
byte
*
out_ptr
=
reinterpret_cast
<
std
::
byte
*>
(
output
);
for
(
size_t
i
=
0
;
i
<
_num_indices
;
++
i
)
{
int64_t
idx
=
indices_ptr
[
i
];
if
(
idx
>=
0
&&
static_cast
<
size_t
>
(
idx
)
<
_vocab_size
)
{
std
::
memcpy
(
out_ptr
+
i
*
row_bytes
,
weight_ptr
+
static_cast
<
size_t
>
(
idx
)
*
row_bytes
,
row_bytes
);
}
}
}
else
{
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::embedding::cpu
src/infiniop/ops/embedding/cpu/embedding_cpu.h
0 → 100644
View file @
784139b9
#ifndef __EMBEDDING_CPU_H__
#define __EMBEDDING_CPU_H__
#include "../embedding.h"
DESCRIPTOR
(
cpu
)
#endif // __EMBEDDING_CPU_H__
src/infiniop/ops/embedding/cuda/embedding_kernel.cuh
0 → 100644
View file @
784139b9
#ifndef __EMBEDDING_CUDA_KERNEL_CUH__
#define __EMBEDDING_CUDA_KERNEL_CUH__
#include <type_traits>
// Helper function to check memory alignment
__forceinline__
__device__
bool
is_aligned
(
const
void
*
ptr
,
size_t
alignment
)
{
// Use size_t for pointer arithmetic in device code (more compatible)
return
(
reinterpret_cast
<
size_t
>
(
ptr
)
%
alignment
==
0
);
}
// Vectorized copy for float type using float4
template
<
typename
IndexType
>
__forceinline__
__device__
void
copyVectorizedFloat4
(
float
*
__restrict__
dst
,
const
float
*
__restrict__
src
,
size_t
embedding_dim
)
{
// Use float4 for vectorized access (16 bytes, 4 floats)
const
float4
*
src_vec
=
reinterpret_cast
<
const
float4
*>
(
src
);
float4
*
dst_vec
=
reinterpret_cast
<
float4
*>
(
dst
);
size_t
vec_count
=
embedding_dim
/
4
;
// Vectorized copy using __ldg for read-only weight
for
(
size_t
i
=
0
;
i
<
vec_count
;
++
i
)
{
dst_vec
[
i
]
=
__ldg
(
&
src_vec
[
i
]);
}
// Copy remaining elements
size_t
remaining
=
embedding_dim
%
4
;
if
(
remaining
>
0
)
{
size_t
offset
=
vec_count
*
4
;
for
(
size_t
i
=
0
;
i
<
remaining
;
++
i
)
{
dst
[
offset
+
i
]
=
__ldg
(
&
src
[
offset
+
i
]);
}
}
}
// Vectorized copy for float type using float2 (fallback when not aligned to 16 bytes)
template
<
typename
IndexType
>
__forceinline__
__device__
void
copyVectorizedFloat2
(
float
*
__restrict__
dst
,
const
float
*
__restrict__
src
,
size_t
embedding_dim
)
{
// Use float2 for vectorized access (8 bytes, 2 floats)
const
float2
*
src_vec
=
reinterpret_cast
<
const
float2
*>
(
src
);
float2
*
dst_vec
=
reinterpret_cast
<
float2
*>
(
dst
);
size_t
vec_count
=
embedding_dim
/
2
;
// Vectorized copy using __ldg for read-only weight
for
(
size_t
i
=
0
;
i
<
vec_count
;
++
i
)
{
dst_vec
[
i
]
=
__ldg
(
&
src_vec
[
i
]);
}
// Copy remaining element if odd
if
(
embedding_dim
%
2
!=
0
)
{
dst
[
embedding_dim
-
1
]
=
__ldg
(
&
src
[
embedding_dim
-
1
]);
}
}
// Vectorized copy for half type using half2
template
<
typename
IndexType
>
__forceinline__
__device__
void
copyVectorizedHalf2
(
half
*
__restrict__
dst
,
const
half
*
__restrict__
src
,
size_t
embedding_dim
)
{
// Use half2 for vectorized access (4 bytes, 2 halfs)
const
half2
*
src_vec
=
reinterpret_cast
<
const
half2
*>
(
src
);
half2
*
dst_vec
=
reinterpret_cast
<
half2
*>
(
dst
);
size_t
vec_count
=
embedding_dim
/
2
;
// Vectorized copy using __ldg for read-only weight
for
(
size_t
i
=
0
;
i
<
vec_count
;
++
i
)
{
dst_vec
[
i
]
=
__ldg
(
&
src_vec
[
i
]);
}
// Copy remaining element if odd
if
(
embedding_dim
%
2
!=
0
)
{
dst
[
embedding_dim
-
1
]
=
__ldg
(
&
src
[
embedding_dim
-
1
]);
}
}
// Vectorized copy for bfloat16 type using bfloat162
template
<
typename
IndexType
>
__forceinline__
__device__
void
copyVectorizedBFloat162
(
cuda_bfloat16
*
__restrict__
dst
,
const
cuda_bfloat16
*
__restrict__
src
,
size_t
embedding_dim
)
{
// Use bfloat162 for vectorized access (4 bytes, 2 bfloat16s)
const
cuda_bfloat162
*
src_vec
=
reinterpret_cast
<
const
cuda_bfloat162
*>
(
src
);
cuda_bfloat162
*
dst_vec
=
reinterpret_cast
<
cuda_bfloat162
*>
(
dst
);
size_t
vec_count
=
embedding_dim
/
2
;
// Vectorized copy using __ldg for read-only weight
for
(
size_t
i
=
0
;
i
<
vec_count
;
++
i
)
{
dst_vec
[
i
]
=
__ldg
(
&
src_vec
[
i
]);
}
// Copy remaining element if odd
if
(
embedding_dim
%
2
!=
0
)
{
dst
[
embedding_dim
-
1
]
=
__ldg
(
&
src
[
embedding_dim
-
1
]);
}
}
// Scalar copy fallback with __ldg optimization
template
<
typename
T
,
typename
IndexType
>
__forceinline__
__device__
void
copyScalar
(
T
*
__restrict__
dst
,
const
T
*
__restrict__
src
,
size_t
embedding_dim
)
{
// Scalar copy with __ldg for read-only weight
for
(
size_t
i
=
0
;
i
<
embedding_dim
;
++
i
)
{
dst
[
i
]
=
__ldg
(
&
src
[
i
]);
}
}
#endif // __EMBEDDING_CUDA_KERNEL_CUH__
src/infiniop/ops/embedding/embedding.h
0 → 100644
View file @
784139b9
#ifndef __EMBEDDING_H__
#define __EMBEDDING_H__
#include "../../../utils.h"
#include "../../operator.h"
#define DESCRIPTOR(NAMESPACE) \
\
namespace op::embedding::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
size_t _num_indices; \
size_t _embedding_dim; \
size_t _vocab_size; \
infiniDtype_t _input_dtype; \
infiniDtype_t _weight_dtype; \
\
Descriptor( \
size_t num_indices, \
size_t embedding_dim, \
size_t vocab_size, \
infiniDtype_t input_dtype, \
infiniDtype_t weight_dtype, \
Opaque *opaque, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_num_indices(num_indices), \
_embedding_dim(embedding_dim), \
_vocab_size(vocab_size), \
_input_dtype(input_dtype), \
_weight_dtype(weight_dtype) {} \
\
public: \
~Descriptor(); \
\
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t output_desc, \
infiniopTensorDescriptor_t input_desc, \
infiniopTensorDescriptor_t weight_desc); \
\
infiniStatus_t calculate( \
void *output, \
const void *input, \
const void *weight, \
void *stream) const; \
}; \
}
#endif // __EMBEDDING_H__
src/infiniop/ops/embedding/metax/embedding_metax.cuh
0 → 100644
View file @
784139b9
#ifndef __EMBEDDING_METAX_H__
#define __EMBEDDING_METAX_H__
#include "../embedding.h"
DESCRIPTOR
(
metax
)
#endif // __EMBEDDING_METAX_H__
src/infiniop/ops/embedding/metax/embedding_metax.maca
0 → 100644
View file @
784139b9
#include "../../../../utils.h"
#include "../../../devices/metax/metax_common.h"
#include "../../../devices/metax/metax_kernel_common.h"
#include "../../../tensor.h"
#include "../cuda/embedding_kernel.cuh"
#include "embedding_metax.cuh"
template <typename T, typename IndexType>
INFINIOP_METAX_KERNEL embeddingKernel(
T *__restrict__ output,
const IndexType *__restrict__ indices,
const T *__restrict__ weight,
size_t num_indices,
size_t embedding_dim,
size_t vocab_size) {
// Calculate global thread index
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < num_indices) {
// Get the index value
IndexType index_val = __ldg(&indices[idx]);
// Bounds check - handle negative indices gracefully
if (index_val >= 0 && static_cast<size_t>(index_val) < vocab_size) {
// Copy embedding vector from weight to output
const T *src = weight + static_cast<size_t>(index_val) * embedding_dim;
T *dst = output + idx * embedding_dim;
// Choose optimal copy strategy based on type and alignment
if constexpr (std::is_same_v<T, float>) {
// Check alignment for float4 (16 bytes)
bool aligned_16 = is_aligned(src, 16) && is_aligned(dst, 16);
if (aligned_16 && embedding_dim >= 4 && embedding_dim % 4 == 0) {
copyVectorizedFloat4<IndexType>(dst, src, embedding_dim);
} else if (embedding_dim >= 2 && embedding_dim % 2 == 0) {
// Try float2 if not aligned to 16 bytes
copyVectorizedFloat2<IndexType>(dst, src, embedding_dim);
} else {
copyScalar<T, IndexType>(dst, src, embedding_dim);
}
} else if constexpr (std::is_same_v<T, half>) {
// Use half2 for vectorized access
if (embedding_dim >= 2 && embedding_dim % 2 == 0) {
copyVectorizedHalf2<IndexType>(dst, src, embedding_dim);
} else {
copyScalar<T, IndexType>(dst, src, embedding_dim);
}
} else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
// Use bfloat162 for vectorized access
if (embedding_dim >= 2 && embedding_dim % 2 == 0) {
copyVectorizedBFloat162<IndexType>(dst, src, embedding_dim);
} else {
copyScalar<T, IndexType>(dst, src, embedding_dim);
}
} else {
// Fallback to scalar copy with __ldg
copyScalar<T, IndexType>(dst, src, embedding_dim);
}
}
}
}
namespace op::embedding::metax {
struct Descriptor::Opaque {
std::shared_ptr<device::metax::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t output_desc,
infiniopTensorDescriptor_t input_desc,
infiniopTensorDescriptor_t weight_desc) {
auto input_shape = input_desc->shape();
auto weight_shape = weight_desc->shape();
// Validate shapes
CHECK_OR_RETURN(weight_shape.size() == 2, INFINI_STATUS_BAD_TENSOR_SHAPE);
CHECK_OR_RETURN(output_desc->shape().size() == input_shape.size() + 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
// Check output shape matches input shape + embedding_dim
auto output_shape = output_desc->shape();
size_t embedding_dim = weight_shape[1];
CHECK_OR_RETURN(output_shape.back() == embedding_dim, INFINI_STATUS_BAD_TENSOR_SHAPE);
for (size_t i = 0; i < input_shape.size(); ++i) {
CHECK_OR_RETURN(output_shape[i] == input_shape[i], INFINI_STATUS_BAD_TENSOR_SHAPE);
}
// Validate dtypes
auto input_dtype = input_desc->dtype();
auto weight_dtype = weight_desc->dtype();
CHECK_OR_RETURN(input_dtype == INFINI_DTYPE_I32 || input_dtype == INFINI_DTYPE_I64,
INFINI_STATUS_BAD_TENSOR_DTYPE);
CHECK_OR_RETURN(weight_dtype == INFINI_DTYPE_F32 || weight_dtype == INFINI_DTYPE_F16 ||
weight_dtype == INFINI_DTYPE_BF16, INFINI_STATUS_BAD_TENSOR_DTYPE);
CHECK_OR_RETURN(output_desc->dtype() == weight_dtype, INFINI_STATUS_BAD_TENSOR_DTYPE);
// Calculate number of indices (supporting batch dimension)
size_t num_indices = 1;
for (auto dim : input_shape) {
num_indices *= dim;
}
size_t vocab_size = weight_shape[0];
*desc_ptr = new Descriptor(
num_indices,
embedding_dim,
vocab_size,
input_dtype,
weight_dtype,
new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
handle->device,
handle->device_id);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *output,
const void *input,
const void *weight,
void *stream) const {
if (_num_indices == 0) {
return INFINI_STATUS_SUCCESS;
}
auto hc_stream = reinterpret_cast<hcStream_t>(stream);
// Dynamic block size optimization based on embedding_dim for Metax platform
size_t block_size = 256; // Default block size for Metax
if (_embedding_dim <= 64) {
block_size = 512; // Small embedding_dim: use larger block for better occupancy
} else if (_embedding_dim >= 1024) {
block_size = 128; // Large embedding_dim: use smaller block to reduce register pressure
}
size_t grid_size = (_num_indices + block_size - 1) / block_size;
// Launch kernel based on dtypes for Metax platform
if (_input_dtype == INFINI_DTYPE_I32) {
const int32_t *indices_ptr = reinterpret_cast<const int32_t *>(input);
if (_weight_dtype == INFINI_DTYPE_F32) {
embeddingKernel<float, int32_t><<<grid_size, block_size, 0, hc_stream>>>(
reinterpret_cast<float *>(output),
indices_ptr,
reinterpret_cast<const float *>(weight),
_num_indices,
_embedding_dim,
_vocab_size);
} else if (_weight_dtype == INFINI_DTYPE_F16) {
embeddingKernel<half, int32_t><<<grid_size, block_size, 0, hc_stream>>>(
reinterpret_cast<half *>(output),
indices_ptr,
reinterpret_cast<const half *>(weight),
_num_indices,
_embedding_dim,
_vocab_size);
} else if (_weight_dtype == INFINI_DTYPE_BF16) {
// Use Metax's bfloat16 type
embeddingKernel<__hpcc_bfloat16, int32_t><<<grid_size, block_size, 0, hc_stream>>>(
reinterpret_cast<__hpcc_bfloat16 *>(output),
indices_ptr,
reinterpret_cast<const __hpcc_bfloat16 *>(weight),
_num_indices,
_embedding_dim,
_vocab_size);
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
} else if (_input_dtype == INFINI_DTYPE_I64) {
const int64_t *indices_ptr = reinterpret_cast<const int64_t *>(input);
if (_weight_dtype == INFINI_DTYPE_F32) {
embeddingKernel<float, int64_t><<<grid_size, block_size, 0, hc_stream>>>(
reinterpret_cast<float *>(output),
indices_ptr,
reinterpret_cast<const float *>(weight),
_num_indices,
_embedding_dim,
_vocab_size);
} else if (_weight_dtype == INFINI_DTYPE_F16) {
embeddingKernel<half, int64_t><<<grid_size, block_size, 0, hc_stream>>>(
reinterpret_cast<half *>(output),
indices_ptr,
reinterpret_cast<const half *>(weight),
_num_indices,
_embedding_dim,
_vocab_size);
} else if (_weight_dtype == INFINI_DTYPE_BF16) {
embeddingKernel<__hpcc_bfloat16, int64_t><<<grid_size, block_size, 0, hc_stream>>>(
reinterpret_cast<__hpcc_bfloat16 *>(output),
indices_ptr,
reinterpret_cast<const __hpcc_bfloat16 *>(weight),
_num_indices,
_embedding_dim,
_vocab_size);
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::embedding::metax
src/infiniop/ops/embedding/moore/embedding_moore.h
0 → 100644
View file @
784139b9
#ifndef __EMBEDDING_MOORE_H__
#define __EMBEDDING_MOORE_H__
#include "../embedding.h"
DESCRIPTOR
(
moore
)
#endif // __EMBEDDING_MOORE_H__
src/infiniop/ops/embedding/moore/embedding_moore.mu
0 → 100644
View file @
784139b9
#include "../../../../utils.h"
#include "../../../devices/moore/moore_common.h"
#include "../../../devices/moore/moore_kernel_common.h"
#include "../../../tensor.h"
#include "embedding_moore_kernel.h"
#include "embedding_moore.h"
#include <musa_runtime.h>
template <typename T, typename IndexType>
INFINIOP_MOORE_KERNEL embeddingKernel(
T *__restrict__ output,
const IndexType *__restrict__ indices,
const T *__restrict__ weight,
size_t num_indices,
size_t embedding_dim,
size_t vocab_size) {
// Calculate global thread index
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < num_indices) {
// Get the index value with Moore-optimized memory access
IndexType index_val = indices[idx];
// Bounds check - handle negative indices gracefully
if (index_val >= 0 && static_cast<size_t>(index_val) < vocab_size) {
// Copy embedding vector from weight to output
const T *src = weight + static_cast<size_t>(index_val) * embedding_dim;
T *dst = output + idx * embedding_dim;
// Choose optimal copy strategy based on type and alignment
if constexpr (std::is_same_v<T, float>) {
// Check alignment for float4 (16 bytes)
bool aligned_16 = is_aligned(src, 16) && is_aligned(dst, 16);
if (aligned_16 && embedding_dim >= 4 && embedding_dim % 4 == 0) {
copyVectorizedFloat4<IndexType>(dst, src, embedding_dim);
} else if (embedding_dim >= 2 && embedding_dim % 2 == 0) {
// Try float2 if not aligned to 16 bytes
copyVectorizedFloat2<IndexType>(dst, src, embedding_dim);
} else {
copyScalar<T, IndexType>(dst, src, embedding_dim);
}
} else if constexpr (std::is_same_v<T, half>) {
// Use half2 for vectorized access
if (embedding_dim >= 2 && embedding_dim % 2 == 0) {
copyVectorizedHalf2<IndexType>(dst, src, embedding_dim);
} else {
copyScalar<T, IndexType>(dst, src, embedding_dim);
}
} else if constexpr (std::is_same_v<T, __mt_bfloat16>) {
// Use mt_bfloat162 for vectorized access (Moore-specific type)
if (embedding_dim >= 2 && embedding_dim % 2 == 0) {
copyVectorizedBFloat162<IndexType>(dst, src, embedding_dim);
} else {
copyScalar<T, IndexType>(dst, src, embedding_dim);
}
} else {
// Fallback to scalar copy with Moore-optimized memory access
copyScalar<T, IndexType>(dst, src, embedding_dim);
}
}
}
}
namespace op::embedding::moore {
struct Descriptor::Opaque {
std::shared_ptr<device::moore::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t output_desc,
infiniopTensorDescriptor_t input_desc,
infiniopTensorDescriptor_t weight_desc) {
auto input_shape = input_desc->shape();
auto weight_shape = weight_desc->shape();
// Validate shapes
CHECK_OR_RETURN(weight_shape.size() == 2, INFINI_STATUS_BAD_TENSOR_SHAPE);
CHECK_OR_RETURN(output_desc->shape().size() == input_shape.size() + 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
// Check output shape matches input shape + embedding_dim
auto output_shape = output_desc->shape();
size_t embedding_dim = weight_shape[1];
CHECK_OR_RETURN(output_shape.back() == embedding_dim, INFINI_STATUS_BAD_TENSOR_SHAPE);
for (size_t i = 0; i < input_shape.size(); ++i) {
CHECK_OR_RETURN(output_shape[i] == input_shape[i], INFINI_STATUS_BAD_TENSOR_SHAPE);
}
// Validate dtypes
auto input_dtype = input_desc->dtype();
auto weight_dtype = weight_desc->dtype();
CHECK_OR_RETURN(input_dtype == INFINI_DTYPE_I32 || input_dtype == INFINI_DTYPE_I64,
INFINI_STATUS_BAD_TENSOR_DTYPE);
CHECK_OR_RETURN(weight_dtype == INFINI_DTYPE_F32 || weight_dtype == INFINI_DTYPE_F16 || weight_dtype == INFINI_DTYPE_BF16, INFINI_STATUS_BAD_TENSOR_DTYPE);
CHECK_OR_RETURN(output_desc->dtype() == weight_dtype, INFINI_STATUS_BAD_TENSOR_DTYPE);
// Calculate number of indices (supporting batch dimension)
size_t num_indices = 1;
for (auto dim : input_shape) {
num_indices *= dim;
}
size_t vocab_size = weight_shape[0];
*desc_ptr = new Descriptor(
num_indices,
embedding_dim,
vocab_size,
input_dtype,
weight_dtype,
new Opaque{reinterpret_cast<device::moore::Handle *>(handle)->internal()},
handle->device,
handle->device_id);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *output,
const void *input,
const void *weight,
void *stream) const {
if (_num_indices == 0) {
return INFINI_STATUS_SUCCESS;
}
auto musa_stream = reinterpret_cast<musaStream_t>(stream);
// Dynamic block size optimization based on embedding_dim
// Moore platform typically has different performance characteristics
size_t block_size = 256; // Default for Moore
if (_embedding_dim <= 64) {
block_size = 512; // Small embedding_dim: use larger block for better occupancy
} else if (_embedding_dim >= 1024) {
block_size = 128; // Large embedding_dim: use smaller block to reduce register pressure
} else if (_embedding_dim <= 256) {
block_size = 384; // Medium embedding_dim: balanced configuration
}
size_t grid_size = (_num_indices + block_size - 1) / block_size;
// Launch kernel based on dtypes
// Note: Moore uses __mt_bfloat16 instead of __nv_bfloat16
if (_input_dtype == INFINI_DTYPE_I32) {
const int32_t *indices_ptr = reinterpret_cast<const int32_t *>(input);
if (_weight_dtype == INFINI_DTYPE_F32) {
embeddingKernel<float, int32_t><<<grid_size, block_size, 0, musa_stream>>>(
reinterpret_cast<float *>(output),
indices_ptr,
reinterpret_cast<const float *>(weight),
_num_indices,
_embedding_dim,
_vocab_size);
} else if (_weight_dtype == INFINI_DTYPE_F16) {
embeddingKernel<half, int32_t><<<grid_size, block_size, 0, musa_stream>>>(
reinterpret_cast<half *>(output),
indices_ptr,
reinterpret_cast<const half *>(weight),
_num_indices,
_embedding_dim,
_vocab_size);
} else if (_weight_dtype == INFINI_DTYPE_BF16) {
// Use Moore's bfloat16 type
embeddingKernel<__mt_bfloat16, int32_t><<<grid_size, block_size, 0, musa_stream>>>(
reinterpret_cast<__mt_bfloat16 *>(output),
indices_ptr,
reinterpret_cast<const __mt_bfloat16 *>(weight),
_num_indices,
_embedding_dim,
_vocab_size);
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
} else if (_input_dtype == INFINI_DTYPE_I64) {
const int64_t *indices_ptr = reinterpret_cast<const int64_t *>(input);
if (_weight_dtype == INFINI_DTYPE_F32) {
embeddingKernel<float, int64_t><<<grid_size, block_size, 0, musa_stream>>>(
reinterpret_cast<float *>(output),
indices_ptr,
reinterpret_cast<const float *>(weight),
_num_indices,
_embedding_dim,
_vocab_size);
} else if (_weight_dtype == INFINI_DTYPE_F16) {
embeddingKernel<half, int64_t><<<grid_size, block_size, 0, musa_stream>>>(
reinterpret_cast<half *>(output),
indices_ptr,
reinterpret_cast<const half *>(weight),
_num_indices,
_embedding_dim,
_vocab_size);
} else if (_weight_dtype == INFINI_DTYPE_BF16) {
embeddingKernel<__mt_bfloat16, int64_t><<<grid_size, block_size, 0, musa_stream>>>(
reinterpret_cast<__mt_bfloat16 *>(output),
indices_ptr,
reinterpret_cast<const __mt_bfloat16 *>(weight),
_num_indices,
_embedding_dim,
_vocab_size);
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
// Check for kernel launch errors
musaError_t err = musaGetLastError();
if (err != musaSuccess) {
return INFINI_STATUS_INTERNAL_ERROR;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::embedding::moore
src/infiniop/ops/embedding/moore/embedding_moore_kernel.h
0 → 100644
View file @
784139b9
#ifndef __EMBEDDING_MOORE_KERNEL_CUH__
#define __EMBEDDING_MOORE_KERNEL_CUH__
#include <type_traits>
// Helper function to check memory alignment
__forceinline__
__device__
bool
is_aligned
(
const
void
*
ptr
,
size_t
alignment
)
{
// Use size_t for pointer arithmetic in device code (more compatible)
return
(
reinterpret_cast
<
size_t
>
(
ptr
)
%
alignment
==
0
);
}
// Vectorized copy for float type using float4
template
<
typename
IndexType
>
__forceinline__
__device__
void
copyVectorizedFloat4
(
float
*
__restrict__
dst
,
const
float
*
__restrict__
src
,
size_t
embedding_dim
)
{
// Use float4 for vectorized access (16 bytes, 4 floats)
const
float4
*
src_vec
=
reinterpret_cast
<
const
float4
*>
(
src
);
float4
*
dst_vec
=
reinterpret_cast
<
float4
*>
(
dst
);
size_t
vec_count
=
embedding_dim
/
4
;
// Vectorized copy with __ldg equivalent for Moore platform
for
(
size_t
i
=
0
;
i
<
vec_count
;
++
i
)
{
dst_vec
[
i
]
=
src_vec
[
i
];
}
// Copy remaining elements
size_t
remaining
=
embedding_dim
%
4
;
if
(
remaining
>
0
)
{
size_t
offset
=
vec_count
*
4
;
for
(
size_t
i
=
0
;
i
<
remaining
;
++
i
)
{
dst
[
offset
+
i
]
=
src
[
offset
+
i
];
}
}
}
// Vectorized copy for float type using float2 (fallback when not aligned to 16 bytes)
template
<
typename
IndexType
>
__forceinline__
__device__
void
copyVectorizedFloat2
(
float
*
__restrict__
dst
,
const
float
*
__restrict__
src
,
size_t
embedding_dim
)
{
// Use float2 for vectorized access (8 bytes, 2 floats)
const
float2
*
src_vec
=
reinterpret_cast
<
const
float2
*>
(
src
);
float2
*
dst_vec
=
reinterpret_cast
<
float2
*>
(
dst
);
size_t
vec_count
=
embedding_dim
/
2
;
// Vectorized copy with Moore-optimized memory access
for
(
size_t
i
=
0
;
i
<
vec_count
;
++
i
)
{
dst_vec
[
i
]
=
src_vec
[
i
];
}
// Copy remaining element if odd
if
(
embedding_dim
%
2
!=
0
)
{
dst
[
embedding_dim
-
1
]
=
src
[
embedding_dim
-
1
];
}
}
// Vectorized copy for half type using half2
template
<
typename
IndexType
>
__forceinline__
__device__
void
copyVectorizedHalf2
(
half
*
__restrict__
dst
,
const
half
*
__restrict__
src
,
size_t
embedding_dim
)
{
// Use half2 for vectorized access (4 bytes, 2 halfs)
const
half2
*
src_vec
=
reinterpret_cast
<
const
half2
*>
(
src
);
half2
*
dst_vec
=
reinterpret_cast
<
half2
*>
(
dst
);
size_t
vec_count
=
embedding_dim
/
2
;
// Vectorized copy optimized for Moore architecture
for
(
size_t
i
=
0
;
i
<
vec_count
;
++
i
)
{
dst_vec
[
i
]
=
src_vec
[
i
];
}
// Copy remaining element if odd
if
(
embedding_dim
%
2
!=
0
)
{
dst
[
embedding_dim
-
1
]
=
src
[
embedding_dim
-
1
];
}
}
// Vectorized copy for Moore bfloat16 type using bfloat162
template
<
typename
IndexType
>
__forceinline__
__device__
void
copyVectorizedBFloat162
(
__mt_bfloat16
*
__restrict__
dst
,
const
__mt_bfloat16
*
__restrict__
src
,
size_t
embedding_dim
)
{
// Use mt_bfloat162 for vectorized access (4 bytes, 2 bfloat16s)
const
__mt_bfloat162
*
src_vec
=
reinterpret_cast
<
const
__mt_bfloat162
*>
(
src
);
__mt_bfloat162
*
dst_vec
=
reinterpret_cast
<
__mt_bfloat162
*>
(
dst
);
size_t
vec_count
=
embedding_dim
/
2
;
// Vectorized copy with Moore-specific optimization
for
(
size_t
i
=
0
;
i
<
vec_count
;
++
i
)
{
dst_vec
[
i
]
=
src_vec
[
i
];
}
// Copy remaining element if odd
if
(
embedding_dim
%
2
!=
0
)
{
dst
[
embedding_dim
-
1
]
=
src
[
embedding_dim
-
1
];
}
}
// Scalar copy fallback with Moore-optimized memory access
template
<
typename
T
,
typename
IndexType
>
__forceinline__
__device__
void
copyScalar
(
T
*
__restrict__
dst
,
const
T
*
__restrict__
src
,
size_t
embedding_dim
)
{
// Scalar copy with Moore read-only weight optimization
for
(
size_t
i
=
0
;
i
<
embedding_dim
;
++
i
)
{
dst
[
i
]
=
src
[
i
];
}
}
#endif // __EMBEDDING_MOORE_KERNEL_CUH__
src/infiniop/ops/embedding/nvidia/embedding_nvidia.cu
0 → 100644
View file @
784139b9
#include "../../../../utils.h"
#include "../../../devices/nvidia/nvidia_common.cuh"
#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
#include "../../../tensor.h"
#include "../cuda/embedding_kernel.cuh"
#include "embedding_nvidia.cuh"
#include <cuda_runtime.h>
template
<
typename
T
,
typename
IndexType
>
INFINIOP_CUDA_KERNEL
embeddingKernel
(
T
*
__restrict__
output
,
const
IndexType
*
__restrict__
indices
,
const
T
*
__restrict__
weight
,
size_t
num_indices
,
size_t
embedding_dim
,
size_t
vocab_size
)
{
// Calculate global thread index
size_t
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
idx
<
num_indices
)
{
// Get the index value
IndexType
index_val
=
__ldg
(
&
indices
[
idx
]);
// Bounds check - handle negative indices gracefully
if
(
index_val
>=
0
&&
static_cast
<
size_t
>
(
index_val
)
<
vocab_size
)
{
// Copy embedding vector from weight to output
const
T
*
src
=
weight
+
static_cast
<
size_t
>
(
index_val
)
*
embedding_dim
;
T
*
dst
=
output
+
idx
*
embedding_dim
;
// Choose optimal copy strategy based on type and alignment
if
constexpr
(
std
::
is_same_v
<
T
,
float
>
)
{
// Check alignment for float4 (16 bytes)
bool
aligned_16
=
is_aligned
(
src
,
16
)
&&
is_aligned
(
dst
,
16
);
if
(
aligned_16
&&
embedding_dim
>=
4
&&
embedding_dim
%
4
==
0
)
{
copyVectorizedFloat4
<
IndexType
>
(
dst
,
src
,
embedding_dim
);
}
else
if
(
embedding_dim
>=
2
&&
embedding_dim
%
2
==
0
)
{
// Try float2 if not aligned to 16 bytes
copyVectorizedFloat2
<
IndexType
>
(
dst
,
src
,
embedding_dim
);
}
else
{
copyScalar
<
T
,
IndexType
>
(
dst
,
src
,
embedding_dim
);
}
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
// Use half2 for vectorized access
if
(
embedding_dim
>=
2
&&
embedding_dim
%
2
==
0
)
{
copyVectorizedHalf2
<
IndexType
>
(
dst
,
src
,
embedding_dim
);
}
else
{
copyScalar
<
T
,
IndexType
>
(
dst
,
src
,
embedding_dim
);
}
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
cuda_bfloat16
>
)
{
// Use bfloat162 for vectorized access
if
(
embedding_dim
>=
2
&&
embedding_dim
%
2
==
0
)
{
copyVectorizedBFloat162
<
IndexType
>
(
dst
,
src
,
embedding_dim
);
}
else
{
copyScalar
<
T
,
IndexType
>
(
dst
,
src
,
embedding_dim
);
}
}
else
{
// Fallback to scalar copy with __ldg
copyScalar
<
T
,
IndexType
>
(
dst
,
src
,
embedding_dim
);
}
}
}
}
namespace
op
::
embedding
::
nvidia
{
struct
Descriptor
::
Opaque
{
std
::
shared_ptr
<
device
::
nvidia
::
Handle
::
Internal
>
internal
;
};
Descriptor
::~
Descriptor
()
{
delete
_opaque
;
}
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
output_desc
,
infiniopTensorDescriptor_t
input_desc
,
infiniopTensorDescriptor_t
weight_desc
)
{
auto
input_shape
=
input_desc
->
shape
();
auto
weight_shape
=
weight_desc
->
shape
();
// Validate shapes
CHECK_OR_RETURN
(
weight_shape
.
size
()
==
2
,
INFINI_STATUS_BAD_TENSOR_SHAPE
);
CHECK_OR_RETURN
(
output_desc
->
shape
().
size
()
==
input_shape
.
size
()
+
1
,
INFINI_STATUS_BAD_TENSOR_SHAPE
);
// Check output shape matches input shape + embedding_dim
auto
output_shape
=
output_desc
->
shape
();
size_t
embedding_dim
=
weight_shape
[
1
];
CHECK_OR_RETURN
(
output_shape
.
back
()
==
embedding_dim
,
INFINI_STATUS_BAD_TENSOR_SHAPE
);
for
(
size_t
i
=
0
;
i
<
input_shape
.
size
();
++
i
)
{
CHECK_OR_RETURN
(
output_shape
[
i
]
==
input_shape
[
i
],
INFINI_STATUS_BAD_TENSOR_SHAPE
);
}
// Validate dtypes
auto
input_dtype
=
input_desc
->
dtype
();
auto
weight_dtype
=
weight_desc
->
dtype
();
CHECK_OR_RETURN
(
input_dtype
==
INFINI_DTYPE_I32
||
input_dtype
==
INFINI_DTYPE_I64
,
INFINI_STATUS_BAD_TENSOR_DTYPE
);
CHECK_OR_RETURN
(
weight_dtype
==
INFINI_DTYPE_F32
||
weight_dtype
==
INFINI_DTYPE_F16
||
weight_dtype
==
INFINI_DTYPE_BF16
,
INFINI_STATUS_BAD_TENSOR_DTYPE
);
CHECK_OR_RETURN
(
output_desc
->
dtype
()
==
weight_dtype
,
INFINI_STATUS_BAD_TENSOR_DTYPE
);
// Calculate number of indices (supporting batch dimension)
size_t
num_indices
=
1
;
for
(
auto
dim
:
input_shape
)
{
num_indices
*=
dim
;
}
size_t
vocab_size
=
weight_shape
[
0
];
*
desc_ptr
=
new
Descriptor
(
num_indices
,
embedding_dim
,
vocab_size
,
input_dtype
,
weight_dtype
,
new
Opaque
{
reinterpret_cast
<
device
::
nvidia
::
Handle
*>
(
handle
)
->
internal
()},
handle
->
device
,
handle
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
output
,
const
void
*
input
,
const
void
*
weight
,
void
*
stream
)
const
{
if
(
_num_indices
==
0
)
{
return
INFINI_STATUS_SUCCESS
;
}
auto
cuda_stream
=
reinterpret_cast
<
cudaStream_t
>
(
stream
);
// Dynamic block size optimization based on embedding_dim
// Smaller embedding_dim benefits from larger block size (better occupancy)
// Larger embedding_dim benefits from smaller block size (more registers per thread)
size_t
block_size
=
256
;
// Default
if
(
_embedding_dim
<=
64
)
{
block_size
=
512
;
// Small embedding_dim: use larger block for better occupancy
}
else
if
(
_embedding_dim
>=
1024
)
{
block_size
=
128
;
// Large embedding_dim: use smaller block to reduce register pressure
}
size_t
grid_size
=
(
_num_indices
+
block_size
-
1
)
/
block_size
;
// Launch kernel based on dtypes
if
(
_input_dtype
==
INFINI_DTYPE_I32
)
{
const
int32_t
*
indices_ptr
=
reinterpret_cast
<
const
int32_t
*>
(
input
);
if
(
_weight_dtype
==
INFINI_DTYPE_F32
)
{
embeddingKernel
<
float
,
int32_t
><<<
grid_size
,
block_size
,
0
,
cuda_stream
>>>
(
reinterpret_cast
<
float
*>
(
output
),
indices_ptr
,
reinterpret_cast
<
const
float
*>
(
weight
),
_num_indices
,
_embedding_dim
,
_vocab_size
);
}
else
if
(
_weight_dtype
==
INFINI_DTYPE_F16
)
{
embeddingKernel
<
half
,
int32_t
><<<
grid_size
,
block_size
,
0
,
cuda_stream
>>>
(
reinterpret_cast
<
half
*>
(
output
),
indices_ptr
,
reinterpret_cast
<
const
half
*>
(
weight
),
_num_indices
,
_embedding_dim
,
_vocab_size
);
}
else
if
(
_weight_dtype
==
INFINI_DTYPE_BF16
)
{
embeddingKernel
<
cuda_bfloat16
,
int32_t
><<<
grid_size
,
block_size
,
0
,
cuda_stream
>>>
(
reinterpret_cast
<
cuda_bfloat16
*>
(
output
),
indices_ptr
,
reinterpret_cast
<
const
cuda_bfloat16
*>
(
weight
),
_num_indices
,
_embedding_dim
,
_vocab_size
);
}
else
{
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
}
else
if
(
_input_dtype
==
INFINI_DTYPE_I64
)
{
const
int64_t
*
indices_ptr
=
reinterpret_cast
<
const
int64_t
*>
(
input
);
if
(
_weight_dtype
==
INFINI_DTYPE_F32
)
{
embeddingKernel
<
float
,
int64_t
><<<
grid_size
,
block_size
,
0
,
cuda_stream
>>>
(
reinterpret_cast
<
float
*>
(
output
),
indices_ptr
,
reinterpret_cast
<
const
float
*>
(
weight
),
_num_indices
,
_embedding_dim
,
_vocab_size
);
}
else
if
(
_weight_dtype
==
INFINI_DTYPE_F16
)
{
embeddingKernel
<
half
,
int64_t
><<<
grid_size
,
block_size
,
0
,
cuda_stream
>>>
(
reinterpret_cast
<
half
*>
(
output
),
indices_ptr
,
reinterpret_cast
<
const
half
*>
(
weight
),
_num_indices
,
_embedding_dim
,
_vocab_size
);
}
else
if
(
_weight_dtype
==
INFINI_DTYPE_BF16
)
{
embeddingKernel
<
cuda_bfloat16
,
int64_t
><<<
grid_size
,
block_size
,
0
,
cuda_stream
>>>
(
reinterpret_cast
<
cuda_bfloat16
*>
(
output
),
indices_ptr
,
reinterpret_cast
<
const
cuda_bfloat16
*>
(
weight
),
_num_indices
,
_embedding_dim
,
_vocab_size
);
}
else
{
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
}
else
{
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
// Check for kernel launch errors
cudaError_t
err
=
cudaGetLastError
();
if
(
err
!=
cudaSuccess
)
{
return
INFINI_STATUS_INTERNAL_ERROR
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::embedding::nvidia
src/infiniop/ops/embedding/nvidia/embedding_nvidia.cuh
0 → 100644
View file @
784139b9
#ifndef __EMBEDDING_CUDA_H__
#define __EMBEDDING_CUDA_H__
#include "../embedding.h"
DESCRIPTOR
(
nvidia
)
#endif // __EMBEDDING_CUDA_H__
src/infiniop/ops/embedding/operator.cc
0 → 100644
View file @
784139b9
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/embedding.h"
#ifdef ENABLE_CPU_API
#include "cpu/embedding_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) || defined(ENABLE_ALI_API)
#include "nvidia/embedding_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
#include "metax/embedding_metax.cuh"
#endif
#ifdef ENABLE_MOORE_API
#include "moore/embedding_moore.h"
#endif
__C
infiniStatus_t
infiniopCreateEmbeddingDescriptor
(
infiniopHandle_t
handle
,
infiniopEmbeddingDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
output_desc
,
infiniopTensorDescriptor_t
input_desc
,
infiniopTensorDescriptor_t
weight_desc
)
{
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::embedding::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::embedding::NAMESPACE::Descriptor **>(desc_ptr), \
output_desc, \
input_desc, \
weight_desc)
switch
(
handle
->
device
)
{
#ifdef ENABLE_CPU_API
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
CREATE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
CREATE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_HYGON_API
CREATE
(
INFINI_DEVICE_HYGON
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
CREATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_MOORE_API
CREATE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CREATE
}
__C
infiniStatus_t
infiniopEmbedding
(
infiniopEmbeddingDescriptor_t
desc
,
void
*
output
,
const
void
*
input
,
const
void
*
weight
,
void
*
stream
)
{
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::embedding::NAMESPACE::Descriptor *>(desc) \
->calculate(output, input, weight, stream)
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
CALCULATE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
CALCULATE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_HYGON_API
CALCULATE
(
INFINI_DEVICE_HYGON
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
CALCULATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_MOORE_API
CALCULATE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CALCULATE
}
__C
infiniStatus_t
infiniopDestroyEmbeddingDescriptor
(
infiniopEmbeddingDescriptor_t
desc
)
{
#define DESTROY(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::embedding::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
DESTROY
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
DESTROY
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
DESTROY
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
DESTROY
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
DESTROY
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_HYGON_API
DESTROY
(
INFINI_DEVICE_HYGON
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
DESTROY
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_MOORE_API
DESTROY
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef DESTROY
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
Prev
1
…
4
5
6
7
8
9
10
11
12
…
30
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment