Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
8d09630a
Unverified
Commit
8d09630a
authored
Feb 11, 2026
by
gongchensu
Committed by
GitHub
Feb 11, 2026
Browse files
Merge branch 'demo131' into Issue/862
parents
ab52dead
012df56c
Changes
387
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1345 additions
and
12 deletions
+1345
-12
src/infiniop/ops/add_rms_norm/nvidia/add_rms_norm_nvidia.cu
src/infiniop/ops/add_rms_norm/nvidia/add_rms_norm_nvidia.cu
+183
-0
src/infiniop/ops/add_rms_norm/nvidia/add_rms_norm_nvidia.cuh
src/infiniop/ops/add_rms_norm/nvidia/add_rms_norm_nvidia.cuh
+8
-0
src/infiniop/ops/add_rms_norm/operator.cc
src/infiniop/ops/add_rms_norm/operator.cc
+223
-0
src/infiniop/ops/causal_softmax/bang/causal_softmax_bang.mlu
src/infiniop/ops/causal_softmax/bang/causal_softmax_bang.mlu
+1
-1
src/infiniop/ops/causal_softmax/moore/causal_softmax_kernel.h
...infiniop/ops/causal_softmax/moore/causal_softmax_kernel.h
+1
-1
src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu
...finiop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu
+9
-5
src/infiniop/ops/causal_softmax/operator.cc
src/infiniop/ops/causal_softmax/operator.cc
+13
-1
src/infiniop/ops/clip/operator.cc
src/infiniop/ops/clip/operator.cc
+13
-1
src/infiniop/ops/conv/operator.cc
src/infiniop/ops/conv/operator.cc
+13
-1
src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_nvidia.cu
...iop/ops/dequantize_awq/nvidia/dequantize_w42f16_nvidia.cu
+1
-1
src/infiniop/ops/dequantize_awq/operator.cc
src/infiniop/ops/dequantize_awq/operator.cc
+17
-1
src/infiniop/ops/embedding/cpu/embedding_cpu.cc
src/infiniop/ops/embedding/cpu/embedding_cpu.cc
+109
-0
src/infiniop/ops/embedding/cpu/embedding_cpu.h
src/infiniop/ops/embedding/cpu/embedding_cpu.h
+8
-0
src/infiniop/ops/embedding/cuda/embedding_kernel.cuh
src/infiniop/ops/embedding/cuda/embedding_kernel.cuh
+116
-0
src/infiniop/ops/embedding/embedding.h
src/infiniop/ops/embedding/embedding.h
+54
-0
src/infiniop/ops/embedding/metax/embedding_metax.cuh
src/infiniop/ops/embedding/metax/embedding_metax.cuh
+8
-0
src/infiniop/ops/embedding/metax/embedding_metax.maca
src/infiniop/ops/embedding/metax/embedding_metax.maca
+217
-0
src/infiniop/ops/embedding/moore/embedding_moore.h
src/infiniop/ops/embedding/moore/embedding_moore.h
+8
-0
src/infiniop/ops/embedding/moore/embedding_moore.mu
src/infiniop/ops/embedding/moore/embedding_moore.mu
+227
-0
src/infiniop/ops/embedding/moore/embedding_moore_kernel.h
src/infiniop/ops/embedding/moore/embedding_moore_kernel.h
+116
-0
No files found.
src/infiniop/ops/add_rms_norm/nvidia/add_rms_norm_nvidia.cu
0 → 100644
View file @
8d09630a
#include "../../../devices/nvidia/nvidia_common.cuh"
#include "add_rms_norm_nvidia.cuh"
#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
#include <cub/block/block_reduce.cuh>
#include "../../../reduce/cuda/reduce.cuh"
#include "../cuda/kernel.cuh"
template
<
unsigned
int
BLOCK_SIZE
,
typename
Tcompute
,
typename
Tdata
,
typename
Tweight
>
INFINIOP_CUDA_KERNEL
add_rmsnormKernel
(
Tdata
*
__restrict__
y
,
Tdata
*
__restrict__
residual_out
,
ptrdiff_t
stride_y_batch
,
ptrdiff_t
stride_y_nhead
,
ptrdiff_t
stride_residual_out_batch
,
ptrdiff_t
stride_residual_out_nhead
,
const
Tdata
*
__restrict__
a
,
ptrdiff_t
stride_a_batch
,
ptrdiff_t
stride_a_nhead
,
const
Tdata
*
__restrict__
b
,
ptrdiff_t
stride_b_batch
,
ptrdiff_t
stride_b_nhead
,
const
Tweight
*
__restrict__
w
,
size_t
nhead
,
size_t
dim
,
float
epsilon
)
{
add_rmsnormBlock
<
BLOCK_SIZE
,
Tcompute
>
(
y
,
residual_out
,
stride_y_batch
,
stride_y_nhead
,
stride_residual_out_batch
,
stride_residual_out_nhead
,
a
,
stride_a_batch
,
stride_a_nhead
,
b
,
stride_b_batch
,
stride_b_nhead
,
w
,
nhead
,
dim
,
epsilon
);
}
namespace
op
::
add_rms_norm
::
nvidia
{
struct
Descriptor
::
Opaque
{
std
::
shared_ptr
<
device
::
nvidia
::
Handle
::
Internal
>
internal
;
};
Descriptor
::~
Descriptor
()
{
delete
_opaque
;
}
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
y_desc
,
infiniopTensorDescriptor_t
residual_out_desc
,
infiniopTensorDescriptor_t
a_desc
,
infiniopTensorDescriptor_t
b_desc
,
infiniopTensorDescriptor_t
weight_desc
,
float
epsilon
)
{
auto
result
=
AddRMSNormInfo
::
create
(
y_desc
,
residual_out_desc
,
a_desc
,
b_desc
,
weight_desc
,
epsilon
);
CHECK_RESULT
(
result
);
auto
info
=
result
.
take
();
*
desc_ptr
=
new
Descriptor
(
new
Opaque
{
reinterpret_cast
<
device
::
nvidia
::
Handle
*>
(
handle
)
->
internal
()},
std
::
move
(
info
),
0
,
handle
->
device
,
handle
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
// launch kernel with different data types
template
<
unsigned
int
BLOCK_SIZE
>
infiniStatus_t
launchKernel
(
uint32_t
batch_size
,
size_t
nhead
,
size_t
dim
,
void
*
y
,
infiniDtype_t
atype
,
ptrdiff_t
stride_y_batch
,
ptrdiff_t
stride_y_nhead
,
void
*
residual_out
,
ptrdiff_t
stride_residual_out_batch
,
ptrdiff_t
stride_residual_out_nhead
,
const
void
*
a
,
ptrdiff_t
stride_a_batch
,
ptrdiff_t
stride_a_nhead
,
const
void
*
b
,
ptrdiff_t
stride_b_batch
,
ptrdiff_t
stride_b_nhead
,
const
void
*
w
,
infiniDtype_t
wtype
,
float
epsilon
,
cudaStream_t
cuda_stream
)
{
#define LAUNCH_KERNEL(Tdata, Tweight, Tcompute) \
add_rmsnormKernel<BLOCK_SIZE, Tcompute, Tdata, Tweight><<<batch_size * nhead, BLOCK_SIZE, 0, cuda_stream>>>( \
reinterpret_cast<Tdata *>(y), \
reinterpret_cast<Tdata *>(residual_out), \
stride_y_batch, \
stride_y_nhead, \
stride_residual_out_batch, \
stride_residual_out_nhead, \
reinterpret_cast<const Tdata *>(a), \
stride_a_batch, \
stride_a_nhead, \
reinterpret_cast<const Tdata *>(b), \
stride_b_batch, \
stride_b_nhead, \
reinterpret_cast<const Tweight *>(w), \
nhead, \
dim, \
epsilon)
if
(
atype
==
INFINI_DTYPE_F16
&&
wtype
==
INFINI_DTYPE_F16
)
{
LAUNCH_KERNEL
(
half
,
half
,
float
);
}
else
if
(
atype
==
INFINI_DTYPE_F16
&&
wtype
==
INFINI_DTYPE_BF16
)
{
LAUNCH_KERNEL
(
half
,
__nv_bfloat16
,
float
);
}
else
if
(
atype
==
INFINI_DTYPE_F16
&&
wtype
==
INFINI_DTYPE_F32
)
{
LAUNCH_KERNEL
(
half
,
float
,
float
);
}
else
if
(
atype
==
INFINI_DTYPE_BF16
&&
wtype
==
INFINI_DTYPE_BF16
)
{
LAUNCH_KERNEL
(
__nv_bfloat16
,
__nv_bfloat16
,
float
);
}
else
if
(
atype
==
INFINI_DTYPE_BF16
&&
wtype
==
INFINI_DTYPE_F16
)
{
LAUNCH_KERNEL
(
__nv_bfloat16
,
half
,
float
);
}
else
if
(
atype
==
INFINI_DTYPE_BF16
&&
wtype
==
INFINI_DTYPE_F32
)
{
LAUNCH_KERNEL
(
__nv_bfloat16
,
float
,
float
);
}
else
if
(
atype
==
INFINI_DTYPE_F32
&&
wtype
==
INFINI_DTYPE_F32
)
{
LAUNCH_KERNEL
(
float
,
float
,
float
);
}
else
{
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
#undef LAUNCH_KERNEL
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
y
,
void
*
residual_out
,
const
void
*
a
,
const
void
*
b
,
const
void
*
weight
,
void
*
stream
)
const
{
if
(
workspace_size
<
_workspace_size
)
{
return
INFINI_STATUS_INSUFFICIENT_WORKSPACE
;
}
auto
stride_a_batch
=
_info
.
a_strides
[
0
];
auto
stride_a_nhead
=
_info
.
a_strides
[
1
];
auto
stride_b_batch
=
_info
.
b_strides
[
0
];
auto
stride_b_nhead
=
_info
.
b_strides
[
1
];
auto
stride_y_batch
=
_info
.
y_strides
[
0
];
auto
stride_y_nhead
=
_info
.
y_strides
[
1
];
auto
stride_residual_out_batch
=
_info
.
residual_out_strides
[
0
];
auto
stride_residual_out_nhead
=
_info
.
residual_out_strides
[
1
];
auto
dim
=
_info
.
dim
();
uint32_t
batch_size
=
static_cast
<
uint32_t
>
(
_info
.
shape
[
0
]);
size_t
nhead
=
_info
.
shape
.
size
()
>
2
?
_info
.
shape
[
1
]
:
1
;
auto
cuda_stream
=
reinterpret_cast
<
cudaStream_t
>
(
stream
);
// launch kernel with different block sizes
if
(
_opaque
->
internal
->
maxThreadsPerBlock
()
==
CUDA_BLOCK_SIZE_512
)
{
CHECK_STATUS
(
launchKernel
<
CUDA_BLOCK_SIZE_512
>
(
batch_size
,
nhead
,
dim
,
y
,
_info
.
atype
,
stride_y_batch
,
stride_y_nhead
,
residual_out
,
stride_residual_out_batch
,
stride_residual_out_nhead
,
a
,
stride_a_batch
,
stride_a_nhead
,
b
,
stride_b_batch
,
stride_b_nhead
,
weight
,
_info
.
wtype
,
_info
.
epsilon
,
cuda_stream
));
}
else
if
(
_opaque
->
internal
->
maxThreadsPerBlock
()
==
CUDA_BLOCK_SIZE_1024
)
{
CHECK_STATUS
(
launchKernel
<
CUDA_BLOCK_SIZE_1024
>
(
batch_size
,
nhead
,
dim
,
y
,
_info
.
atype
,
stride_y_batch
,
stride_y_nhead
,
residual_out
,
stride_residual_out_batch
,
stride_residual_out_nhead
,
a
,
stride_a_batch
,
stride_a_nhead
,
b
,
stride_b_batch
,
stride_b_nhead
,
weight
,
_info
.
wtype
,
_info
.
epsilon
,
cuda_stream
));
}
else
if
(
_opaque
->
internal
->
maxThreadsPerBlock
()
==
CUDA_BLOCK_SIZE_2048
)
{
CHECK_STATUS
(
launchKernel
<
CUDA_BLOCK_SIZE_2048
>
(
batch_size
,
nhead
,
dim
,
y
,
_info
.
atype
,
stride_y_batch
,
stride_y_nhead
,
residual_out
,
stride_residual_out_batch
,
stride_residual_out_nhead
,
a
,
stride_a_batch
,
stride_a_nhead
,
b
,
stride_b_batch
,
stride_b_nhead
,
weight
,
_info
.
wtype
,
_info
.
epsilon
,
cuda_stream
));
}
else
if
(
_opaque
->
internal
->
maxThreadsPerBlock
()
==
CUDA_BLOCK_SIZE_4096
)
{
CHECK_STATUS
(
launchKernel
<
CUDA_BLOCK_SIZE_4096
>
(
batch_size
,
nhead
,
dim
,
y
,
_info
.
atype
,
stride_y_batch
,
stride_y_nhead
,
residual_out
,
stride_residual_out_batch
,
stride_residual_out_nhead
,
a
,
stride_a_batch
,
stride_a_nhead
,
b
,
stride_b_batch
,
stride_b_nhead
,
weight
,
_info
.
wtype
,
_info
.
epsilon
,
cuda_stream
));
}
else
{
return
INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::add_rms_norm::nvidia
src/infiniop/ops/add_rms_norm/nvidia/add_rms_norm_nvidia.cuh
0 → 100644
View file @
8d09630a
#ifndef __ADD_RMS_NORM_NVIDIA_CUDA_H__
#define __ADD_RMS_NORM_NVIDIA_CUDA_H__
#include "../add_rms_norm.h"
DESCRIPTOR
(
nvidia
)
#endif
src/infiniop/ops/add_rms_norm/operator.cc
0 → 100644
View file @
8d09630a
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/add_rms_norm.h"
#ifdef ENABLE_CPU_API
#include "cpu/add_rms_norm_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) || defined(ENABLE_ALI_API)
#include "nvidia/add_rms_norm_nvidia.cuh"
#endif
#ifdef ENABLE_ASCEND_API
// TODO: Add Ascend implementation
// #include "ascend/add_rms_norm_aclnn.h"
#endif
#ifdef ENABLE_CAMBRICON_API
// TODO: Add Cambricon implementation
// #include "bang/add_rms_norm_bang.h"
#endif
#ifdef ENABLE_METAX_API
#include "metax/add_rms_norm_metax.cuh"
#endif
#ifdef ENABLE_MOORE_API
#include "moore/add_rms_norm_moore.h"
#endif
#ifdef ENABLE_KUNLUN_API
// TODO: Add Kunlun implementation
// #include "kunlun/add_rms_norm_kunlun.h"
#endif
__C
infiniStatus_t
infiniopCreateAddRMSNormDescriptor
(
infiniopHandle_t
handle
,
infiniopAddRMSNormDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
residual_out_desc
,
infiniopTensorDescriptor_t
y_desc
,
infiniopTensorDescriptor_t
a_desc
,
infiniopTensorDescriptor_t
b_desc
,
infiniopTensorDescriptor_t
weight_desc
,
float
epsilon
)
{
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::add_rms_norm::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::add_rms_norm::NAMESPACE::Descriptor **>(desc_ptr), \
y_desc, \
residual_out_desc, \
a_desc, \
b_desc, \
weight_desc, \
epsilon)
switch
(
handle
->
device
)
{
#ifdef ENABLE_CPU_API
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
CREATE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#ifdef ENABLE_MOORE_API
CREATE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
#ifdef ENABLE_METAX_API
CREATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_QY_API
CREATE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_HYGON_API
CREATE
(
INFINI_DEVICE_HYGON
,
nvidia
);
#endif
#ifdef ENABLE_KUNLUN_API
// CREATE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CREATE
}
__C
infiniStatus_t
infiniopGetAddRMSNormWorkspaceSize
(
infiniopAddRMSNormDescriptor_t
desc
,
size_t
*
size
)
{
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::add_rms_norm::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
GET
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
GET
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
GET
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
GET
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#ifdef ENABLE_MOORE_API
GET
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
#ifdef ENABLE_METAX_API
GET
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_QY_API
GET
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_HYGON_API
GET
(
INFINI_DEVICE_HYGON
,
nvidia
);
#endif
#ifdef ENABLE_KUNLUN_API
// GET(INFINI_DEVICE_KUNLUN, kunlun);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef GET
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
__C
infiniStatus_t
infiniopAddRMSNorm
(
infiniopAddRMSNormDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
y
,
void
*
residual_out
,
const
void
*
a
,
const
void
*
b
,
const
void
*
weight
,
void
*
stream
)
{
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::add_rms_norm::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, y, residual_out, a, b, weight, stream)
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
CALCULATE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#ifdef ENABLE_MOORE_API
CALCULATE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
#ifdef ENABLE_METAX_API
CALCULATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_QY_API
CALCULATE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_HYGON_API
CALCULATE
(
INFINI_DEVICE_HYGON
,
nvidia
);
#endif
#ifdef ENABLE_KUNLUN_API
// CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CALCULATE
}
__C
infiniStatus_t
infiniopDestroyAddRMSNormDescriptor
(
infiniopAddRMSNormDescriptor_t
desc
)
{
if
(
desc
==
nullptr
)
{
return
INFINI_STATUS_SUCCESS
;
}
#define DESTROY(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<op::add_rms_norm::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
DESTROY
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
DESTROY
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
DESTROY
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
DESTROY
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#ifdef ENABLE_MOORE_API
DESTROY
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
#ifdef ENABLE_METAX_API
DESTROY
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_QY_API
DESTROY
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_HYGON_API
DESTROY
(
INFINI_DEVICE_HYGON
,
nvidia
);
#endif
#ifdef ENABLE_KUNLUN_API
// DESTROY(INFINI_DEVICE_KUNLUN, kunlun);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef DESTROY
}
src/infiniop/ops/causal_softmax/bang/causal_softmax_bang.mlu
View file @
8d09630a
...
...
@@ -131,7 +131,7 @@ void causalSoftmaxUnion(void *workspace, int core_per_cluster, int cluster_count
kernel_dim.x = core_per_cluster;
kernel_dim.y = cluster_count;
kernel_dim.z = 1;
kernel_type =
CNRT_FUNC_TYPE_UNION
1;
kernel_type =
cnrtFuncTypeUnion
1;
// Launch kernel
causalSoftmax<T><<<kernel_dim, kernel_type, queue>>>(
...
...
src/infiniop/ops/causal_softmax/moore/causal_softmax_kernel.h
View file @
8d09630a
...
...
@@ -28,7 +28,7 @@ __device__ void causalSoftmaxKernel(
// 1 | * * * ... * * |
// 2 | * * * ... * * * |
// height: 3 col_id->
if
(
width
+
blockIdx
.
x
>=
threadIdx
.
x
+
height
)
{
if
(
width
+
blockIdx
.
x
>=
col
+
height
)
{
if
constexpr
(
std
::
is_same_v
<
Tdata
,
half
>
||
std
::
is_same_v
<
Tdata
,
cuda_bfloat16
>
)
{
/*
* MUSA does not support CUDA's native `hexp` function.
...
...
src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu
View file @
8d09630a
...
...
@@ -76,7 +76,15 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
const
void
*
x
,
void
*
stream_
)
const
{
cudaStream_t
stream
=
(
cudaStream_t
)
stream_
;
if
(
_opaque
->
internal
->
maxThreadsPerBlock
()
==
CUDA_BLOCK_SIZE_1024
)
{
if
(
_opaque
->
internal
->
maxThreadsPerBlock
()
==
CUDA_BLOCK_SIZE_4096
)
{
CHECK_STATUS
(
launchKernel
<
CUDA_BLOCK_SIZE_4096
>
(
y
,
x
,
_info
.
dtype
,
_info
.
batch_size
,
_info
.
seq_len
,
_info
.
total_seq_len
,
_info
.
y_stride_b
,
_info
.
y_stride_i
,
_info
.
x_stride_b
,
_info
.
x_stride_i
,
stream
));
}
else
if
(
_opaque
->
internal
->
maxThreadsPerBlock
()
==
CUDA_BLOCK_SIZE_2048
)
{
CHECK_STATUS
(
launchKernel
<
CUDA_BLOCK_SIZE_2048
>
(
y
,
x
,
_info
.
dtype
,
_info
.
batch_size
,
_info
.
seq_len
,
_info
.
total_seq_len
,
_info
.
y_stride_b
,
_info
.
y_stride_i
,
_info
.
x_stride_b
,
_info
.
x_stride_i
,
stream
));
}
else
if
(
_opaque
->
internal
->
maxThreadsPerBlock
()
==
CUDA_BLOCK_SIZE_1024
)
{
CHECK_STATUS
(
launchKernel
<
CUDA_BLOCK_SIZE_1024
>
(
y
,
x
,
_info
.
dtype
,
_info
.
batch_size
,
_info
.
seq_len
,
_info
.
total_seq_len
,
_info
.
y_stride_b
,
_info
.
y_stride_i
,
_info
.
x_stride_b
,
_info
.
x_stride_i
,
stream
));
...
...
@@ -84,10 +92,6 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
CHECK_STATUS
(
launchKernel
<
CUDA_BLOCK_SIZE_512
>
(
y
,
x
,
_info
.
dtype
,
_info
.
batch_size
,
_info
.
seq_len
,
_info
.
total_seq_len
,
_info
.
y_stride_b
,
_info
.
y_stride_i
,
_info
.
x_stride_b
,
_info
.
x_stride_i
,
stream
));
}
else
if
(
_opaque
->
internal
->
maxThreadsPerBlock
()
==
CUDA_BLOCK_SIZE_4096
)
{
CHECK_STATUS
(
launchKernel
<
CUDA_BLOCK_SIZE_4096
>
(
y
,
x
,
_info
.
dtype
,
_info
.
batch_size
,
_info
.
seq_len
,
_info
.
total_seq_len
,
_info
.
y_stride_b
,
_info
.
y_stride_i
,
_info
.
x_stride_b
,
_info
.
x_stride_i
,
stream
));
}
else
{
return
INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED
;
}
...
...
src/infiniop/ops/causal_softmax/operator.cc
View file @
8d09630a
...
...
@@ -5,7 +5,7 @@
#ifdef ENABLE_CPU_API
#include "cpu/causal_softmax_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
|| defined(ENABLE_ALI_API)
#include "nvidia/causal_softmax_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
...
...
@@ -48,6 +48,9 @@ __C infiniStatus_t infiniopCreateCausalSoftmaxDescriptor(
#ifdef ENABLE_ILUVATAR_API
CREATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
CREATE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
CREATE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
...
...
@@ -90,6 +93,9 @@ __C infiniStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmaxDe
#ifdef ENABLE_ILUVATAR_API
GET
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
GET
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
GET
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
...
...
@@ -137,6 +143,9 @@ __C infiniStatus_t infiniopCausalSoftmax(
#ifdef ENABLE_ILUVATAR_API
CALCULATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
CALCULATE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
CALCULATE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
...
...
@@ -179,6 +188,9 @@ __C infiniStatus_t infiniopDestroyCausalSoftmaxDescriptor(infiniopCausalSoftmaxD
#ifdef ENABLE_ILUVATAR_API
DESTROY
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
DESTROY
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
DESTROY
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
...
...
src/infiniop/ops/clip/operator.cc
View file @
8d09630a
...
...
@@ -5,7 +5,7 @@
#ifdef ENABLE_CPU_API
#include "cpu/clip_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
|| defined(ENABLE_ALI_API)
#include "nvidia/clip_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
...
...
@@ -42,6 +42,9 @@ __C infiniStatus_t infiniopCreateClipDescriptor(
#ifdef ENABLE_ILUVATAR_API
CREATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
CREATE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
CREATE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
...
...
@@ -76,6 +79,9 @@ __C infiniStatus_t infiniopGetClipWorkspaceSize(infiniopClipDescriptor_t desc, s
#ifdef ENABLE_ILUVATAR_API
GET
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
GET
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
GET
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
...
...
@@ -118,6 +124,9 @@ __C infiniStatus_t infiniopClip(
#ifdef ENABLE_ILUVATAR_API
CALCULATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
CALCULATE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
CALCULATE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
...
...
@@ -154,6 +163,9 @@ infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc) {
#ifdef ENABLE_ILUVATAR_API
DELETE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
DELETE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
DELETE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
...
...
src/infiniop/ops/conv/operator.cc
View file @
8d09630a
...
...
@@ -5,7 +5,7 @@
#ifdef ENABLE_CPU_API
#include "cpu/conv_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
|| defined(ENABLE_ALI_API)
#include "nvidia/conv_nvidia.cuh"
#endif
...
...
@@ -45,6 +45,9 @@ __C __export infiniStatus_t infiniopCreateConvDescriptor(infiniopHandle_t handle
#ifdef ENABLE_QY_API
CREATE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
CREATE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -76,6 +79,9 @@ infiniopGetConvWorkspaceSize(
#ifdef ENABLE_QY_API
GET
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
GET
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -115,6 +121,9 @@ __C infiniStatus_t infiniopConv(
#ifdef ENABLE_QY_API
CALCULATE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
CALCULATE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -142,6 +151,9 @@ infiniopDestroyConvDescriptor(infiniopConvDescriptor_t desc) {
#ifdef ENABLE_QY_API
DELETE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
DELETE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_nvidia.cu
View file @
8d09630a
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
|| defined(ENABLE_ALI_API)
#include "../../../devices/nvidia/nvidia_handle.cuh"
#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
...
...
src/infiniop/ops/dequantize_awq/operator.cc
View file @
8d09630a
...
...
@@ -2,7 +2,7 @@
#include "../../handle.h"
#include "infiniop/ops/dequantize_awq.h"
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
|| defined(ENABLE_ALI_API)
#include "nvidia/dequantize_w42f16_nvidia.cuh"
#endif
#ifdef ENABLE_MOORE_API
...
...
@@ -43,6 +43,10 @@ __C infiniStatus_t infiniopCreateDequantizeAWQDescriptor(
#ifdef ENABLE_QY_API
CREATE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
CREATE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
...
...
@@ -70,6 +74,10 @@ __C infiniStatus_t infiniopGetDequantizeAWQWorkspaceSize(infiniopDequantizeAWQDe
#ifdef ENABLE_QY_API
GET
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
GET
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
...
...
@@ -104,6 +112,10 @@ __C infiniStatus_t infiniopDequantizeAWQ(
#ifdef ENABLE_QY_API
CALCULATE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
CALCULATE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
...
...
@@ -132,6 +144,10 @@ infiniopDestroyDequantizeAWQDescriptor(infiniopDequantizeAWQDescriptor_t desc) {
#ifdef ENABLE_QY_API
DELETE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
DELETE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
...
...
src/infiniop/ops/embedding/cpu/embedding_cpu.cc
0 → 100644
View file @
8d09630a
#include "embedding_cpu.h"
#include "../../../../utils.h"
#include "../../../handle.h"
#include "../../../tensor.h"
#include <cstring>
namespace
op
::
embedding
::
cpu
{
struct
Descriptor
::
Opaque
{};
Descriptor
::~
Descriptor
()
{
delete
_opaque
;
}
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
output_desc
,
infiniopTensorDescriptor_t
input_desc
,
infiniopTensorDescriptor_t
weight_desc
)
{
auto
input_shape
=
input_desc
->
shape
();
auto
weight_shape
=
weight_desc
->
shape
();
CHECK_OR_RETURN
(
weight_shape
.
size
()
==
2
,
INFINI_STATUS_BAD_TENSOR_SHAPE
);
CHECK_OR_RETURN
(
output_desc
->
shape
().
size
()
==
input_shape
.
size
()
+
1
,
INFINI_STATUS_BAD_TENSOR_SHAPE
);
auto
output_shape
=
output_desc
->
shape
();
size_t
embedding_dim
=
weight_shape
[
1
];
CHECK_OR_RETURN
(
output_shape
.
back
()
==
embedding_dim
,
INFINI_STATUS_BAD_TENSOR_SHAPE
);
for
(
size_t
i
=
0
;
i
<
input_shape
.
size
();
++
i
)
{
CHECK_OR_RETURN
(
output_shape
[
i
]
==
input_shape
[
i
],
INFINI_STATUS_BAD_TENSOR_SHAPE
);
}
auto
input_dtype
=
input_desc
->
dtype
();
auto
weight_dtype
=
weight_desc
->
dtype
();
CHECK_OR_RETURN
(
input_dtype
==
INFINI_DTYPE_I32
||
input_dtype
==
INFINI_DTYPE_I64
,
INFINI_STATUS_BAD_TENSOR_DTYPE
);
CHECK_OR_RETURN
(
weight_dtype
==
INFINI_DTYPE_F32
||
weight_dtype
==
INFINI_DTYPE_F16
||
weight_dtype
==
INFINI_DTYPE_BF16
,
INFINI_STATUS_BAD_TENSOR_DTYPE
);
CHECK_OR_RETURN
(
output_desc
->
dtype
()
==
weight_dtype
,
INFINI_STATUS_BAD_TENSOR_DTYPE
);
size_t
num_indices
=
1
;
for
(
auto
dim
:
input_shape
)
{
num_indices
*=
dim
;
}
size_t
vocab_size
=
weight_shape
[
0
];
*
desc_ptr
=
new
Descriptor
(
num_indices
,
embedding_dim
,
vocab_size
,
input_dtype
,
weight_dtype
,
new
Opaque
{},
handle
->
device
,
handle
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
output
,
const
void
*
input
,
const
void
*
weight
,
void
*
stream
)
const
{
if
(
_num_indices
==
0
)
{
return
INFINI_STATUS_SUCCESS
;
}
size_t
element_size
=
infiniSizeOf
(
_weight_dtype
);
size_t
row_bytes
=
_embedding_dim
*
element_size
;
if
(
_input_dtype
==
INFINI_DTYPE_I32
)
{
const
int32_t
*
indices_ptr
=
reinterpret_cast
<
const
int32_t
*>
(
input
);
const
std
::
byte
*
weight_ptr
=
reinterpret_cast
<
const
std
::
byte
*>
(
weight
);
std
::
byte
*
out_ptr
=
reinterpret_cast
<
std
::
byte
*>
(
output
);
for
(
size_t
i
=
0
;
i
<
_num_indices
;
++
i
)
{
int32_t
idx
=
indices_ptr
[
i
];
if
(
idx
>=
0
&&
static_cast
<
size_t
>
(
idx
)
<
_vocab_size
)
{
std
::
memcpy
(
out_ptr
+
i
*
row_bytes
,
weight_ptr
+
static_cast
<
size_t
>
(
idx
)
*
row_bytes
,
row_bytes
);
}
}
}
else
if
(
_input_dtype
==
INFINI_DTYPE_I64
)
{
const
int64_t
*
indices_ptr
=
reinterpret_cast
<
const
int64_t
*>
(
input
);
const
std
::
byte
*
weight_ptr
=
reinterpret_cast
<
const
std
::
byte
*>
(
weight
);
std
::
byte
*
out_ptr
=
reinterpret_cast
<
std
::
byte
*>
(
output
);
for
(
size_t
i
=
0
;
i
<
_num_indices
;
++
i
)
{
int64_t
idx
=
indices_ptr
[
i
];
if
(
idx
>=
0
&&
static_cast
<
size_t
>
(
idx
)
<
_vocab_size
)
{
std
::
memcpy
(
out_ptr
+
i
*
row_bytes
,
weight_ptr
+
static_cast
<
size_t
>
(
idx
)
*
row_bytes
,
row_bytes
);
}
}
}
else
{
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::embedding::cpu
src/infiniop/ops/embedding/cpu/embedding_cpu.h
0 → 100644
View file @
8d09630a
#ifndef __EMBEDDING_CPU_H__
#define __EMBEDDING_CPU_H__
#include "../embedding.h"
DESCRIPTOR
(
cpu
)
#endif // __EMBEDDING_CPU_H__
src/infiniop/ops/embedding/cuda/embedding_kernel.cuh
0 → 100644
View file @
8d09630a
#ifndef __EMBEDDING_CUDA_KERNEL_CUH__
#define __EMBEDDING_CUDA_KERNEL_CUH__
#include <type_traits>
// Helper function to check memory alignment
__forceinline__
__device__
bool
is_aligned
(
const
void
*
ptr
,
size_t
alignment
)
{
// Use size_t for pointer arithmetic in device code (more compatible)
return
(
reinterpret_cast
<
size_t
>
(
ptr
)
%
alignment
==
0
);
}
// Vectorized copy for float type using float4
template
<
typename
IndexType
>
__forceinline__
__device__
void
copyVectorizedFloat4
(
float
*
__restrict__
dst
,
const
float
*
__restrict__
src
,
size_t
embedding_dim
)
{
// Use float4 for vectorized access (16 bytes, 4 floats)
const
float4
*
src_vec
=
reinterpret_cast
<
const
float4
*>
(
src
);
float4
*
dst_vec
=
reinterpret_cast
<
float4
*>
(
dst
);
size_t
vec_count
=
embedding_dim
/
4
;
// Vectorized copy using __ldg for read-only weight
for
(
size_t
i
=
0
;
i
<
vec_count
;
++
i
)
{
dst_vec
[
i
]
=
__ldg
(
&
src_vec
[
i
]);
}
// Copy remaining elements
size_t
remaining
=
embedding_dim
%
4
;
if
(
remaining
>
0
)
{
size_t
offset
=
vec_count
*
4
;
for
(
size_t
i
=
0
;
i
<
remaining
;
++
i
)
{
dst
[
offset
+
i
]
=
__ldg
(
&
src
[
offset
+
i
]);
}
}
}
// Vectorized copy for float type using float2 (fallback when not aligned to 16 bytes)
template
<
typename
IndexType
>
__forceinline__
__device__
void
copyVectorizedFloat2
(
float
*
__restrict__
dst
,
const
float
*
__restrict__
src
,
size_t
embedding_dim
)
{
// Use float2 for vectorized access (8 bytes, 2 floats)
const
float2
*
src_vec
=
reinterpret_cast
<
const
float2
*>
(
src
);
float2
*
dst_vec
=
reinterpret_cast
<
float2
*>
(
dst
);
size_t
vec_count
=
embedding_dim
/
2
;
// Vectorized copy using __ldg for read-only weight
for
(
size_t
i
=
0
;
i
<
vec_count
;
++
i
)
{
dst_vec
[
i
]
=
__ldg
(
&
src_vec
[
i
]);
}
// Copy remaining element if odd
if
(
embedding_dim
%
2
!=
0
)
{
dst
[
embedding_dim
-
1
]
=
__ldg
(
&
src
[
embedding_dim
-
1
]);
}
}
// Vectorized copy for half type using half2
template
<
typename
IndexType
>
__forceinline__
__device__
void
copyVectorizedHalf2
(
half
*
__restrict__
dst
,
const
half
*
__restrict__
src
,
size_t
embedding_dim
)
{
// Use half2 for vectorized access (4 bytes, 2 halfs)
const
half2
*
src_vec
=
reinterpret_cast
<
const
half2
*>
(
src
);
half2
*
dst_vec
=
reinterpret_cast
<
half2
*>
(
dst
);
size_t
vec_count
=
embedding_dim
/
2
;
// Vectorized copy using __ldg for read-only weight
for
(
size_t
i
=
0
;
i
<
vec_count
;
++
i
)
{
dst_vec
[
i
]
=
__ldg
(
&
src_vec
[
i
]);
}
// Copy remaining element if odd
if
(
embedding_dim
%
2
!=
0
)
{
dst
[
embedding_dim
-
1
]
=
__ldg
(
&
src
[
embedding_dim
-
1
]);
}
}
// Vectorized copy for bfloat16 type using bfloat162
template
<
typename
IndexType
>
__forceinline__
__device__
void
copyVectorizedBFloat162
(
cuda_bfloat16
*
__restrict__
dst
,
const
cuda_bfloat16
*
__restrict__
src
,
size_t
embedding_dim
)
{
// Use bfloat162 for vectorized access (4 bytes, 2 bfloat16s)
const
cuda_bfloat162
*
src_vec
=
reinterpret_cast
<
const
cuda_bfloat162
*>
(
src
);
cuda_bfloat162
*
dst_vec
=
reinterpret_cast
<
cuda_bfloat162
*>
(
dst
);
size_t
vec_count
=
embedding_dim
/
2
;
// Vectorized copy using __ldg for read-only weight
for
(
size_t
i
=
0
;
i
<
vec_count
;
++
i
)
{
dst_vec
[
i
]
=
__ldg
(
&
src_vec
[
i
]);
}
// Copy remaining element if odd
if
(
embedding_dim
%
2
!=
0
)
{
dst
[
embedding_dim
-
1
]
=
__ldg
(
&
src
[
embedding_dim
-
1
]);
}
}
// Scalar copy fallback with __ldg optimization
template
<
typename
T
,
typename
IndexType
>
__forceinline__
__device__
void
copyScalar
(
T
*
__restrict__
dst
,
const
T
*
__restrict__
src
,
size_t
embedding_dim
)
{
// Scalar copy with __ldg for read-only weight
for
(
size_t
i
=
0
;
i
<
embedding_dim
;
++
i
)
{
dst
[
i
]
=
__ldg
(
&
src
[
i
]);
}
}
#endif // __EMBEDDING_CUDA_KERNEL_CUH__
src/infiniop/ops/embedding/embedding.h
0 → 100644
View file @
8d09630a
#ifndef __EMBEDDING_H__
#define __EMBEDDING_H__
#include "../../../utils.h"
#include "../../operator.h"
#define DESCRIPTOR(NAMESPACE) \
\
namespace op::embedding::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
size_t _num_indices; \
size_t _embedding_dim; \
size_t _vocab_size; \
infiniDtype_t _input_dtype; \
infiniDtype_t _weight_dtype; \
\
Descriptor( \
size_t num_indices, \
size_t embedding_dim, \
size_t vocab_size, \
infiniDtype_t input_dtype, \
infiniDtype_t weight_dtype, \
Opaque *opaque, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_num_indices(num_indices), \
_embedding_dim(embedding_dim), \
_vocab_size(vocab_size), \
_input_dtype(input_dtype), \
_weight_dtype(weight_dtype) {} \
\
public: \
~Descriptor(); \
\
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t output_desc, \
infiniopTensorDescriptor_t input_desc, \
infiniopTensorDescriptor_t weight_desc); \
\
infiniStatus_t calculate( \
void *output, \
const void *input, \
const void *weight, \
void *stream) const; \
}; \
}
#endif // __EMBEDDING_H__
src/infiniop/ops/embedding/metax/embedding_metax.cuh
0 → 100644
View file @
8d09630a
#ifndef __EMBEDDING_METAX_H__
#define __EMBEDDING_METAX_H__
#include "../embedding.h"
DESCRIPTOR
(
metax
)
#endif // __EMBEDDING_METAX_H__
src/infiniop/ops/embedding/metax/embedding_metax.maca
0 → 100644
View file @
8d09630a
#include "../../../../utils.h"
#include "../../../devices/metax/metax_common.h"
#include "../../../devices/metax/metax_kernel_common.h"
#include "../../../tensor.h"
#include "../cuda/embedding_kernel.cuh"
#include "embedding_metax.cuh"
template <typename T, typename IndexType>
INFINIOP_METAX_KERNEL embeddingKernel(
T *__restrict__ output,
const IndexType *__restrict__ indices,
const T *__restrict__ weight,
size_t num_indices,
size_t embedding_dim,
size_t vocab_size) {
// Calculate global thread index
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < num_indices) {
// Get the index value
IndexType index_val = __ldg(&indices[idx]);
// Bounds check - handle negative indices gracefully
if (index_val >= 0 && static_cast<size_t>(index_val) < vocab_size) {
// Copy embedding vector from weight to output
const T *src = weight + static_cast<size_t>(index_val) * embedding_dim;
T *dst = output + idx * embedding_dim;
// Choose optimal copy strategy based on type and alignment
if constexpr (std::is_same_v<T, float>) {
// Check alignment for float4 (16 bytes)
bool aligned_16 = is_aligned(src, 16) && is_aligned(dst, 16);
if (aligned_16 && embedding_dim >= 4 && embedding_dim % 4 == 0) {
copyVectorizedFloat4<IndexType>(dst, src, embedding_dim);
} else if (embedding_dim >= 2 && embedding_dim % 2 == 0) {
// Try float2 if not aligned to 16 bytes
copyVectorizedFloat2<IndexType>(dst, src, embedding_dim);
} else {
copyScalar<T, IndexType>(dst, src, embedding_dim);
}
} else if constexpr (std::is_same_v<T, half>) {
// Use half2 for vectorized access
if (embedding_dim >= 2 && embedding_dim % 2 == 0) {
copyVectorizedHalf2<IndexType>(dst, src, embedding_dim);
} else {
copyScalar<T, IndexType>(dst, src, embedding_dim);
}
} else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
// Use bfloat162 for vectorized access
if (embedding_dim >= 2 && embedding_dim % 2 == 0) {
copyVectorizedBFloat162<IndexType>(dst, src, embedding_dim);
} else {
copyScalar<T, IndexType>(dst, src, embedding_dim);
}
} else {
// Fallback to scalar copy with __ldg
copyScalar<T, IndexType>(dst, src, embedding_dim);
}
}
}
}
namespace op::embedding::metax {
struct Descriptor::Opaque {
std::shared_ptr<device::metax::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t output_desc,
infiniopTensorDescriptor_t input_desc,
infiniopTensorDescriptor_t weight_desc) {
auto input_shape = input_desc->shape();
auto weight_shape = weight_desc->shape();
// Validate shapes
CHECK_OR_RETURN(weight_shape.size() == 2, INFINI_STATUS_BAD_TENSOR_SHAPE);
CHECK_OR_RETURN(output_desc->shape().size() == input_shape.size() + 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
// Check output shape matches input shape + embedding_dim
auto output_shape = output_desc->shape();
size_t embedding_dim = weight_shape[1];
CHECK_OR_RETURN(output_shape.back() == embedding_dim, INFINI_STATUS_BAD_TENSOR_SHAPE);
for (size_t i = 0; i < input_shape.size(); ++i) {
CHECK_OR_RETURN(output_shape[i] == input_shape[i], INFINI_STATUS_BAD_TENSOR_SHAPE);
}
// Validate dtypes
auto input_dtype = input_desc->dtype();
auto weight_dtype = weight_desc->dtype();
CHECK_OR_RETURN(input_dtype == INFINI_DTYPE_I32 || input_dtype == INFINI_DTYPE_I64,
INFINI_STATUS_BAD_TENSOR_DTYPE);
CHECK_OR_RETURN(weight_dtype == INFINI_DTYPE_F32 || weight_dtype == INFINI_DTYPE_F16 ||
weight_dtype == INFINI_DTYPE_BF16, INFINI_STATUS_BAD_TENSOR_DTYPE);
CHECK_OR_RETURN(output_desc->dtype() == weight_dtype, INFINI_STATUS_BAD_TENSOR_DTYPE);
// Calculate number of indices (supporting batch dimension)
size_t num_indices = 1;
for (auto dim : input_shape) {
num_indices *= dim;
}
size_t vocab_size = weight_shape[0];
*desc_ptr = new Descriptor(
num_indices,
embedding_dim,
vocab_size,
input_dtype,
weight_dtype,
new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
handle->device,
handle->device_id);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *output,
const void *input,
const void *weight,
void *stream) const {
if (_num_indices == 0) {
return INFINI_STATUS_SUCCESS;
}
auto hc_stream = reinterpret_cast<hcStream_t>(stream);
// Dynamic block size optimization based on embedding_dim for Metax platform
size_t block_size = 256; // Default block size for Metax
if (_embedding_dim <= 64) {
block_size = 512; // Small embedding_dim: use larger block for better occupancy
} else if (_embedding_dim >= 1024) {
block_size = 128; // Large embedding_dim: use smaller block to reduce register pressure
}
size_t grid_size = (_num_indices + block_size - 1) / block_size;
// Launch kernel based on dtypes for Metax platform
if (_input_dtype == INFINI_DTYPE_I32) {
const int32_t *indices_ptr = reinterpret_cast<const int32_t *>(input);
if (_weight_dtype == INFINI_DTYPE_F32) {
embeddingKernel<float, int32_t><<<grid_size, block_size, 0, hc_stream>>>(
reinterpret_cast<float *>(output),
indices_ptr,
reinterpret_cast<const float *>(weight),
_num_indices,
_embedding_dim,
_vocab_size);
} else if (_weight_dtype == INFINI_DTYPE_F16) {
embeddingKernel<half, int32_t><<<grid_size, block_size, 0, hc_stream>>>(
reinterpret_cast<half *>(output),
indices_ptr,
reinterpret_cast<const half *>(weight),
_num_indices,
_embedding_dim,
_vocab_size);
} else if (_weight_dtype == INFINI_DTYPE_BF16) {
// Use Metax's bfloat16 type
embeddingKernel<__hpcc_bfloat16, int32_t><<<grid_size, block_size, 0, hc_stream>>>(
reinterpret_cast<__hpcc_bfloat16 *>(output),
indices_ptr,
reinterpret_cast<const __hpcc_bfloat16 *>(weight),
_num_indices,
_embedding_dim,
_vocab_size);
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
} else if (_input_dtype == INFINI_DTYPE_I64) {
const int64_t *indices_ptr = reinterpret_cast<const int64_t *>(input);
if (_weight_dtype == INFINI_DTYPE_F32) {
embeddingKernel<float, int64_t><<<grid_size, block_size, 0, hc_stream>>>(
reinterpret_cast<float *>(output),
indices_ptr,
reinterpret_cast<const float *>(weight),
_num_indices,
_embedding_dim,
_vocab_size);
} else if (_weight_dtype == INFINI_DTYPE_F16) {
embeddingKernel<half, int64_t><<<grid_size, block_size, 0, hc_stream>>>(
reinterpret_cast<half *>(output),
indices_ptr,
reinterpret_cast<const half *>(weight),
_num_indices,
_embedding_dim,
_vocab_size);
} else if (_weight_dtype == INFINI_DTYPE_BF16) {
embeddingKernel<__hpcc_bfloat16, int64_t><<<grid_size, block_size, 0, hc_stream>>>(
reinterpret_cast<__hpcc_bfloat16 *>(output),
indices_ptr,
reinterpret_cast<const __hpcc_bfloat16 *>(weight),
_num_indices,
_embedding_dim,
_vocab_size);
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::embedding::metax
src/infiniop/ops/embedding/moore/embedding_moore.h
0 → 100644
View file @
8d09630a
#ifndef __EMBEDDING_MOORE_H__
#define __EMBEDDING_MOORE_H__
#include "../embedding.h"
DESCRIPTOR
(
moore
)
#endif // __EMBEDDING_MOORE_H__
src/infiniop/ops/embedding/moore/embedding_moore.mu
0 → 100644
View file @
8d09630a
#include "../../../../utils.h"
#include "../../../devices/moore/moore_common.h"
#include "../../../devices/moore/moore_kernel_common.h"
#include "../../../tensor.h"
#include "embedding_moore_kernel.h"
#include "embedding_moore.h"
#include <musa_runtime.h>
template <typename T, typename IndexType>
INFINIOP_MOORE_KERNEL embeddingKernel(
T *__restrict__ output,
const IndexType *__restrict__ indices,
const T *__restrict__ weight,
size_t num_indices,
size_t embedding_dim,
size_t vocab_size) {
// Calculate global thread index
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < num_indices) {
// Get the index value with Moore-optimized memory access
IndexType index_val = indices[idx];
// Bounds check - handle negative indices gracefully
if (index_val >= 0 && static_cast<size_t>(index_val) < vocab_size) {
// Copy embedding vector from weight to output
const T *src = weight + static_cast<size_t>(index_val) * embedding_dim;
T *dst = output + idx * embedding_dim;
// Choose optimal copy strategy based on type and alignment
if constexpr (std::is_same_v<T, float>) {
// Check alignment for float4 (16 bytes)
bool aligned_16 = is_aligned(src, 16) && is_aligned(dst, 16);
if (aligned_16 && embedding_dim >= 4 && embedding_dim % 4 == 0) {
copyVectorizedFloat4<IndexType>(dst, src, embedding_dim);
} else if (embedding_dim >= 2 && embedding_dim % 2 == 0) {
// Try float2 if not aligned to 16 bytes
copyVectorizedFloat2<IndexType>(dst, src, embedding_dim);
} else {
copyScalar<T, IndexType>(dst, src, embedding_dim);
}
} else if constexpr (std::is_same_v<T, half>) {
// Use half2 for vectorized access
if (embedding_dim >= 2 && embedding_dim % 2 == 0) {
copyVectorizedHalf2<IndexType>(dst, src, embedding_dim);
} else {
copyScalar<T, IndexType>(dst, src, embedding_dim);
}
} else if constexpr (std::is_same_v<T, __mt_bfloat16>) {
// Use mt_bfloat162 for vectorized access (Moore-specific type)
if (embedding_dim >= 2 && embedding_dim % 2 == 0) {
copyVectorizedBFloat162<IndexType>(dst, src, embedding_dim);
} else {
copyScalar<T, IndexType>(dst, src, embedding_dim);
}
} else {
// Fallback to scalar copy with Moore-optimized memory access
copyScalar<T, IndexType>(dst, src, embedding_dim);
}
}
}
}
namespace op::embedding::moore {
struct Descriptor::Opaque {
std::shared_ptr<device::moore::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t output_desc,
infiniopTensorDescriptor_t input_desc,
infiniopTensorDescriptor_t weight_desc) {
auto input_shape = input_desc->shape();
auto weight_shape = weight_desc->shape();
// Validate shapes
CHECK_OR_RETURN(weight_shape.size() == 2, INFINI_STATUS_BAD_TENSOR_SHAPE);
CHECK_OR_RETURN(output_desc->shape().size() == input_shape.size() + 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
// Check output shape matches input shape + embedding_dim
auto output_shape = output_desc->shape();
size_t embedding_dim = weight_shape[1];
CHECK_OR_RETURN(output_shape.back() == embedding_dim, INFINI_STATUS_BAD_TENSOR_SHAPE);
for (size_t i = 0; i < input_shape.size(); ++i) {
CHECK_OR_RETURN(output_shape[i] == input_shape[i], INFINI_STATUS_BAD_TENSOR_SHAPE);
}
// Validate dtypes
auto input_dtype = input_desc->dtype();
auto weight_dtype = weight_desc->dtype();
CHECK_OR_RETURN(input_dtype == INFINI_DTYPE_I32 || input_dtype == INFINI_DTYPE_I64,
INFINI_STATUS_BAD_TENSOR_DTYPE);
CHECK_OR_RETURN(weight_dtype == INFINI_DTYPE_F32 || weight_dtype == INFINI_DTYPE_F16 || weight_dtype == INFINI_DTYPE_BF16, INFINI_STATUS_BAD_TENSOR_DTYPE);
CHECK_OR_RETURN(output_desc->dtype() == weight_dtype, INFINI_STATUS_BAD_TENSOR_DTYPE);
// Calculate number of indices (supporting batch dimension)
size_t num_indices = 1;
for (auto dim : input_shape) {
num_indices *= dim;
}
size_t vocab_size = weight_shape[0];
*desc_ptr = new Descriptor(
num_indices,
embedding_dim,
vocab_size,
input_dtype,
weight_dtype,
new Opaque{reinterpret_cast<device::moore::Handle *>(handle)->internal()},
handle->device,
handle->device_id);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *output,
const void *input,
const void *weight,
void *stream) const {
if (_num_indices == 0) {
return INFINI_STATUS_SUCCESS;
}
auto musa_stream = reinterpret_cast<musaStream_t>(stream);
// Dynamic block size optimization based on embedding_dim
// Moore platform typically has different performance characteristics
size_t block_size = 256; // Default for Moore
if (_embedding_dim <= 64) {
block_size = 512; // Small embedding_dim: use larger block for better occupancy
} else if (_embedding_dim >= 1024) {
block_size = 128; // Large embedding_dim: use smaller block to reduce register pressure
} else if (_embedding_dim <= 256) {
block_size = 384; // Medium embedding_dim: balanced configuration
}
size_t grid_size = (_num_indices + block_size - 1) / block_size;
// Launch kernel based on dtypes
// Note: Moore uses __mt_bfloat16 instead of __nv_bfloat16
if (_input_dtype == INFINI_DTYPE_I32) {
const int32_t *indices_ptr = reinterpret_cast<const int32_t *>(input);
if (_weight_dtype == INFINI_DTYPE_F32) {
embeddingKernel<float, int32_t><<<grid_size, block_size, 0, musa_stream>>>(
reinterpret_cast<float *>(output),
indices_ptr,
reinterpret_cast<const float *>(weight),
_num_indices,
_embedding_dim,
_vocab_size);
} else if (_weight_dtype == INFINI_DTYPE_F16) {
embeddingKernel<half, int32_t><<<grid_size, block_size, 0, musa_stream>>>(
reinterpret_cast<half *>(output),
indices_ptr,
reinterpret_cast<const half *>(weight),
_num_indices,
_embedding_dim,
_vocab_size);
} else if (_weight_dtype == INFINI_DTYPE_BF16) {
// Use Moore's bfloat16 type
embeddingKernel<__mt_bfloat16, int32_t><<<grid_size, block_size, 0, musa_stream>>>(
reinterpret_cast<__mt_bfloat16 *>(output),
indices_ptr,
reinterpret_cast<const __mt_bfloat16 *>(weight),
_num_indices,
_embedding_dim,
_vocab_size);
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
} else if (_input_dtype == INFINI_DTYPE_I64) {
const int64_t *indices_ptr = reinterpret_cast<const int64_t *>(input);
if (_weight_dtype == INFINI_DTYPE_F32) {
embeddingKernel<float, int64_t><<<grid_size, block_size, 0, musa_stream>>>(
reinterpret_cast<float *>(output),
indices_ptr,
reinterpret_cast<const float *>(weight),
_num_indices,
_embedding_dim,
_vocab_size);
} else if (_weight_dtype == INFINI_DTYPE_F16) {
embeddingKernel<half, int64_t><<<grid_size, block_size, 0, musa_stream>>>(
reinterpret_cast<half *>(output),
indices_ptr,
reinterpret_cast<const half *>(weight),
_num_indices,
_embedding_dim,
_vocab_size);
} else if (_weight_dtype == INFINI_DTYPE_BF16) {
embeddingKernel<__mt_bfloat16, int64_t><<<grid_size, block_size, 0, musa_stream>>>(
reinterpret_cast<__mt_bfloat16 *>(output),
indices_ptr,
reinterpret_cast<const __mt_bfloat16 *>(weight),
_num_indices,
_embedding_dim,
_vocab_size);
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
// Check for kernel launch errors
musaError_t err = musaGetLastError();
if (err != musaSuccess) {
return INFINI_STATUS_INTERNAL_ERROR;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::embedding::moore
src/infiniop/ops/embedding/moore/embedding_moore_kernel.h
0 → 100644
View file @
8d09630a
#ifndef __EMBEDDING_MOORE_KERNEL_CUH__
#define __EMBEDDING_MOORE_KERNEL_CUH__
#include <type_traits>
// Helper function to check memory alignment
__forceinline__
__device__
bool
is_aligned
(
const
void
*
ptr
,
size_t
alignment
)
{
// Use size_t for pointer arithmetic in device code (more compatible)
return
(
reinterpret_cast
<
size_t
>
(
ptr
)
%
alignment
==
0
);
}
// Vectorized copy for float type using float4
template
<
typename
IndexType
>
__forceinline__
__device__
void
copyVectorizedFloat4
(
float
*
__restrict__
dst
,
const
float
*
__restrict__
src
,
size_t
embedding_dim
)
{
// Use float4 for vectorized access (16 bytes, 4 floats)
const
float4
*
src_vec
=
reinterpret_cast
<
const
float4
*>
(
src
);
float4
*
dst_vec
=
reinterpret_cast
<
float4
*>
(
dst
);
size_t
vec_count
=
embedding_dim
/
4
;
// Vectorized copy with __ldg equivalent for Moore platform
for
(
size_t
i
=
0
;
i
<
vec_count
;
++
i
)
{
dst_vec
[
i
]
=
src_vec
[
i
];
}
// Copy remaining elements
size_t
remaining
=
embedding_dim
%
4
;
if
(
remaining
>
0
)
{
size_t
offset
=
vec_count
*
4
;
for
(
size_t
i
=
0
;
i
<
remaining
;
++
i
)
{
dst
[
offset
+
i
]
=
src
[
offset
+
i
];
}
}
}
// Vectorized copy for float type using float2 (fallback when not aligned to 16 bytes)
template
<
typename
IndexType
>
__forceinline__
__device__
void
copyVectorizedFloat2
(
float
*
__restrict__
dst
,
const
float
*
__restrict__
src
,
size_t
embedding_dim
)
{
// Use float2 for vectorized access (8 bytes, 2 floats)
const
float2
*
src_vec
=
reinterpret_cast
<
const
float2
*>
(
src
);
float2
*
dst_vec
=
reinterpret_cast
<
float2
*>
(
dst
);
size_t
vec_count
=
embedding_dim
/
2
;
// Vectorized copy with Moore-optimized memory access
for
(
size_t
i
=
0
;
i
<
vec_count
;
++
i
)
{
dst_vec
[
i
]
=
src_vec
[
i
];
}
// Copy remaining element if odd
if
(
embedding_dim
%
2
!=
0
)
{
dst
[
embedding_dim
-
1
]
=
src
[
embedding_dim
-
1
];
}
}
// Vectorized copy for half type using half2
template
<
typename
IndexType
>
__forceinline__
__device__
void
copyVectorizedHalf2
(
half
*
__restrict__
dst
,
const
half
*
__restrict__
src
,
size_t
embedding_dim
)
{
// Use half2 for vectorized access (4 bytes, 2 halfs)
const
half2
*
src_vec
=
reinterpret_cast
<
const
half2
*>
(
src
);
half2
*
dst_vec
=
reinterpret_cast
<
half2
*>
(
dst
);
size_t
vec_count
=
embedding_dim
/
2
;
// Vectorized copy optimized for Moore architecture
for
(
size_t
i
=
0
;
i
<
vec_count
;
++
i
)
{
dst_vec
[
i
]
=
src_vec
[
i
];
}
// Copy remaining element if odd
if
(
embedding_dim
%
2
!=
0
)
{
dst
[
embedding_dim
-
1
]
=
src
[
embedding_dim
-
1
];
}
}
// Vectorized copy for Moore bfloat16 type using bfloat162
template
<
typename
IndexType
>
__forceinline__
__device__
void
copyVectorizedBFloat162
(
__mt_bfloat16
*
__restrict__
dst
,
const
__mt_bfloat16
*
__restrict__
src
,
size_t
embedding_dim
)
{
// Use mt_bfloat162 for vectorized access (4 bytes, 2 bfloat16s)
const
__mt_bfloat162
*
src_vec
=
reinterpret_cast
<
const
__mt_bfloat162
*>
(
src
);
__mt_bfloat162
*
dst_vec
=
reinterpret_cast
<
__mt_bfloat162
*>
(
dst
);
size_t
vec_count
=
embedding_dim
/
2
;
// Vectorized copy with Moore-specific optimization
for
(
size_t
i
=
0
;
i
<
vec_count
;
++
i
)
{
dst_vec
[
i
]
=
src_vec
[
i
];
}
// Copy remaining element if odd
if
(
embedding_dim
%
2
!=
0
)
{
dst
[
embedding_dim
-
1
]
=
src
[
embedding_dim
-
1
];
}
}
// Scalar copy fallback with Moore-optimized memory access
template
<
typename
T
,
typename
IndexType
>
__forceinline__
__device__
void
copyScalar
(
T
*
__restrict__
dst
,
const
T
*
__restrict__
src
,
size_t
embedding_dim
)
{
// Scalar copy with Moore read-only weight optimization
for
(
size_t
i
=
0
;
i
<
embedding_dim
;
++
i
)
{
dst
[
i
]
=
src
[
i
];
}
}
#endif // __EMBEDDING_MOORE_KERNEL_CUH__
Prev
1
…
6
7
8
9
10
11
12
13
14
…
20
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment