Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
9ad23fad
Unverified
Commit
9ad23fad
authored
Sep 02, 2025
by
blkmjsian
Committed by
GitHub
Sep 02, 2025
Browse files
[T2-2-3] blkmjsian
- dequantize awq - rope v2
parent
b3170335
Changes
42
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1470 additions
and
1 deletion
+1470
-1
src/infiniop/ops/rope_v2/cpu/rope_v2_cpu.h
src/infiniop/ops/rope_v2/cpu/rope_v2_cpu.h
+8
-0
src/infiniop/ops/rope_v2/cuda/kernel.cuh
src/infiniop/ops/rope_v2/cuda/kernel.cuh
+59
-0
src/infiniop/ops/rope_v2/metax/rope_metax.h
src/infiniop/ops/rope_v2/metax/rope_metax.h
+8
-0
src/infiniop/ops/rope_v2/metax/rope_metax.maca
src/infiniop/ops/rope_v2/metax/rope_metax.maca
+144
-0
src/infiniop/ops/rope_v2/nvidia/rope_v2_nvidia.cu
src/infiniop/ops/rope_v2/nvidia/rope_v2_nvidia.cu
+144
-0
src/infiniop/ops/rope_v2/nvidia/rope_v2_nvidia.cuh
src/infiniop/ops/rope_v2/nvidia/rope_v2_nvidia.cuh
+8
-0
src/infiniop/ops/rope_v2/operator.cc
src/infiniop/ops/rope_v2/operator.cc
+197
-0
src/infiniop/ops/rope_v2/rope_v2.h
src/infiniop/ops/rope_v2/rope_v2.h
+125
-0
src/infiniop/ops/topkrouter/cpu/topkrouter_cpu.cc
src/infiniop/ops/topkrouter/cpu/topkrouter_cpu.cc
+28
-0
src/infiniop/ops/topkrouter/cpu/topkrouter_cpu.h
src/infiniop/ops/topkrouter/cpu/topkrouter_cpu.h
+7
-0
src/infiniop/ops/topkrouter/cuda/kernel.cuh
src/infiniop/ops/topkrouter/cuda/kernel.cuh
+168
-0
src/infiniop/ops/topkrouter/info.h
src/infiniop/ops/topkrouter/info.h
+44
-0
src/infiniop/ops/topkrouter/nvidia/topkrouter_nvidia.cu
src/infiniop/ops/topkrouter/nvidia/topkrouter_nvidia.cu
+88
-0
src/infiniop/ops/topkrouter/nvidia/topkrouter_nvidia.cuh
src/infiniop/ops/topkrouter/nvidia/topkrouter_nvidia.cuh
+8
-0
src/infiniop/ops/topkrouter/operator.cc
src/infiniop/ops/topkrouter/operator.cc
+101
-0
src/infiniop/ops/topkrouter/topkrouter.h
src/infiniop/ops/topkrouter/topkrouter.h
+50
-0
test/infiniop/dequantize.py
test/infiniop/dequantize.py
+173
-0
test/infiniop/libinfiniop/op_register.py
test/infiniop/libinfiniop/op_register.py
+107
-0
test/infiniop/libinfiniop/utils.py
test/infiniop/libinfiniop/utils.py
+2
-0
test/infiniop/rms_norm.py
test/infiniop/rms_norm.py
+1
-1
No files found.
src/infiniop/ops/rope_v2/cpu/rope_v2_cpu.h
0 → 100644
View file @
9ad23fad
#ifndef __INFINIOP_ROPE_V2_CPU_H__
#define __INFINIOP_ROPE_V2_CPU_H__
#include "../rope_v2.h"
DESCRIPTOR
(
cpu
)
#endif // __INFINIOP_ROPE_V2_CPU_H__
src/infiniop/ops/rope_v2/cuda/kernel.cuh
0 → 100644
View file @
9ad23fad
#ifndef __INFINIOP_ROPE_V2_CUDA_KERNEL_CUH__
#define __INFINIOP_ROPE_V2_CUDA_KERNEL_CUH__
template
<
typename
Tdata
,
typename
Tindex
,
typename
Tangle
>
__device__
void
ropeThreadPerItemBlock
(
Tdata
*
y_
,
const
Tdata
*
x_
,
const
Tindex
*
__restrict__
pos_ids
,
const
Tangle
*
__restrict__
sin_table
,
const
Tangle
*
__restrict__
cos_table
,
size_t
table_dim
,
ptrdiff_t
y_stride_seqlen
,
ptrdiff_t
y_stride_nhead
,
ptrdiff_t
x_stride_seqlen
,
ptrdiff_t
x_stride_nhead
)
{
auto
y_offset
=
blockIdx
.
x
*
y_stride_seqlen
+
blockIdx
.
y
*
y_stride_nhead
;
auto
x_offset
=
blockIdx
.
x
*
x_stride_seqlen
+
blockIdx
.
y
*
x_stride_nhead
;
size_t
pos_id
=
size_t
(
pos_ids
[
blockIdx
.
x
]);
auto
table_offset
=
pos_id
*
table_dim
;
const
size_t
half_dim
=
table_dim
;
// Head dimension = 2 * table_dim
for
(
size_t
i
=
threadIdx
.
x
;
i
<
table_dim
;
i
+=
blockDim
.
x
)
{
Tangle
sin__
=
sin_table
[
table_offset
+
i
];
Tangle
cos__
=
cos_table
[
table_offset
+
i
];
// Calculate positions in first and second halves
size_t
pos0
=
i
;
size_t
pos1
=
i
+
half_dim
;
if
constexpr
(
std
::
is_same
<
Tdata
,
half
>::
value
)
{
Tangle
x0
=
__half2float
(
x_
[
x_offset
+
pos0
]);
Tangle
x1
=
__half2float
(
x_
[
x_offset
+
pos1
]);
Tangle
y0
=
x0
*
cos__
-
x1
*
sin__
;
Tangle
y1
=
x0
*
sin__
+
x1
*
cos__
;
y_
[
y_offset
+
pos0
]
=
__float2half
(
y0
);
y_
[
y_offset
+
pos1
]
=
__float2half
(
y1
);
}
else
if
constexpr
(
std
::
is_same
<
Tdata
,
cuda_bfloat16
>::
value
)
{
Tangle
x0
=
__bfloat162float
(
x_
[
x_offset
+
pos0
]);
Tangle
x1
=
__bfloat162float
(
x_
[
x_offset
+
pos1
]);
Tangle
y0
=
x0
*
cos__
-
x1
*
sin__
;
Tangle
y1
=
x0
*
sin__
+
x1
*
cos__
;
y_
[
y_offset
+
pos0
]
=
__float2bfloat16
(
y0
);
y_
[
y_offset
+
pos1
]
=
__float2bfloat16
(
y1
);
}
else
{
Tangle
x0
=
x_
[
x_offset
+
pos0
];
Tangle
x1
=
x_
[
x_offset
+
pos1
];
y_
[
y_offset
+
pos0
]
=
x0
*
cos__
-
x1
*
sin__
;
y_
[
y_offset
+
pos1
]
=
x0
*
sin__
+
x1
*
cos__
;
}
}
}
#endif
src/infiniop/ops/rope_v2/metax/rope_metax.h
0 → 100644
View file @
9ad23fad
#ifndef __INFINIOP_ROPE_METAX_H__
#define __INFINIOP_ROPE_METAX_H__
#include "../rope.h"
DESCRIPTOR
(
metax
)
#endif // __INFINIOP_ROPE_METAX_H__
src/infiniop/ops/rope_v2/metax/rope_metax.maca
0 → 100644
View file @
9ad23fad
#include "../../../devices/metax/metax_common.h"
#include "rope_metax.h"
#include "../../../devices/metax/metax_kernel_common.h"
#include "../cuda/kernel.cuh"
template <typename Tdata, typename Tindex, typename Tangle>
INFINIOP_METAX_KERNEL ropeThreadPerItemKernel(
Tdata *y_,
const Tdata *x_,
const Tindex *__restrict__ pos_ids,
const Tangle *__restrict__ sin_table,
const Tangle *__restrict__ cos_table,
size_t table_dim,
ptrdiff_t y_stride_seqlen,
ptrdiff_t y_stride_nhead,
ptrdiff_t x_stride_seqlen,
ptrdiff_t x_stride_nhead) {
ropeThreadPerItemBlock(
y_, x_, pos_ids,
sin_table, cos_table,
table_dim,
y_stride_seqlen, y_stride_nhead,
x_stride_seqlen, x_stride_nhead);
}
namespace op::rope::metax {
struct Descriptor::Opaque {
std::shared_ptr<device::metax::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t pos_desc,
infiniopTensorDescriptor_t sin_desc,
infiniopTensorDescriptor_t cos_desc) {
auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
auto info = RoPEInfo::createRoPEInfo(y_desc, x_desc, pos_desc, sin_desc, cos_desc);
CHECK_RESULT(info);
// Create descriptor
*desc_ptr = new Descriptor(
info.take(),
0,
new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
handle->device,
handle->device_id);
return INFINI_STATUS_SUCCESS;
}
template <typename Tdata, typename Tindex>
infiniStatus_t calculateRoPE(const RoPEInfo &info,
int block_size,
Tdata *y,
const Tdata *x,
const Tindex *pos_ids,
const Tdata *sin_table,
const Tdata *cos_table,
hcStream_t stream) {
auto dimx = uint32_t(info.seqlen),
dimy = uint32_t(info.nhead);
int nthreads = std::max(int(info.table_dim), block_size);
ropeThreadPerItemKernel<<<dim3(dimx, dimy), nthreads, 0, stream>>>(
y, x, pos_ids, sin_table, cos_table, info.table_dim,
info.y_stride_seqlen, info.y_stride_nhead, info.x_stride_seqlen, info.x_stride_nhead);
return INFINI_STATUS_SUCCESS;
}
#define CALCULATE_ROPE(TDATA, TINDEX) \
calculateRoPE(_info, \
_opaque->internal->maxThreadsPerBlock(), \
(TDATA *)y, \
(const TDATA *)x, \
(const TINDEX *)pos_ids, \
(const TDATA *)sin_table, \
(const TDATA *)cos_table, \
(hcStream_t)stream)
#define ROPE_TYPE(TDATA) \
switch (_info.pos_type) { \
case INFINI_DTYPE_U8: \
return CALCULATE_ROPE(TDATA, uint8_t); \
case INFINI_DTYPE_U16: \
return CALCULATE_ROPE(TDATA, uint16_t); \
case INFINI_DTYPE_U32: \
return CALCULATE_ROPE(TDATA, uint32_t); \
case INFINI_DTYPE_U64: \
return CALCULATE_ROPE(TDATA, uint64_t); \
case INFINI_DTYPE_I8: \
return CALCULATE_ROPE(TDATA, int8_t); \
case INFINI_DTYPE_I16: \
return CALCULATE_ROPE(TDATA, int16_t); \
case INFINI_DTYPE_I32: \
return CALCULATE_ROPE(TDATA, int32_t); \
case INFINI_DTYPE_I64: \
return CALCULATE_ROPE(TDATA, int64_t); \
default: \
return INFINI_STATUS_BAD_TENSOR_DTYPE; \
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *y,
const void *x,
const void *pos_ids,
const void *sin_table,
const void *cos_table,
void *stream) const {
switch (_info.data_type) {
case INFINI_DTYPE_F16:
ROPE_TYPE(half);
case INFINI_DTYPE_BF16:
ROPE_TYPE(cuda_bfloat16);
case INFINI_DTYPE_F32:
ROPE_TYPE(float);
case INFINI_DTYPE_F64:
ROPE_TYPE(double);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
#undef ROPE_TYPE
#undef CALCULATE_ROPE
} // namespace op::rope::metax
src/infiniop/ops/rope_v2/nvidia/rope_v2_nvidia.cu
0 → 100644
View file @
9ad23fad
#include "../../../devices/nvidia/nvidia_common.cuh"
#include "rope_v2_nvidia.cuh"
#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
#include "../cuda/kernel.cuh"
namespace
op
::
rope_v2
::
nvidia
{
template
<
typename
Tdata
,
typename
Tindex
,
typename
Tangle
>
INFINIOP_CUDA_KERNEL
ropev2ThreadPerItemKernel
(
Tdata
*
y_
,
const
Tdata
*
x_
,
const
Tindex
*
__restrict__
pos_ids
,
const
Tangle
*
__restrict__
sin_table
,
const
Tangle
*
__restrict__
cos_table
,
size_t
table_dim
,
ptrdiff_t
y_stride_seqlen
,
ptrdiff_t
y_stride_nhead
,
ptrdiff_t
x_stride_seqlen
,
ptrdiff_t
x_stride_nhead
)
{
ropeThreadPerItemBlock
(
y_
,
x_
,
pos_ids
,
sin_table
,
cos_table
,
table_dim
,
y_stride_seqlen
,
y_stride_nhead
,
x_stride_seqlen
,
x_stride_nhead
);
}
struct
Descriptor
::
Opaque
{
std
::
shared_ptr
<
device
::
nvidia
::
Handle
::
Internal
>
internal
;
};
Descriptor
::~
Descriptor
()
{
delete
_opaque
;
}
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle_
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
y_desc
,
infiniopTensorDescriptor_t
x_desc
,
infiniopTensorDescriptor_t
pos_desc
,
infiniopTensorDescriptor_t
sin_desc
,
infiniopTensorDescriptor_t
cos_desc
)
{
auto
handle
=
reinterpret_cast
<
device
::
nvidia
::
Handle
*>
(
handle_
);
auto
info
=
RoPEv2Info
::
createRoPEv2Info
(
y_desc
,
x_desc
,
pos_desc
,
sin_desc
,
cos_desc
);
CHECK_RESULT
(
info
);
// Create descriptor
*
desc_ptr
=
new
Descriptor
(
info
.
take
(),
0
,
new
Opaque
{
reinterpret_cast
<
device
::
nvidia
::
Handle
*>
(
handle
)
->
internal
()},
handle
->
device
,
handle
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
template
<
typename
Tdata
,
typename
Tindex
>
infiniStatus_t
calculateRoPEv2
(
const
RoPEv2Info
&
info
,
int
block_size
,
Tdata
*
y
,
const
Tdata
*
x
,
const
Tindex
*
pos_ids
,
const
Tdata
*
sin_table
,
const
Tdata
*
cos_table
,
cudaStream_t
stream
)
{
auto
dimx
=
uint32_t
(
info
.
seqlen
),
dimy
=
uint32_t
(
info
.
nhead
);
int
nthreads
=
std
::
max
(
int
(
info
.
table_dim
),
block_size
);
ropev2ThreadPerItemKernel
<<<
dim3
(
dimx
,
dimy
),
nthreads
,
0
,
stream
>>>
(
y
,
x
,
pos_ids
,
sin_table
,
cos_table
,
info
.
table_dim
,
info
.
y_stride_seqlen
,
info
.
y_stride_nhead
,
info
.
x_stride_seqlen
,
info
.
x_stride_nhead
);
return
INFINI_STATUS_SUCCESS
;
}
#define CALCULATE_ROPE_V2(TDATA, TINDEX) \
calculateRoPEv2(_info, \
_opaque->internal->maxThreadsPerBlock(), \
(TDATA *)y, \
(const TDATA *)x, \
(const TINDEX *)pos_ids, \
(const TDATA *)sin_table, \
(const TDATA *)cos_table, \
(cudaStream_t)stream)
#define ROPE_TYPE(TDATA) \
switch (_info.pos_type) { \
case INFINI_DTYPE_U8: \
return CALCULATE_ROPE_V2(TDATA, uint8_t); \
case INFINI_DTYPE_U16: \
return CALCULATE_ROPE_V2(TDATA, uint16_t); \
case INFINI_DTYPE_U32: \
return CALCULATE_ROPE_V2(TDATA, uint32_t); \
case INFINI_DTYPE_U64: \
return CALCULATE_ROPE_V2(TDATA, uint64_t); \
case INFINI_DTYPE_I8: \
return CALCULATE_ROPE_V2(TDATA, int8_t); \
case INFINI_DTYPE_I16: \
return CALCULATE_ROPE_V2(TDATA, int16_t); \
case INFINI_DTYPE_I32: \
return CALCULATE_ROPE_V2(TDATA, int32_t); \
case INFINI_DTYPE_I64: \
return CALCULATE_ROPE_V2(TDATA, int64_t); \
default: \
return INFINI_STATUS_BAD_TENSOR_DTYPE; \
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
y
,
const
void
*
x
,
const
void
*
pos_ids
,
const
void
*
sin_table
,
const
void
*
cos_table
,
void
*
stream
)
const
{
switch
(
_info
.
data_type
)
{
case
INFINI_DTYPE_F16
:
ROPE_TYPE
(
half
);
case
INFINI_DTYPE_BF16
:
ROPE_TYPE
(
cuda_bfloat16
);
case
INFINI_DTYPE_F32
:
ROPE_TYPE
(
float
);
case
INFINI_DTYPE_F64
:
ROPE_TYPE
(
double
);
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
return
INFINI_STATUS_SUCCESS
;
}
#undef ROPE_TYPE
#undef CALCULATE_ROPE
}
// namespace op::rope_v2::nvidia
src/infiniop/ops/rope_v2/nvidia/rope_v2_nvidia.cuh
0 → 100644
View file @
9ad23fad
#ifndef __INFINIOP_ROPE_V2_CUDA_H__
#define __INFINIOP_ROPE_V2_CUDA_H__
#include "../rope_v2.h"
DESCRIPTOR
(
nvidia
)
#endif // __INFINIOP_ROPE_V2_CUDA_H__
src/infiniop/ops/rope_v2/operator.cc
0 → 100644
View file @
9ad23fad
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/rope_v2.h"
#ifdef ENABLE_CPU_API
#include "cpu/rope_v2_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
#include "nvidia/rope_v2_nvidia.cuh"
#endif
#ifdef ENABLE_ASCEND_API
#include "ascend/rope_v2_ascend.h"
#endif
#ifdef ENABLE_CAMBRICON_API
#include "bang/rope_v2_bang.h"
#endif
#ifdef ENABLE_METAX_API
#include "metax/rope_v2_metax.h"
#endif
__C
infiniStatus_t
infiniopCreateRoPEv2Descriptor
(
infiniopHandle_t
handle
,
infiniopRoPEv2Descriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
y
,
infiniopTensorDescriptor_t
x
,
infiniopTensorDescriptor_t
pos_ids
,
infiniopTensorDescriptor_t
sin_table
,
infiniopTensorDescriptor_t
cos_table
)
{
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::rope_v2::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::rope_v2::NAMESPACE::Descriptor **>(desc_ptr), \
y, \
x, \
pos_ids, \
sin_table, \
cos_table)
switch
(
handle
->
device
)
{
#ifdef ENABLE_CPU_API
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
CREATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_ASCEND_API
CREATE
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
#ifdef ENABLE_CAMBRICON_API
CREATE
(
INFINI_DEVICE_CAMBRICON
,
bang
);
#endif
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
{
return
musaCreateRoPEDescriptor
((
MusaHandle_t
)
handle
,
(
RoPEMusaDescriptor_t
*
)
desc_ptr
,
t
,
pos_ids
,
sin_table
,
cos_table
);
}
#endif
}
#undef CREATE
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
__C
infiniStatus_t
infiniopGetRoPEv2WorkspaceSize
(
infiniopRoPEv2Descriptor_t
desc
,
size_t
*
size
)
{
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<const op::rope_v2::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
GET
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
GET
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
GET
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
GET
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_CAMBRICON_API
GET
(
INFINI_DEVICE_CAMBRICON
,
bang
);
#endif
#ifdef ENABLE_ASCEND_API
GET
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
{
return
musaGetRoPEWorkspaceSize
((
RoPEMusaDescriptor_t
)
desc
,
size
);
}
#endif
}
#undef GET
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
__C
infiniStatus_t
infiniopRoPEv2
(
infiniopRoPEv2Descriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
y
,
const
void
*
x
,
const
void
*
pos_ids
,
const
void
*
sin_table
,
const
void
*
cos_table
,
void
*
stream
)
{
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::rope_v2::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, y, x, pos_ids, sin_table, cos_table, stream)
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
CALCULATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_CAMBRICON_API
CALCULATE
(
INFINI_DEVICE_CAMBRICON
,
bang
);
#endif
#ifdef ENABLE_ASCEND_API
CALCULATE
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
{
return
musaRoPE
((
RoPEMusaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
t
,
pos_ids
,
sin_table
,
cos_table
,
stream
);
}
#endif
}
#undef CALCULATE
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
__C
infiniStatus_t
infiniopDestroyRoPEv2Descriptor
(
infiniopRoPEv2Descriptor_t
desc
)
{
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::rope_v2::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
DELETE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_CAMBRICON_API
DELETE
(
INFINI_DEVICE_CAMBRICON
,
bang
);
#endif
#ifdef ENABLE_ASCEND_API
DELETE
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
{
return
musaDestroyRoPEDescriptor
((
RoPEMusaDescriptor_t
)
desc
);
}
#endif
}
#undef DELETE
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
src/infiniop/ops/rope_v2/rope_v2.h
0 → 100644
View file @
9ad23fad
#ifndef __ROPE_V2_H__
#define __ROPE_V2_H__
#include "../../../utils.h"
#include "../../operator.h"
#include "../../tensor.h"
#define DESCRIPTOR(NAMESPACE) \
\
namespace op::rope_v2::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
RoPEv2Info _info; \
size_t _workspace_size; \
\
Descriptor( \
RoPEv2Info info, \
size_t workspace_size_, \
Opaque *opaque, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_info(info), \
_workspace_size(workspace_size_) {} \
\
public: \
~Descriptor(); \
\
size_t workspaceSize() const { return _workspace_size; } \
\
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t y_desc, \
infiniopTensorDescriptor_t x_desc, \
infiniopTensorDescriptor_t pos_desc, \
infiniopTensorDescriptor_t sin_desc, \
infiniopTensorDescriptor_t cos_desc); \
\
infiniStatus_t calculate( \
void *workspace, \
size_t workspace_size, \
void *y, \
const void *x, \
const void *pos_ids, \
const void *sin_table, \
const void *cos_table, \
void *stream) const; \
}; \
}
class
RoPEv2Info
{
private:
RoPEv2Info
()
=
default
;
public:
infiniDtype_t
data_type
,
pos_type
;
size_t
seqlen
,
nhead
,
dhead
,
table_len
,
table_dim
;
ptrdiff_t
y_stride_seqlen
,
y_stride_nhead
,
x_stride_seqlen
,
x_stride_nhead
;
static
utils
::
Result
<
RoPEv2Info
>
createRoPEv2Info
(
infiniopTensorDescriptor_t
y_desc
,
infiniopTensorDescriptor_t
x_desc
,
infiniopTensorDescriptor_t
pos_desc
,
infiniopTensorDescriptor_t
sin_desc
,
infiniopTensorDescriptor_t
cos_desc
)
{
CHECK_OR_RETURN
(
y_desc
!=
nullptr
&&
pos_desc
!=
nullptr
&&
sin_desc
!=
nullptr
&&
cos_desc
!=
nullptr
,
INFINI_STATUS_NULL_POINTER
);
const
infiniDtype_t
data_type
=
y_desc
->
dtype
();
const
infiniDtype_t
pos_type
=
pos_desc
->
dtype
();
CHECK_OR_RETURN
(
data_type
==
x_desc
->
dtype
()
&&
data_type
==
sin_desc
->
dtype
()
&&
data_type
==
cos_desc
->
dtype
(),
INFINI_STATUS_BAD_TENSOR_DTYPE
);
CHECK_DTYPE
(
data_type
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_BF16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F64
);
CHECK_DTYPE_ANY_INT
(
pos_type
);
CHECK_OR_RETURN
(
y_desc
->
ndim
()
==
3
&&
x_desc
->
ndim
()
==
3
&&
pos_desc
->
ndim
()
==
1
&&
sin_desc
->
ndim
()
==
2
&&
cos_desc
->
ndim
()
==
2
,
INFINI_STATUS_BAD_TENSOR_SHAPE
);
const
auto
seqlen
=
y_desc
->
dim
(
0
),
nhead
=
y_desc
->
dim
(
1
),
dhead
=
y_desc
->
dim
(
2
),
table_len
=
sin_desc
->
dim
(
0
),
table_dim
=
sin_desc
->
dim
(
1
);
CHECK_OR_RETURN
(
seqlen
==
x_desc
->
dim
(
0
)
&&
seqlen
==
pos_desc
->
dim
(
0
)
&&
nhead
==
x_desc
->
dim
(
1
)
&&
dhead
==
x_desc
->
dim
(
2
)
&&
table_len
==
cos_desc
->
dim
(
0
)
&&
table_dim
==
cos_desc
->
dim
(
1
),
INFINI_STATUS_BAD_TENSOR_SHAPE
);
CHECK_OR_RETURN
(
dhead
==
table_dim
*
2
,
INFINI_STATUS_BAD_TENSOR_SHAPE
);
// Last dimension of x and y must be contiguous
CHECK_OR_RETURN
(
y_desc
->
stride
(
2
)
==
1
&&
x_desc
->
stride
(
2
)
==
1
,
INFINI_STATUS_BAD_TENSOR_STRIDES
);
// sin table and cos table must be totally contiguous
CHECK_OR_RETURN
(
sin_desc
->
isContiguous
()
&&
cos_desc
->
isContiguous
(),
INFINI_STATUS_BAD_TENSOR_STRIDES
);
return
utils
::
Result
<
RoPEv2Info
>
(
RoPEv2Info
{
data_type
,
pos_type
,
seqlen
,
nhead
,
dhead
,
table_len
,
table_dim
,
y_desc
->
stride
(
0
),
y_desc
->
stride
(
1
),
x_desc
->
stride
(
0
),
x_desc
->
stride
(
1
),
});
}
};
#endif
src/infiniop/ops/topkrouter/cpu/topkrouter_cpu.cc
0 → 100644
View file @
9ad23fad
#include "topkrouter_cpu.h"
#include "../../../devices/cpu/common_cpu.h"
#include "../../../reduce/cpu/reduce.h"
namespace
op
::
topkrouter
::
cpu
{
Descriptor
::~
Descriptor
()
{}
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
x_desc
,
infiniopTensorDescriptor_t
correction_bias_desc
)
{
return
INFINI_STATUS_NOT_IMPLEMENTED
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
float
*
values
,
int
*
indices
,
void
*
x
,
float
*
correction_bias
,
float
routed_scaling_factor
,
size_t
topk
,
void
*
stream
)
const
{
return
INFINI_STATUS_NOT_IMPLEMENTED
;
}
}
// namespace op::topkrouter::cpu
src/infiniop/ops/topkrouter/cpu/topkrouter_cpu.h
0 → 100644
View file @
9ad23fad
#ifndef __Topkrouter_CPU_H__
#define __Topkrouter_CPU_H__
#include "../topkrouter.h"
DESCRIPTOR
(
cpu
)
#endif
src/infiniop/ops/topkrouter/cuda/kernel.cuh
0 → 100644
View file @
9ad23fad
#ifndef _Topkrouter_KERNEL_CUH__
#define _Topkrouter_KERNEL_CUH__
#include <cfloat>
#include <cub/block/block_load.cuh>
#include <cub/block/block_radix_sort.cuh>
#include <cub/block/block_reduce.cuh>
#include <cub/block/block_store.cuh>
#include <cub/cub.cuh>
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
template
<
typename
T
>
inline
__device__
float
exp_func
(
T
x
)
{
float
data
;
if
constexpr
(
std
::
is_same_v
<
T
,
float
>
)
{
data
=
x
;
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
__nv_bfloat16
>
)
{
data
=
__bfloat162float
(
x
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
data
=
__half2float
(
x
);
}
return
__expf
(
data
);
}
template
<
typename
T
>
inline
__device__
T
sigmoid_func
(
T
x
)
{
// sigmoid(x) = 1 / (1 + exp(-x))
return
1.0
f
/
(
1.0
f
+
exp_func
<
T
>
(
-
x
));
}
struct
CustomLess
{
template
<
typename
DataType
>
__device__
bool
operator
()(
const
DataType
&
lhs
,
const
DataType
&
rhs
)
{
return
lhs
>
rhs
;
}
};
//
// deepseek的topk
//
template
<
typename
T
,
int
BLOCK_THREADS
=
256
>
__global__
void
topkrouter_kernel
(
float
*
values_topk
,
// 输出值, 形状[N, topk]
int
*
indices_topk
,
// 输出索引, 形状[N, topk]
T
*
input
,
// 输入数据 [N, width]
float
*
d_correction_bias
,
// 输入数据 [width]
float
routed_scaling_factor
,
//
const
size_t
N
,
// 总行数,toen数量
const
size_t
width
,
// 每行元素数量
const
size_t
topk
)
{
const
int
bid
=
blockIdx
.
x
;
if
(
bid
>=
N
)
{
return
;
}
const
int
tid
=
threadIdx
.
x
;
const
T
*
data_input
=
input
+
bid
*
width
;
float
*
values_topk_output
=
values_topk
+
bid
*
topk
;
int
*
indices_topk_output
=
indices_topk
+
bid
*
topk
;
constexpr
int
warp_threads
=
32
;
constexpr
int
block_threads
=
256
;
constexpr
int
warps_per_block
=
block_threads
/
warp_threads
;
const
int
warp_id
=
tid
/
warp_threads
;
const
int
lane_id
=
tid
%
warp_threads
;
__shared__
float
share_data
[
256
];
__shared__
float
share_data_group
[
8
];
__shared__
float
share_data_group_mask
[
8
];
// 有效的group
__shared__
float
share_sum
;
if
(
tid
<
8
)
{
share_data_group_mask
[
tid
]
=
0.0
f
;
}
// ------------------------------------------------------ //
// 对输入数据做 sigmoid //
// ------------------------------------------------------ //
float
value
=
sigmoid_func
(
data_input
[
tid
]);
// ------------------------------------------------------ //
// 对输入数据加偏执 //
// ------------------------------------------------------ //
value
+=
d_correction_bias
[
tid
];
// ----------------------------------------------------------- //
// 每个warp为一组,一共8组,找出每组的最大的前两个数据 //
// ----------------------------------------------------------- //
float
thread_values
[
1
]
=
{
value
};
int
thread_indices
[
1
]
=
{
tid
};
using
WarpMergeSortT
=
cub
::
WarpMergeSort
<
float
,
1
,
warp_threads
,
int
>
;
{
__shared__
typename
WarpMergeSortT
::
TempStorage
temp_storage
[
warps_per_block
];
WarpMergeSortT
(
temp_storage
[
warp_id
]).
Sort
(
thread_values
,
thread_indices
,
CustomLess
());
}
__syncthreads
();
share_data
[
tid
]
=
thread_values
[
0
];
// ----------------------------------------------------------- //
// 每个组中,前两个数据的和 //
// ----------------------------------------------------------- //
__syncthreads
();
if
(
0
==
lane_id
)
{
share_data_group
[
warp_id
]
=
share_data
[
warp_id
*
warp_threads
]
+
share_data
[
warp_id
*
warp_threads
+
1
];
}
__syncthreads
();
// ----------------------------------------------------------- //
// 再选前 4 个 //
// ----------------------------------------------------------- //
if
(
0
==
warp_id
)
{
float
thread_values
[
1
]
=
{
-
FLT_MAX
};
int
thread_indices
[
1
]
=
{
-
1
};
if
(
lane_id
<
8
)
{
thread_values
[
0
]
=
share_data_group
[
lane_id
];
thread_indices
[
0
]
=
lane_id
;
}
__shared__
typename
WarpMergeSortT
::
TempStorage
temp_storage
[
1
];
WarpMergeSortT
(
temp_storage
[
0
]).
Sort
(
thread_values
,
thread_indices
,
CustomLess
());
if
(
lane_id
<
4
)
{
int
indices
=
thread_indices
[
0
];
share_data_group_mask
[
indices
]
=
1.0
f
;
}
}
__syncthreads
();
// ----------------------------------------------------------- //
// 求得 最后一次topk //
// ----------------------------------------------------------- //
value
=
value
*
share_data_group_mask
[
warp_id
];
thread_values
[
0
]
=
value
;
thread_indices
[
0
]
=
tid
;
{
typedef
cub
::
BlockRadixSort
<
float
,
BLOCK_THREADS
,
1
,
int
>
BlockRadixSort
;
__shared__
typename
BlockRadixSort
::
TempStorage
temp_storage
;
BlockRadixSort
(
temp_storage
).
SortDescending
(
thread_values
,
thread_indices
);
}
__syncthreads
();
// ----------------------------------------------------------- //
// 归一化 //
// ----------------------------------------------------------- //
if
(
0
==
warp_id
)
{
value
=
0.0
f
;
if
(
tid
<
8
)
{
int
index
=
thread_indices
[
0
];
value
=
sigmoid_func
(
data_input
[
index
]);
}
typedef
cub
::
WarpReduce
<
float
,
warp_threads
>
WarpReduce
;
__shared__
typename
WarpReduce
::
TempStorage
temp_storage
;
// 使用有效项group 进行部分归约
float
warp_sum
=
WarpReduce
(
temp_storage
).
Sum
(
value
);
if
(
0
==
tid
)
{
share_sum
=
warp_sum
+
1e-20
;
}
__syncwarp
();
if
(
tid
<
8
)
{
int
index
=
thread_indices
[
0
];
indices_topk_output
[
tid
]
=
index
;
values_topk_output
[
tid
]
=
routed_scaling_factor
*
value
/
share_sum
;
}
}
}
#endif // _topkrouter_KERNEL_CUH__
src/infiniop/ops/topkrouter/info.h
0 → 100644
View file @
9ad23fad
#ifndef __topkrouter_INFO_H__
#define __topkrouter_INFO_H__
#include "../../../utils.h"
#include "../../tensor.h"
#include <vector>
namespace
op
::
topkrouter
{
class
TopkrouterInfo
{
TopkrouterInfo
()
=
default
;
public:
infiniDtype_t
xtype
;
std
::
vector
<
size_t
>
shape
;
std
::
vector
<
ptrdiff_t
>
x_strides
;
size_t
N
;
size_t
width
;
public:
size_t
ndim
()
const
{
return
shape
.
size
();
}
size_t
dim
()
const
{
return
shape
[
ndim
()
-
1
];
}
static
utils
::
Result
<
TopkrouterInfo
>
create
(
infiniopTensorDescriptor_t
x_desc
)
{
auto
xtype
=
x_desc
->
dtype
();
if
((
xtype
!=
infiniDtype_t
::
INFINI_DTYPE_F32
)
&&
(
xtype
!=
infiniDtype_t
::
INFINI_DTYPE_F16
)
&&
(
xtype
!=
infiniDtype_t
::
INFINI_DTYPE_BF16
))
{
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
size_t
N
=
x_desc
->
shape
()[
0
];
// token数量
size_t
width
=
x_desc
->
shape
()[
1
];
// 专家数量
if
(
x_desc
->
ndim
()
!=
2
)
{
return
INFINI_STATUS_BAD_TENSOR_SHAPE
;
}
return
utils
::
Result
<
TopkrouterInfo
>
(
TopkrouterInfo
{
xtype
,
x_desc
->
shape
(),
x_desc
->
strides
(),
N
,
width
});
}
};
}
// namespace op::topkrouter
#endif // __Topkrouter_INFO_H__
src/infiniop/ops/topkrouter/nvidia/topkrouter_nvidia.cu
0 → 100644
View file @
9ad23fad
#include "../../../devices/nvidia/nvidia_common.cuh"
#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
#include "../cuda/kernel.cuh"
#include "topkrouter_nvidia.cuh"
#include <cub/block/block_reduce.cuh>
namespace
op
::
topkrouter
::
nvidia
{
struct
Descriptor
::
Opaque
{
std
::
shared_ptr
<
device
::
nvidia
::
Handle
::
Internal
>
internal
;
};
Descriptor
::~
Descriptor
()
{
delete
_opaque
;
}
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
x_desc
,
infiniopTensorDescriptor_t
correction_bias_desc
)
{
auto
result
=
TopkrouterInfo
::
create
(
x_desc
);
CHECK_RESULT
(
result
);
auto
info
=
result
.
take
();
if
(
info
.
x_strides
[
1
]
!=
1
)
{
return
INFINI_STATUS_BAD_TENSOR_STRIDES
;
}
*
desc_ptr
=
new
Descriptor
(
new
Opaque
{
reinterpret_cast
<
device
::
nvidia
::
Handle
*>
(
handle
)
->
internal
()},
std
::
move
(
info
),
0
,
handle
->
device
,
handle
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
namespace
{
template
<
int
BLOCK_SIZE
=
128
>
infiniStatus_t
launch_topkrouter
(
float
*
d_values_out
,
int
*
d_indices_out
,
void
*
d_input
,
float
*
d_correction_bias
,
float
routed_scaling_factor
,
size_t
N
,
size_t
width
,
size_t
topk
,
infiniDtype_t
xtype
,
cudaStream_t
stream
)
{
const
int
block_threads
=
BLOCK_SIZE
;
dim3
blocks
(
N
);
dim3
threads
(
block_threads
);
if
(
xtype
==
INFINI_DTYPE_F32
)
{
topkrouter_kernel
<
float
,
BLOCK_SIZE
><<<
blocks
,
threads
,
0
,
stream
>>>
(
d_values_out
,
d_indices_out
,
(
float
*
)
d_input
,
d_correction_bias
,
routed_scaling_factor
,
N
,
width
,
topk
);
}
else
if
(
xtype
==
INFINI_DTYPE_F16
)
{
topkrouter_kernel
<
half
,
BLOCK_SIZE
><<<
blocks
,
threads
,
0
,
stream
>>>
(
d_values_out
,
d_indices_out
,
(
half
*
)
d_input
,
d_correction_bias
,
routed_scaling_factor
,
N
,
width
,
topk
);
}
else
if
(
xtype
==
INFINI_DTYPE_BF16
)
{
topkrouter_kernel
<
__nv_bfloat16
,
BLOCK_SIZE
><<<
blocks
,
threads
,
0
,
stream
>>>
(
d_values_out
,
d_indices_out
,
(
__nv_bfloat16
*
)
d_input
,
d_correction_bias
,
routed_scaling_factor
,
N
,
width
,
topk
);
}
else
{
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
return
INFINI_STATUS_SUCCESS
;
}
};
// namespace
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
float
*
values
,
int
*
indices
,
void
*
x
,
float
*
correction_bias
,
float
routed_scaling_factor
,
size_t
topk
,
void
*
stream
)
const
{
if
(
workspace_size
<
_workspace_size
)
{
return
INFINI_STATUS_INSUFFICIENT_WORKSPACE
;
}
size_t
N
=
_info
.
N
;
size_t
width
=
_info
.
width
;
// 256
// size_t n_routed_experts = 256;
// size_t n_group = 8;
// size_t topk_group = 4;
auto
cuda_stream
=
reinterpret_cast
<
cudaStream_t
>
(
stream
);
if
(
256
==
width
)
{
launch_topkrouter
<
256
>
(
values
,
indices
,
x
,
correction_bias
,
routed_scaling_factor
,
N
,
width
,
topk
,
_info
.
xtype
,
cuda_stream
);
}
else
{
return
INFINI_STATUS_INTERNAL_ERROR
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::topkrouter::nvidia
src/infiniop/ops/topkrouter/nvidia/topkrouter_nvidia.cuh
0 → 100644
View file @
9ad23fad
#ifndef __Topkrouter_CUDA_H__
#define __Topkrouter_CUDA_H__
#include "../topkrouter.h"
DESCRIPTOR
(
nvidia
)
#endif
src/infiniop/ops/topkrouter/operator.cc
0 → 100644
View file @
9ad23fad
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/topkrouter.h"
#ifdef ENABLE_CPU_API
#include "cpu/topkrouter_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API)
#include "nvidia/topkrouter_nvidia.cuh"
#endif
__C
infiniStatus_t
infiniopCreateTopkrouterDescriptor
(
infiniopHandle_t
handle
,
infiniopTopkrouterDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
x_desc
,
infiniopTensorDescriptor_t
correction_bias_desc
)
{
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::topkrouter::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::topkrouter::NAMESPACE::Descriptor **>(desc_ptr), \
x_desc, correction_bias_desc)
switch
(
handle
->
device
)
{
#ifdef ENABLE_CPU_API
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
}
#undef CREATE
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
__C
infiniStatus_t
infiniopGetTopkrouterWorkspaceSize
(
infiniopTopkrouterDescriptor_t
desc
,
size_t
*
size
)
{
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::topkrouter::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
GET
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
GET
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
}
#undef GET
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
__C
infiniStatus_t
infiniopTopkrouter
(
infiniopTopkrouterDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
values
,
void
*
indices
,
void
*
x
,
void
*
correction_bias
,
float
routed_scaling_factor
,
size_t
topk
,
void
*
stream
)
{
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<op::topkrouter::NAMESPACE::Descriptor *>(desc)->calculate( \
workspace, workspace_size, (float *)values, (int *)indices, x, (float *)correction_bias, routed_scaling_factor, topk, stream)
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
}
#undef CALCULATE
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
__C
infiniStatus_t
infiniopDestroyTopkrouterDescriptor
(
infiniopTopkrouterDescriptor_t
desc
)
{
#define DESTROY(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<op::topkrouter::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
DESTROY
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
DESTROY
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
}
#undef DESTROY
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
src/infiniop/ops/topkrouter/topkrouter.h
0 → 100644
View file @
9ad23fad
#ifndef _Topkrouter_H_
#define _Topkrouter_H_
#include "../../operator.h"
#include "info.h"
#define DESCRIPTOR(NAMESPACE) \
\
namespace op::topkrouter::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
TopkrouterInfo _info; \
size_t _workspace_size; \
\
Descriptor( \
Opaque *opaque, \
TopkrouterInfo info, \
size_t workspace_size, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_info(info), \
_workspace_size(workspace_size) {} \
\
public: \
~Descriptor(); \
\
size_t workspaceSize() const { return _workspace_size; } \
\
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t x_desc, \
infiniopTensorDescriptor_t correction_bias_desc); \
\
infiniStatus_t calculate( \
void *workspace, size_t workspace_size, \
float *values, \
int *indices, \
void *x, \
float *correction_bias, \
float routed_scaling_factor, \
size_t topk, \
void *stream) const; \
}; \
}
#endif // _Topkrouter_H_
test/infiniop/dequantize.py
0 → 100644
View file @
9ad23fad
import
torch
import
ctypes
from
ctypes
import
c_uint64
from
libinfiniop
import
(
LIBINFINIOP
,
TestTensor
,
get_test_devices
,
check_error
,
test_operator
,
get_args
,
debug
,
get_tolerance
,
profile_operation
,
TestWorkspace
,
InfiniDtype
,
InfiniDtypeNames
,
InfiniDeviceNames
,
infiniopOperatorDescriptor_t
,
)
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# These are not meant to be imported from other modules
_TEST_CASES
=
[
# alpha, beta, a_shape, b_shape, c_shape, a_stride, b_stride, c_stride
(
1.0
,
0.0
,
(
1
,
2048
),
(
2048
,
2048
),
(
1
,
2048
),
None
,
None
,
None
),
(
1.0
,
0.0
,
(
2
,
4
,
2048
),
(
2
,
2048
,
2048
),
(
2
,
4
,
2048
),
None
,
None
,
None
),
(
1.0
,
0.0
,
(
1
,
2048
),
(
2048
,
2048
),
(
1
,
2048
),
(
4096
,
1
),
(
4096
,
1
),
(
4096
,
1
)),
(
1.0
,
1.0
,
(
6
,
2048
),
(
2048
,
2560
),
(
6
,
2560
),
(
2048
,
1
),
(
1
,
2048
),
(
2560
,
1
)),
(
1.0
/
8.0
,
0.0
,
(
4
,
8
*
6
,
64
),
(
4
,
64
,
6
),
(
4
,
8
*
6
,
6
),
None
,
None
,
None
),
]
# Data types used for testing
_TENSOR_DTYPES
=
[
InfiniDtype
.
F16
,
InfiniDtype
.
BF16
,
InfiniDtype
.
F32
]
# Tolerance map for different data types
_TOLERANCE_MAP
=
{
InfiniDtype
.
F16
:
{
"atol"
:
0
,
"rtol"
:
1e-2
},
InfiniDtype
.
F32
:
{
"atol"
:
0
,
"rtol"
:
1e-3
},
InfiniDtype
.
BF16
:
{
"atol"
:
0
,
"rtol"
:
5e-2
},
}
DEBUG
=
False
PROFILE
=
False
NUM_PRERUN
=
10
NUM_ITERATIONS
=
1000
# PyTorch implementation for matrix multiplication
def
gemm
(
d
,
_c
,
beta
,
_a
,
_b
,
alpha
):
try
:
if
_c
.
ndim
==
2
:
torch
.
addmm
(
_c
,
_a
,
_b
,
beta
=
beta
,
alpha
=
alpha
,
out
=
d
)
elif
_c
.
ndim
==
3
:
torch
.
baddbmm
(
_c
,
_a
,
_b
,
beta
=
beta
,
alpha
=
alpha
,
out
=
d
)
else
:
raise
except
Exception
:
torch
.
matmul
(
_a
,
_b
,
out
=
d
)
d
.
mul_
(
alpha
).
add_
(
_c
,
alpha
=
beta
)
# The argument list should be (lib, handle, torch_device, <param list>, dtype)
# The <param list> should keep the same order as the one specified in _TEST_CASES
def
test
(
handle
,
device
,
alpha
,
beta
,
a_shape
,
b_shape
,
c_shape
,
a_stride
=
None
,
b_stride
=
None
,
c_stride
=
None
,
dtype
=
InfiniDtype
.
F16
,
sync
=
None
,
):
print
(
f
"Testing Gemm on
{
InfiniDeviceNames
[
device
]
}
with alpha:
{
alpha
}
, beta:
{
beta
}
,"
f
" a_shape:
{
a_shape
}
, b_shape:
{
b_shape
}
, c_shape:
{
c_shape
}
,"
f
" a_stride:
{
a_stride
}
, b_stride:
{
b_stride
}
, c_stride:
{
c_stride
}
, dtype:
{
InfiniDtypeNames
[
dtype
]
}
"
)
qweight
=
TestTensor
((
8192
,
256
),
None
,
InfiniDtype
.
I32
,
device
,
mode
=
"randint"
)
scales
=
TestTensor
((
64
,
2048
),
None
,
InfiniDtype
.
F16
,
device
)
zeros
=
TestTensor
((
64
,
256
),
None
,
InfiniDtype
.
I32
,
device
,
mode
=
"zeros"
)
out
=
TestTensor
((
8192
,
2048
),
None
,
InfiniDtype
.
F16
,
device
,
mode
=
"zeros"
)
print
(
out
.
actual_tensor
())
descriptor
=
infiniopOperatorDescriptor_t
()
check_error
(
LIBINFINIOP
.
infiniopCreateDequantizeDescriptor
(
handle
,
ctypes
.
byref
(
descriptor
),
out
.
descriptor
,
qweight
.
descriptor
,
scales
.
descriptor
,
zeros
.
descriptor
,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
# for tensor in [a, b, c]:
# tensor.destroy_desc()
# Get workspace size and create workspace
workspace_size
=
c_uint64
(
0
)
check_error
(
LIBINFINIOP
.
infiniopGetDequantizeWorkspaceSize
(
descriptor
,
ctypes
.
byref
(
workspace_size
)
)
)
workspace
=
TestWorkspace
(
workspace_size
.
value
,
device
)
# Execute infiniop gemm operator
def
lib_dequantize
():
check_error
(
LIBINFINIOP
.
infiniopDequantize
(
descriptor
,
workspace
.
data
(),
workspace_size
.
value
,
out
.
data
(),
qweight
.
data
(),
scales
.
data
(),
zeros
.
data
(),
0
,
0
,
0
,
None
,
)
)
lib_dequantize
()
print
(
out
.
actual_tensor
())
# # Validate results
# atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
# if DEBUG:
# debug(c.actual_tensor(), ans.torch_tensor(), atol=atol, rtol=rtol)
# assert torch.allclose(c.actual_tensor(), ans.torch_tensor(), atol=atol, rtol=rtol)
# # Profiling workflow
# if PROFILE:
# # fmt: off
# profile_operation("PyTorch", lambda: torch_gemm(), device, NUM_PRERUN, NUM_ITERATIONS)
# profile_operation(" lib", lambda: lib_gemm(), device, NUM_PRERUN, NUM_ITERATIONS)
# # fmt: on
# check_error(LIBINFINIOP.infiniopDestroyDequantizeDescriptor(descriptor))
# ==============================================================================
# Main Execution
# ==============================================================================
if
__name__
==
"__main__"
:
args
=
get_args
()
# Configure testing options
DEBUG
=
args
.
debug
PROFILE
=
args
.
profile
NUM_PRERUN
=
args
.
num_prerun
NUM_ITERATIONS
=
args
.
num_iterations
# Execute tests
for
device
in
get_test_devices
(
args
):
test_operator
(
device
,
test
,
_TEST_CASES
,
_TENSOR_DTYPES
)
print
(
"
\033
[92mTest passed!
\033
[0m"
)
test/infiniop/libinfiniop/op_register.py
View file @
9ad23fad
...
...
@@ -387,6 +387,42 @@ def rope_(lib):
]
@
OpRegister
.
operator
def
rope_v2_
(
lib
):
lib
.
infiniopCreateRoPEv2Descriptor
.
restype
=
c_int32
lib
.
infiniopCreateRoPEv2Descriptor
.
argtypes
=
[
infiniopHandle_t
,
POINTER
(
infiniopOperatorDescriptor_t
),
infiniopTensorDescriptor_t
,
infiniopTensorDescriptor_t
,
infiniopTensorDescriptor_t
,
infiniopTensorDescriptor_t
,
]
lib
.
infiniopGetRoPEv2WorkspaceSize
.
restype
=
c_int32
lib
.
infiniopGetRoPEv2WorkspaceSize
.
argtypes
=
[
infiniopOperatorDescriptor_t
,
POINTER
(
c_size_t
),
]
lib
.
infiniopRoPEv2
.
restype
=
c_int32
lib
.
infiniopRoPEv2
.
argtypes
=
[
infiniopOperatorDescriptor_t
,
c_void_p
,
c_size_t
,
c_void_p
,
c_void_p
,
c_void_p
,
c_void_p
,
c_void_p
,
]
lib
.
infiniopDestroyRoPEv2Descriptor
.
restype
=
c_int32
lib
.
infiniopDestroyRoPEv2Descriptor
.
argtypes
=
[
infiniopOperatorDescriptor_t
,
]
@
OpRegister
.
operator
def
sub_
(
lib
):
lib
.
infiniopCreateSubDescriptor
.
restype
=
c_int32
...
...
@@ -489,3 +525,74 @@ def conv_(lib):
lib
.
infiniopDestroyConvDescriptor
.
argtypes
=
[
infiniopOperatorDescriptor_t
,
]
@
OpRegister
.
operator
def
topkrouter_
(
lib
):
lib
.
infiniopCreateTopkrouterDescriptor
.
restype
=
c_int32
lib
.
infiniopCreateTopkrouterDescriptor
.
argtypes
=
[
infiniopHandle_t
,
POINTER
(
infiniopOperatorDescriptor_t
),
infiniopTensorDescriptor_t
,
infiniopTensorDescriptor_t
]
lib
.
infiniopGetTopkrouterWorkspaceSize
.
restype
=
c_int32
lib
.
infiniopGetTopkrouterWorkspaceSize
.
argtypes
=
[
infiniopOperatorDescriptor_t
,
POINTER
(
c_size_t
),
]
lib
.
infiniopTopkrouter
.
restype
=
c_int32
lib
.
infiniopTopkrouter
.
argtypes
=
[
infiniopOperatorDescriptor_t
,
c_void_p
,
c_size_t
,
c_void_p
,
c_void_p
,
c_void_p
,
c_void_p
,
c_float
,
c_size_t
,
c_void_p
,
]
lib
.
infiniopDestroyTopkrouterDescriptor
.
restype
=
c_int32
lib
.
infiniopDestroyTopkrouterDescriptor
.
argtypes
=
[
infiniopOperatorDescriptor_t
,
]
@
OpRegister
.
operator
def
dequantize_
(
lib
):
lib
.
infiniopCreateDequantizeDescriptor
.
restype
=
c_int32
lib
.
infiniopCreateDequantizeDescriptor
.
argtypes
=
[
infiniopHandle_t
,
POINTER
(
infiniopOperatorDescriptor_t
),
infiniopTensorDescriptor_t
,
infiniopTensorDescriptor_t
,
infiniopTensorDescriptor_t
,
infiniopTensorDescriptor_t
,
]
lib
.
infiniopGetDequantizeWorkspaceSize
.
restype
=
c_int32
lib
.
infiniopGetDequantizeWorkspaceSize
.
argtypes
=
[
infiniopOperatorDescriptor_t
,
POINTER
(
c_size_t
),
]
lib
.
infiniopDequantize
.
restype
=
c_int32
lib
.
infiniopDequantize
.
argtypes
=
[
infiniopOperatorDescriptor_t
,
c_void_p
,
c_size_t
,
c_void_p
,
c_void_p
,
c_void_p
,
c_size_t
,
c_size_t
,
c_size_t
,
c_void_p
,
]
lib
.
infiniopDestroyDequantizeDescriptor
.
restype
=
c_int32
lib
.
infiniopDestroyDequantizeDescriptor
.
argtypes
=
[
infiniopOperatorDescriptor_t
,
]
\ No newline at end of file
test/infiniop/libinfiniop/utils.py
View file @
9ad23fad
...
...
@@ -78,6 +78,8 @@ class TestTensor(CTensor):
self
.
_torch_tensor
=
torch
.
ones
(
torch_shape
,
dtype
=
to_torch_dtype
(
dt
),
device
=
torch_device_map
[
device
]
)
elif
mode
==
"randint"
:
self
.
_torch_tensor
=
torch
.
randint
(
-
2000000000
,
2000000000
,
torch_shape
,
dtype
=
to_torch_dtype
(
dt
),
device
=
torch_device_map
[
device
])
elif
mode
==
"manual"
:
assert
set_tensor
is
not
None
assert
torch_shape
==
list
(
set_tensor
.
shape
)
...
...
test/infiniop/rms_norm.py
View file @
9ad23fad
...
...
@@ -37,7 +37,7 @@ _TEST_CASES_ = [
# w (weight) types
# Note: 'None' means the same as input dtype
_WEIGHT_DTYPES
=
[
None
,
InfiniDtype
.
F32
]
_WEIGHT_DTYPES
=
[
None
,
InfiniDtype
.
F32
,
InfiniDtype
.
F16
,
InfiniDtype
.
BF16
]
# x types used for testing
_TENSOR_DTYPES
=
[
InfiniDtype
.
F16
,
InfiniDtype
.
BF16
]
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment