Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
ed530e11
Commit
ed530e11
authored
Sep 29, 2025
by
pengcheng888
Browse files
issue/427 - the sigmoid, topksoftmax, and topkrouter ops
parent
3959c943
Changes
38
Hide whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
1569 additions
and
83 deletions
+1569
-83
src/infiniop/ops/topkrouter/nvidia/topkrouter_nvidia.cuh
src/infiniop/ops/topkrouter/nvidia/topkrouter_nvidia.cuh
+2
-2
src/infiniop/ops/topkrouter/operator.cc
src/infiniop/ops/topkrouter/operator.cc
+14
-20
src/infiniop/ops/topkrouter/topkrouter.h
src/infiniop/ops/topkrouter/topkrouter.h
+44
-43
src/infiniop/ops/topksoftmax/cpu/topksoftmax_cpu.cc
src/infiniop/ops/topksoftmax/cpu/topksoftmax_cpu.cc
+143
-0
src/infiniop/ops/topksoftmax/cpu/topksoftmax_cpu.h
src/infiniop/ops/topksoftmax/cpu/topksoftmax_cpu.h
+7
-0
src/infiniop/ops/topksoftmax/cuda/kernel.cuh
src/infiniop/ops/topksoftmax/cuda/kernel.cuh
+145
-0
src/infiniop/ops/topksoftmax/info.h
src/infiniop/ops/topksoftmax/info.h
+48
-0
src/infiniop/ops/topksoftmax/nvidia/topksoftmax_nvidia.cu
src/infiniop/ops/topksoftmax/nvidia/topksoftmax_nvidia.cu
+92
-0
src/infiniop/ops/topksoftmax/nvidia/topksoftmax_nvidia.cuh
src/infiniop/ops/topksoftmax/nvidia/topksoftmax_nvidia.cuh
+8
-0
src/infiniop/ops/topksoftmax/operator.cc
src/infiniop/ops/topksoftmax/operator.cc
+101
-0
src/infiniop/ops/topksoftmax/topksoftmax.h
src/infiniop/ops/topksoftmax/topksoftmax.h
+34
-0
test/infiniop-test/test_generate/testcases/sigmoid.py
test/infiniop-test/test_generate/testcases/sigmoid.py
+136
-0
test/infiniop-test/test_generate/testcases/topkrouter.py
test/infiniop-test/test_generate/testcases/topkrouter.py
+225
-0
test/infiniop-test/test_generate/testcases/topksoftmax.py
test/infiniop-test/test_generate/testcases/topksoftmax.py
+148
-0
test/infiniop/libinfiniop/op_register.py
test/infiniop/libinfiniop/op_register.py
+61
-3
test/infiniop/sigmoid.py
test/infiniop/sigmoid.py
+172
-0
test/infiniop/topkrouter.py
test/infiniop/topkrouter.py
+19
-15
test/infiniop/topksoftmax.py
test/infiniop/topksoftmax.py
+170
-0
No files found.
src/infiniop/ops/topkrouter/nvidia/topkrouter_nvidia.cuh
View file @
ed530e11
#ifndef __T
opkrouter
_CUDA_H__
#ifndef __T
OPKROUTER
_CUDA_H__
#define __T
opkrouter
_CUDA_H__
#define __T
OPKROUTER
_CUDA_H__
#include "../topkrouter.h"
#include "../topkrouter.h"
...
...
src/infiniop/ops/topkrouter/operator.cc
View file @
ed530e11
...
@@ -9,18 +9,13 @@
...
@@ -9,18 +9,13 @@
#include "nvidia/topkrouter_nvidia.cuh"
#include "nvidia/topkrouter_nvidia.cuh"
#endif
#endif
__C
infiniStatus_t
infiniopCreateTopkrouterDescriptor
(
__C
infiniStatus_t
infiniopCreateTopkrouterDescriptor
(
infiniopHandle_t
handle
,
infiniopTopkrouterDescriptor_t
*
desc_ptr
,
infiniopHandle_t
handle
,
infiniopTensorDescriptor_t
x_desc
,
infiniopTopkrouterDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
correction_bias_desc
)
{
infiniopTensorDescriptor_t
x_desc
,
#define CREATE(CASE, NAMESPACE) \
infiniopTensorDescriptor_t
correction_bias_desc
)
{
case CASE: \
return op::topkrouter::NAMESPACE::Descriptor::create( \
#define CREATE(CASE, NAMESPACE) \
handle, reinterpret_cast<op::topkrouter::NAMESPACE::Descriptor **>(desc_ptr), x_desc, correction_bias_desc)
case CASE: \
return op::topkrouter::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::topkrouter::NAMESPACE::Descriptor **>(desc_ptr), \
x_desc, correction_bias_desc)
switch
(
handle
->
device
)
{
switch
(
handle
->
device
)
{
#ifdef ENABLE_CPU_API
#ifdef ENABLE_CPU_API
...
@@ -37,7 +32,6 @@ __C infiniStatus_t infiniopCreateTopkrouterDescriptor(
...
@@ -37,7 +32,6 @@ __C infiniStatus_t infiniopCreateTopkrouterDescriptor(
}
}
__C
infiniStatus_t
infiniopGetTopkrouterWorkspaceSize
(
infiniopTopkrouterDescriptor_t
desc
,
size_t
*
size
)
{
__C
infiniStatus_t
infiniopGetTopkrouterWorkspaceSize
(
infiniopTopkrouterDescriptor_t
desc
,
size_t
*
size
)
{
#define GET(CASE, NAMESPACE) \
#define GET(CASE, NAMESPACE) \
case CASE: \
case CASE: \
*size = reinterpret_cast<op::topkrouter::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
*size = reinterpret_cast<op::topkrouter::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
...
@@ -58,12 +52,13 @@ __C infiniStatus_t infiniopGetTopkrouterWorkspaceSize(infiniopTopkrouterDescript
...
@@ -58,12 +52,13 @@ __C infiniStatus_t infiniopGetTopkrouterWorkspaceSize(infiniopTopkrouterDescript
}
}
__C
infiniStatus_t
infiniopTopkrouter
(
infiniopTopkrouterDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
__C
infiniStatus_t
infiniopTopkrouter
(
infiniopTopkrouterDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
values
,
void
*
indices
,
void
*
x
,
void
*
correction_bias
,
float
routed_scaling_factor
,
size_t
topk
,
void
*
stream
)
{
void
*
values
,
void
*
indices
,
const
void
*
x
,
const
void
*
correction_bias
,
const
float
routed_scaling_factor
,
const
size_t
topk
,
void
*
stream
)
{
#define CALCULATE(CASE, NAMESPACE) \
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
case CASE: \
return reinterpret_cast<op::topkrouter::NAMESPACE::Descriptor *>(desc)->calculate( \
return reinterpret_cast<op::topkrouter::NAMESPACE::Descriptor *>(desc)->calculate( \
workspace, workspace_size, (float *)values, (int *)indices, x, (float *)correction_bias, routed_scaling_factor, topk, stream)
workspace, workspace_size, (float *)values, (int *)indices, x, (float *)correction_bias, routed_scaling_factor, \
topk, stream)
switch
(
desc
->
device_type
)
{
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
#ifdef ENABLE_CPU_API
...
@@ -80,7 +75,6 @@ __C infiniStatus_t infiniopTopkrouter(infiniopTopkrouterDescriptor_t desc, void
...
@@ -80,7 +75,6 @@ __C infiniStatus_t infiniopTopkrouter(infiniopTopkrouterDescriptor_t desc, void
}
}
__C
infiniStatus_t
infiniopDestroyTopkrouterDescriptor
(
infiniopTopkrouterDescriptor_t
desc
)
{
__C
infiniStatus_t
infiniopDestroyTopkrouterDescriptor
(
infiniopTopkrouterDescriptor_t
desc
)
{
#define DESTROY(CASE, NAMESPACE) \
#define DESTROY(CASE, NAMESPACE) \
case CASE: \
case CASE: \
delete reinterpret_cast<op::topkrouter::NAMESPACE::Descriptor *>(desc); \
delete reinterpret_cast<op::topkrouter::NAMESPACE::Descriptor *>(desc); \
...
...
src/infiniop/ops/topkrouter/topkrouter.h
View file @
ed530e11
#ifndef _T
opkrouter
_H_
#ifndef _T
OPKTOUTER
_H_
#define _T
opkrouter
_H_
#define _T
OPKTOUTER
_H_
#include "../../operator.h"
#include "../../operator.h"
#include "info.h"
#include "info.h"
#define DESCRIPTOR(NAMESPACE) \
#define DESCRIPTOR(NAMESPACE) \
\
\
namespace op::topkrouter::NAMESPACE { \
namespace op::topkrouter::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
struct Opaque; \
Opaque *_opaque; \
Opaque *_opaque; \
TopkrouterInfo _info; \
TopkrouterInfo _info; \
size_t _workspace_size; \
size_t _workspace_size; \
\
\
Descriptor( \
Descriptor(Opaque *opaque, \
Opaque *opaque, \
TopkrouterInfo info, \
TopkrouterInfo info, \
size_t workspace_size, \
size_t workspace_size, \
infiniDevice_t device_type, \
infiniDevice_t device_type, \
int device_id) \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_opaque(opaque), \
_info(info), \
_info(info), \
_workspace_size(workspace_size) { \
_workspace_size(workspace_size) {} \
} \
\
\
public: \
public: \
~Descriptor(); \
~Descriptor(); \
\
\
size_t workspaceSize() const { return _workspace_size; } \
size_t workspaceSize() const { \
\
return _workspace_size; \
static infiniStatus_t create( \
} \
infiniopHandle_t handle, \
\
Descriptor **desc_ptr, \
static infiniStatus_t create(infiniopHandle_t handle, \
infiniopTensorDescriptor_t x_desc, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t correction_bias_desc); \
infiniopTensorDescriptor_t x_desc, \
\
infiniopTensorDescriptor_t correction_bias_desc); \
infiniStatus_t calculate( \
\
void *workspace, size_t workspace_size, \
infiniStatus_t calculate(void *workspace, \
float *values, \
size_t workspace_size, \
int *indices, \
float *values, \
void *x, \
int *indices, \
float *correction_bias, \
const void *x, \
float routed_scaling_factor, \
const float *correction_bias, \
size_t topk, \
const float routed_scaling_factor, \
void *stream) const; \
const size_t topk, \
}; \
void *stream) const; \
}; \
}
}
#endif // _Topkrouter_H_
#endif // _Topkrouter_H_
src/infiniop/ops/topksoftmax/cpu/topksoftmax_cpu.cc
0 → 100644
View file @
ed530e11
#include "topksoftmax_cpu.h"
#include "../../../../utils.h"
#include "../../../devices/cpu/common_cpu.h"
#include "../../../reduce/cpu/reduce.h"
#include "topksoftmax_cpu.h"
#include <algorithm>
namespace
op
::
topksoftmax
::
cpu
{
Descriptor
::~
Descriptor
()
{
}
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
x_desc
)
{
auto
result
=
TopksoftmaxInfo
::
create
(
x_desc
);
CHECK_RESULT
(
result
);
auto
info
=
result
.
take
();
if
(
info
.
x_strides
[
1
]
!=
1
)
{
return
INFINI_STATUS_BAD_TENSOR_STRIDES
;
}
*
desc_ptr
=
new
Descriptor
(
nullptr
,
std
::
move
(
info
),
0
,
handle
->
device
,
handle
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
void
topksoftmax_cpu_one_token
(
float
*
values_input
,
// 输出数据
int
*
indices_input
,
// 输出索引
std
::
vector
<
std
::
pair
<
float
,
size_t
>>
&
value_index_arr
,
// 输入数据
size_t
topk
,
bool
norm
,
size_t
width
)
{
// ------------------------------------------------ //
// 第一步:计算最大值 //
// ------------------------------------------------ //
float
value_max
=
value_index_arr
[
0
].
first
;
for
(
size_t
i
=
1
;
i
<
width
;
++
i
)
{
value_max
=
value_index_arr
[
i
].
first
>
value_max
?
value_index_arr
[
i
].
first
:
value_max
;
}
// ------------------------------------------------ //
// 第二步: 指数计算 //
// ------------------------------------------------ //
float
exp_sum
=
0.0
f
;
for
(
size_t
i
=
0
;
i
<
width
;
++
i
)
{
float
value
=
std
::
exp
(
value_index_arr
[
i
].
first
-
value_max
);
value_index_arr
[
i
].
first
=
value
;
exp_sum
+=
value
;
}
// ------------------------------------------------ //
// 第三步:计算 Softmax //
// ------------------------------------------------ //
for
(
size_t
i
=
0
;
i
<
width
;
++
i
)
{
value_index_arr
[
i
].
first
/=
exp_sum
;
}
// ------------------------------------------------ //
// 第四步:计算 排序 //
// ------------------------------------------------ //
std
::
sort
(
value_index_arr
.
begin
(),
value_index_arr
.
end
(),
[](
const
std
::
pair
<
float
,
size_t
>
&
a
,
const
std
::
pair
<
float
,
size_t
>
&
b
)
{
return
a
.
first
>
b
.
first
;
});
// ------------------------------------------------ //
// 第五步: topk //
// ------------------------------------------------ //
exp_sum
=
0.0
f
;
for
(
size_t
i
=
0
;
i
<
topk
;
++
i
)
{
values_input
[
i
]
=
value_index_arr
[
i
].
first
;
indices_input
[
i
]
=
static_cast
<
int
>
(
value_index_arr
[
i
].
second
);
exp_sum
+=
values_input
[
i
];
}
// ------------------------------------------------ //
// 第6步: norm归一化 //
// ------------------------------------------------ //
if
(
norm
)
{
for
(
size_t
i
=
0
;
i
<
topk
;
++
i
)
{
values_input
[
i
]
/=
exp_sum
;
}
}
}
template
<
typename
T
>
infiniStatus_t
topksoftmax_cpu_func
(
float
*
values
,
int
*
indices
,
const
T
*
x
,
size_t
topk
,
bool
norm
,
size_t
N
,
size_t
width
)
{
/*
O-----------> width 地址连续
|
|
N
*/
for
(
size_t
n
=
0
;
n
<
N
;
++
n
)
{
float
*
values_input
=
values
+
n
*
topk
;
int
*
indices_input
=
indices
+
n
*
topk
;
const
T
*
x_input
=
x
+
n
*
width
;
std
::
vector
<
std
::
pair
<
float
,
size_t
>>
value_index_arr
;
value_index_arr
.
resize
(
width
);
// ------------------------------------------------ //
// 第0步: 数据先转换到 float //
// ------------------------------------------------ //
float
temp
;
for
(
size_t
i
=
0
;
i
<
width
;
++
i
)
{
if
constexpr
(
std
::
is_same
<
T
,
fp16_t
>::
value
)
{
temp
=
_f16_to_f32
(
x_input
[
i
]);
}
else
if
constexpr
(
std
::
is_same
<
T
,
bf16_t
>::
value
)
{
temp
=
_bf16_to_f32
(
x_input
[
i
]);
}
else
{
temp
=
x_input
[
i
];
}
value_index_arr
[
i
]
=
{
temp
,
i
};
}
topksoftmax_cpu_one_token
(
values_input
,
indices_input
,
value_index_arr
,
topk
,
norm
,
width
);
}
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
float
*
values
,
int
*
indices
,
const
void
*
x
,
const
size_t
topk
,
const
bool
norm
,
void
*
stream
)
const
{
size_t
N
=
_info
.
N
;
size_t
width
=
_info
.
width
;
if
(
_info
.
xtype
==
INFINI_DTYPE_F32
)
{
topksoftmax_cpu_func
<
float
>
(
values
,
indices
,
(
const
float
*
)
x
,
topk
,
norm
,
N
,
width
);
}
else
if
(
_info
.
xtype
==
INFINI_DTYPE_F16
)
{
topksoftmax_cpu_func
<
fp16_t
>
(
values
,
indices
,
(
const
fp16_t
*
)
x
,
topk
,
norm
,
N
,
width
);
}
else
if
(
_info
.
xtype
==
INFINI_DTYPE_BF16
)
{
topksoftmax_cpu_func
<
bf16_t
>
(
values
,
indices
,
(
const
bf16_t
*
)
x
,
topk
,
norm
,
N
,
width
);
}
else
{
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::topksoftmax::cpu
src/infiniop/ops/topksoftmax/cpu/topksoftmax_cpu.h
0 → 100644
View file @
ed530e11
#ifndef __TOPKSOFTMAX_CPU_H__
#define __TOPKSOFTMAX_CPU_H__
#include "../topksoftmax.h"
DESCRIPTOR
(
cpu
)
#endif
src/infiniop/ops/topksoftmax/cuda/kernel.cuh
0 → 100644
View file @
ed530e11
#ifndef _TOPKSOFTMAX_KERNEL_CUH__
#define _TOPKSOFTMAX_KERNEL_CUH__
#include <cfloat>
#include <cub/block/block_load.cuh>
#include <cub/block/block_radix_sort.cuh>
#include <cub/block/block_reduce.cuh>
#include <cub/block/block_store.cuh>
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
template
<
typename
T
>
inline
__device__
float
exp_func
(
T
x
)
{
float
data
;
if
constexpr
(
std
::
is_same_v
<
T
,
float
>
)
{
data
=
x
;
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
__nv_bfloat16
>
)
{
data
=
__bfloat162float
(
x
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
data
=
__half2float
(
x
);
}
return
__expf
(
data
);
}
template
<
typename
T
,
int
BLOCK_SIZE
=
128
>
__global__
void
softmax_topk_row_kernel
(
float
*
values_topk
,
// 输出数据, 形状[N, topk]
int
*
indices_topk
,
// 输出索引, 形状[N, topk]
const
T
*
input
,
// 输入数据 [N, width]
const
size_t
N
,
const
size_t
width
,
const
size_t
topk
,
bool
norm
)
{
const
int
bid
=
blockIdx
.
x
;
if
(
bid
>=
N
)
{
return
;
}
const
int
tid
=
threadIdx
.
x
;
const
T
*
data_input
=
input
+
bid
*
width
;
float
*
values_topk_output
=
values_topk
+
bid
*
topk
;
int
*
indices_topk_output
=
indices_topk
+
bid
*
topk
;
const
int
warp_id
=
tid
/
32
;
__shared__
T
shared_max
;
__shared__
float
shared_sum
;
typedef
cub
::
BlockReduce
<
T
,
BLOCK_SIZE
>
BlockReduce
;
// ------------------------------------------------ //
// 第一步:计算最大值 //
// ------------------------------------------------ //
T
thread_max
=
data_input
[
0
];
if
(
tid
<
width
)
{
thread_max
=
thread_max
>
data_input
[
tid
]
?
thread_max
:
data_input
[
tid
];
}
{
__shared__
typename
BlockReduce
::
TempStorage
temp_storage_max
;
T
value_max
=
BlockReduce
(
temp_storage_max
).
Reduce
(
thread_max
,
cub
::
Max
());
if
(
tid
==
0
)
{
shared_max
=
value_max
;
}
}
__syncthreads
();
// ------------------------------------------------ //
// 第二步:计算指数和 //
// ------------------------------------------------ //
float
exp_val
=
0.0
f
;
if
(
tid
<
width
)
{
T
temp_val
=
data_input
[
tid
]
-
shared_max
;
exp_val
=
exp_func
<
T
>
(
temp_val
);
}
{
__shared__
typename
BlockReduce
::
TempStorage
temp_storage_sum
;
float
value_sum
=
BlockReduce
(
temp_storage_sum
).
Sum
(
exp_val
);
if
(
tid
==
0
)
{
shared_sum
=
value_sum
;
}
}
__syncthreads
();
// ------------------------------------------------ //
// 第三步:计算 Softmax //
// ------------------------------------------------ //
exp_val
/=
shared_sum
;
// ------------------------------------------------ //
// 第四步:计算 排序 //
// ------------------------------------------------ //
float
thread_values
[
1
]
=
{
-
FLT_MAX
};
int
thread_indices
[
1
]
=
{
-
1
};
if
((
tid
<
width
)
&&
(
exp_val
>
thread_values
[
0
]))
{
thread_values
[
0
]
=
exp_val
;
thread_indices
[
0
]
=
tid
;
}
{
typedef
cub
::
BlockRadixSort
<
float
,
BLOCK_SIZE
,
1
,
int
>
BlockRadixSort
;
__shared__
typename
BlockRadixSort
::
TempStorage
temp_storage
;
BlockRadixSort
(
temp_storage
).
SortDescending
(
thread_values
,
thread_indices
);
}
__syncthreads
();
if
(
0
==
warp_id
)
{
int
indice
=
-
1
;
float
value
=
0.0
f
;
if
(
tid
<
topk
)
{
indice
=
thread_indices
[
0
];
value
=
thread_values
[
0
];
}
// ------------------------------------------------ //
// 第五步: topk的和 //
// ------------------------------------------------ //
{
typedef
cub
::
WarpReduce
<
float
,
32
>
WarpReduce
;
__shared__
typename
WarpReduce
::
TempStorage
temp_storage
;
float
warp_sum
=
WarpReduce
(
temp_storage
).
Sum
(
value
);
if
(
0
==
tid
)
{
shared_sum
=
warp_sum
+
1e-9
f
;
}
}
__syncwarp
();
// ------------------------------------------------ //
// 第6步: norm归一化 //
// ------------------------------------------------ //
if
(
norm
&&
(
tid
<
topk
))
{
value
/=
shared_sum
;
}
// ------------------------------------------------ //
// 第7步: 最终的返回值 //
// ------------------------------------------------ //
if
(
tid
<
topk
)
{
values_topk_output
[
tid
]
=
value
;
indices_topk_output
[
tid
]
=
indice
;
}
}
}
#endif // _TOPKSOFTMAX_KERNEL_CUH__
src/infiniop/ops/topksoftmax/info.h
0 → 100644
View file @
ed530e11
#ifndef __TOPKSOFTMAX_INFO_H__
#define __TOPKSOFTMAX_INFO_H__
#include "../../../utils.h"
#include "../../tensor.h"
#include <vector>
namespace
op
::
topksoftmax
{
class
TopksoftmaxInfo
{
TopksoftmaxInfo
()
=
default
;
public:
infiniDtype_t
xtype
;
std
::
vector
<
size_t
>
shape
;
std
::
vector
<
ptrdiff_t
>
x_strides
;
size_t
N
;
size_t
width
;
public:
size_t
ndim
()
const
{
return
shape
.
size
();
}
size_t
dim
()
const
{
return
shape
[
ndim
()
-
1
];
}
static
utils
::
Result
<
TopksoftmaxInfo
>
create
(
infiniopTensorDescriptor_t
x_desc
)
{
auto
xtype
=
x_desc
->
dtype
();
if
((
xtype
!=
infiniDtype_t
::
INFINI_DTYPE_F32
)
&&
(
xtype
!=
infiniDtype_t
::
INFINI_DTYPE_F16
)
&&
(
xtype
!=
infiniDtype_t
::
INFINI_DTYPE_BF16
))
{
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
if
(
x_desc
->
ndim
()
!=
2
)
{
return
INFINI_STATUS_BAD_TENSOR_SHAPE
;
}
size_t
N
=
x_desc
->
shape
()[
0
];
// token数量
size_t
width
=
x_desc
->
shape
()[
1
];
// 专家数量
return
utils
::
Result
<
TopksoftmaxInfo
>
(
TopksoftmaxInfo
{
xtype
,
x_desc
->
shape
(),
x_desc
->
strides
(),
N
,
width
});
}
};
}
// namespace op::topksoftmax
#endif // __TOPKSOFTMAX_INFO_H__
src/infiniop/ops/topksoftmax/nvidia/topksoftmax_nvidia.cu
0 → 100644
View file @
ed530e11
#include "../../../devices/nvidia/nvidia_common.cuh"
#include "topksoftmax_nvidia.cuh"
#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
#include <cub/block/block_reduce.cuh>
#include "../../../reduce/cuda/reduce.cuh"
#include "../cuda/kernel.cuh"
namespace
op
::
topksoftmax
::
nvidia
{
struct
Descriptor
::
Opaque
{
std
::
shared_ptr
<
device
::
nvidia
::
Handle
::
Internal
>
internal
;
};
Descriptor
::~
Descriptor
()
{
delete
_opaque
;
}
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
x_desc
)
{
auto
result
=
TopksoftmaxInfo
::
create
(
x_desc
);
CHECK_RESULT
(
result
);
auto
info
=
result
.
take
();
if
(
info
.
x_strides
[
1
]
!=
1
)
{
return
INFINI_STATUS_BAD_TENSOR_STRIDES
;
}
*
desc_ptr
=
new
Descriptor
(
new
Opaque
{
reinterpret_cast
<
device
::
nvidia
::
Handle
*>
(
handle
)
->
internal
()},
std
::
move
(
info
),
0
,
handle
->
device
,
handle
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
namespace
{
template
<
int
BLOCK_SIZE
=
128
>
infiniStatus_t
launch_topksoftmax
(
float
*
d_values_out
,
int
*
d_indices_out
,
const
void
*
d_input
,
const
size_t
N
,
const
size_t
width
,
const
size_t
topk
,
const
bool
norm
,
infiniDtype_t
xtype
,
cudaStream_t
stream
)
{
const
int
block_threads
=
BLOCK_SIZE
;
dim3
blocks
(
static_cast
<
unsigned
int
>
(
N
));
dim3
threads
(
block_threads
);
if
(
xtype
==
INFINI_DTYPE_F32
)
{
softmax_topk_row_kernel
<
float
,
BLOCK_SIZE
><<<
blocks
,
threads
,
0
,
stream
>>>
(
d_values_out
,
d_indices_out
,
(
float
*
)
d_input
,
N
,
width
,
topk
,
norm
);
}
else
if
(
xtype
==
INFINI_DTYPE_F16
)
{
softmax_topk_row_kernel
<
half
,
BLOCK_SIZE
><<<
blocks
,
threads
,
0
,
stream
>>>
(
d_values_out
,
d_indices_out
,
(
half
*
)
d_input
,
N
,
width
,
topk
,
norm
);
}
else
if
(
xtype
==
INFINI_DTYPE_BF16
)
{
softmax_topk_row_kernel
<
__nv_bfloat16
,
BLOCK_SIZE
><<<
blocks
,
threads
,
0
,
stream
>>>
(
d_values_out
,
d_indices_out
,
(
__nv_bfloat16
*
)
d_input
,
N
,
width
,
topk
,
norm
);
}
else
{
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
return
INFINI_STATUS_SUCCESS
;
}
};
// namespace
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
float
*
values
,
int
*
indices
,
const
void
*
x
,
const
size_t
topk
,
const
bool
norm
,
void
*
stream
)
const
{
if
(
workspace_size
<
_workspace_size
)
{
return
INFINI_STATUS_INSUFFICIENT_WORKSPACE
;
}
size_t
N
=
_info
.
N
;
size_t
width
=
_info
.
width
;
auto
cuda_stream
=
reinterpret_cast
<
cudaStream_t
>
(
stream
);
if
(
width
<=
128
)
{
launch_topksoftmax
<
128
>
(
values
,
indices
,
x
,
N
,
width
,
topk
,
norm
,
_info
.
xtype
,
cuda_stream
);
}
else
if
(
width
<=
256
)
{
launch_topksoftmax
<
256
>
(
values
,
indices
,
x
,
N
,
width
,
topk
,
norm
,
_info
.
xtype
,
cuda_stream
);
}
else
if
(
width
<=
512
)
{
launch_topksoftmax
<
512
>
(
values
,
indices
,
x
,
N
,
width
,
topk
,
norm
,
_info
.
xtype
,
cuda_stream
);
}
else
{
return
INFINI_STATUS_INTERNAL_ERROR
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::topksoftmax::nvidia
src/infiniop/ops/topksoftmax/nvidia/topksoftmax_nvidia.cuh
0 → 100644
View file @
ed530e11
#ifndef __TOPKSOFTMAX_CUDA_H__
#define __TOPKSOFTMAX_CUDA_H__
#include "../topksoftmax.h"
DESCRIPTOR
(
nvidia
)
#endif
src/infiniop/ops/topksoftmax/operator.cc
0 → 100644
View file @
ed530e11
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/topksoftmax.h"
#ifdef ENABLE_CPU_API
#include "cpu/topksoftmax_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API)
#include "nvidia/topksoftmax_nvidia.cuh"
#endif
__C
infiniStatus_t
infiniopCreateTopksoftmaxDescriptor
(
infiniopHandle_t
handle
,
infiniopTopksoftmaxDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
x_desc
)
{
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::topksoftmax::NAMESPACE::Descriptor::create( \
handle, reinterpret_cast<op::topksoftmax::NAMESPACE::Descriptor **>(desc_ptr), x_desc)
switch
(
handle
->
device
)
{
#ifdef ENABLE_CPU_API
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
}
#undef CREATE
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
__C
infiniStatus_t
infiniopGetTopksoftmaxWorkspaceSize
(
infiniopTopksoftmaxDescriptor_t
desc
,
size_t
*
size
)
{
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::topksoftmax::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
GET
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
GET
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
}
#undef GET
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
__C
infiniStatus_t
infiniopTopksoftmax
(
infiniopTopksoftmaxDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
values
,
void
*
indices
,
const
void
*
x
,
const
size_t
topk
,
const
int
norm
,
void
*
stream
)
{
if
(
topk
>
32
)
{
return
INFINI_STATUS_BAD_PARAM
;
}
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<op::topksoftmax::NAMESPACE::Descriptor *>(desc)->calculate( \
workspace, workspace_size, (float *)values, (int *)indices, x, topk, static_cast<bool>(norm), stream)
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
}
#undef CALCULATE
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
__C
infiniStatus_t
infiniopDestroyTopksoftmaxDescriptor
(
infiniopTopksoftmaxDescriptor_t
desc
)
{
#define DESTROY(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<op::topksoftmax::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
DESTROY
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
DESTROY
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
}
#undef DESTROY
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
src/infiniop/ops/topksoftmax/topksoftmax.h
0 → 100644
View file @
ed530e11
#ifndef _TOPKSOFTMAX_H_
#define _TOPKSOFTMAX_H_
#include "../../operator.h"
#include "info.h"
#define DESCRIPTOR(NAMESPACE) \
\
namespace op::topksoftmax::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
TopksoftmaxInfo _info; \
size_t _workspace_size; \
\
Descriptor(Opaque *opaque, TopksoftmaxInfo info, size_t workspace_size, infiniDevice_t device_type, int device_id) \
: InfiniopDescriptor{device_type, device_id}, _opaque(opaque), _info(info), _workspace_size(workspace_size) { \
} \
\
public: \
~Descriptor(); \
\
size_t workspaceSize() const { \
return _workspace_size; \
} \
\
static infiniStatus_t create(infiniopHandle_t handle, Descriptor **desc_ptr, infiniopTensorDescriptor_t x_desc); \
\
infiniStatus_t calculate(void *workspace, size_t workspace_size, float *values, int *indices, const void *x, \
const size_t topk, const bool norm, void *stream) const; \
}; \
}
#endif // _TOPKSOFTMAX_H_
test/infiniop-test/test_generate/testcases/sigmoid.py
0 → 100644
View file @
ed530e11
import
numpy
as
np
from
numpy.lib.stride_tricks
import
as_strided
import
gguf
from
typing
import
List
from
..
import
InfiniopTestWriter
,
InfiniopTestCase
,
np_dtype_to_ggml
,
gguf_strides
,
contiguous_gguf_strides
,
process_zero_stride_tensor
def
sigmoid
(
x
:
np
.
ndarray
,
):
return
1
/
(
1
+
np
.
exp
(
-
x
))
def
random_tensor
(
shape
,
dtype
):
rate
=
1e-3
var
=
0.5
*
rate
return
rate
*
np
.
random
.
rand
(
*
shape
).
astype
(
dtype
)
-
var
def
process_tensors
(
a
,
b
,
stride_a
=
None
,
stride_b
=
None
):
def
normalize_stride
(
tensor
,
stride
):
if
stride
:
slices
=
tuple
(
slice
(
0
,
1
)
if
s
==
0
else
slice
(
None
)
for
s
in
stride
)
return
tensor
[
slices
]
else
:
return
tensor
a_unique
=
normalize_stride
(
a
,
stride_a
)
b_unique
=
normalize_stride
(
b
,
stride_b
)
return
a_unique
,
b_unique
def
process_tensor
(
a
,
stride_a
=
None
):
def
normalize_stride
(
tensor
,
stride
):
if
stride
:
slices
=
tuple
(
slice
(
0
,
1
)
if
s
==
0
else
slice
(
None
)
for
s
in
stride
)
return
tensor
[
slices
]
else
:
return
tensor
a_unique
=
normalize_stride
(
a
,
stride_a
)
return
a_unique
class
SigmoidTestCase
(
InfiniopTestCase
):
def
__init__
(
self
,
x
:
np
.
ndarray
,
shape_x
:
List
[
int
]
|
None
,
stride_x
:
List
[
int
]
|
None
,
y
:
np
.
ndarray
,
shape_y
:
List
[
int
]
|
None
,
stride_y
:
List
[
int
]
|
None
,
):
super
().
__init__
(
"sigmoid"
)
self
.
x
=
x
self
.
shape_x
=
shape_x
self
.
stride_x
=
stride_x
self
.
y
=
y
self
.
shape_y
=
shape_y
self
.
stride_y
=
stride_y
def
write_test
(
self
,
test_writer
:
"InfiniopTestWriter"
):
super
().
write_test
(
test_writer
)
if
self
.
shape_x
is
not
None
:
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"x.shape"
),
self
.
shape_x
)
if
self
.
shape_y
is
not
None
:
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"y.shape"
),
self
.
shape_y
)
if
self
.
stride_x
is
not
None
:
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"x.strides"
),
gguf_strides
(
*
self
.
stride_x
))
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"y.strides"
),
gguf_strides
(
*
self
.
stride_y
if
self
.
stride_y
is
not
None
else
contiguous_gguf_strides
(
self
.
shape_y
))
)
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"x"
),
self
.
x
,
raw_dtype
=
np_dtype_to_ggml
(
self
.
x
.
dtype
)
)
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"y"
),
self
.
y
,
raw_dtype
=
np_dtype_to_ggml
(
self
.
y
.
dtype
)
)
input_x
=
self
.
x
.
astype
(
np
.
float64
)
if
(
self
.
stride_x
is
not
None
)
and
(
0
in
self
.
stride_x
):
typesize
=
np
.
dtype
(
input_x
.
dtype
).
itemsize
new_strides_bytes
=
tuple
(
x
*
typesize
for
x
in
self
.
stride_x
)
input_x
=
as_strided
(
x
=
input_x
,
shape
=
self
.
shape_x
,
strides
=
new_strides_bytes
)
ans
=
sigmoid
(
input_x
)
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"ans"
),
ans
,
raw_dtype
=
gguf
.
GGMLQuantizationType
.
F64
)
if
__name__
==
'__main__'
:
test_writer
=
InfiniopTestWriter
(
"sigmoid.gguf"
)
test_cases
=
[]
_TEST_CASES_
=
[
# shape, x_stride, y_stride
((
13
,
4
),
None
,
None
),
((
13
,
4
),
(
10
,
1
),
(
10
,
1
)),
((
13
,
4
),
(
0
,
1
),
None
),
((
13
,
4
,
4
),
None
,
None
),
((
13
,
4
,
4
),
(
20
,
4
,
1
),
(
20
,
4
,
1
)),
((
13
,
4
,
4
),
(
4
,
0
,
1
),
None
),
((
16
,
5632
),
None
,
None
),
((
16
,
5632
),
(
13312
,
1
),
(
13312
,
1
)),
((
4
,
4
,
5632
),
None
,
None
),
((
4
,
4
,
5632
),
(
45056
,
5632
,
1
),
(
45056
,
5632
,
1
)),
]
_TENSOR_DTYPES_
=
[
np
.
float16
,
np
.
float32
]
for
dtype
in
_TENSOR_DTYPES_
:
for
shape
,
stride_x
,
stride_y
in
_TEST_CASES_
:
x
=
np
.
random
.
rand
(
*
shape
).
astype
(
dtype
)
y
=
np
.
empty
(
tuple
(
0
for
_
in
shape
),
dtype
=
dtype
)
x
=
process_zero_stride_tensor
(
x
,
stride_x
)
test_case
=
SigmoidTestCase
(
x
=
x
,
shape_x
=
shape
,
stride_x
=
stride_x
,
y
=
y
,
shape_y
=
shape
,
stride_y
=
stride_y
)
test_cases
.
append
(
test_case
)
test_writer
.
add_tests
(
test_cases
)
test_writer
.
save
()
test/infiniop-test/test_generate/testcases/topkrouter.py
0 → 100644
View file @
ed530e11
import
numpy
as
np
from
typing
import
List
import
torch
import
torch.nn
as
nn
from
..
import
InfiniopTestWriter
,
InfiniopTestCase
,
np_dtype_to_ggml
,
gguf_strides
,
contiguous_gguf_strides
def
random_tensor
(
shape
:
tuple
,
dtype
:
np
.
dtype
)
->
np
.
ndarray
:
return
np
.
random
.
uniform
(
-
1.0
,
1.0
,
shape
).
astype
(
dtype
)
*
0.001
class
DeepseekV3TopkRouter
(
nn
.
Module
):
def
__init__
(
self
,
correction_bias
,
routed_scaling_factor
:
float
,
topk
:
int
,
config
=
None
):
super
().
__init__
()
self
.
config
=
config
self
.
top_k
=
8
# config.num_experts_per_tok
self
.
n_routed_experts
=
256
# config.n_routed_experts
self
.
routed_scaling_factor
=
2.5
# config.routed_scaling_factor
self
.
n_group
=
8
# config.n_group
self
.
topk_group
=
4
# config.topk_group
self
.
norm_topk_prob
=
True
# config.norm_topk_prob
self
.
routed_scaling_factor
=
routed_scaling_factor
self
.
top_k
=
topk
# self.weight = nn.Parameter(torch.empty((self.n_routed_experts, config.hidden_size)))
# self.weight = torch.rand(256, 7168) * 2 - 1
# self.register_buffer("e_score_correction_bias", torch.zeros(self.n_routed_experts))
self
.
e_score_correction_bias
=
torch
.
zeros
(
256
,
)
self
.
e_score_correction_bias
[:]
=
correction_bias
[:]
@
torch
.
no_grad
()
def
get_topk_indices
(
self
,
scores
):
scores_for_choice
=
scores
.
view
(
-
1
,
self
.
n_routed_experts
)
+
self
.
e_score_correction_bias
.
unsqueeze
(
0
)
# Size([1, 256])
group_scores
=
(
scores_for_choice
.
view
(
-
1
,
self
.
n_group
,
self
.
n_routed_experts
//
self
.
n_group
)
.
topk
(
2
,
dim
=-
1
)[
0
]
.
sum
(
dim
=-
1
)
)
group_idx
=
torch
.
topk
(
group_scores
,
k
=
self
.
topk_group
,
dim
=-
1
,
sorted
=
True
)[
1
]
# Size([1, 4])
group_mask
=
torch
.
zeros_like
(
group_scores
)
# Size([1, 8])
group_mask
.
scatter_
(
1
,
group_idx
,
1
)
# Size([1, 8])
score_mask
=
(
group_mask
.
unsqueeze
(
-
1
)
.
expand
(
-
1
,
self
.
n_group
,
self
.
n_routed_experts
//
self
.
n_group
)
.
reshape
(
-
1
,
self
.
n_routed_experts
)
)
scores_for_choice
=
scores_for_choice
.
masked_fill
(
~
score_mask
.
bool
(),
0.0
)
# Size([1, 256])
topk_indices
=
torch
.
topk
(
scores_for_choice
,
k
=
self
.
top_k
,
dim
=-
1
,
sorted
=
True
)[
1
]
# Size([1, 8])
return
topk_indices
def
forward
(
self
,
router_logits
):
# hidden_states = hidden_states.view(-1, 7168)
# router_logits = F.linear(hidden_states.type(torch.float32), self.weight.type(torch.float32))
scores
=
router_logits
.
sigmoid
()
# (1,256)
scores
=
scores
.
to
(
torch
.
float32
)
topk_indices
=
self
.
get_topk_indices
(
scores
)
# (1,8)
topk_weights
=
scores
.
gather
(
1
,
topk_indices
)
if
self
.
norm_topk_prob
:
denominator
=
topk_weights
.
sum
(
dim
=-
1
,
keepdim
=
True
)
+
1e-20
topk_weights
/=
denominator
topk_weights
=
topk_weights
*
self
.
routed_scaling_factor
return
topk_indices
,
topk_weights
def
python_topkrouter
(
x
:
np
.
ndarray
,
correction_bias
:
np
.
ndarray
,
routed_scaling_factor
:
float
,
topk
:
int
):
x
=
torch
.
from_numpy
(
x
)
correction_bias
=
torch
.
from_numpy
(
correction_bias
)
router_logits
=
x
lable_indices
,
lable_values
=
DeepseekV3TopkRouter
(
correction_bias
,
routed_scaling_factor
=
routed_scaling_factor
,
topk
=
topk
)(
router_logits
)
lable_indices
=
lable_indices
.
to
(
torch
.
int32
)
return
lable_values
.
numpy
(),
lable_indices
.
numpy
()
class
TopkrouterTestCase
(
InfiniopTestCase
):
def
__init__
(
self
,
values
:
np
.
ndarray
,
# 传出参数
indices
:
np
.
ndarray
,
# 传出参数
x
:
np
.
ndarray
,
# 传入参数
correction_bias
:
np
.
ndarray
,
# 传入参数
routed_scaling_factor
:
float
,
topk
:
int
,
values_shape
:
List
[
int
]
|
None
,
values_strides
:
List
[
int
]
|
None
,
indices_shape
:
List
[
int
]
|
None
,
indices_strides
:
List
[
int
]
|
None
,
x_shape
:
List
[
int
]
|
None
,
x_strides
:
List
[
int
]
|
None
,
correction_bias_shape
:
List
[
int
]
|
None
,
correction_bias_strides
:
List
[
int
]
|
None
,
):
super
().
__init__
(
"topkrouter"
)
self
.
values
=
values
self
.
indices
=
indices
self
.
x
=
x
self
.
correction_bias
=
correction_bias
self
.
routed_scaling_factor
=
routed_scaling_factor
self
.
topk
=
topk
self
.
values_shape
=
values_shape
self
.
values_strides
=
values_strides
self
.
indices_shape
=
indices_shape
self
.
indices_strides
=
indices_strides
self
.
x_shape
=
x_shape
self
.
x_strides
=
x_strides
self
.
correction_bias_shape
=
correction_bias_shape
self
.
correction_bias_strides
=
correction_bias_strides
def
write_test
(
self
,
test_writer
:
"InfiniopTestWriter"
):
super
().
write_test
(
test_writer
)
if
self
.
values_shape
is
not
None
:
print
(
"self.values_shape: "
,
self
.
values_shape
)
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"values.shape"
),
self
.
values_shape
)
if
self
.
indices_shape
is
not
None
:
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"indices.shape"
),
self
.
indices_shape
)
if
self
.
x_shape
is
not
None
:
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"x.shape"
),
self
.
x_shape
)
if
self
.
correction_bias_shape
is
not
None
:
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"correction_bias.shape"
),
self
.
correction_bias_shape
)
if
self
.
x_strides
is
not
None
:
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"x.strides"
),
gguf_strides
(
*
self
.
x_strides
))
if
self
.
correction_bias_strides
is
not
None
:
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"correction_bias_strides.strides"
),
gguf_strides
(
*
self
.
correction_bias_strides
))
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"values.strides"
),
gguf_strides
(
*
self
.
values_strides
if
self
.
values_strides
is
not
None
else
contiguous_gguf_strides
(
self
.
values_shape
))
)
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"indices.strides"
),
gguf_strides
(
*
self
.
indices_strides
if
self
.
indices_strides
is
not
None
else
contiguous_gguf_strides
(
self
.
indices_shape
))
)
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"values"
),
self
.
values
,
raw_dtype
=
np_dtype_to_ggml
(
self
.
values
.
dtype
))
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"indices"
),
self
.
indices
,
raw_dtype
=
np_dtype_to_ggml
(
self
.
indices
.
dtype
))
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"x"
),
self
.
x
,
raw_dtype
=
np_dtype_to_ggml
(
self
.
x
.
dtype
))
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"correction_bias"
),
self
.
correction_bias
,
raw_dtype
=
np_dtype_to_ggml
(
self
.
correction_bias
.
dtype
))
test_writer
.
add_float32
(
test_writer
.
gguf_key
(
"routed_scaling_factor"
),
self
.
routed_scaling_factor
)
test_writer
.
add_int32
(
test_writer
.
gguf_key
(
"topk"
),
self
.
topk
)
lable_values
,
lable_indices
=
python_topkrouter
(
self
.
x
.
copy
(),
self
.
correction_bias
.
copy
(),
self
.
routed_scaling_factor
,
self
.
topk
)
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"lable_values"
),
lable_values
,
raw_dtype
=
np_dtype_to_ggml
(
lable_values
.
dtype
),
)
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"lable_indices"
),
lable_indices
,
raw_dtype
=
np_dtype_to_ggml
(
lable_indices
.
dtype
)
)
if
__name__
==
"__main__"
:
test_writer
=
InfiniopTestWriter
(
"topkrouter.gguf"
)
test_cases
=
[]
_TEST_CASES_
=
[
# x_shape, x_strides, correction_bias_shape, correction_bias_stride, routed_scaling_factor, topk
((
1
,
256
),
None
,
(
256
,),
None
,
2.5
,
8
),
((
2
,
256
),
None
,
(
256
,),
None
,
1.5
,
8
),
]
_TENSOR_DTYPES_
=
[
np
.
float32
,
np
.
float16
]
for
dtype
in
_TENSOR_DTYPES_
:
for
x_shape
,
x_strides
,
correction_bias_shape
,
b_stride
,
routed_scaling_factor
,
topk
in
_TEST_CASES_
:
ntoken
=
x_shape
[
0
]
values_indices_shape
=
(
ntoken
,
topk
)
values
=
np
.
empty
(
tuple
(
0
for
_
in
values_indices_shape
),
dtype
=
np
.
float32
)
indices
=
np
.
empty
(
tuple
(
0
for
_
in
values_indices_shape
),
dtype
=
np
.
int32
)
x
=
np
.
random
.
rand
(
*
x_shape
).
astype
(
dtype
)
correction_bias
=
np
.
random
.
rand
(
*
correction_bias_shape
).
astype
(
np
.
float32
)
test_case
=
TopkrouterTestCase
(
values
=
values
,
indices
=
indices
,
x
=
x
,
correction_bias
=
correction_bias
,
routed_scaling_factor
=
routed_scaling_factor
,
topk
=
topk
,
values_shape
=
list
(
values_indices_shape
),
values_strides
=
None
,
indices_shape
=
list
(
values_indices_shape
),
indices_strides
=
None
,
x_shape
=
list
(
x_shape
),
x_strides
=
None
,
correction_bias_shape
=
list
(
correction_bias_shape
),
correction_bias_strides
=
None
,
)
test_cases
.
append
(
test_case
)
test_writer
.
add_tests
(
test_cases
)
test_writer
.
save
()
test/infiniop-test/test_generate/testcases/topksoftmax.py
0 → 100644
View file @
ed530e11
import
numpy
as
np
from
typing
import
List
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
..
import
InfiniopTestWriter
,
InfiniopTestCase
,
np_dtype_to_ggml
,
gguf_strides
,
contiguous_gguf_strides
def
random_tensor
(
shape
:
tuple
,
dtype
:
np
.
dtype
)
->
np
.
ndarray
:
return
np
.
random
.
uniform
(
-
1.0
,
1.0
,
shape
).
astype
(
dtype
)
*
0.001
def
torch_Topksoftmax
(
router_logits
,
top_k
:
int
,
norm_topk_prob
:
bool
):
routing_weights
=
F
.
softmax
(
router_logits
,
dim
=
1
,
dtype
=
torch
.
float
)
routing_weights
,
selected_experts
=
torch
.
topk
(
routing_weights
,
top_k
,
dim
=-
1
)
if
norm_topk_prob
:
# only diff with mixtral sparse moe block!
routing_weights
/=
routing_weights
.
sum
(
dim
=-
1
,
keepdim
=
True
)
routing_weights
=
routing_weights
.
to
(
torch
.
float
)
return
routing_weights
,
selected_experts
def
python_Topksoftmax
(
router_logits
,
top_k
:
int
,
norm_topk_prob
:
bool
):
router_logits
=
torch
.
from_numpy
(
router_logits
)
lable_values
,
lable_indices
=
torch_Topksoftmax
(
router_logits
,
top_k
,
norm_topk_prob
)
return
lable_values
.
numpy
(),
lable_indices
.
numpy
()
class
TopksoftmaxTestCase
(
InfiniopTestCase
):
def
__init__
(
self
,
values
:
np
.
ndarray
,
# 传出参数
indices
:
np
.
ndarray
,
# 传出参数
x
:
np
.
ndarray
,
# 传入参数
topk
:
np
.
ndarray
,
norm
:
bool
,
values_shape
:
List
[
int
]
|
None
,
values_strides
:
List
[
int
]
|
None
,
indices_shape
:
List
[
int
]
|
None
,
indices_strides
:
List
[
int
]
|
None
,
x_shape
:
List
[
int
]
|
None
,
x_strides
:
List
[
int
]
|
None
,
):
super
().
__init__
(
"topksoftmax"
)
self
.
values
=
values
self
.
indices
=
indices
self
.
x
=
x
self
.
topk
=
topk
self
.
norm
=
norm
self
.
values_shape
=
values_shape
self
.
values_strides
=
values_strides
self
.
indices_shape
=
indices_shape
self
.
indices_strides
=
indices_strides
self
.
x_shape
=
x_shape
self
.
x_strides
=
x_strides
def
write_test
(
self
,
test_writer
:
"InfiniopTestWriter"
):
super
().
write_test
(
test_writer
)
if
self
.
values_shape
is
not
None
:
print
(
"self.values_shape: "
,
self
.
values_shape
)
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"values.shape"
),
self
.
values_shape
)
if
self
.
indices_shape
is
not
None
:
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"indices.shape"
),
self
.
indices_shape
)
if
self
.
x_shape
is
not
None
:
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"x.shape"
),
self
.
x_shape
)
if
self
.
x_strides
is
not
None
:
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"x.strides"
),
gguf_strides
(
*
self
.
x_strides
))
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"values.strides"
),
gguf_strides
(
*
self
.
values_strides
if
self
.
values_strides
is
not
None
else
contiguous_gguf_strides
(
self
.
values_shape
))
)
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"indices.strides"
),
gguf_strides
(
*
self
.
indices_strides
if
self
.
indices_strides
is
not
None
else
contiguous_gguf_strides
(
self
.
indices_shape
))
)
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"values"
),
self
.
values
,
raw_dtype
=
np_dtype_to_ggml
(
self
.
values
.
dtype
))
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"indices"
),
self
.
indices
,
raw_dtype
=
np_dtype_to_ggml
(
self
.
indices
.
dtype
))
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"x"
),
self
.
x
,
raw_dtype
=
np_dtype_to_ggml
(
self
.
x
.
dtype
))
test_writer
.
add_int32
(
test_writer
.
gguf_key
(
"topk"
),
self
.
topk
)
test_writer
.
add_bool
(
test_writer
.
gguf_key
(
"norm"
),
self
.
norm
)
lable_values
,
lable_indices
=
python_Topksoftmax
(
self
.
x
.
copy
(),
self
.
topk
,
self
.
norm
)
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"lable_values"
),
lable_values
,
raw_dtype
=
np_dtype_to_ggml
(
lable_values
.
dtype
),
)
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"lable_indices"
),
lable_indices
,
raw_dtype
=
np_dtype_to_ggml
(
lable_indices
.
dtype
)
)
if
__name__
==
"__main__"
:
test_writer
=
InfiniopTestWriter
(
"topksoftmax.gguf"
)
test_cases
=
[]
_TEST_CASES_
=
[
# x_shape, x_strides, topk, norm
((
1
,
32
),
None
,
4
,
True
),
((
8
,
20
),
None
,
8
,
False
),
((
2
,
128
),
None
,
10
,
True
)
]
_TENSOR_DTYPES_
=
[
np
.
float32
,
np
.
float16
]
for
dtype
in
_TENSOR_DTYPES_
:
for
x_shape
,
x_strides
,
topk
,
norm
in
_TEST_CASES_
:
ntoken
=
x_shape
[
0
]
values_indices_shape
=
(
ntoken
,
topk
)
values
=
np
.
empty
(
tuple
(
0
for
_
in
values_indices_shape
),
dtype
=
np
.
float32
)
indices
=
np
.
empty
(
tuple
(
0
for
_
in
values_indices_shape
),
dtype
=
np
.
int32
)
x
=
np
.
random
.
rand
(
*
x_shape
).
astype
(
dtype
)
test_case
=
TopksoftmaxTestCase
(
values
=
values
,
indices
=
indices
,
x
=
x
,
topk
=
topk
,
norm
=
norm
,
values_shape
=
list
(
values_indices_shape
),
values_strides
=
None
,
indices_shape
=
list
(
values_indices_shape
),
indices_strides
=
None
,
x_shape
=
list
(
x_shape
),
x_strides
=
None
)
test_cases
.
append
(
test_case
)
test_writer
.
add_tests
(
test_cases
)
test_writer
.
save
()
test/infiniop/libinfiniop/op_register.py
View file @
ed530e11
...
@@ -495,6 +495,66 @@ def conv_(lib):
...
@@ -495,6 +495,66 @@ def conv_(lib):
]
]
@
OpRegister
.
operator
def
sigmoid_
(
lib
):
lib
.
infiniopCreateSigmoidDescriptor
.
restype
=
c_int32
lib
.
infiniopCreateSigmoidDescriptor
.
argtypes
=
[
infiniopHandle_t
,
POINTER
(
infiniopOperatorDescriptor_t
),
infiniopTensorDescriptor_t
,
infiniopTensorDescriptor_t
,
]
lib
.
infiniopGetSigmoidWorkspaceSize
.
restype
=
c_int32
lib
.
infiniopGetSigmoidWorkspaceSize
.
argtypes
=
[
infiniopOperatorDescriptor_t
,
POINTER
(
c_size_t
),
]
lib
.
infiniopSigmoid
.
restype
=
c_int32
lib
.
infiniopSigmoid
.
argtypes
=
[
infiniopOperatorDescriptor_t
,
c_void_p
,
c_size_t
,
c_void_p
,
c_void_p
,
c_void_p
,
]
lib
.
infiniopDestroySigmoidDescriptor
.
restype
=
c_int32
lib
.
infiniopDestroySigmoidDescriptor
.
argtypes
=
[
infiniopOperatorDescriptor_t
,
]
@
OpRegister
.
operator
def
topksoftmax_
(
lib
):
lib
.
infiniopCreateTopksoftmaxDescriptor
.
restype
=
c_int32
lib
.
infiniopCreateTopksoftmaxDescriptor
.
argtypes
=
[
infiniopHandle_t
,
POINTER
(
infiniopOperatorDescriptor_t
),
infiniopTensorDescriptor_t
,
]
lib
.
infiniopGetTopksoftmaxWorkspaceSize
.
restype
=
c_int32
lib
.
infiniopGetTopksoftmaxWorkspaceSize
.
argtypes
=
[
infiniopOperatorDescriptor_t
,
POINTER
(
c_size_t
),
]
lib
.
infiniopTopksoftmax
.
restype
=
c_int32
lib
.
infiniopTopksoftmax
.
argtypes
=
[
infiniopOperatorDescriptor_t
,
c_void_p
,
c_size_t
,
c_void_p
,
c_void_p
,
c_void_p
,
c_size_t
,
c_int32
,
c_void_p
,
]
lib
.
infiniopDestroyTopksoftmaxDescriptor
.
restype
=
c_int32
lib
.
infiniopDestroyTopksoftmaxDescriptor
.
argtypes
=
[
infiniopOperatorDescriptor_t
,
]
@
OpRegister
.
operator
@
OpRegister
.
operator
def
topkrouter_
(
lib
):
def
topkrouter_
(
lib
):
lib
.
infiniopCreateTopkrouterDescriptor
.
restype
=
c_int32
lib
.
infiniopCreateTopkrouterDescriptor
.
restype
=
c_int32
...
@@ -502,7 +562,7 @@ def topkrouter_(lib):
...
@@ -502,7 +562,7 @@ def topkrouter_(lib):
infiniopHandle_t
,
infiniopHandle_t
,
POINTER
(
infiniopOperatorDescriptor_t
),
POINTER
(
infiniopOperatorDescriptor_t
),
infiniopTensorDescriptor_t
,
infiniopTensorDescriptor_t
,
infiniopTensorDescriptor_t
,
infiniopTensorDescriptor_t
]
]
lib
.
infiniopGetTopkrouterWorkspaceSize
.
restype
=
c_int32
lib
.
infiniopGetTopkrouterWorkspaceSize
.
restype
=
c_int32
...
@@ -510,7 +570,6 @@ def topkrouter_(lib):
...
@@ -510,7 +570,6 @@ def topkrouter_(lib):
infiniopOperatorDescriptor_t
,
infiniopOperatorDescriptor_t
,
POINTER
(
c_size_t
),
POINTER
(
c_size_t
),
]
]
lib
.
infiniopTopkrouter
.
restype
=
c_int32
lib
.
infiniopTopkrouter
.
restype
=
c_int32
lib
.
infiniopTopkrouter
.
argtypes
=
[
lib
.
infiniopTopkrouter
.
argtypes
=
[
infiniopOperatorDescriptor_t
,
infiniopOperatorDescriptor_t
,
...
@@ -524,7 +583,6 @@ def topkrouter_(lib):
...
@@ -524,7 +583,6 @@ def topkrouter_(lib):
c_size_t
,
c_size_t
,
c_void_p
,
c_void_p
,
]
]
lib
.
infiniopDestroyTopkrouterDescriptor
.
restype
=
c_int32
lib
.
infiniopDestroyTopkrouterDescriptor
.
restype
=
c_int32
lib
.
infiniopDestroyTopkrouterDescriptor
.
argtypes
=
[
lib
.
infiniopDestroyTopkrouterDescriptor
.
argtypes
=
[
infiniopOperatorDescriptor_t
,
infiniopOperatorDescriptor_t
,
...
...
test/infiniop/sigmoid.py
0 → 100644
View file @
ed530e11
import
torch
import
ctypes
from
ctypes
import
c_uint64
from
libinfiniop
import
(
LIBINFINIOP
,
TestTensor
,
get_test_devices
,
check_error
,
test_operator
,
get_args
,
debug
,
get_tolerance
,
profile_operation
,
TestWorkspace
,
InfiniDtype
,
InfiniDtypeNames
,
InfiniDeviceNames
,
infiniopOperatorDescriptor_t
,
)
from
enum
import
Enum
,
auto
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# These are not meant to be imported from other modules
_TEST_CASES_
=
[
# shape, x_stride, y_stride
((
13
,
4
),
None
,
None
),
((
13
,
4
),
(
10
,
1
),
(
10
,
1
)),
((
13
,
4
),
(
0
,
1
),
(
0
,
1
)),
((
13
,
4
,
4
),
None
,
None
),
((
13
,
4
,
4
),
(
20
,
4
,
1
),
(
20
,
4
,
1
)),
((
13
,
4
,
4
),
(
4
,
0
,
1
),
(
4
,
0
,
1
)),
((
16
,
5632
),
None
,
None
),
((
16
,
5632
),
(
13312
,
1
),
(
13312
,
1
)),
((
4
,
4
,
5632
),
None
,
None
),
((
4
,
4
,
5632
),
(
45056
,
5632
,
1
),
(
45056
,
5632
,
1
)),
((
4
,
4
,
56320
),
None
,
None
),
]
class
Inplace
(
Enum
):
OUT_OF_PLACE
=
auto
()
INPLACE_X
=
auto
()
# Inplace options applied for each test case in _TEST_CASES_
_INPLACE
=
[
Inplace
.
OUT_OF_PLACE
,
Inplace
.
INPLACE_X
,
]
# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
_TEST_CASES
=
[
test_case
+
(
inplace_item
,)
for
test_case
in
_TEST_CASES_
for
inplace_item
in
_INPLACE
]
# Data types used for testing
_TENSOR_DTYPES
=
[
InfiniDtype
.
F16
,
InfiniDtype
.
F32
,
InfiniDtype
.
BF16
]
# Tolerance map for different data types
_TOLERANCE_MAP
=
{
InfiniDtype
.
F16
:
{
"atol"
:
1e-3
,
"rtol"
:
1e-3
},
InfiniDtype
.
F32
:
{
"atol"
:
1e-7
,
"rtol"
:
1e-7
},
InfiniDtype
.
BF16
:
{
"atol"
:
1e-2
,
"rtol"
:
1e-2
},
}
DEBUG
=
False
PROFILE
=
False
NUM_PRERUN
=
10
NUM_ITERATIONS
=
1000
def
torch_sigmoid
(
y
,
x
):
torch
.
sigmoid
(
x
,
out
=
y
)
def
test
(
handle
,
device
,
shape
,
x_stride
=
None
,
y_stride
=
None
,
inplace
=
Inplace
.
OUT_OF_PLACE
,
dtype
=
torch
.
float16
,
sync
=
None
,
):
x
=
TestTensor
(
shape
,
x_stride
,
dtype
,
device
)
if
inplace
==
Inplace
.
INPLACE_X
:
if
x_stride
!=
y_stride
:
return
y
=
x
else
:
y
=
TestTensor
(
shape
,
y_stride
,
dtype
,
device
,
mode
=
"ones"
)
if
y
.
is_broadcast
():
return
print
(
f
"Testing Sigmoid on
{
InfiniDeviceNames
[
device
]
}
with shape:
{
shape
}
x_stride:
{
x_stride
}
y_stride:
{
y_stride
}
"
f
"dtype:
{
InfiniDtypeNames
[
dtype
]
}
inplace:
{
inplace
}
"
)
torch_sigmoid
(
y
.
torch_tensor
(),
x
.
torch_tensor
())
if
sync
is
not
None
:
sync
()
descriptor
=
infiniopOperatorDescriptor_t
()
check_error
(
LIBINFINIOP
.
infiniopCreateSigmoidDescriptor
(
handle
,
ctypes
.
byref
(
descriptor
),
y
.
descriptor
,
x
.
descriptor
,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for
tensor
in
[
x
,
y
]:
tensor
.
destroy_desc
()
workspace_size
=
c_uint64
(
0
)
check_error
(
LIBINFINIOP
.
infiniopGetSigmoidWorkspaceSize
(
descriptor
,
ctypes
.
byref
(
workspace_size
)
)
)
workspace
=
TestWorkspace
(
workspace_size
.
value
,
y
.
device
)
def
lib_sigmoid
():
check_error
(
LIBINFINIOP
.
infiniopSigmoid
(
descriptor
,
workspace
.
data
(),
workspace
.
size
(),
y
.
data
(),
x
.
data
(),
None
,
)
)
lib_sigmoid
()
atol
,
rtol
=
get_tolerance
(
_TOLERANCE_MAP
,
dtype
)
if
DEBUG
:
debug
(
y
.
actual_tensor
(),
y
.
torch_tensor
(),
atol
=
atol
,
rtol
=
rtol
)
assert
torch
.
allclose
(
y
.
actual_tensor
(),
y
.
torch_tensor
(),
atol
=
atol
,
rtol
=
rtol
)
# Profiling workflow
if
PROFILE
:
# fmt: off
profile_operation
(
"PyTorch"
,
lambda
:
torch_sigmoid
(
y
.
torch_tensor
(),
x
.
torch_tensor
()),
device
,
NUM_PRERUN
,
NUM_ITERATIONS
)
profile_operation
(
" lib"
,
lambda
:
lib_sigmoid
(),
device
,
NUM_PRERUN
,
NUM_ITERATIONS
)
# fmt: on
check_error
(
LIBINFINIOP
.
infiniopDestroySigmoidDescriptor
(
descriptor
))
if
__name__
==
"__main__"
:
args
=
get_args
()
# Configure testing options
DEBUG
=
args
.
debug
PROFILE
=
args
.
profile
NUM_PRERUN
=
args
.
num_prerun
NUM_ITERATIONS
=
args
.
num_iterations
for
device
in
get_test_devices
(
args
):
test_operator
(
device
,
test
,
_TEST_CASES
,
_TENSOR_DTYPES
)
print
(
"
\033
[92m Test passed!
\033
[0m"
)
test/infiniop/topkrouter.py
View file @
ed530e11
...
@@ -27,14 +27,13 @@ from libinfiniop import (
...
@@ -27,14 +27,13 @@ from libinfiniop import (
# ==============================================================================
# ==============================================================================
# These are not meant to be imported from other modules
# These are not meant to be imported from other modules
_TEST_CASES_
=
[
_TEST_CASES_
=
[
# x_shape, x_stride, select_experts
# x_shape, x_stride, topk, routed_scaling_factor
((
1
,
256
),
None
,
8
),
((
1
,
256
),
None
,
8
,
2.5
),
((
3
,
256
),
None
,
8
),
]
]
# w (weight) types
# w (weight) types
# Note: 'None' means the same as input dtype
# Note: 'None' means the same as input dtype
_X_DTYPES
=
[
InfiniDtype
.
F32
,
InfiniDtype
.
BF16
,
InfiniDtype
.
F16
]
#
_X_DTYPES
=
[]
#
[InfiniDtype.F32, InfiniDtype.BF16, InfiniDtype.F16]
# x types used for testing
# x types used for testing
_VALUE_DTYPES
=
[
InfiniDtype
.
F32
]
_VALUE_DTYPES
=
[
InfiniDtype
.
F32
]
...
@@ -46,6 +45,8 @@ _TEST_CASES = [
...
@@ -46,6 +45,8 @@ _TEST_CASES = [
# Tolerance map for different data types
# Tolerance map for different data types
_TOLERANCE_MAP
=
{
_TOLERANCE_MAP
=
{
InfiniDtype
.
F32
:
{
"atol"
:
1e-3
,
"rtol"
:
1e-3
},
InfiniDtype
.
F32
:
{
"atol"
:
1e-3
,
"rtol"
:
1e-3
},
InfiniDtype
.
BF16
:
{
"atol"
:
1e-3
,
"rtol"
:
1e-3
},
InfiniDtype
.
F16
:
{
"atol"
:
1e-3
,
"rtol"
:
1e-3
},
}
}
DEBUG
=
False
DEBUG
=
False
...
@@ -59,12 +60,13 @@ def tensorInfo(data):
...
@@ -59,12 +60,13 @@ def tensorInfo(data):
class
DeepseekV3TopkRouter
(
nn
.
Module
):
class
DeepseekV3TopkRouter
(
nn
.
Module
):
def
__init__
(
self
,
correction_bias
,
config
=
None
):
def
__init__
(
self
,
correction_bias
,
routed_scaling_factor
:
float
=
2.5
,
topk
:
int
=
8
,
config
=
None
):
super
().
__init__
()
super
().
__init__
()
self
.
config
=
config
self
.
config
=
config
self
.
top_k
=
8
# config.num_experts_per_tok
self
.
top_k
=
topk
# config.num_experts_per_tok 8
assert
topk
==
8
self
.
n_routed_experts
=
256
# config.n_routed_experts
self
.
n_routed_experts
=
256
# config.n_routed_experts
self
.
routed_scaling_factor
=
2.5
# config.routed_scaling_factor
self
.
routed_scaling_factor
=
routed_scaling_factor
# config.routed_scaling_factor
2.5
self
.
n_group
=
8
# config.n_group
self
.
n_group
=
8
# config.n_group
self
.
topk_group
=
4
# config.topk_group
self
.
topk_group
=
4
# config.topk_group
self
.
norm_topk_prob
=
True
# config.norm_topk_prob
self
.
norm_topk_prob
=
True
# config.norm_topk_prob
...
@@ -73,7 +75,7 @@ class DeepseekV3TopkRouter(nn.Module):
...
@@ -73,7 +75,7 @@ class DeepseekV3TopkRouter(nn.Module):
# self.weight = torch.rand(256, 7168) * 2 - 1
# self.weight = torch.rand(256, 7168) * 2 - 1
# self.register_buffer("e_score_correction_bias", torch.zeros(self.n_routed_experts))
# self.register_buffer("e_score_correction_bias", torch.zeros(self.n_routed_experts))
self
.
e_score_correction_bias
=
torch
.
zeros
(
256
,
device
=
"cuda"
)
self
.
e_score_correction_bias
=
torch
.
zeros
(
256
,
device
=
correction_bias
.
device
)
self
.
e_score_correction_bias
[:]
=
correction_bias
[:]
self
.
e_score_correction_bias
[:]
=
correction_bias
[:]
@
torch
.
no_grad
()
@
torch
.
no_grad
()
...
@@ -114,11 +116,12 @@ class DeepseekV3TopkRouter(nn.Module):
...
@@ -114,11 +116,12 @@ class DeepseekV3TopkRouter(nn.Module):
denominator
=
topk_weights
.
sum
(
dim
=-
1
,
keepdim
=
True
)
+
1e-20
denominator
=
topk_weights
.
sum
(
dim
=-
1
,
keepdim
=
True
)
+
1e-20
topk_weights
/=
denominator
topk_weights
/=
denominator
topk_weights
=
topk_weights
*
self
.
routed_scaling_factor
topk_weights
=
topk_weights
*
self
.
routed_scaling_factor
return
topk_indices
,
topk_weights
return
topk_indices
,
topk_weights
def
torch_topkrouter
(
router_logits
,
correction_bias
):
def
torch_topkrouter
(
router_logits
,
correction_bias
,
routed_scaling_factor
,
topk
):
lable_indices
,
lable_values
=
DeepseekV3TopkRouter
(
correction_bias
)(
router_logits
)
lable_indices
,
lable_values
=
DeepseekV3TopkRouter
(
correction_bias
,
routed_scaling_factor
,
topk
)(
router_logits
)
lable_indices
=
lable_indices
.
to
(
torch
.
int32
)
lable_indices
=
lable_indices
.
to
(
torch
.
int32
)
return
lable_values
,
lable_indices
return
lable_values
,
lable_indices
...
@@ -129,12 +132,13 @@ def test(
...
@@ -129,12 +132,13 @@ def test(
x_shape
,
x_shape
,
x_stride
,
x_stride
,
topk
,
topk
,
routed_scaling_factor
,
x_dtype
=
InfiniDtype
.
F32
,
x_dtype
=
InfiniDtype
.
F32
,
dtype
=
InfiniDtype
.
F16
,
dtype
=
InfiniDtype
.
F16
,
sync
=
None
,
sync
=
None
,
):
):
print
(
print
(
f
"Testing topkrouter on
{
InfiniDeviceNames
[
device
]
}
with x_shape:
{
x_shape
}
"
f
"Testing topkrouter on
{
InfiniDeviceNames
[
device
]
}
with x_shape:
{
x_shape
}
"
f
"x_stride:
{
x_stride
}
w_dtype:
{
InfiniDtypeNames
[
x_dtype
]
}
dtype:
{
InfiniDtypeNames
[
dtype
]
}
"
f
"x_stride:
{
x_stride
}
w_dtype:
{
InfiniDtypeNames
[
x_dtype
]
}
dtype:
{
InfiniDtypeNames
[
dtype
]
}
"
)
)
...
@@ -182,27 +186,27 @@ def test(
...
@@ -182,27 +186,27 @@ def test(
indices
.
data_ptr
(),
indices
.
data_ptr
(),
x
.
data
(),
x
.
data
(),
correction_bias
.
data
(),
correction_bias
.
data
(),
2.5
,
routed_scaling_factor
,
topk
,
topk
,
None
,
None
,
)
)
)
)
lib_topkrouter
()
lib_topkrouter
()
lable_values
,
lable_indices
=
torch_topkrouter
(
x
.
actual_tensor
(),
correction_bias
.
actual_tensor
())
lable_values
,
lable_indices
=
torch_topkrouter
(
x
.
actual_tensor
(),
correction_bias
.
actual_tensor
(),
routed_scaling_factor
,
topk
)
atol
,
rtol
=
get_tolerance
(
_TOLERANCE_MAP
,
dtype
)
atol
,
rtol
=
get_tolerance
(
_TOLERANCE_MAP
,
dtype
)
if
DEBUG
:
if
DEBUG
:
debug
(
lable_values
,
values
,
atol
=
atol
,
rtol
=
rtol
)
debug
(
lable_values
,
values
,
atol
=
atol
,
rtol
=
rtol
)
debug
(
lable_indices
,
indices
,
atol
=
atol
,
rtol
=
rtol
)
debug
(
lable_indices
,
indices
,
atol
=
atol
,
rtol
=
rtol
)
assert
torch
.
allclose
(
lable_values
,
values
,
atol
=
atol
,
rtol
=
rtol
)
assert
torch
.
allclose
(
lable_values
,
values
,
atol
=
atol
,
rtol
=
rtol
)
assert
torch
.
allclose
(
lable_indices
,
lable_
indices
,
atol
=
atol
,
rtol
=
rtol
)
assert
torch
.
allclose
(
lable_indices
,
indices
,
atol
=
atol
,
rtol
=
rtol
)
# Profiling workflow
# Profiling workflow
if
PROFILE
:
if
PROFILE
:
# fmt: off
# fmt: off
profile_operation
(
"PyTorch"
,
lambda
:
torch_topkrouter
(
x
.
actual_tensor
()
.
clone
(),
tokp
),
device
,
NUM_PRERUN
,
NUM_ITERATIONS
)
profile_operation
(
"PyTorch"
,
lambda
:
torch_topkrouter
(
x
.
actual_tensor
()
,
correction_bias
.
actual_tensor
(),
routed_scaling_factor
,
topk
),
device
,
NUM_PRERUN
,
NUM_ITERATIONS
)
profile_operation
(
" lib"
,
lambda
:
lib_topkrouter
(),
device
,
NUM_PRERUN
,
NUM_ITERATIONS
)
profile_operation
(
" lib"
,
lambda
:
lib_topkrouter
(),
device
,
NUM_PRERUN
,
NUM_ITERATIONS
)
# fmt: on
# fmt: on
check_error
(
LIBINFINIOP
.
infiniopDestroyTopkrouterDescriptor
(
descriptor
))
check_error
(
LIBINFINIOP
.
infiniopDestroyTopkrouterDescriptor
(
descriptor
))
...
...
test/infiniop/topksoftmax.py
0 → 100644
View file @
ed530e11
import
torch
import
ctypes
from
ctypes
import
c_uint64
import
torch.nn.functional
as
F
from
libinfiniop
import
(
LIBINFINIOP
,
TestTensor
,
get_test_devices
,
check_error
,
test_operator
,
get_args
,
debug
,
get_tolerance
,
profile_operation
,
TestWorkspace
,
InfiniDtype
,
InfiniDtypeNames
,
InfiniDeviceNames
,
infiniopOperatorDescriptor_t
,
torch_device_map
)
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# These are not meant to be imported from other modules
_TEST_CASES_
=
[
# x_shape, x_stride, topk, norm
((
1
,
10
),
None
,
7
,
True
),
((
2
,
20
),
None
,
4
,
True
),
((
1
,
128
),
None
,
10
,
True
),
]
# w (weight) types
# Note: 'None' means the same as input dtype
_X_DTYPES
=
[
InfiniDtype
.
F32
,
InfiniDtype
.
F16
,
InfiniDtype
.
BF16
]
#
# x types used for testing
_VALUE_DTYPES
=
[
InfiniDtype
.
F32
]
# Form the test cases by appending each element of _X_DTYPES to each tuple in _TEST_CASES_
_TEST_CASES
=
[
test_case
+
(
x_dtype
,)
for
test_case
in
_TEST_CASES_
for
x_dtype
in
_X_DTYPES
]
# Tolerance map for different data types
_TOLERANCE_MAP
=
{
InfiniDtype
.
F32
:
{
"atol"
:
1e-3
,
"rtol"
:
1e-3
},
InfiniDtype
.
F16
:
{
"atol"
:
1e-3
,
"rtol"
:
1e-3
},
InfiniDtype
.
BF16
:
{
"atol"
:
1e-3
,
"rtol"
:
1e-3
},
}
DEBUG
=
False
PROFILE
=
False
NUM_PRERUN
=
10
NUM_ITERATIONS
=
1000
def
tensorInfo
(
data
):
print
(
"data: "
,
data
.
is_contiguous
(),
data
.
device
,
data
.
dtype
,
data
.
shape
,
data
.
stride
(),
data
.
data_ptr
(),
hex
(
data
.
data_ptr
()))
def
torch_topksoftmax
(
router_logits
,
top_k
,
norm_topk_prob
=
False
):
routing_weights
=
F
.
softmax
(
router_logits
,
dim
=
1
,
dtype
=
torch
.
float
)
routing_weights
,
selected_experts
=
torch
.
topk
(
routing_weights
,
top_k
,
dim
=-
1
)
if
norm_topk_prob
:
# only diff with mixtral sparse moe block!
routing_weights
/=
routing_weights
.
sum
(
dim
=-
1
,
keepdim
=
True
)
return
routing_weights
,
selected_experts
def
test
(
handle
,
device
,
x_shape
,
x_stride
,
topk
,
norm_topk_prob
,
x_dtype
=
InfiniDtype
.
F32
,
dtype
=
InfiniDtype
.
F16
,
sync
=
None
,
):
print
(
f
"Testing topksoftmax on
{
InfiniDeviceNames
[
device
]
}
with x_shape:
{
x_shape
}
"
f
"x_stride:
{
x_stride
}
w_dtype:
{
InfiniDtypeNames
[
x_dtype
]
}
dtype:
{
InfiniDtypeNames
[
dtype
]
}
"
)
data
=
torch
.
arange
(
0
,
x_shape
[
0
]
*
x_shape
[
1
]).
reshape
(
x_shape
)
N
,
width
=
x_shape
x
=
TestTensor
(
x_shape
,
data
.
stride
(),
x_dtype
,
device
,
scale
=
0.5
,
mode
=
"manual"
,
set_tensor
=
data
)
# print(x.torch_tensor())
if
sync
is
not
None
:
sync
()
descriptor
=
infiniopOperatorDescriptor_t
()
check_error
(
LIBINFINIOP
.
infiniopCreateTopksoftmaxDescriptor
(
handle
,
ctypes
.
byref
(
descriptor
),
x
.
descriptor
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for
tensor
in
[
x
]:
tensor
.
destroy_desc
()
workspace_size
=
c_uint64
(
0
)
check_error
(
LIBINFINIOP
.
infiniopGetTopksoftmaxWorkspaceSize
(
descriptor
,
ctypes
.
byref
(
workspace_size
)
)
)
workspace
=
TestWorkspace
(
workspace_size
.
value
,
x
.
device
)
values
=
torch
.
zeros
((
N
,
topk
),
dtype
=
torch
.
float32
,
device
=
torch_device_map
[
x
.
device
])
indices
=
torch
.
zeros
((
N
,
topk
),
dtype
=
torch
.
int32
,
device
=
torch_device_map
[
x
.
device
])
def
lib_topksoftmax
():
check_error
(
LIBINFINIOP
.
infiniopTopksoftmax
(
descriptor
,
workspace
.
data
(),
workspace_size
.
value
,
values
.
data_ptr
(),
indices
.
data_ptr
(),
x
.
data
(),
topk
,
norm_topk_prob
,
None
,
)
)
lable_values
,
lable_indices
=
torch_topksoftmax
(
x
.
torch_tensor
().
clone
(),
topk
,
norm_topk_prob
=
norm_topk_prob
)
lable_indices
=
lable_indices
.
to
(
dtype
=
torch
.
int32
)
lib_topksoftmax
()
atol
,
rtol
=
get_tolerance
(
_TOLERANCE_MAP
,
dtype
)
if
DEBUG
:
debug
(
lable_values
,
values
,
atol
=
atol
,
rtol
=
rtol
)
debug
(
lable_indices
,
indices
,
atol
=
atol
,
rtol
=
rtol
)
assert
torch
.
allclose
(
lable_values
,
values
,
atol
=
atol
,
rtol
=
rtol
)
assert
torch
.
allclose
(
lable_indices
,
indices
,
atol
=
atol
,
rtol
=
rtol
)
# Profiling workflow
if
PROFILE
:
# fmt: off
profile_operation
(
"PyTorch"
,
lambda
:
torch_topksoftmax
(
x
.
actual_tensor
().
clone
(),
topk
),
device
,
NUM_PRERUN
,
NUM_ITERATIONS
)
profile_operation
(
" lib"
,
lambda
:
lib_topksoftmax
(),
device
,
NUM_PRERUN
,
NUM_ITERATIONS
)
# fmt: on
check_error
(
LIBINFINIOP
.
infiniopDestroyTopksoftmaxDescriptor
(
descriptor
))
if
__name__
==
"__main__"
:
args
=
get_args
()
# Configure testing options
DEBUG
=
args
.
debug
PROFILE
=
args
.
profile
NUM_PRERUN
=
args
.
num_prerun
NUM_ITERATIONS
=
args
.
num_iterations
# Execute tests
for
device
in
get_test_devices
(
args
):
test_operator
(
device
,
test
,
_TEST_CASES
,
_VALUE_DTYPES
)
print
(
"
\033
[92mTest passed!
\033
[0m"
)
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment