Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
ac4aae48
Unverified
Commit
ac4aae48
authored
Dec 01, 2025
by
Shijie
Committed by
GitHub
Dec 01, 2025
Browse files
Merge branch 'main' into dev_topkrouter
parents
a15aa367
2f3f4076
Changes
33
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
463 additions
and
16 deletions
+463
-16
src/infiniop/ops/topkrouter/kunlun/topkrouter_kunlun.h
src/infiniop/ops/topkrouter/kunlun/topkrouter_kunlun.h
+8
-0
src/infiniop/ops/topkrouter/kunlun/topkrouter_kunlun.xpu
src/infiniop/ops/topkrouter/kunlun/topkrouter_kunlun.xpu
+108
-0
src/infiniop/ops/topkrouter/operator.cc
src/infiniop/ops/topkrouter/operator.cc
+15
-0
src/infiniop/sort/kunlun/heap.h
src/infiniop/sort/kunlun/heap.h
+264
-0
src/infinirt/infinirt.cc
src/infinirt/infinirt.cc
+1
-1
test/infinicore/framework/base.py
test/infinicore/framework/base.py
+6
-5
test/infinicore/framework/utils.py
test/infinicore/framework/utils.py
+15
-1
test/infinicore/ops/aminmax.py
test/infinicore/ops/aminmax.py
+2
-2
test/infinicore/ops/silu.py
test/infinicore/ops/silu.py
+0
-1
test/infinicore/ops/sort.py
test/infinicore/ops/sort.py
+5
-5
test/infiniop/topkrouter.py
test/infiniop/topkrouter.py
+3
-1
xmake.lua
xmake.lua
+35
-0
xmake/test.lua
xmake/test.lua
+1
-0
No files found.
src/infiniop/ops/topkrouter/kunlun/topkrouter_kunlun.h
0 → 100644
View file @
ac4aae48
#ifndef __TOPKROUTER_KUNLUN_H__
#define __TOPKROUTER_KUNLUN_H__
#include "../topkrouter.h"
DESCRIPTOR
(
kunlun
)
#endif
src/infiniop/ops/topkrouter/kunlun/topkrouter_kunlun.xpu
0 → 100644
View file @
ac4aae48
#include "../../../devices/kunlun/kunlun_common.h"
#include "../../../devices/kunlun/kunlun_handle.h"
#include "../../../devices/kunlun/kunlun_kernel_common.h"
#include "kernel.h"
#include "topkrouter_kunlun.h"
#include <memory>
#include <stdint.h>
namespace op::topkrouter::kunlun {
struct Descriptor::Opaque {
std::shared_ptr<device::kunlun::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t correction_bias_desc) {
auto result = TopkrouterInfo::create(x_desc);
CHECK_RESULT(result);
auto info = result.take();
if (info.x_strides[1] != 1) {
return INFINI_STATUS_BAD_TENSOR_STRIDES;
}
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::kunlun::Handle *>(handle)->internal()},
std::move(info),
0,
handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
template <int BLOCK_SIZE = 64>
infiniStatus_t launch_topkrouter(float *d_values_out, int *d_indices_out, const void *d_input, const float *d_correction_bias,
const float routed_scaling_factor, const size_t N, const size_t width, const size_t topk, infiniDtype_t xtype,
kunlunStream_t stream) {
if (xtype == INFINI_DTYPE_F32) {
topkrouter_kernel<float, BLOCK_SIZE, 256, 8, 4, 2>
<<<N, BLOCK_SIZE, stream>>>(
d_values_out,
d_indices_out,
(float *)d_input,
(const float *)d_correction_bias,
routed_scaling_factor,
N,
width,
topk);
} else if (xtype == INFINI_DTYPE_F16) {
topkrouter_kernel<half, BLOCK_SIZE, 256, 8, 4, 2>
<<<N, BLOCK_SIZE, stream>>>(
d_values_out,
d_indices_out,
(half *)d_input,
(const float *)d_correction_bias,
routed_scaling_factor,
N,
width,
topk);
} else if (xtype == INFINI_DTYPE_BF16) {
topkrouter_kernel<bfloat16_t, BLOCK_SIZE, 256, 8, 4, 2>
<<<N, BLOCK_SIZE, stream>>>(
d_values_out,
d_indices_out,
(bfloat16_t *)d_input,
(const float *)d_correction_bias,
routed_scaling_factor,
N,
width,
topk);
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
float *values,
int *indices,
const void *x,
const float *correction_bias,
const float routed_scaling_factor,
const size_t topk,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
size_t N = _info.N;
size_t width = _info.width;
auto kunlun_stream = reinterpret_cast<kunlunStream_t>(stream);
launch_topkrouter<64>(values, indices, x, correction_bias, routed_scaling_factor, N, width, topk, _info.xtype, kunlun_stream);
return INFINI_STATUS_SUCCESS;
}
} // namespace op::topkrouter::kunlun
src/infiniop/ops/topkrouter/operator.cc
View file @
ac4aae48
...
@@ -11,6 +11,9 @@
...
@@ -11,6 +11,9 @@
#ifdef ENABLE_METAX_API
#ifdef ENABLE_METAX_API
#include "metax/topkrouter_metax.h"
#include "metax/topkrouter_metax.h"
#endif
#endif
#ifdef ENABLE_KUNLUN_API
#include "kunlun/topkrouter_kunlun.h"
#endif
__C
infiniStatus_t
infiniopCreateTopkrouterDescriptor
(
infiniopHandle_t
handle
,
infiniopTopkrouterDescriptor_t
*
desc_ptr
,
__C
infiniStatus_t
infiniopCreateTopkrouterDescriptor
(
infiniopHandle_t
handle
,
infiniopTopkrouterDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
x_desc
,
infiniopTensorDescriptor_t
x_desc
,
...
@@ -32,6 +35,9 @@ __C infiniStatus_t infiniopCreateTopkrouterDescriptor(infiniopHandle_t handle, i
...
@@ -32,6 +35,9 @@ __C infiniStatus_t infiniopCreateTopkrouterDescriptor(infiniopHandle_t handle, i
#endif
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_METAX_API
CREATE
(
INFINI_DEVICE_METAX
,
metax
);
CREATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_KUNLUN_API
CREATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#endif
}
}
...
@@ -58,6 +64,9 @@ __C infiniStatus_t infiniopGetTopkrouterWorkspaceSize(infiniopTopkrouterDescript
...
@@ -58,6 +64,9 @@ __C infiniStatus_t infiniopGetTopkrouterWorkspaceSize(infiniopTopkrouterDescript
#endif
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_METAX_API
GET
(
INFINI_DEVICE_METAX
,
metax
);
GET
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_KUNLUN_API
GET
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#endif
}
}
...
@@ -87,6 +96,9 @@ __C infiniStatus_t infiniopTopkrouter(infiniopTopkrouterDescriptor_t desc, void
...
@@ -87,6 +96,9 @@ __C infiniStatus_t infiniopTopkrouter(infiniopTopkrouterDescriptor_t desc, void
#endif
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_METAX_API
CALCULATE
(
INFINI_DEVICE_METAX
,
metax
);
CALCULATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_KUNLUN_API
CALCULATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#endif
}
}
...
@@ -113,6 +125,9 @@ __C infiniStatus_t infiniopDestroyTopkrouterDescriptor(infiniopTopkrouterDescrip
...
@@ -113,6 +125,9 @@ __C infiniStatus_t infiniopDestroyTopkrouterDescriptor(infiniopTopkrouterDescrip
#endif
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_METAX_API
DESTROY
(
INFINI_DEVICE_METAX
,
metax
);
DESTROY
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_KUNLUN_API
DESTROY
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#endif
}
}
...
...
src/infiniop/sort/kunlun/heap.h
0 → 100644
View file @
ac4aae48
#ifndef __INFINIOP_HEAP_KUNLUN_H__
#define __INFINIOP_HEAP_KUNLUN_H__
#include "xpu/kernel/xtdk_simd_xpu2.h"
template
<
typename
TK
,
typename
TV
>
static
__device__
inline
void
sm_swap_kv
(
_shared_ptr_
TK
*
k0
,
_shared_ptr_
TV
*
v0
,
_shared_ptr_
TK
*
k1
,
_shared_ptr_
TV
*
v1
)
{
TK
tmpk
=
*
k0
;
TV
tmpv
=
*
v0
;
*
k0
=
*
k1
;
*
v0
=
*
v1
;
*
k1
=
tmpk
;
*
v1
=
tmpv
;
}
template
<
typename
TK
,
typename
TV
>
static
__device__
inline
void
update_sm_min_heap
(
_shared_ptr_
TK
*
heap_key
,
_shared_ptr_
TV
*
heap_value
,
int
idx
,
int
heap_capacity
)
{
while
(
idx
<
heap_capacity
)
{
int
child_l
=
idx
*
2
+
1
;
int
child_r
=
idx
*
2
+
2
;
int
child_min
=
child_l
;
if
(
child_r
>=
heap_capacity
)
{
if
(
child_l
>=
heap_capacity
)
{
// idx is leaf node, shift finished
break
;
}
else
{
// if child_r does not exist while child_l does, choose child_l
child_min
=
child_l
;
}
}
else
{
// both child L & R exists
child_min
=
child_l
+
(
heap_key
[
child_l
]
>
heap_key
[
child_r
]);
}
if
(
heap_key
[
idx
]
<=
heap_key
[
child_min
])
{
break
;
}
sm_swap_kv
(
&
heap_key
[
idx
],
&
heap_value
[
idx
],
&
heap_key
[
child_min
],
&
heap_value
[
child_min
]);
idx
=
child_min
;
}
}
template
<
typename
TK
,
typename
TV
>
static
__device__
inline
void
make_sm_min_heap
(
_shared_ptr_
TK
*
heap_key
,
_shared_ptr_
TV
*
heap_value
,
int
size
)
{
for
(
int
i
=
size
/
2
-
1
;
i
>=
0
;
i
--
)
{
update_sm_min_heap
(
heap_key
,
heap_value
,
i
,
size
);
}
}
template
<
typename
TK
,
typename
TV
>
static
__device__
inline
void
sort_sm_min_heap
(
_shared_ptr_
TK
*
heap_key
,
_shared_ptr_
TV
*
heap_value
,
int
heap_capacity
)
{
for
(
int
i
=
heap_capacity
-
1
;
i
>
0
;
i
--
)
{
sm_swap_kv
(
&
heap_key
[
0
],
&
heap_value
[
0
],
&
heap_key
[
i
],
&
heap_value
[
i
]);
update_sm_min_heap
(
heap_key
,
heap_value
,
0
,
i
);
}
}
template
<
typename
TK
,
typename
TV
>
static
__device__
inline
void
update_sm_max_heap
(
_shared_ptr_
TK
*
heap_key
,
_shared_ptr_
TV
*
heap_value
,
int
idx
,
int
heap_capacity
)
{
while
(
idx
<
heap_capacity
)
{
int
child_l
=
idx
*
2
+
1
;
int
child_r
=
idx
*
2
+
2
;
int
child_max
=
child_l
;
if
(
child_r
>=
heap_capacity
)
{
if
(
child_l
>=
heap_capacity
)
{
// idx is leaf node, shift finished
break
;
}
else
{
// if child_r does not exist while child_l does, choose child_l
child_max
=
child_l
;
}
}
else
{
// both child L & R exists
child_max
=
child_l
+
(
heap_key
[
child_l
]
<
heap_key
[
child_r
]);
}
if
(
heap_key
[
idx
]
>=
heap_key
[
child_max
])
{
break
;
}
sm_swap_kv
(
&
heap_key
[
idx
],
&
heap_value
[
idx
],
&
heap_key
[
child_max
],
&
heap_value
[
child_max
]);
idx
=
child_max
;
}
}
template
<
typename
TK
,
typename
TV
>
static
__device__
inline
void
make_sm_max_heap
(
_shared_ptr_
TK
*
heap_key
,
_shared_ptr_
TV
*
heap_value
,
int
size
)
{
for
(
int
i
=
size
/
2
-
1
;
i
>=
0
;
i
--
)
{
update_sm_max_heap
(
heap_key
,
heap_value
,
i
,
size
);
}
}
template
<
typename
TK
,
typename
TV
>
static
__device__
inline
void
sort_sm_max_heap
(
_shared_ptr_
TK
*
heap_key
,
_shared_ptr_
TV
*
heap_value
,
int
heap_capacity
)
{
for
(
int
i
=
heap_capacity
-
1
;
i
>
0
;
i
--
)
{
sm_swap_kv
(
&
heap_key
[
0
],
&
heap_value
[
0
],
&
heap_key
[
i
],
&
heap_value
[
i
]);
update_sm_max_heap
(
heap_key
,
heap_value
,
0
,
i
);
}
}
template
<
typename
TK
,
typename
TV
>
static
__device__
inline
void
lm_swap_kv
(
TK
*
k0
,
TV
*
v0
,
TK
*
k1
,
TV
*
v1
)
{
TK
tmpk
=
*
k0
;
TV
tmpv
=
*
v0
;
*
k0
=
*
k1
;
*
v0
=
*
v1
;
*
k1
=
tmpk
;
*
v1
=
tmpv
;
}
template
<
typename
TK
,
typename
TV
>
static
__device__
inline
void
update_lm_min_heap
(
TK
*
heap_key
,
TV
*
heap_value
,
int
idx
,
int
heap_capacity
)
{
while
(
idx
<
heap_capacity
)
{
int
child_l
=
idx
*
2
+
1
;
int
child_r
=
idx
*
2
+
2
;
int
child_min
=
child_l
;
if
(
child_r
>=
heap_capacity
)
{
if
(
child_l
>=
heap_capacity
)
{
// idx is leaf node, shift finished
break
;
}
else
{
// if child_r does not exist while child_l does, choose child_l
child_min
=
child_l
;
}
}
else
{
// both child L & R exists
child_min
=
child_l
+
(
heap_key
[
child_l
]
>
heap_key
[
child_r
]);
}
if
(
heap_key
[
idx
]
<=
heap_key
[
child_min
])
{
break
;
}
lm_swap_kv
(
&
heap_key
[
idx
],
&
heap_value
[
idx
],
&
heap_key
[
child_min
],
&
heap_value
[
child_min
]);
idx
=
child_min
;
}
}
template
<
typename
TK
,
typename
TV
>
static
__device__
inline
void
make_lm_min_heap
(
TK
*
heap_key
,
TV
*
heap_value
,
int
size
)
{
for
(
int
i
=
size
/
2
-
1
;
i
>=
0
;
i
--
)
{
update_lm_min_heap
(
heap_key
,
heap_value
,
i
,
size
);
}
}
template
<
typename
TK
,
typename
TV
>
static
__device__
inline
void
sort_lm_min_heap
(
TK
*
heap_key
,
TV
*
heap_value
,
int
heap_capacity
)
{
for
(
int
i
=
heap_capacity
-
1
;
i
>
0
;
i
--
)
{
lm_swap_kv
(
&
heap_key
[
0
],
&
heap_value
[
0
],
&
heap_key
[
i
],
&
heap_value
[
i
]);
update_lm_min_heap
(
heap_key
,
heap_value
,
0
,
i
);
}
}
template
<
typename
TK
,
typename
TV
>
static
__device__
inline
void
update_lm_max_heap
(
TK
*
heap_key
,
TV
*
heap_value
,
int
idx
,
int
heap_capacity
)
{
while
(
idx
<
heap_capacity
)
{
int
child_l
=
idx
*
2
+
1
;
int
child_r
=
idx
*
2
+
2
;
int
child_max
=
child_l
;
if
(
child_r
>=
heap_capacity
)
{
if
(
child_l
>=
heap_capacity
)
{
// idx is leaf node, shift finished
break
;
}
else
{
// if child_r does not exist while child_l does, choose child_l
child_max
=
child_l
;
}
}
else
{
// both child L & R exists
child_max
=
child_l
+
(
heap_key
[
child_l
]
<
heap_key
[
child_r
]);
}
if
(
heap_key
[
idx
]
>=
heap_key
[
child_max
])
{
break
;
}
lm_swap_kv
(
&
heap_key
[
idx
],
&
heap_value
[
idx
],
&
heap_key
[
child_max
],
&
heap_value
[
child_max
]);
idx
=
child_max
;
}
}
template
<
typename
TK
,
typename
TV
>
static
__device__
inline
void
make_lm_max_heap
(
TK
*
heap_key
,
TV
*
heap_value
,
int
size
)
{
for
(
int
i
=
size
/
2
-
1
;
i
>=
0
;
i
--
)
{
update_lm_max_heap
(
heap_key
,
heap_value
,
i
,
size
);
}
}
template
<
typename
TK
,
typename
TV
>
static
__device__
inline
void
sort_lm_max_heap
(
TK
*
heap_key
,
TV
*
heap_value
,
int
heap_capacity
)
{
for
(
int
i
=
heap_capacity
-
1
;
i
>
0
;
i
--
)
{
lm_swap_kv
(
&
heap_key
[
0
],
&
heap_value
[
0
],
&
heap_key
[
i
],
&
heap_value
[
i
]);
update_lm_max_heap
(
heap_key
,
heap_value
,
0
,
i
);
}
}
template
<
typename
TID
>
__device__
TID
roundup_div_p
(
TID
a
,
TID
b
)
{
return
(
a
+
b
-
1
)
/
b
;
}
template
<
typename
T
>
__device__
T
min_p
(
T
a
,
T
b
)
{
return
a
<
b
?
a
:
b
;
}
template
<
typename
TID
>
static
__device__
inline
void
partition
(
int
tid
,
int
nthreads
,
TID
len
,
int
align
,
TID
*
start
,
TID
*
end
)
{
TID
block_cnt
=
roundup_div_p
<
TID
>
(
len
,
align
);
TID
remain_block
=
block_cnt
%
nthreads
;
TID
start_block
=
block_cnt
/
nthreads
*
static_cast
<
TID
>
(
tid
)
+
min_p
<
TID
>
(
tid
,
remain_block
);
TID
end_block
=
start_block
+
block_cnt
/
nthreads
+
(
tid
<
remain_block
);
*
start
=
min_p
<
TID
>
(
start_block
*
align
,
len
);
*
end
=
min_p
<
TID
>
(
end_block
*
align
,
len
);
}
template
<
typename
TX
,
typename
TY
>
static
__device__
void
primitive_cast
(
const
TX
*
x
,
TY
*
y
,
int
len
)
{
return
;
}
template
<
>
__device__
void
primitive_cast
(
const
float
*
x
,
int
*
y
,
int
len
)
{
for
(
int
i
=
0
;
i
<
len
;
i
+=
16
)
{
float32x16_t
Y
=
vload_lm_float32x16
(
x
);
__asm__
__volatile__
(
"vfloat2fix.rz vr0, %0
\t\n
"
"vstore_mask16.mz vr0{mr1}, 0(%1)"
::
"v"
(
Y
),
"r"
(
y
)
:
"vr0"
);
x
+=
16
;
y
+=
16
;
}
mfence_lm
();
}
template
<
>
__device__
void
primitive_cast
(
const
int
*
x
,
float
*
y
,
int
len
)
{
for
(
int
i
=
0
;
i
<
len
;
i
+=
16
)
{
int32x16_t
Y
=
vload_lm_int32x16
(
x
);
__asm__
__volatile__
(
"vfix2float.rn vr0, %0
\t\n
"
"vstore_mask16.mz vr0{mr1}, 0(%1)"
::
"v"
(
Y
),
"r"
(
y
)
:
"vr0"
);
x
+=
16
;
y
+=
16
;
}
mfence_lm
();
}
static
__device__
inline
void
vload2_lm
(
const
float
*
ptr
,
float32x16_t
&
vl
,
float32x16_t
&
vh
)
{
vl
=
__builtin_xpu2_vload_mask16_mr1
(
ptr
,
0
);
vh
=
__builtin_xpu2_vload_mask16_mr1
(
ptr
+
16
,
0
);
}
static
__device__
inline
void
vstore2_lm
(
float
*
ptr
,
float32x16_t
&
vl
,
float32x16_t
&
vh
)
{
vstore_lm_float32x16
(
ptr
,
vl
);
vstore_lm_float32x16
(
ptr
+
16
,
vh
);
}
template
<
>
__device__
void
primitive_cast
(
const
float
*
x
,
float
*
y
,
int
len
)
{
if
(
x
==
y
)
{
return
;
}
else
{
// just copy
float32x16_t
vec_x_0
;
float32x16_t
vec_x_1
;
for
(
int
i
=
0
;
i
<
len
;
i
+=
32
)
{
vload2_lm
(
x
+
i
,
vec_x_0
,
vec_x_1
);
vstore2_lm
(
y
+
i
,
vec_x_0
,
vec_x_1
);
}
mfence_lm
();
}
}
#endif
src/infinirt/infinirt.cc
View file @
ac4aae48
...
@@ -23,7 +23,7 @@ __C infiniStatus_t infinirtGetAllDeviceCount(int *count_array) {
...
@@ -23,7 +23,7 @@ __C infiniStatus_t infinirtGetAllDeviceCount(int *count_array) {
return
INFINI_STATUS_NULL_POINTER
;
return
INFINI_STATUS_NULL_POINTER
;
}
}
for
(
size_t
i
=
0
;
i
<
INFINI_DEVICE_TYPE_COUNT
;
i
++
)
{
for
(
size_t
i
=
0
;
i
<
INFINI_DEVICE_TYPE_COUNT
;
i
++
)
{
if
(
i
==
INFINI_DEVICE_ILUVATAR
||
i
==
INFINI_DEVICE_
QY
||
i
==
INFINI_DEVICE_KUNLU
N
||
i
==
INFINI_DEVICE_
HYGON
)
{
if
(
i
==
INFINI_DEVICE_ILUVATAR
||
i
==
INFINI_DEVICE_
HYGO
N
||
i
==
INFINI_DEVICE_
QY
)
{
count_array
[
i
]
=
0
;
count_array
[
i
]
=
0
;
continue
;
continue
;
}
}
...
...
test/infinicore/framework/base.py
View file @
ac4aae48
...
@@ -13,6 +13,7 @@ from .datatypes import to_torch_dtype, to_infinicore_dtype
...
@@ -13,6 +13,7 @@ from .datatypes import to_torch_dtype, to_infinicore_dtype
from
.devices
import
InfiniDeviceNames
,
torch_device_map
from
.devices
import
InfiniDeviceNames
,
torch_device_map
from
.tensor
import
TensorSpec
,
TensorInitializer
from
.tensor
import
TensorSpec
,
TensorInitializer
from
.utils
import
(
from
.utils
import
(
clone_torch_tensor
,
create_test_comparator
,
create_test_comparator
,
infinicore_tensor_from_torch
,
infinicore_tensor_from_torch
,
)
)
...
@@ -321,7 +322,7 @@ class BaseOperatorTest(ABC):
...
@@ -321,7 +322,7 @@ class BaseOperatorTest(ABC):
for
item
in
input_sequence
:
for
item
in
input_sequence
:
if
isinstance
(
item
,
torch
.
Tensor
):
if
isinstance
(
item
,
torch
.
Tensor
):
if
clone
:
if
clone
:
cloned_item
=
item
.
clone
().
detach
(
)
cloned_item
=
clone_torch_tensor
(
item
)
infini_item
=
infinicore_tensor_from_torch
(
cloned_item
)
infini_item
=
infinicore_tensor_from_torch
(
cloned_item
)
cloned_tensors
.
append
(
cloned_item
)
cloned_tensors
.
append
(
cloned_item
)
else
:
else
:
...
@@ -340,7 +341,7 @@ class BaseOperatorTest(ABC):
...
@@ -340,7 +341,7 @@ class BaseOperatorTest(ABC):
if
isinstance
(
inp
,
torch
.
Tensor
):
if
isinstance
(
inp
,
torch
.
Tensor
):
# Clone only if this input will be used for comparison
# Clone only if this input will be used for comparison
if
comparison_target
==
i
:
if
comparison_target
==
i
:
cloned_inp
=
inp
.
clone
().
detach
(
)
cloned_inp
=
clone
_torch_tensor
(
inp
)
infini_tensor
=
infinicore_tensor_from_torch
(
cloned_inp
)
infini_tensor
=
infinicore_tensor_from_torch
(
cloned_inp
)
cloned_tensors
.
append
(
cloned_inp
)
cloned_tensors
.
append
(
cloned_inp
)
else
:
else
:
...
@@ -362,7 +363,7 @@ class BaseOperatorTest(ABC):
...
@@ -362,7 +363,7 @@ class BaseOperatorTest(ABC):
if
isinstance
(
value
,
torch
.
Tensor
):
if
isinstance
(
value
,
torch
.
Tensor
):
# Check if this tensor is used for output comparison
# Check if this tensor is used for output comparison
if
key
==
"out"
and
comparison_target
==
"out"
:
if
key
==
"out"
and
comparison_target
==
"out"
:
cloned_value
=
value
.
clone
().
detach
(
)
cloned_value
=
clone_torch_tensor
(
value
)
infini_kwargs
[
key
]
=
infinicore_tensor_from_torch
(
cloned_value
)
infini_kwargs
[
key
]
=
infinicore_tensor_from_torch
(
cloned_value
)
cloned_tensors
.
append
(
cloned_value
)
cloned_tensors
.
append
(
cloned_value
)
elif
key
==
"out"
and
isinstance
(
comparison_target
,
int
):
elif
key
==
"out"
and
isinstance
(
comparison_target
,
int
):
...
@@ -566,12 +567,12 @@ class BaseOperatorTest(ABC):
...
@@ -566,12 +567,12 @@ class BaseOperatorTest(ABC):
elif
comparison_target
==
"out"
:
elif
comparison_target
==
"out"
:
# Compare output tensor from kwargs (explicit output)
# Compare output tensor from kwargs (explicit output)
torch_comparison
=
kwargs
.
get
(
"out"
)
torch_comparison
=
kwargs
.
get
(
"out"
)
infini_comparison
=
infini_kwargs
.
get
(
"out"
)
infini_comparison
=
cloned_tensors
[
0
]
elif
isinstance
(
comparison_target
,
int
):
elif
isinstance
(
comparison_target
,
int
):
# Compare specific input tensor (in-place operation on input)
# Compare specific input tensor (in-place operation on input)
if
0
<=
comparison_target
<
len
(
inputs
):
if
0
<=
comparison_target
<
len
(
inputs
):
torch_comparison
=
inputs
[
comparison_target
]
torch_comparison
=
inputs
[
comparison_target
]
infini_comparison
=
infini_inputs
[
comparison_target
]
infini_comparison
=
cloned_tensors
[
0
]
else
:
else
:
raise
ValueError
(
raise
ValueError
(
f
"Invalid comparison target index:
{
comparison_target
}
"
f
"Invalid comparison target index:
{
comparison_target
}
"
...
...
test/infinicore/framework/utils.py
View file @
ac4aae48
...
@@ -118,6 +118,13 @@ def get_tolerance(tolerance_map, tensor_dtype, default_atol=0, default_rtol=1e-3
...
@@ -118,6 +118,13 @@ def get_tolerance(tolerance_map, tensor_dtype, default_atol=0, default_rtol=1e-3
return
tolerance
[
"atol"
],
tolerance
[
"rtol"
]
return
tolerance
[
"atol"
],
tolerance
[
"rtol"
]
def
clone_torch_tensor
(
torch_tensor
):
cloned
=
torch_tensor
.
clone
().
detach
()
if
not
torch_tensor
.
is_contiguous
():
cloned
=
rearrange_tensor
(
cloned
,
torch_tensor
.
stride
())
return
cloned
def
infinicore_tensor_from_torch
(
torch_tensor
):
def
infinicore_tensor_from_torch
(
torch_tensor
):
infini_device
=
infinicore
.
device
(
torch_tensor
.
device
.
type
,
0
)
infini_device
=
infinicore
.
device
(
torch_tensor
.
device
.
type
,
0
)
if
torch_tensor
.
is_contiguous
():
if
torch_tensor
.
is_contiguous
():
...
@@ -152,6 +159,10 @@ def convert_infinicore_to_torch(infini_result):
...
@@ -152,6 +159,10 @@ def convert_infinicore_to_torch(infini_result):
dtype
=
to_torch_dtype
(
infini_result
.
dtype
),
dtype
=
to_torch_dtype
(
infini_result
.
dtype
),
device
=
infini_result
.
device
.
type
,
device
=
infini_result
.
device
.
type
,
)
)
if
not
infini_result
.
is_contiguous
():
torch_result_from_infini
=
rearrange_tensor
(
torch_result_from_infini
,
infini_result
.
stride
()
)
temp_tensor
=
infinicore_tensor_from_torch
(
torch_result_from_infini
)
temp_tensor
=
infinicore_tensor_from_torch
(
torch_result_from_infini
)
temp_tensor
.
copy_
(
infini_result
)
temp_tensor
.
copy_
(
infini_result
)
return
torch_result_from_infini
return
torch_result_from_infini
...
@@ -223,7 +234,10 @@ def compare_results(
...
@@ -223,7 +234,10 @@ def compare_results(
return
result_equal
return
result_equal
# Convert infinicore result to PyTorch tensor for comparison
# Convert infinicore result to PyTorch tensor for comparison
torch_result_from_infini
=
convert_infinicore_to_torch
(
infini_result
)
if
isinstance
(
infini_result
,
torch
.
Tensor
):
torch_result_from_infini
=
infini_result
else
:
torch_result_from_infini
=
convert_infinicore_to_torch
(
infini_result
)
# Debug mode: detailed comparison
# Debug mode: detailed comparison
if
debug_mode
:
if
debug_mode
:
...
...
test/infinicore/ops/aminmax.py
View file @
ac4aae48
...
@@ -49,8 +49,8 @@ _TEST_CASES_DATA = [
...
@@ -49,8 +49,8 @@ _TEST_CASES_DATA = [
((
13
,
4
),
0
,
False
,
None
,
(
3
,),
(
3
,)),
((
13
,
4
),
0
,
False
,
None
,
(
3
,),
(
3
,)),
((
13
,
4
),
1
,
False
,
(
20
,
1
),
(
10
,),
(
10
,)),
((
13
,
4
),
1
,
False
,
(
20
,
1
),
(
10
,),
(
10
,)),
# 3D in-place cases
# 3D in-place cases
((
4
,
5
,
6
),
1
,
True
,
None
,
(
4
,
1
,
6
),
(
4
,
1
,
6
)),
((
4
,
5
,
6
),
1
,
True
,
None
,
(
6
,
6
,
1
),
(
6
,
6
,
1
)),
((
4
,
5
,
6
),
-
1
,
False
,
(
30
,
6
,
1
),
(
4
,
5
),
(
4
,
5
)),
((
4
,
5
,
6
),
-
1
,
False
,
(
30
,
6
,
1
),
(
5
,
1
),
(
5
,
1
)),
]
]
# Tolerance configuration
# Tolerance configuration
...
...
test/infinicore/ops/silu.py
View file @
ac4aae48
...
@@ -28,7 +28,6 @@ _TEST_CASES_DATA = [
...
@@ -28,7 +28,6 @@ _TEST_CASES_DATA = [
((
4
,
48
,
6
),
None
,
None
),
((
4
,
48
,
6
),
None
,
None
),
# Strided tensors
# Strided tensors
((
1
,
2048
),
(
4096
,
1
),
(
4096
,
1
)),
((
1
,
2048
),
(
4096
,
1
),
(
4096
,
1
)),
((
6
,
2560
),
(
2048
,
1
),
(
2560
,
1
)),
# Mixed cases
# Mixed cases
((
8
,
16
,
32
),
None
,
None
),
((
8
,
16
,
32
),
None
,
None
),
# Large tensors
# Large tensors
...
...
test/infinicore/ops/sort.py
View file @
ac4aae48
...
@@ -31,12 +31,12 @@ _TEST_CASES_DATA = [
...
@@ -31,12 +31,12 @@ _TEST_CASES_DATA = [
((
4
,
5
,
6
),
1
,
False
,
None
,
None
,
None
),
((
4
,
5
,
6
),
1
,
False
,
None
,
None
,
None
),
((
4
,
5
,
6
),
-
1
,
True
,
None
,
None
,
None
),
((
4
,
5
,
6
),
-
1
,
True
,
None
,
None
,
None
),
# 3D in-place cases
# 3D in-place cases
((
4
,
5
,
6
),
1
,
False
,
None
,
(
4
,
1
,
6
),
(
4
,
1
,
6
)),
((
4
,
5
,
6
),
1
,
False
,
None
,
(
30
,
6
,
1
),
(
30
,
6
,
1
)),
((
4
,
5
,
6
),
-
1
,
False
,
(
30
,
6
,
1
),
(
64
,
1
,
5
),
(
64
,
1
,
5
)),
((
4
,
5
,
6
),
-
1
,
False
,
(
30
,
6
,
1
),
(
30
,
6
,
1
),
(
30
,
6
,
1
)),
# Strided inputs and outputs
# Strided inputs and outputs
((
13
,
4
),
None
,
False
,
(
4
,
1
),
(
12
,
1
),
(
2
4
,
1
)),
((
13
,
4
),
None
,
False
,
(
4
,
1
),
(
4
,
1
),
(
4
,
1
)),
((
13
,
4
),
0
,
False
,
(
1
,
4
),
(
64
,
1
),
(
1
,
4
)),
((
13
,
4
),
0
,
False
,
(
1
3
,
1
),
(
13
,
1
),
(
1
3
,
1
)),
((
13
,
4
),
1
,
False
,
(
1
,
4
),
(
64
,
1
),
(
1
,
4
)),
((
13
,
4
),
1
,
False
,
(
1
3
,
1
),
(
13
,
1
),
(
1
3
,
1
)),
]
]
# Tolerance configuration
# Tolerance configuration
...
...
test/infiniop/topkrouter.py
View file @
ac4aae48
...
@@ -33,7 +33,8 @@ _TEST_CASES_ = [
...
@@ -33,7 +33,8 @@ _TEST_CASES_ = [
# w (weight) types
# w (weight) types
# Note: 'None' means the same as input dtype
# Note: 'None' means the same as input dtype
_X_DTYPES
=
[]
# [InfiniDtype.F32, InfiniDtype.BF16, InfiniDtype.F16]
# _X_DTYPES = [InfiniDtype.F32, InfiniDtype.BF16, InfiniDtype.F16]
_X_DTYPES
=
[]
# CPU CI
# x types used for testing
# x types used for testing
_VALUE_DTYPES
=
[
InfiniDtype
.
F32
]
_VALUE_DTYPES
=
[
InfiniDtype
.
F32
]
...
@@ -194,6 +195,7 @@ def test(
...
@@ -194,6 +195,7 @@ def test(
lib_topkrouter
()
lib_topkrouter
()
lable_values
,
lable_indices
=
torch_topkrouter
(
x
.
actual_tensor
(),
correction_bias
.
actual_tensor
(),
routed_scaling_factor
,
topk
)
lable_values
,
lable_indices
=
torch_topkrouter
(
x
.
actual_tensor
(),
correction_bias
.
actual_tensor
(),
routed_scaling_factor
,
topk
)
atol
,
rtol
=
get_tolerance
(
_TOLERANCE_MAP
,
dtype
)
atol
,
rtol
=
get_tolerance
(
_TOLERANCE_MAP
,
dtype
)
if
DEBUG
:
if
DEBUG
:
...
...
xmake.lua
View file @
ac4aae48
...
@@ -326,6 +326,7 @@ target("infiniccl")
...
@@ -326,6 +326,7 @@ target("infiniccl")
end
end
if
has_config
(
"qy-gpu"
)
then
if
has_config
(
"qy-gpu"
)
then
add_deps
(
"infiniccl-qy"
)
add_deps
(
"infiniccl-qy"
)
add_files
(
"build/.objs/infiniccl-qy/rules/qy.cuda/src/infiniccl/cuda/*.cu.o"
,
{
public
=
true
})
end
end
if
has_config
(
"moore-gpu"
)
then
if
has_config
(
"moore-gpu"
)
then
...
@@ -347,12 +348,45 @@ target("infiniccl")
...
@@ -347,12 +348,45 @@ target("infiniccl")
set_installdir
(
os.getenv
(
"INFINI_ROOT"
)
or
(
os.getenv
(
is_host
(
"windows"
)
and
"HOMEPATH"
or
"HOME"
)
..
"/.infini"
))
set_installdir
(
os.getenv
(
"INFINI_ROOT"
)
or
(
os.getenv
(
is_host
(
"windows"
)
and
"HOMEPATH"
or
"HOME"
)
..
"/.infini"
))
target_end
()
target_end
()
target
(
"infinicore_c_api"
)
target
(
"infinicore_c_api"
)
target
(
"infinicore_c_api"
)
set_kind
(
"phony"
)
set_kind
(
"phony"
)
add_deps
(
"infiniop"
,
"infinirt"
,
"infiniccl"
)
add_deps
(
"infiniop"
,
"infinirt"
,
"infiniccl"
)
after_build
(
function
(
target
)
print
(
YELLOW
..
"[Congratulations!] Now you can install the libraries with \"
xmake
install
\
""
..
NC
)
end
)
after_build
(
function
(
target
)
print
(
YELLOW
..
"[Congratulations!] Now you can install the libraries with \"
xmake
install
\
""
..
NC
)
end
)
target_end
()
target_end
()
target
(
"infinicore_cpp_api"
)
set_kind
(
"shared"
)
add_deps
(
"infiniop"
,
"infinirt"
,
"infiniccl"
)
set_languages
(
"cxx17"
)
local
INFINI_ROOT
=
os.getenv
(
"INFINI_ROOT"
)
or
(
os.getenv
(
is_host
(
"windows"
)
and
"HOMEPATH"
or
"HOME"
)
..
"/.infini"
)
add_includedirs
(
"include"
)
add_includedirs
(
INFINI_ROOT
..
"/include"
,
{
public
=
true
})
add_linkdirs
(
INFINI_ROOT
..
"/lib"
)
add_links
(
"infiniop"
,
"infinirt"
,
"infiniccl"
)
-- Add InfiniCore C++ source files (needed for RoPE and other nn modules)
add_files
(
"src/infinicore/*.cc"
)
add_files
(
"src/infinicore/context/*.cc"
)
add_files
(
"src/infinicore/context/*/*.cc"
)
add_files
(
"src/infinicore/tensor/*.cc"
)
add_files
(
"src/infinicore/nn/*.cc"
)
add_files
(
"src/infinicore/ops/*/*.cc"
)
set_installdir
(
INFINI_ROOT
)
add_installfiles
(
"include/infinicore/(**.h)"
,
{
prefixdir
=
"include/infinicore"
})
add_installfiles
(
"include/infinicore/(**.hpp)"
,
{
prefixdir
=
"include/infinicore"
})
add_installfiles
(
"include/infinicore/(**/*.h)"
,
{
prefixdir
=
"include/infinicore"
})
add_installfiles
(
"include/infinicore/(**/*.hpp)"
,{
prefixdir
=
"include/infinicore"
})
add_installfiles
(
"include/infinicore.h"
,
{
prefixdir
=
"include"
})
add_installfiles
(
"include/infinicore.hpp"
,
{
prefixdir
=
"include"
})
after_build
(
function
(
target
)
print
(
YELLOW
..
"[Congratulations!] Now you can install the libraries with \"
xmake
install
\
""
..
NC
)
end
)
target_end
()
target
(
"_infinicore"
)
target
(
"_infinicore"
)
add_packages
(
"boost"
)
add_packages
(
"boost"
)
if
is_mode
(
"debug"
)
then
if
is_mode
(
"debug"
)
then
...
@@ -378,6 +412,7 @@ target("_infinicore")
...
@@ -378,6 +412,7 @@ target("_infinicore")
add_files
(
"src/infinicore/context/*.cc"
)
add_files
(
"src/infinicore/context/*.cc"
)
add_files
(
"src/infinicore/context/*/*.cc"
)
add_files
(
"src/infinicore/context/*/*.cc"
)
add_files
(
"src/infinicore/tensor/*.cc"
)
add_files
(
"src/infinicore/tensor/*.cc"
)
add_files
(
"src/infinicore/nn/*.cc"
)
add_files
(
"src/infinicore/ops/*/*.cc"
)
add_files
(
"src/infinicore/ops/*/*.cc"
)
add_files
(
"src/infinicore/pybind11/**.cc"
)
add_files
(
"src/infinicore/pybind11/**.cc"
)
...
...
xmake/test.lua
View file @
ac4aae48
...
@@ -89,6 +89,7 @@ target("infinicore-test")
...
@@ -89,6 +89,7 @@ target("infinicore-test")
add_files
(
os
.
projectdir
()
..
"/src/infinicore/nn/*.cc"
)
add_files
(
os
.
projectdir
()
..
"/src/infinicore/nn/*.cc"
)
add_files
(
os
.
projectdir
()
..
"/src/infinicore-test/*.cc"
)
add_files
(
os
.
projectdir
()
..
"/src/infinicore-test/*.cc"
)
add_files
(
os
.
projectdir
()
..
"/src/infinicore-test/*/*.cc"
)
set_installdir
(
INFINI_ROOT
)
set_installdir
(
INFINI_ROOT
)
target_end
()
target_end
()
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment