Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
194e19bd
Commit
194e19bd
authored
Dec 22, 2025
by
baominghelly
Browse files
issue 787 - Merge from main and resolve conflict
parents
147a4ac7
215d1932
Changes
31
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
258 additions
and
58 deletions
+258
-58
src/infiniop/ops/topkrouter/metax/topkrouter_metax.maca
src/infiniop/ops/topkrouter/metax/topkrouter_metax.maca
+101
-0
src/infiniop/ops/topkrouter/nvidia/topkrouter_nvidia.cu
src/infiniop/ops/topkrouter/nvidia/topkrouter_nvidia.cu
+9
-1
src/infiniop/ops/topkrouter/operator.cc
src/infiniop/ops/topkrouter/operator.cc
+15
-0
src/infinirt/infinirt.cc
src/infinirt/infinirt.cc
+12
-0
src/infinirt/moore/infinirt_moore.cc
src/infinirt/moore/infinirt_moore.cc
+1
-18
test/infinicore/framework/__init__.py
test/infinicore/framework/__init__.py
+1
-3
test/infinicore/framework/utils/tensor_utils.py
test/infinicore/framework/utils/tensor_utils.py
+2
-0
test/infiniop/libinfiniop/utils.py
test/infiniop/libinfiniop/utils.py
+30
-6
test/infiniop/topkrouter.py
test/infiniop/topkrouter.py
+67
-30
xmake.lua
xmake.lua
+7
-0
xmake/nvidia.lua
xmake/nvidia.lua
+13
-0
No files found.
src/infiniop/ops/topkrouter/metax/topkrouter_metax.maca
0 → 100644
View file @
194e19bd
#include "../../../devices/metax/metax_common.h"
#include "../../../devices/metax/metax_kernel_common.h"
#include "topkrouter_metax.h"
#include <cfloat>
#include <cub/block/block_load.cuh>
#include <cub/block/block_radix_sort.cuh>
#include <cub/block/block_reduce.cuh>
#include <cub/block/block_store.cuh>
#include <cub/cub.cuh>
#include "../cuda/kernel.cuh"
namespace op::topkrouter::metax {
struct Descriptor::Opaque {
std::shared_ptr<device::metax::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t correction_bias_desc) {
auto result = TopkrouterInfo::create(x_desc);
CHECK_RESULT(result);
auto info = result.take();
if (info.x_strides[1] != 1) {
return INFINI_STATUS_BAD_TENSOR_STRIDES;
}
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
std::move(info),
0,
handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
namespace {
template <int BLOCK_SIZE = 128>
infiniStatus_t launch_topkrouter(float *d_values_out, int *d_indices_out, const void *d_input, const float *d_correction_bias,
const float routed_scaling_factor, const size_t N, const size_t width, const size_t topk, infiniDtype_t xtype,
hcStream_t stream) {
const int block_threads = BLOCK_SIZE;
dim3 blocks(N);
dim3 threads(block_threads);
if (xtype == INFINI_DTYPE_F32) {
topkrouter_kernel<float, BLOCK_SIZE><<<blocks, threads, 0, stream>>>(d_values_out, d_indices_out, (float *)d_input, d_correction_bias, routed_scaling_factor, N, width, topk);
} else if (xtype == INFINI_DTYPE_F16) {
topkrouter_kernel<half, BLOCK_SIZE><<<blocks, threads, 0, stream>>>(d_values_out, d_indices_out, (half *)d_input, d_correction_bias, routed_scaling_factor, N, width, topk);
} else if (xtype == INFINI_DTYPE_BF16) {
topkrouter_kernel<cuda_bfloat16, BLOCK_SIZE><<<blocks, threads, 0, stream>>>(d_values_out, d_indices_out, (cuda_bfloat16 *)d_input, d_correction_bias, routed_scaling_factor, N, width, topk);
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
}; // namespace
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
float *values,
int *indices,
const void *x,
const float *correction_bias,
const float routed_scaling_factor,
const size_t topk,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
size_t N = _info.N;
size_t width = _info.width; // 256
// size_t n_routed_experts = 256;
// size_t n_group = 8;
// size_t topk_group = 4;
auto cuda_stream = reinterpret_cast<hcStream_t>(stream);
if (256 == width) {
launch_topkrouter<256>(values, indices, x, correction_bias, routed_scaling_factor, N, width, topk, _info.xtype, cuda_stream);
} else {
return INFINI_STATUS_BAD_PARAM;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::topkrouter::metax
src/infiniop/ops/topkrouter/nvidia/topkrouter_nvidia.cu
View file @
194e19bd
...
...
@@ -2,9 +2,17 @@
#include "../../../devices/nvidia/nvidia_common.cuh"
#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
#include "../cuda/kernel.cuh"
#include "topkrouter_nvidia.cuh"
#include <cfloat>
#include <cub/block/block_load.cuh>
#include <cub/block/block_radix_sort.cuh>
#include <cub/block/block_reduce.cuh>
#include <cub/block/block_store.cuh>
#include <cub/cub.cuh>
#include "../cuda/kernel.cuh"
namespace
op
::
topkrouter
::
nvidia
{
...
...
src/infiniop/ops/topkrouter/operator.cc
View file @
194e19bd
...
...
@@ -8,6 +8,9 @@
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
#include "nvidia/topkrouter_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
#include "metax/topkrouter_metax.h"
#endif
#ifdef ENABLE_KUNLUN_API
#include "kunlun/topkrouter_kunlun.h"
#endif
...
...
@@ -30,6 +33,9 @@ __C infiniStatus_t infiniopCreateTopkrouterDescriptor(infiniopHandle_t handle, i
#ifdef ENABLE_QY_API
CREATE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
CREATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_KUNLUN_API
CREATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
...
...
@@ -56,6 +62,9 @@ __C infiniStatus_t infiniopGetTopkrouterWorkspaceSize(infiniopTopkrouterDescript
#ifdef ENABLE_QY_API
GET
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
GET
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_KUNLUN_API
GET
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
...
...
@@ -85,6 +94,9 @@ __C infiniStatus_t infiniopTopkrouter(infiniopTopkrouterDescriptor_t desc, void
#ifdef ENABLE_QY_API
CALCULATE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
CALCULATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_KUNLUN_API
CALCULATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
...
...
@@ -111,6 +123,9 @@ __C infiniStatus_t infiniopDestroyTopkrouterDescriptor(infiniopTopkrouterDescrip
#ifdef ENABLE_QY_API
DESTROY
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
DESTROY
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_KUNLUN_API
DESTROY
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
...
...
src/infinirt/infinirt.cc
View file @
194e19bd
...
...
@@ -10,6 +10,8 @@
thread_local
infiniDevice_t
CURRENT_DEVICE_TYPE
=
INFINI_DEVICE_CPU
;
thread_local
int
CURRENT_DEVICE_ID
=
0
;
thread_local
infiniDevice_t
PREVIOUS_NON_CPU_DEVICE_TYPE
=
INFINI_DEVICE_TYPE_COUNT
;
thread_local
int
PREVIOUS_NON_CPU_DEVICE_ID
=
0
;
__C
infiniStatus_t
infinirtInit
()
{
#ifdef ENABLE_ASCEND_API
...
...
@@ -96,6 +98,16 @@ __C infiniStatus_t infinirtGetDeviceCount(infiniDevice_t device, int *count) {
}
__C
infiniStatus_t
infinirtSetDevice
(
infiniDevice_t
device
,
int
device_id
)
{
bool
skip_set
=
CURRENT_DEVICE_TYPE
==
INFINI_DEVICE_CPU
&&
device
==
PREVIOUS_NON_CPU_DEVICE_TYPE
&&
device_id
==
PREVIOUS_NON_CPU_DEVICE_ID
;
if
(
CURRENT_DEVICE_TYPE
!=
INFINI_DEVICE_CPU
)
{
PREVIOUS_NON_CPU_DEVICE_TYPE
=
CURRENT_DEVICE_TYPE
;
PREVIOUS_NON_CPU_DEVICE_ID
=
CURRENT_DEVICE_ID
;
}
if
(
skip_set
)
{
CURRENT_DEVICE_TYPE
=
device
;
CURRENT_DEVICE_ID
=
device_id
;
return
INFINI_STATUS_SUCCESS
;
}
INFINIRT_CALL_DEVICE_API_AND
(
device
,
setDevice
,
(
device_id
),
{
CURRENT_DEVICE_TYPE
=
device
;
CURRENT_DEVICE_ID
=
device_id
;
});
...
...
src/infinirt/moore/infinirt_moore.cc
View file @
194e19bd
#include "infinirt_moore.h"
#include "../../utils.h"
#include <chrono>
#include <musa_runtime.h>
#include <musa_runtime_api.h>
...
...
@@ -83,23 +82,7 @@ infiniStatus_t eventDestroy(infinirtEvent_t event) {
}
infiniStatus_t
eventElapsedTime
(
float
*
ms_ptr
,
infinirtEvent_t
start
,
infinirtEvent_t
end
)
{
// MUSA may not have direct musaEventElapsedTime API
// Use a fallback method: synchronize events and measure CPU time difference
// Note: This includes synchronization overhead, so timing may not be as accurate
// as native GPU event timing, but it allows benchmarking to work
// Synchronize start event and record CPU time
CHECK_MUSART
(
musaEventSynchronize
((
musaEvent_t
)
start
));
auto
start_cpu_time
=
std
::
chrono
::
steady_clock
::
now
();
// Synchronize end event and record CPU time
CHECK_MUSART
(
musaEventSynchronize
((
musaEvent_t
)
end
));
auto
end_cpu_time
=
std
::
chrono
::
steady_clock
::
now
();
// Calculate elapsed time in milliseconds
auto
duration
=
std
::
chrono
::
duration_cast
<
std
::
chrono
::
microseconds
>
(
end_cpu_time
-
start_cpu_time
);
*
ms_ptr
=
static_cast
<
float
>
(
duration
.
count
())
/
1000.0
f
;
CHECK_MUSART
(
musaEventElapsedTime
(
ms_ptr
,
(
musaEvent_t
)
start
,
(
musaEvent_t
)
end
));
return
INFINI_STATUS_SUCCESS
;
}
...
...
test/infinicore/framework/__init__.py
View file @
194e19bd
...
...
@@ -22,7 +22,6 @@ from .utils.compare_utils import (
)
from
.utils.json_utils
import
save_json_report
from
.utils.tensor_utils
import
(
infinicore_tensor_from_torch
,
convert_infinicore_to_torch
,
rearrange_tensor
,
...
...
@@ -37,7 +36,6 @@ __all__ = [
# Core types and classes
"BaseOperatorTest"
,
"CaseResult"
,
"ConsolePrinter"
,
"GenericTestRunner"
,
"InfiniDeviceEnum"
,
"InfiniDeviceNames"
,
...
...
@@ -47,7 +45,7 @@ __all__ = [
"TestCase"
,
"TestConfig"
,
"TestDriver"
,
"Test
Reporter
"
,
"Test
Summary
"
,
"TestRunner"
,
"TestTiming"
,
# Core functions
...
...
test/infinicore/framework/utils/tensor_utils.py
View file @
194e19bd
...
...
@@ -15,6 +15,8 @@ def synchronize_device(torch_device):
torch
.
npu
.
synchronize
()
elif
torch_device
==
"mlu"
:
torch
.
mlu
.
synchronize
()
elif
torch_device
==
"musa"
:
torch
.
musa
.
synchronize
()
# =================================================================
...
...
test/infiniop/libinfiniop/utils.py
View file @
194e19bd
...
...
@@ -71,17 +71,32 @@ class TestTensor(CTensor):
torch_shape
.
append
(
shape
[
i
])
if
mode
==
"random"
:
# For integer types, use randint instead of rand
if
dt
in
[
InfiniDtype
.
I8
,
InfiniDtype
.
I16
,
InfiniDtype
.
I32
,
InfiniDtype
.
I64
,
InfiniDtype
.
U8
,
InfiniDtype
.
U16
,
InfiniDtype
.
U32
,
InfiniDtype
.
U64
,
InfiniDtype
.
BYTE
,
InfiniDtype
.
BOOL
]:
if
dt
in
[
InfiniDtype
.
I8
,
InfiniDtype
.
I16
,
InfiniDtype
.
I32
,
InfiniDtype
.
I64
,
InfiniDtype
.
U8
,
InfiniDtype
.
U16
,
InfiniDtype
.
U32
,
InfiniDtype
.
U64
,
InfiniDtype
.
BYTE
,
InfiniDtype
.
BOOL
,
]:
randint_low
=
-
2000000000
if
randint_low
is
None
else
randint_low
randint_high
=
2000000000
if
randint_high
is
None
else
randint_high
self
.
_torch_tensor
=
torch
.
randint
(
randint_low
,
randint_high
,
torch_shape
,
dtype
=
to_torch_dtype
(
dt
),
device
=
torch_device_map
[
device
]
randint_low
,
randint_high
,
torch_shape
,
dtype
=
to_torch_dtype
(
dt
),
device
=
torch_device_map
[
device
],
)
else
:
self
.
_torch_tensor
=
torch
.
rand
(
torch_shape
,
dtype
=
to_torch_dtype
(
dt
),
device
=
torch_device_map
[
device
]
torch_shape
,
dtype
=
to_torch_dtype
(
dt
),
device
=
torch_device_map
[
device
],
)
elif
mode
==
"zeros"
:
self
.
_torch_tensor
=
torch
.
zeros
(
...
...
@@ -431,6 +446,8 @@ def synchronize_device(torch_device):
torch
.
npu
.
synchronize
()
elif
torch_device
==
"mlu"
:
torch
.
mlu
.
synchronize
()
elif
torch_device
==
"musa"
:
torch
.
musa
.
synchronize
()
def
debug
(
actual
,
desired
,
atol
=
0
,
rtol
=
1e-2
,
equal_nan
=
False
,
verbose
=
True
):
...
...
@@ -463,7 +480,14 @@ def debug(actual, desired, atol=0, rtol=1e-2, equal_nan=False, verbose=True):
def
filter_tensor_dtypes_by_device
(
device
,
tensor_dtypes
):
if
device
in
(
InfiniDeviceEnum
.
CPU
,
InfiniDeviceEnum
.
NVIDIA
,
InfiniDeviceEnum
.
METAX
,
InfiniDeviceEnum
.
ASCEND
,
InfiniDeviceEnum
.
ILUVATAR
,
InfiniDeviceEnum
.
CAMBRICON
):
if
device
in
(
InfiniDeviceEnum
.
CPU
,
InfiniDeviceEnum
.
NVIDIA
,
InfiniDeviceEnum
.
METAX
,
InfiniDeviceEnum
.
ASCEND
,
InfiniDeviceEnum
.
ILUVATAR
,
InfiniDeviceEnum
.
CAMBRICON
,
):
return
tensor_dtypes
else
:
# 过滤掉 torch.bfloat16
...
...
test/infiniop/topkrouter.py
View file @
194e19bd
...
...
@@ -19,7 +19,7 @@ from libinfiniop import (
InfiniDtypeNames
,
InfiniDeviceNames
,
infiniopOperatorDescriptor_t
,
torch_device_map
torch_device_map
,
)
# ==============================================================================
...
...
@@ -29,12 +29,14 @@ from libinfiniop import (
_TEST_CASES_
=
[
# x_shape, x_stride, topk, routed_scaling_factor
((
1
,
256
),
None
,
8
,
2.5
),
((
2
,
256
),
None
,
8
,
1.0
),
]
# w (weight) types
# Note: 'None' means the same as input dtype
# _X_DTYPES = [InfiniDtype.F32, InfiniDtype.BF16, InfiniDtype.F16]
_X_DTYPES
=
[]
# CPU CI
_X_DTYPES
=
[]
# CPU CI
# x types used for testing
_VALUE_DTYPES
=
[
InfiniDtype
.
F32
]
...
...
@@ -57,17 +59,34 @@ NUM_ITERATIONS = 1000
def
tensorInfo
(
data
):
print
(
"data: "
,
data
.
is_contiguous
(),
data
.
device
,
data
.
dtype
,
data
.
shape
,
data
.
stride
(),
data
.
data_ptr
(),
hex
(
data
.
data_ptr
()))
print
(
"data: "
,
data
.
is_contiguous
(),
data
.
device
,
data
.
dtype
,
data
.
shape
,
data
.
stride
(),
data
.
data_ptr
(),
hex
(
data
.
data_ptr
()),
)
class
DeepseekV3TopkRouter
(
nn
.
Module
):
def
__init__
(
self
,
correction_bias
,
routed_scaling_factor
:
float
=
2.5
,
topk
:
int
=
8
,
config
=
None
):
def
__init__
(
self
,
correction_bias
,
routed_scaling_factor
:
float
=
2.5
,
topk
:
int
=
8
,
config
=
None
,
):
super
().
__init__
()
self
.
config
=
config
self
.
top_k
=
topk
# config.num_experts_per_tok 8
assert
topk
==
8
self
.
n_routed_experts
=
256
# config.n_routed_experts
self
.
routed_scaling_factor
=
routed_scaling_factor
# config.routed_scaling_factor 2.5
self
.
routed_scaling_factor
=
(
routed_scaling_factor
# config.routed_scaling_factor 2.5
)
self
.
n_group
=
8
# config.n_group
self
.
topk_group
=
4
# config.topk_group
self
.
norm_topk_prob
=
True
# config.norm_topk_prob
...
...
@@ -81,14 +100,20 @@ class DeepseekV3TopkRouter(nn.Module):
@
torch
.
no_grad
()
def
get_topk_indices
(
self
,
scores
):
scores_for_choice
=
scores
.
view
(
-
1
,
self
.
n_routed_experts
)
+
self
.
e_score_correction_bias
.
unsqueeze
(
0
)
# Size([1, 256])
scores_for_choice
=
scores
.
view
(
-
1
,
self
.
n_routed_experts
)
+
self
.
e_score_correction_bias
.
unsqueeze
(
0
)
# Size([1, 256])
group_scores
=
(
scores_for_choice
.
view
(
-
1
,
self
.
n_group
,
self
.
n_routed_experts
//
self
.
n_group
)
scores_for_choice
.
view
(
-
1
,
self
.
n_group
,
self
.
n_routed_experts
//
self
.
n_group
)
.
topk
(
2
,
dim
=-
1
)[
0
]
.
sum
(
dim
=-
1
)
)
group_idx
=
torch
.
topk
(
group_scores
,
k
=
self
.
topk_group
,
dim
=-
1
,
sorted
=
True
)[
1
]
# Size([1, 4])
group_idx
=
torch
.
topk
(
group_scores
,
k
=
self
.
topk_group
,
dim
=-
1
,
sorted
=
True
)[
1
]
# Size([1, 4])
group_mask
=
torch
.
zeros_like
(
group_scores
)
# Size([1, 8])
group_mask
.
scatter_
(
1
,
group_idx
,
1
)
# Size([1, 8])
...
...
@@ -98,8 +123,12 @@ class DeepseekV3TopkRouter(nn.Module):
.
reshape
(
-
1
,
self
.
n_routed_experts
)
)
scores_for_choice
=
scores_for_choice
.
masked_fill
(
~
score_mask
.
bool
(),
0.0
)
# Size([1, 256])
topk_indices
=
torch
.
topk
(
scores_for_choice
,
k
=
self
.
top_k
,
dim
=-
1
,
sorted
=
True
)[
1
]
# Size([1, 8])
scores_for_choice
=
scores_for_choice
.
masked_fill
(
~
score_mask
.
bool
(),
0.0
)
# Size([1, 256])
topk_indices
=
torch
.
topk
(
scores_for_choice
,
k
=
self
.
top_k
,
dim
=-
1
,
sorted
=
True
)[
1
]
# Size([1, 8])
return
topk_indices
...
...
@@ -122,21 +151,23 @@ class DeepseekV3TopkRouter(nn.Module):
def
torch_topkrouter
(
router_logits
,
correction_bias
,
routed_scaling_factor
,
topk
):
lable_indices
,
lable_values
=
DeepseekV3TopkRouter
(
correction_bias
,
routed_scaling_factor
,
topk
)(
router_logits
)
lable_indices
,
lable_values
=
DeepseekV3TopkRouter
(
correction_bias
,
routed_scaling_factor
,
topk
)(
router_logits
)
lable_indices
=
lable_indices
.
to
(
torch
.
int32
)
return
lable_values
,
lable_indices
def
test
(
handle
,
device
,
x_shape
,
x_stride
,
topk
,
routed_scaling_factor
,
x_dtype
=
InfiniDtype
.
F32
,
dtype
=
InfiniDtype
.
F16
,
sync
=
None
,
handle
,
device
,
x_shape
,
x_stride
,
topk
,
routed_scaling_factor
,
x_dtype
=
InfiniDtype
.
F32
,
dtype
=
InfiniDtype
.
F16
,
sync
=
None
,
):
print
(
f
"Testing topkrouter on
{
InfiniDeviceNames
[
device
]
}
with x_shape:
{
x_shape
}
"
...
...
@@ -146,8 +177,12 @@ def test(
data
=
torch
.
arange
(
0
,
x_shape
[
0
]
*
x_shape
[
1
]).
reshape
(
x_shape
)
N
,
width
=
x_shape
x
=
TestTensor
(
x_shape
,
data
.
stride
(),
x_dtype
,
device
,
scale
=
5.0
,
bias
=-
5.0
,
mode
=
"random"
)
correction_bias
=
TestTensor
([
x_shape
[
1
]],
[
1
],
InfiniDtype
.
F32
,
device
,
mode
=
"random"
)
x
=
TestTensor
(
x_shape
,
data
.
stride
(),
x_dtype
,
device
,
scale
=
5.0
,
bias
=-
5.0
,
mode
=
"random"
)
correction_bias
=
TestTensor
(
[
x_shape
[
1
]],
[
1
],
InfiniDtype
.
F32
,
device
,
mode
=
"random"
)
if
sync
is
not
None
:
sync
()
...
...
@@ -155,10 +190,7 @@ def test(
descriptor
=
infiniopOperatorDescriptor_t
()
check_error
(
LIBINFINIOP
.
infiniopCreateTopkrouterDescriptor
(
handle
,
ctypes
.
byref
(
descriptor
),
x
.
descriptor
,
correction_bias
.
descriptor
handle
,
ctypes
.
byref
(
descriptor
),
x
.
descriptor
,
correction_bias
.
descriptor
)
)
...
...
@@ -174,8 +206,12 @@ def test(
)
workspace
=
TestWorkspace
(
workspace_size
.
value
,
x
.
device
)
values
=
torch
.
zeros
((
N
,
topk
),
dtype
=
torch
.
float32
,
device
=
torch_device_map
[
x
.
device
])
indices
=
torch
.
zeros
((
N
,
topk
),
dtype
=
torch
.
int32
,
device
=
torch_device_map
[
x
.
device
])
values
=
torch
.
zeros
(
(
N
,
topk
),
dtype
=
torch
.
float32
,
device
=
torch_device_map
[
x
.
device
]
)
indices
=
torch
.
zeros
(
(
N
,
topk
),
dtype
=
torch
.
int32
,
device
=
torch_device_map
[
x
.
device
]
)
def
lib_topkrouter
():
check_error
(
...
...
@@ -195,8 +231,9 @@ def test(
lib_topkrouter
()
lable_values
,
lable_indices
=
torch_topkrouter
(
x
.
actual_tensor
(),
correction_bias
.
actual_tensor
(),
routed_scaling_factor
,
topk
)
lable_values
,
lable_indices
=
torch_topkrouter
(
x
.
actual_tensor
(),
correction_bias
.
actual_tensor
(),
routed_scaling_factor
,
topk
)
atol
,
rtol
=
get_tolerance
(
_TOLERANCE_MAP
,
dtype
)
if
DEBUG
:
debug
(
lable_values
,
values
,
atol
=
atol
,
rtol
=
rtol
)
...
...
xmake.lua
View file @
194e19bd
...
...
@@ -66,6 +66,13 @@ if has_config("cudnn") then
add_defines
(
"ENABLE_CUDNN_API"
)
end
option
(
"cuda_arch"
)
set_showmenu
(
true
)
set_description
(
"Set CUDA GPU architecture (e.g. sm_90)"
)
set_values
(
"sm_50"
,
"sm_60"
,
"sm_70"
,
"sm_75"
,
"sm_80"
,
"sm_86"
,
"sm_89"
,
"sm_90"
,
"sm_90a"
)
set_category
(
"option"
)
option_end
()
-- 寒武纪
option
(
"cambricon-mlu"
)
set_default
(
false
)
...
...
xmake/nvidia.lua
View file @
194e19bd
...
...
@@ -3,6 +3,14 @@ if CUDNN_ROOT ~= nil then
add_includedirs
(
CUDNN_ROOT
..
"/include"
)
end
local
CUTLASS_ROOT
=
os.getenv
(
"CUTLASS_ROOT"
)
or
os.getenv
(
"CUTLASS_HOME"
)
or
os.getenv
(
"CUTLASS_PATH"
)
local
CUTE_ROOT
=
os.getenv
(
"CUTE_ROOT"
)
or
os.getenv
(
"CUTE_HOME"
)
or
os.getenv
(
"CUTE_PATH"
)
if
CUTLASS_ROOT
~=
nil
then
add_includedirs
(
CUTLASS_ROOT
)
add_includedirs
(
CUTE_ROOT
)
end
target
(
"infiniop-nvidia"
)
set_kind
(
"static"
)
add_deps
(
"infini-utils"
)
...
...
@@ -28,6 +36,11 @@ target("infiniop-nvidia")
target
:
add
(
"linkdirs"
,
path
.
directory
(
path
.
directory
(
nvcc_path
))
..
"/lib64/stubs"
)
target
:
add
(
"links"
,
"cuda"
)
local
cuda_arch
=
get_config
(
"cuda_arch"
)
if
cuda_arch
~=
nil
then
target
:
add
(
"cu-cxxflags"
,
"-arch="
,
cuda_arch
)
end
end
end
)
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment