Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
301cc55c
Unverified
Commit
301cc55c
authored
Jul 08, 2025
by
PanZezhong1725
Committed by
GitHub
Jul 08, 2025
Browse files
Merge pull request #303 from pengcheng888/issue/302
issue/302 - fix compile error of window system
parents
23077c42
b48d60ee
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
26 additions
and
17 deletions
+26
-17
README.md
README.md
+3
-0
src/infiniop/ops/random_sample/cuda/random_sample_kernel.cuh
src/infiniop/ops/random_sample/cuda/random_sample_kernel.cuh
+5
-5
src/infiniop/ops/rearrange/cuda/rearrange_cuda.cu
src/infiniop/ops/rearrange/cuda/rearrange_cuda.cu
+3
-3
src/infiniop/ops/rearrange/cuda/rearrange_kernel.cuh
src/infiniop/ops/rearrange/cuda/rearrange_kernel.cuh
+9
-9
xmake.lua
xmake.lua
+6
-0
No files found.
README.md
View file @
301cc55c
...
@@ -56,6 +56,7 @@ python scripts/install.py [XMAKE_CONFIG_FLAGS]
...
@@ -56,6 +56,7 @@ python scripts/install.py [XMAKE_CONFIG_FLAGS]
1.
项目配置
1.
项目配置
windows系统上,建议使用
`xmake v2.8.9`
编译项目。
-
查看当前配置
-
查看当前配置
```shell
```shell
...
@@ -73,6 +74,8 @@ python scripts/install.py [XMAKE_CONFIG_FLAGS]
...
@@ -73,6 +74,8 @@ python scripts/install.py [XMAKE_CONFIG_FLAGS]
```shell
```shell
# 英伟达
# 英伟达
# 可以指定 CUDA 路径, 一般环境变量为 `CUDA_HOME` 或者 `CUDA_ROOT`
# 可以指定 CUDA 路径, 一般环境变量为 `CUDA_HOME` 或者 `CUDA_ROOT`
# window系统:--cuda="%CUDA_HOME%"
# linux系统:--cuda=$CUDA_HOME
xmake f --nv-gpu=true --cuda=$CUDA_HOME -cv
xmake f --nv-gpu=true --cuda=$CUDA_HOME -cv
# 寒武纪
# 寒武纪
...
...
src/infiniop/ops/random_sample/cuda/random_sample_kernel.cuh
View file @
301cc55c
...
@@ -193,7 +193,7 @@ struct Algo {
...
@@ -193,7 +193,7 @@ struct Algo {
argMax_
(
argMax_
(
kv_pair
,
kv_pair
,
logits
,
logits
,
n
,
static_cast
<
int
>
(
n
)
,
workspace
,
workspace
,
workspace_size
,
stream
);
workspace_size
,
stream
);
castIdx
<<<
1
,
1
,
0
,
stream
>>>
((
Tidx
*
)
result
,
kv_pair
);
castIdx
<<<
1
,
1
,
0
,
stream
>>>
((
Tidx
*
)
result
,
kv_pair
);
...
@@ -232,20 +232,20 @@ struct Algo {
...
@@ -232,20 +232,20 @@ struct Algo {
auto
block
=
cub
::
Min
()((
size_t
)
block_size
,
n
);
auto
block
=
cub
::
Min
()((
size_t
)
block_size
,
n
);
auto
grid
=
(
n
+
block
-
1
)
/
block
;
auto
grid
=
(
n
+
block
-
1
)
/
block
;
// sort
// sort
fillIndices
<<<
grid
,
block
,
0
,
stream
>>>
(
indices
,
n
);
fillIndices
<<<
static_cast
<
unsigned
int
>
(
grid
)
,
static_cast
<
unsigned
int
>
(
block
)
,
0
,
stream
>>>
(
indices
,
static_cast
<
int
>
(
n
)
);
CHECK_CUDA
(
radixSort
(
CHECK_CUDA
(
radixSort
(
workspace_
,
workspace_size
,
workspace_
,
workspace_size
,
logits
,
sorted
,
logits
,
sorted
,
indices
,
indices_out
,
indices
,
indices_out
,
n
,
static_cast
<
int
>
(
n
)
,
stream
));
stream
));
// softmax
// softmax
partialSoftmaxKernel
<<<
grid
,
block
,
0
,
stream
>>>
(
sorted
,
n
,
temperature
);
partialSoftmaxKernel
<<<
static_cast
<
unsigned
int
>
(
grid
)
,
static_cast
<
unsigned
int
>
(
block
)
,
0
,
stream
>>>
(
sorted
,
static_cast
<
int
>
(
n
)
,
temperature
);
setSoftmaxMaxKernel
<<<
1
,
1
,
0
,
stream
>>>
(
sorted
);
setSoftmaxMaxKernel
<<<
1
,
1
,
0
,
stream
>>>
(
sorted
);
// sum
// sum
CHECK_CUDA
(
inclusiveSum
(
CHECK_CUDA
(
inclusiveSum
(
workspace_
,
workspace
,
workspace_
,
workspace
,
sorted
,
n
,
sorted
,
static_cast
<
int
>
(
n
)
,
stream
));
stream
));
// sample
// sample
randomSampleKernel
<<<
1
,
1
,
0
,
stream
>>>
(
randomSampleKernel
<<<
1
,
1
,
0
,
stream
>>>
(
...
...
src/infiniop/ops/rearrange/cuda/rearrange_cuda.cu
View file @
301cc55c
...
@@ -297,7 +297,7 @@ utils::Result<RearrangeParams> prepareRearrangeParams(const utils::RearrangeMeta
...
@@ -297,7 +297,7 @@ utils::Result<RearrangeParams> prepareRearrangeParams(const utils::RearrangeMeta
block_len
.
push_back
(
split_dims
[
j
].
num_per_block
);
block_len
.
push_back
(
split_dims
[
j
].
num_per_block
);
src_block_stride
.
push_back
(
dims
[
i
].
src_stride
);
src_block_stride
.
push_back
(
dims
[
i
].
src_stride
);
dst_block_stride
.
push_back
(
dims
[
i
].
dst_stride
);
dst_block_stride
.
push_back
(
dims
[
i
].
dst_stride
);
split_dims
[
j
].
array_struct_idx_block
=
block_dim
;
split_dims
[
j
].
array_struct_idx_block
=
static_cast
<
int
>
(
block_dim
)
;
block_dim
+=
1
;
block_dim
+=
1
;
block_len_total
*=
split_dims
[
j
].
num_per_block
;
block_len_total
*=
split_dims
[
j
].
num_per_block
;
}
}
...
@@ -316,7 +316,7 @@ utils::Result<RearrangeParams> prepareRearrangeParams(const utils::RearrangeMeta
...
@@ -316,7 +316,7 @@ utils::Result<RearrangeParams> prepareRearrangeParams(const utils::RearrangeMeta
grid_len
.
push_back
(
split_dims
[
j
].
num_per_grid
);
grid_len
.
push_back
(
split_dims
[
j
].
num_per_grid
);
src_grid_stride
.
push_back
(
dims
[
i
].
src_stride
*
split_dims
[
j
].
num_per_block
);
src_grid_stride
.
push_back
(
dims
[
i
].
src_stride
*
split_dims
[
j
].
num_per_block
);
dst_grid_stride
.
push_back
(
dims
[
i
].
dst_stride
*
split_dims
[
j
].
num_per_block
);
dst_grid_stride
.
push_back
(
dims
[
i
].
dst_stride
*
split_dims
[
j
].
num_per_block
);
split_dims
[
j
].
array_struct_idx_grid
=
grid_len
.
size
()
-
1
;
split_dims
[
j
].
array_struct_idx_grid
=
static_cast
<
int
>
(
grid_len
.
size
()
-
1
)
;
}
}
}
}
...
@@ -420,7 +420,7 @@ infiniStatus_t launchKernel(
...
@@ -420,7 +420,7 @@ infiniStatus_t launchKernel(
CHECK_OR_RETURN
(
cudaLaunchKernel
(
CHECK_OR_RETURN
(
cudaLaunchKernel
(
kernel_func
,
kernel_func
,
grid_size
,
BLOCK_SIZE
,
static_cast
<
unsigned
int
>
(
grid_size
)
,
static_cast
<
unsigned
int
>
(
BLOCK_SIZE
)
,
args
,
0
,
stream
)
args
,
0
,
stream
)
==
cudaSuccess
,
==
cudaSuccess
,
INFINI_STATUS_INTERNAL_ERROR
);
INFINI_STATUS_INTERNAL_ERROR
);
...
...
src/infiniop/ops/rearrange/cuda/rearrange_kernel.cuh
View file @
301cc55c
...
@@ -63,13 +63,13 @@ struct Constraint {
...
@@ -63,13 +63,13 @@ struct Constraint {
size_t remaining \
size_t remaining \
= blockIdx.x; \
= blockIdx.x; \
\
\
for (
ssize
_t i = grid_array_size - 1; i >= 0; i--) {
\
for (
ptrdiff
_t i = grid_array_size - 1; i >= 0; i--) { \
size_t idx = remaining % grid_len.a[i]; \
size_t idx = remaining % grid_len.a[i]; \
remaining /= grid_len.a[i]; \
remaining /= grid_len.a[i]; \
src_offset += idx * src_grid_stride.a[i]; \
src_offset += idx * src_grid_stride.a[i]; \
dst_offset += idx * dst_grid_stride.a[i]; \
dst_offset += idx * dst_grid_stride.a[i]; \
if (constraint_num > 0) { \
if (constraint_num > 0) { \
for (
ssize
_t j = 0; j < constraint_num; j++) {
\
for (
ptrdiff
_t j = 0; j < constraint_num; j++) { \
if (i == constraints.a[j].grid_idx) { \
if (i == constraints.a[j].grid_idx) { \
constraints_grid_idx_multiple[j] = idx * constraints.a[j].grid_div_block; \
constraints_grid_idx_multiple[j] = idx * constraints.a[j].grid_div_block; \
} \
} \
...
@@ -80,7 +80,7 @@ struct Constraint {
...
@@ -80,7 +80,7 @@ struct Constraint {
/* 将结果存入共享内存 */
\
/* 将结果存入共享内存 */
\
shared_src_offset = src_offset; \
shared_src_offset = src_offset; \
shared_dst_offset = dst_offset; \
shared_dst_offset = dst_offset; \
for (
ssize
_t j = 0; j < constraint_num; j++) {
\
for (
ptrdiff
_t j = 0; j < constraint_num; j++) { \
shared_constraints_grid_idx_multiple[j] = constraints_grid_idx_multiple[j]; \
shared_constraints_grid_idx_multiple[j] = constraints_grid_idx_multiple[j]; \
} \
} \
} \
} \
...
@@ -92,18 +92,18 @@ struct Constraint {
...
@@ -92,18 +92,18 @@ struct Constraint {
ptrdiff_t src_offset = shared_src_offset; \
ptrdiff_t src_offset = shared_src_offset; \
ptrdiff_t dst_offset = shared_dst_offset; \
ptrdiff_t dst_offset = shared_dst_offset; \
ARRAY_TYPE_SIZE constraints_grid_idx_multiple[constraint_num > 0 ? constraint_num : 1]; \
ARRAY_TYPE_SIZE constraints_grid_idx_multiple[constraint_num > 0 ? constraint_num : 1]; \
for (
ssize
_t j = 0; j < constraint_num; j++) {
\
for (
ptrdiff
_t j = 0; j < constraint_num; j++) { \
constraints_grid_idx_multiple[j] = shared_constraints_grid_idx_multiple[j]; \
constraints_grid_idx_multiple[j] = shared_constraints_grid_idx_multiple[j]; \
} \
} \
\
\
for (
ssize
_t i = block_array_size - 1; i >= 0; i--) {
\
for (
ptrdiff
_t i = block_array_size - 1; i >= 0; i--) { \
size_t idx = remaining % block_len.a[i]; \
size_t idx = remaining % block_len.a[i]; \
remaining /= block_len.a[i]; \
remaining /= block_len.a[i]; \
/* 计算偏移量 */
\
/* 计算偏移量 */
\
src_offset += idx * src_block_stride.a[i]; \
src_offset += idx * src_block_stride.a[i]; \
dst_offset += idx * dst_block_stride.a[i]; \
dst_offset += idx * dst_block_stride.a[i]; \
if (constraint_num > 0) { \
if (constraint_num > 0) { \
for (
ssize
_t j = 0; j < constraint_num; j++) {
\
for (
ptrdiff
_t j = 0; j < constraint_num; j++) { \
if (i == constraints.a[j].block_idx) { \
if (i == constraints.a[j].block_idx) { \
if (constraints_grid_idx_multiple[j] + idx >= constraints.a[j].total_len) { \
if (constraints_grid_idx_multiple[j] + idx >= constraints.a[j].total_len) { \
return; \
return; \
...
@@ -115,7 +115,7 @@ struct Constraint {
...
@@ -115,7 +115,7 @@ struct Constraint {
\
\
src_offset += remaining * src_block_stride.a[0]; \
src_offset += remaining * src_block_stride.a[0]; \
dst_offset += remaining * dst_block_stride.a[0]; \
dst_offset += remaining * dst_block_stride.a[0]; \
for (
ssize
_t j = 0; j < constraint_num; j++) {
\
for (
ptrdiff
_t j = 0; j < constraint_num; j++) { \
if (0 == constraints.a[j].block_idx) { \
if (0 == constraints.a[j].block_idx) { \
if (constraints_grid_idx_multiple[j] + remaining >= constraints.a[j].total_len) { \
if (constraints_grid_idx_multiple[j] + remaining >= constraints.a[j].total_len) { \
return; \
return; \
...
@@ -133,7 +133,7 @@ struct Constraint {
...
@@ -133,7 +133,7 @@ struct Constraint {
ptrdiff_t dst_offset = 0; \
ptrdiff_t dst_offset = 0; \
size_t remaining = blockIdx.x; \
size_t remaining = blockIdx.x; \
\
\
for (
ssize
_t i = grid_array_size - 1; i >= 0; i--) {
\
for (
ptrdiff
_t i = grid_array_size - 1; i >= 0; i--) { \
size_t idx = remaining % grid_len.a[i]; \
size_t idx = remaining % grid_len.a[i]; \
remaining /= grid_len.a[i]; \
remaining /= grid_len.a[i]; \
src_offset += idx * src_grid_stride.a[i]; \
src_offset += idx * src_grid_stride.a[i]; \
...
@@ -152,7 +152,7 @@ struct Constraint {
...
@@ -152,7 +152,7 @@ struct Constraint {
ptrdiff_t src_offset = shared_src_offset; \
ptrdiff_t src_offset = shared_src_offset; \
ptrdiff_t dst_offset = shared_dst_offset; \
ptrdiff_t dst_offset = shared_dst_offset; \
\
\
for (
ssize
_t i = block_array_size - 1; i > 0; i--) {
\
for (
ptrdiff
_t i = block_array_size - 1; i > 0; i--) { \
size_t idx = remaining % block_len.a[i]; \
size_t idx = remaining % block_len.a[i]; \
remaining /= block_len.a[i]; \
remaining /= block_len.a[i]; \
/* 计算偏移量 */
\
/* 计算偏移量 */
\
...
...
xmake.lua
View file @
301cc55c
...
@@ -12,6 +12,12 @@ if is_mode("debug") then
...
@@ -12,6 +12,12 @@ if is_mode("debug") then
add_defines
(
"DEBUG_MODE"
)
add_defines
(
"DEBUG_MODE"
)
end
end
if
is_plat
(
"windows"
)
then
set_runtimes
(
"MD"
)
add_ldflags
(
"/utf-8"
,
{
force
=
true
})
add_cxflags
(
"/utf-8"
,
{
force
=
true
})
end
-- CPU
-- CPU
option
(
"cpu"
)
option
(
"cpu"
)
set_default
(
true
)
set_default
(
true
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment