Unverified Commit 301cc55c authored by PanZezhong1725's avatar PanZezhong1725 Committed by GitHub
Browse files

Merge pull request #303 from pengcheng888/issue/302

issue/302 - fix compile error of window system
parents 23077c42 b48d60ee
...@@ -56,6 +56,7 @@ python scripts/install.py [XMAKE_CONFIG_FLAGS] ...@@ -56,6 +56,7 @@ python scripts/install.py [XMAKE_CONFIG_FLAGS]
1. 项目配置 1. 项目配置
windows系统上,建议使用`xmake v2.8.9`编译项目。
- 查看当前配置 - 查看当前配置
```shell ```shell
...@@ -73,6 +74,8 @@ python scripts/install.py [XMAKE_CONFIG_FLAGS] ...@@ -73,6 +74,8 @@ python scripts/install.py [XMAKE_CONFIG_FLAGS]
```shell ```shell
# 英伟达 # 英伟达
# 可以指定 CUDA 路径, 一般环境变量为 `CUDA_HOME` 或者 `CUDA_ROOT` # 可以指定 CUDA 路径, 一般环境变量为 `CUDA_HOME` 或者 `CUDA_ROOT`
# window系统:--cuda="%CUDA_HOME%"
# linux系统:--cuda=$CUDA_HOME
xmake f --nv-gpu=true --cuda=$CUDA_HOME -cv xmake f --nv-gpu=true --cuda=$CUDA_HOME -cv
# 寒武纪 # 寒武纪
......
...@@ -193,7 +193,7 @@ struct Algo { ...@@ -193,7 +193,7 @@ struct Algo {
argMax_( argMax_(
kv_pair, kv_pair,
logits, logits,
n, static_cast<int>(n),
workspace, workspace,
workspace_size, stream); workspace_size, stream);
castIdx<<<1, 1, 0, stream>>>((Tidx *)result, kv_pair); castIdx<<<1, 1, 0, stream>>>((Tidx *)result, kv_pair);
...@@ -232,20 +232,20 @@ struct Algo { ...@@ -232,20 +232,20 @@ struct Algo {
auto block = cub::Min()((size_t)block_size, n); auto block = cub::Min()((size_t)block_size, n);
auto grid = (n + block - 1) / block; auto grid = (n + block - 1) / block;
// sort // sort
fillIndices<<<grid, block, 0, stream>>>(indices, n); fillIndices<<<static_cast<unsigned int>(grid), static_cast<unsigned int>(block), 0, stream>>>(indices, static_cast<int>(n));
CHECK_CUDA(radixSort( CHECK_CUDA(radixSort(
workspace_, workspace_size, workspace_, workspace_size,
logits, sorted, logits, sorted,
indices, indices_out, indices, indices_out,
n, static_cast<int>(n),
stream)); stream));
// softmax // softmax
partialSoftmaxKernel<<<grid, block, 0, stream>>>(sorted, n, temperature); partialSoftmaxKernel<<<static_cast<unsigned int>(grid), static_cast<unsigned int>(block), 0, stream>>>(sorted, static_cast<int>(n), temperature);
setSoftmaxMaxKernel<<<1, 1, 0, stream>>>(sorted); setSoftmaxMaxKernel<<<1, 1, 0, stream>>>(sorted);
// sum // sum
CHECK_CUDA(inclusiveSum( CHECK_CUDA(inclusiveSum(
workspace_, workspace, workspace_, workspace,
sorted, n, sorted, static_cast<int>(n),
stream)); stream));
// sample // sample
randomSampleKernel<<<1, 1, 0, stream>>>( randomSampleKernel<<<1, 1, 0, stream>>>(
......
...@@ -297,7 +297,7 @@ utils::Result<RearrangeParams> prepareRearrangeParams(const utils::RearrangeMeta ...@@ -297,7 +297,7 @@ utils::Result<RearrangeParams> prepareRearrangeParams(const utils::RearrangeMeta
block_len.push_back(split_dims[j].num_per_block); block_len.push_back(split_dims[j].num_per_block);
src_block_stride.push_back(dims[i].src_stride); src_block_stride.push_back(dims[i].src_stride);
dst_block_stride.push_back(dims[i].dst_stride); dst_block_stride.push_back(dims[i].dst_stride);
split_dims[j].array_struct_idx_block = block_dim; split_dims[j].array_struct_idx_block = static_cast<int>(block_dim);
block_dim += 1; block_dim += 1;
block_len_total *= split_dims[j].num_per_block; block_len_total *= split_dims[j].num_per_block;
} }
...@@ -316,7 +316,7 @@ utils::Result<RearrangeParams> prepareRearrangeParams(const utils::RearrangeMeta ...@@ -316,7 +316,7 @@ utils::Result<RearrangeParams> prepareRearrangeParams(const utils::RearrangeMeta
grid_len.push_back(split_dims[j].num_per_grid); grid_len.push_back(split_dims[j].num_per_grid);
src_grid_stride.push_back(dims[i].src_stride * split_dims[j].num_per_block); src_grid_stride.push_back(dims[i].src_stride * split_dims[j].num_per_block);
dst_grid_stride.push_back(dims[i].dst_stride * split_dims[j].num_per_block); dst_grid_stride.push_back(dims[i].dst_stride * split_dims[j].num_per_block);
split_dims[j].array_struct_idx_grid = grid_len.size() - 1; split_dims[j].array_struct_idx_grid = static_cast<int>(grid_len.size() - 1);
} }
} }
...@@ -420,7 +420,7 @@ infiniStatus_t launchKernel( ...@@ -420,7 +420,7 @@ infiniStatus_t launchKernel(
CHECK_OR_RETURN(cudaLaunchKernel( CHECK_OR_RETURN(cudaLaunchKernel(
kernel_func, kernel_func,
grid_size, BLOCK_SIZE, static_cast<unsigned int>(grid_size), static_cast<unsigned int>(BLOCK_SIZE),
args, 0, stream) args, 0, stream)
== cudaSuccess, == cudaSuccess,
INFINI_STATUS_INTERNAL_ERROR); INFINI_STATUS_INTERNAL_ERROR);
......
...@@ -63,13 +63,13 @@ struct Constraint { ...@@ -63,13 +63,13 @@ struct Constraint {
size_t remaining \ size_t remaining \
= blockIdx.x; \ = blockIdx.x; \
\ \
for (ssize_t i = grid_array_size - 1; i >= 0; i--) { \ for (ptrdiff_t i = grid_array_size - 1; i >= 0; i--) { \
size_t idx = remaining % grid_len.a[i]; \ size_t idx = remaining % grid_len.a[i]; \
remaining /= grid_len.a[i]; \ remaining /= grid_len.a[i]; \
src_offset += idx * src_grid_stride.a[i]; \ src_offset += idx * src_grid_stride.a[i]; \
dst_offset += idx * dst_grid_stride.a[i]; \ dst_offset += idx * dst_grid_stride.a[i]; \
if (constraint_num > 0) { \ if (constraint_num > 0) { \
for (ssize_t j = 0; j < constraint_num; j++) { \ for (ptrdiff_t j = 0; j < constraint_num; j++) { \
if (i == constraints.a[j].grid_idx) { \ if (i == constraints.a[j].grid_idx) { \
constraints_grid_idx_multiple[j] = idx * constraints.a[j].grid_div_block; \ constraints_grid_idx_multiple[j] = idx * constraints.a[j].grid_div_block; \
} \ } \
...@@ -80,7 +80,7 @@ struct Constraint { ...@@ -80,7 +80,7 @@ struct Constraint {
/* 将结果存入共享内存 */ \ /* 将结果存入共享内存 */ \
shared_src_offset = src_offset; \ shared_src_offset = src_offset; \
shared_dst_offset = dst_offset; \ shared_dst_offset = dst_offset; \
for (ssize_t j = 0; j < constraint_num; j++) { \ for (ptrdiff_t j = 0; j < constraint_num; j++) { \
shared_constraints_grid_idx_multiple[j] = constraints_grid_idx_multiple[j]; \ shared_constraints_grid_idx_multiple[j] = constraints_grid_idx_multiple[j]; \
} \ } \
} \ } \
...@@ -92,18 +92,18 @@ struct Constraint { ...@@ -92,18 +92,18 @@ struct Constraint {
ptrdiff_t src_offset = shared_src_offset; \ ptrdiff_t src_offset = shared_src_offset; \
ptrdiff_t dst_offset = shared_dst_offset; \ ptrdiff_t dst_offset = shared_dst_offset; \
ARRAY_TYPE_SIZE constraints_grid_idx_multiple[constraint_num > 0 ? constraint_num : 1]; \ ARRAY_TYPE_SIZE constraints_grid_idx_multiple[constraint_num > 0 ? constraint_num : 1]; \
for (ssize_t j = 0; j < constraint_num; j++) { \ for (ptrdiff_t j = 0; j < constraint_num; j++) { \
constraints_grid_idx_multiple[j] = shared_constraints_grid_idx_multiple[j]; \ constraints_grid_idx_multiple[j] = shared_constraints_grid_idx_multiple[j]; \
} \ } \
\ \
for (ssize_t i = block_array_size - 1; i >= 0; i--) { \ for (ptrdiff_t i = block_array_size - 1; i >= 0; i--) { \
size_t idx = remaining % block_len.a[i]; \ size_t idx = remaining % block_len.a[i]; \
remaining /= block_len.a[i]; \ remaining /= block_len.a[i]; \
/* 计算偏移量 */ \ /* 计算偏移量 */ \
src_offset += idx * src_block_stride.a[i]; \ src_offset += idx * src_block_stride.a[i]; \
dst_offset += idx * dst_block_stride.a[i]; \ dst_offset += idx * dst_block_stride.a[i]; \
if (constraint_num > 0) { \ if (constraint_num > 0) { \
for (ssize_t j = 0; j < constraint_num; j++) { \ for (ptrdiff_t j = 0; j < constraint_num; j++) { \
if (i == constraints.a[j].block_idx) { \ if (i == constraints.a[j].block_idx) { \
if (constraints_grid_idx_multiple[j] + idx >= constraints.a[j].total_len) { \ if (constraints_grid_idx_multiple[j] + idx >= constraints.a[j].total_len) { \
return; \ return; \
...@@ -115,7 +115,7 @@ struct Constraint { ...@@ -115,7 +115,7 @@ struct Constraint {
\ \
src_offset += remaining * src_block_stride.a[0]; \ src_offset += remaining * src_block_stride.a[0]; \
dst_offset += remaining * dst_block_stride.a[0]; \ dst_offset += remaining * dst_block_stride.a[0]; \
for (ssize_t j = 0; j < constraint_num; j++) { \ for (ptrdiff_t j = 0; j < constraint_num; j++) { \
if (0 == constraints.a[j].block_idx) { \ if (0 == constraints.a[j].block_idx) { \
if (constraints_grid_idx_multiple[j] + remaining >= constraints.a[j].total_len) { \ if (constraints_grid_idx_multiple[j] + remaining >= constraints.a[j].total_len) { \
return; \ return; \
...@@ -133,7 +133,7 @@ struct Constraint { ...@@ -133,7 +133,7 @@ struct Constraint {
ptrdiff_t dst_offset = 0; \ ptrdiff_t dst_offset = 0; \
size_t remaining = blockIdx.x; \ size_t remaining = blockIdx.x; \
\ \
for (ssize_t i = grid_array_size - 1; i >= 0; i--) { \ for (ptrdiff_t i = grid_array_size - 1; i >= 0; i--) { \
size_t idx = remaining % grid_len.a[i]; \ size_t idx = remaining % grid_len.a[i]; \
remaining /= grid_len.a[i]; \ remaining /= grid_len.a[i]; \
src_offset += idx * src_grid_stride.a[i]; \ src_offset += idx * src_grid_stride.a[i]; \
...@@ -152,7 +152,7 @@ struct Constraint { ...@@ -152,7 +152,7 @@ struct Constraint {
ptrdiff_t src_offset = shared_src_offset; \ ptrdiff_t src_offset = shared_src_offset; \
ptrdiff_t dst_offset = shared_dst_offset; \ ptrdiff_t dst_offset = shared_dst_offset; \
\ \
for (ssize_t i = block_array_size - 1; i > 0; i--) { \ for (ptrdiff_t i = block_array_size - 1; i > 0; i--) { \
size_t idx = remaining % block_len.a[i]; \ size_t idx = remaining % block_len.a[i]; \
remaining /= block_len.a[i]; \ remaining /= block_len.a[i]; \
/* 计算偏移量 */ \ /* 计算偏移量 */ \
......
...@@ -12,6 +12,12 @@ if is_mode("debug") then ...@@ -12,6 +12,12 @@ if is_mode("debug") then
add_defines("DEBUG_MODE") add_defines("DEBUG_MODE")
end end
if is_plat("windows") then
set_runtimes("MD")
add_ldflags("/utf-8", {force = true})
add_cxflags("/utf-8", {force = true})
end
-- CPU -- CPU
option("cpu") option("cpu")
set_default(true) set_default(true)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment