Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
17415721
Unverified
Commit
17415721
authored
Apr 24, 2025
by
PanZezhong1725
Committed by
GitHub
Apr 24, 2025
Browse files
Merge pull request #128 from pwhMass/rearrange
issue/131: rearrange 算子 - CUDA
parents
a16380fb
c98dac66
Changes
8
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
1002 additions
and
24 deletions
+1002
-24
src/infiniop/ops/rearrange/cpu/rearrange_cpu.cc
src/infiniop/ops/rearrange/cpu/rearrange_cpu.cc
+8
-10
src/infiniop/ops/rearrange/cuda/rearrange_cuda.cu
src/infiniop/ops/rearrange/cuda/rearrange_cuda.cu
+485
-0
src/infiniop/ops/rearrange/cuda/rearrange_cuda.cuh
src/infiniop/ops/rearrange/cuda/rearrange_cuda.cuh
+8
-0
src/infiniop/ops/rearrange/cuda/rearrange_kernel.cuh
src/infiniop/ops/rearrange/cuda/rearrange_kernel.cuh
+330
-0
src/infiniop/ops/rearrange/operator.cc
src/infiniop/ops/rearrange/operator.cc
+16
-0
src/utils/rearrange.cc
src/utils/rearrange.cc
+69
-0
src/utils/rearrange.h
src/utils/rearrange.h
+3
-0
test/infiniop/rearrange.py
test/infiniop/rearrange.py
+83
-14
No files found.
src/infiniop/ops/rearrange/cpu/rearrange_cpu.cc
View file @
17415721
...
...
@@ -15,20 +15,18 @@ infiniStatus_t Descriptor::create(
auto
handle
=
reinterpret_cast
<
device
::
cpu
::
Handle
*>
(
handle_
);
auto
dtype
=
y_desc
->
dtype
();
auto
ndim
=
y_desc
->
ndim
();
auto
shape
=
y_desc
->
shape
().
data
();
CHECK_API_OR
(
x_desc
->
dtype
(),
dtype
,
return
INFINI_STATUS_BAD_TENSOR_DTYPE
);
CHECK_API_OR
(
x_desc
->
ndim
(),
ndim
,
return
INFINI_STATUS_BAD_TENSOR_SHAPE
);
auto
y_shape
=
y_desc
->
shape
();
auto
x_shape
=
x_desc
->
shape
();
CHECK_OR_RETURN
(
x_desc
->
dtype
()
==
dtype
,
INFINI_STATUS_BAD_TENSOR_DTYPE
);
CHECK_OR_RETURN
(
x_desc
->
ndim
()
==
ndim
,
INFINI_STATUS_BAD_TENSOR_SHAPE
);
CHECK_SAME_SHAPE
(
x_shape
,
y_shape
);
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
CHECK_API_OR
(
x_desc
->
shape
()[
i
],
shape
[
i
],
return
INFINI_STATUS_BAD_TENSOR_SHAPE
);
}
auto
dst_strides
=
y_desc
->
strides
().
data
();
auto
src_strides
=
x_desc
->
strides
().
data
();
auto
dst_strides
=
y_desc
->
strides
();
auto
src_strides
=
x_desc
->
strides
();
auto
element_size
=
infiniSizeOf
(
dtype
);
auto
result
=
utils
::
RearrangeMeta
::
create
(
shape
,
dst_strides
,
src_strides
,
ndim
,
element_size
);
auto
result
=
utils
::
RearrangeMeta
::
create
(
y_
shape
.
data
()
,
dst_strides
.
data
()
,
src_strides
.
data
()
,
ndim
,
element_size
);
CHECK_RESULT
(
result
);
*
desc_ptr
=
new
Descriptor
(
...
...
src/infiniop/ops/rearrange/cuda/rearrange_cuda.cu
0 → 100644
View file @
17415721
#include "../../../devices/cuda/cuda_common.cuh"
#include "../../../devices/cuda/cuda_kernel_common.cuh"
#include "../../../tensor.h"
#include "rearrange_cuda.cuh"
#include "rearrange_kernel.cuh"
#include <algorithm>
#include <cmath>
#include <memory>
#include <stdint.h>
#include <vector>
namespace
op
::
rearrange
::
cuda
{
struct
Descriptor
::
Opaque
{
std
::
shared_ptr
<
device
::
cuda
::
Handle
::
Internal
>
internal
;
};
Descriptor
::~
Descriptor
()
{
delete
_opaque
;
}
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
y_desc
,
infiniopTensorDescriptor_t
x_desc
)
{
auto
dtype
=
y_desc
->
dtype
();
auto
ndim
=
y_desc
->
ndim
();
CHECK_OR_RETURN
(
x_desc
->
dtype
()
==
dtype
,
INFINI_STATUS_BAD_TENSOR_DTYPE
);
CHECK_OR_RETURN
(
x_desc
->
ndim
()
==
ndim
,
INFINI_STATUS_BAD_TENSOR_SHAPE
);
// 保存临时vector对象
auto
x_shape
=
x_desc
->
shape
();
auto
y_shape
=
y_desc
->
shape
();
auto
y_strides
=
y_desc
->
strides
();
auto
x_strides
=
x_desc
->
strides
();
CHECK_SAME_SHAPE
(
x_shape
,
y_shape
);
auto
meta
=
utils
::
RearrangeMeta
::
create
(
y_shape
.
data
(),
y_strides
.
data
(),
x_strides
.
data
(),
ndim
,
infiniSizeOf
(
dtype
));
CHECK_RESULT
(
meta
);
*
desc_ptr
=
new
Descriptor
(
std
::
move
(
*
meta
),
new
Opaque
{
reinterpret_cast
<
device
::
cuda
::
Handle
*>
(
handle
)
->
internal
()},
handle
->
device
,
handle
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
// 维度信息结构
struct
Dim
{
size_t
len
;
ARRAY_TYPE_STRIDE
src_stride
;
ARRAY_TYPE_STRIDE
dst_stride
;
};
// 分割维度结构
struct
SplitDim
{
size_t
choose_idx
;
size_t
num_per_block
;
size_t
num_per_grid
;
int
array_struct_idx_block
;
int
array_struct_idx_grid
;
size_t
dim_len
;
};
/**
* 根据给定的元数据准备张量重排参数,该函数主要完成以下工作:
* 1. 根据原始元数据调整单元大小,获取更适合GPU处理的单元大小
* 2. 将维度分配为CUDA块(block)维度和网格(grid)维度:
* 该步骤是核心,目标是为每个block分配尽可能多的相对连续的数据进行处理,
* 对无法完整放入块的维度进行分割,并记录分割维度信息,用于防止kernel访问越界,最大化内存访问局部性和计算效率
*/
utils
::
Result
<
RearrangeParams
>
prepareRearrangeParams
(
const
utils
::
RearrangeMeta
&
original_meta
,
int
max_threads
)
{
RearrangeParams
params
;
// 获取更适合GPU处理的单元大小,这里使用2的幂次方
auto
meta_result
=
original_meta
.
distributeUnit
({
32
,
16
,
8
,
4
,
2
,
1
});
CHECK_RESULT
(
meta_result
);
const
utils
::
RearrangeMeta
&
meta
=
meta_result
.
take
();
// 获取维度信息
const
size_t
ndim
=
meta
.
ndim
();
const
size_t
unit
=
meta
.
unit
();
// 特殊情况:无维度,只需要简单复制
if
(
ndim
==
0
)
{
params
.
block_dim
=
0
;
params
.
block_len_total
=
1
;
params
.
block_len
=
{
static_cast
<
ARRAY_TYPE_SIZE
>
(
1
)};
params
.
src_block_stride
=
{
static_cast
<
ARRAY_TYPE_STRIDE
>
(
0
)};
params
.
dst_block_stride
=
{
static_cast
<
ARRAY_TYPE_STRIDE
>
(
0
)};
params
.
grid_len
=
{
static_cast
<
ARRAY_TYPE_SIZE
>
(
1
)};
params
.
src_grid_stride
=
{
static_cast
<
ARRAY_TYPE_STRIDE
>
(
0
)};
params
.
dst_grid_stride
=
{
static_cast
<
ARRAY_TYPE_STRIDE
>
(
0
)};
params
.
unit_size
=
unit
;
return
utils
::
Result
<
RearrangeParams
>
(
params
);
}
// 从元数据中提取必要的信息
const
ptrdiff_t
*
idx_strides
=
meta
.
idx_strides
();
const
ptrdiff_t
*
dst_strides
=
meta
.
dst_strides
();
const
ptrdiff_t
*
src_strides
=
meta
.
src_strides
();
// 准备维度信息
std
::
vector
<
Dim
>
dims
;
std
::
vector
<
size_t
>
shape
;
dims
.
reserve
(
ndim
);
shape
.
reserve
(
ndim
);
auto
prev_idx_stride
=
meta
.
count
();
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
size_t
len
=
prev_idx_stride
/
idx_strides
[
i
];
shape
.
push_back
(
len
);
dims
.
push_back
({
len
,
src_strides
[
i
],
dst_strides
[
i
]});
prev_idx_stride
=
idx_strides
[
i
];
}
// 计算src_strides的降序排序索引,类似于Rust版本中的src_strides_desc_idx
std
::
vector
<
size_t
>
src_strides_desc_idx
(
ndim
);
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
src_strides_desc_idx
[
i
]
=
i
;
}
std
::
sort
(
src_strides_desc_idx
.
begin
(),
src_strides_desc_idx
.
end
(),
[
&
dims
](
size_t
a
,
size_t
b
)
{
return
std
::
abs
(
dims
[
a
].
src_stride
)
>
std
::
abs
(
dims
[
b
].
src_stride
);
});
// 根据最大线程数选择block和grid维度
const
size_t
block_size
=
max_threads
;
std
::
vector
<
bool
>
block_dim_choose
(
ndim
,
false
);
// 初始化计数器
size_t
block_elements
=
1
;
size_t
block_src_elements
=
1
;
size_t
block_dst_elements
=
1
;
size_t
src_choose_idx
=
ndim
;
size_t
dst_choose_idx
=
ndim
;
// 用于存储分割维度信息
std
::
vector
<
SplitDim
>
split_dims
;
// 维度选择循环
while
(
src_choose_idx
>
0
&&
dst_choose_idx
>
0
)
{
// 获取当前需要处理的维度索引
size_t
src_idx
=
src_strides_desc_idx
[
src_choose_idx
-
1
];
size_t
dst_idx
=
dst_choose_idx
-
1
;
if
(
src_idx
==
dst_idx
)
{
// 源和目标维度相同,可以一起处理
size_t
idx
=
src_idx
;
size_t
len
=
shape
[
idx
];
// 检查是否可以将此维度完全添加到block中
if
(
block_elements
*
len
<=
block_size
)
{
// 选择此维度
block_dim_choose
[
idx
]
=
true
;
block_elements
*=
len
;
block_src_elements
*=
len
;
block_dst_elements
*=
len
;
src_choose_idx
--
;
dst_choose_idx
--
;
}
else
{
// 需要分割此维度
size_t
num_per_block
=
block_size
/
block_elements
;
// 确保num_per_block > 0且len >= num_per_block
if
(
num_per_block
>
0
&&
len
>=
num_per_block
&&
num_per_block
>
1
)
{
size_t
num_per_grid
=
(
len
+
num_per_block
-
1
)
/
num_per_block
;
// 向上取整
SplitDim
split_dim
=
{
idx
,
// choose_idx
num_per_block
,
// num_per_block
num_per_grid
,
// num_per_grid
0
,
// array_struct_idx_block (待更新)
0
,
// array_struct_idx_grid (待更新)
len
// 原始维度长度
};
split_dims
.
push_back
(
split_dim
);
}
break
;
}
}
else
{
// 源和目标维度不同,需要分别处理
// 计算块比例
double
src_div_dst
=
static_cast
<
double
>
(
block_src_elements
)
/
block_dst_elements
;
double
src_num_per_block
=
std
::
sqrt
(
block_size
/
(
double
)
block_elements
/
src_div_dst
);
double
dst_num_per_block
=
src_num_per_block
*
src_div_dst
;
size_t
src_current_dim_len
=
shape
[
src_idx
];
size_t
dst_current_dim_len
=
shape
[
dst_idx
];
if
(
static_cast
<
double
>
(
src_current_dim_len
)
<
src_num_per_block
)
{
// 源维度可以完全添加到block
block_dim_choose
[
src_idx
]
=
true
;
block_elements
*=
src_current_dim_len
;
block_src_elements
*=
src_current_dim_len
;
src_choose_idx
--
;
}
else
if
(
static_cast
<
double
>
(
dst_current_dim_len
)
<
dst_num_per_block
)
{
// 目标维度可以完全添加到block
block_dim_choose
[
dst_idx
]
=
true
;
block_elements
*=
dst_current_dim_len
;
block_dst_elements
*=
dst_current_dim_len
;
dst_choose_idx
--
;
}
else
{
// 需要分割源和目标维度
size_t
src_num_per_block_int
=
static_cast
<
size_t
>
(
std
::
floor
(
src_num_per_block
));
size_t
dst_num_per_block_int
=
static_cast
<
size_t
>
(
std
::
floor
(
dst_num_per_block
));
// 计算网格尺寸
size_t
src_num_per_grid
=
(
src_current_dim_len
+
src_num_per_block_int
-
1
)
/
src_num_per_block_int
;
// 向上取整
size_t
dst_num_per_grid
=
(
dst_current_dim_len
+
dst_num_per_block_int
-
1
)
/
dst_num_per_block_int
;
// 向上取整
// 处理源维度
if
(
src_num_per_block_int
>
1
)
{
if
(
src_num_per_grid
==
1
)
{
// 可以完全放入块
block_dim_choose
[
src_idx
]
=
true
;
block_elements
*=
src_current_dim_len
;
block_src_elements
*=
src_current_dim_len
;
src_choose_idx
--
;
}
else
{
// 需要分割
SplitDim
split_dim
=
{
src_idx
,
// choose_idx
src_num_per_block_int
,
// num_per_block
src_num_per_grid
,
// num_per_grid
0
,
// array_struct_idx_block (待更新)
0
,
// array_struct_idx_grid (待更新)
src_current_dim_len
// 原始维度长度
};
split_dims
.
push_back
(
split_dim
);
}
}
// 处理目标维度
if
(
dst_num_per_block_int
>
1
)
{
if
(
dst_num_per_grid
==
1
)
{
// 可以完全放入块
block_dim_choose
[
dst_idx
]
=
true
;
block_elements
*=
dst_current_dim_len
;
block_dst_elements
*=
dst_current_dim_len
;
dst_choose_idx
--
;
}
else
{
// 需要分割
SplitDim
split_dim
=
{
dst_idx
,
// choose_idx
dst_num_per_block_int
,
// num_per_block
dst_num_per_grid
,
// num_per_grid
0
,
// array_struct_idx_block (待更新)
0
,
// array_struct_idx_grid (待更新)
dst_current_dim_len
// 原始维度长度
};
split_dims
.
push_back
(
split_dim
);
}
}
break
;
}
}
}
// 准备block维度相关参数
size_t
block_dim
=
0
;
size_t
block_len_total
=
1
;
std
::
vector
<
ARRAY_TYPE_SIZE
>
block_len
;
std
::
vector
<
ARRAY_TYPE_STRIDE
>
src_block_stride
;
std
::
vector
<
ARRAY_TYPE_STRIDE
>
dst_block_stride
;
std
::
vector
<
ARRAY_TYPE_SIZE
>
grid_len
;
std
::
vector
<
ARRAY_TYPE_STRIDE
>
src_grid_stride
;
std
::
vector
<
ARRAY_TYPE_STRIDE
>
dst_grid_stride
;
// 处理block维度,填充block_len和block_stride
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
if
(
block_dim_choose
[
i
])
{
block_len
.
push_back
(
shape
[
i
]);
src_block_stride
.
push_back
(
dims
[
i
].
src_stride
);
dst_block_stride
.
push_back
(
dims
[
i
].
dst_stride
);
block_dim
+=
1
;
block_len_total
*=
shape
[
i
];
}
// 处理分割维度的block部分
for
(
size_t
j
=
0
;
j
<
split_dims
.
size
();
++
j
)
{
if
(
i
==
split_dims
[
j
].
choose_idx
)
{
block_len
.
push_back
(
split_dims
[
j
].
num_per_block
);
src_block_stride
.
push_back
(
dims
[
i
].
src_stride
);
dst_block_stride
.
push_back
(
dims
[
i
].
dst_stride
);
split_dims
[
j
].
array_struct_idx_block
=
block_dim
;
block_dim
+=
1
;
block_len_total
*=
split_dims
[
j
].
num_per_block
;
}
}
}
// 处理grid维度,填充grid_len和grid_stride
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
if
(
!
block_dim_choose
[
i
])
{
bool
is_split
=
false
;
// 检查是否是分割维度
for
(
size_t
j
=
0
;
j
<
split_dims
.
size
();
++
j
)
{
if
(
i
==
split_dims
[
j
].
choose_idx
)
{
is_split
=
true
;
grid_len
.
push_back
(
split_dims
[
j
].
num_per_grid
);
src_grid_stride
.
push_back
(
dims
[
i
].
src_stride
*
split_dims
[
j
].
num_per_block
);
dst_grid_stride
.
push_back
(
dims
[
i
].
dst_stride
*
split_dims
[
j
].
num_per_block
);
split_dims
[
j
].
array_struct_idx_grid
=
grid_len
.
size
()
-
1
;
}
}
// 如果不是分割维度,则作为完整的grid维度
if
(
!
is_split
)
{
grid_len
.
push_back
(
shape
[
i
]);
src_grid_stride
.
push_back
(
dims
[
i
].
src_stride
);
dst_grid_stride
.
push_back
(
dims
[
i
].
dst_stride
);
}
}
}
// 如果grid_len为空,添加一个默认值
if
(
grid_len
.
empty
())
{
grid_len
.
push_back
(
1
);
src_grid_stride
.
push_back
(
0
);
dst_grid_stride
.
push_back
(
0
);
}
// 处理约束条件 - 使用与Rust版本相似的逻辑
std
::
vector
<
Constraint
<
ARRAY_TYPE_SIZE
>>
constraints
;
// 限制最多处理2个约束条件
for
(
size_t
i
=
0
;
i
<
split_dims
.
size
();
++
i
)
{
if
(
split_dims
[
i
].
dim_len
%
split_dims
[
i
].
num_per_block
==
0
)
{
continue
;
}
Constraint
<
ARRAY_TYPE_SIZE
>
constraint
;
constraint
.
grid_idx
=
split_dims
[
i
].
array_struct_idx_grid
;
constraint
.
block_idx
=
split_dims
[
i
].
array_struct_idx_block
;
constraint
.
grid_div_block
=
split_dims
[
i
].
num_per_block
;
constraint
.
total_len
=
split_dims
[
i
].
dim_len
;
constraints
.
push_back
(
constraint
);
}
// 设置参数
params
.
block_dim
=
block_dim
;
params
.
block_len_total
=
block_len_total
;
params
.
block_len
=
block_len
;
params
.
src_block_stride
=
src_block_stride
;
params
.
dst_block_stride
=
dst_block_stride
;
params
.
grid_len
=
grid_len
;
params
.
src_grid_stride
=
src_grid_stride
;
params
.
dst_grid_stride
=
dst_grid_stride
;
params
.
constraints
=
constraints
;
params
.
unit_size
=
unit
;
return
utils
::
Result
<
RearrangeParams
>
(
params
);
}
// 带约束的内核启动模板函数
template
<
unsigned
int
BLOCK_SIZE
>
infiniStatus_t
launchKernel
(
void
*
y
,
const
void
*
x
,
size_t
grid_size
,
const
RearrangeParams
&
params
,
size_t
unit_size
,
cudaStream_t
stream
)
{
// 获取内核函数
RearrangeParams
params_copy
=
params
;
// 创建一个非const副本
auto
kernel_func_result
=
getRearrangeKernel
(
params_copy
);
CHECK_RESULT
(
kernel_func_result
);
auto
kernel_func
=
kernel_func_result
.
take
();
// 创建非const的临时变量
size_t
block_dim
=
params
.
block_dim
;
size_t
block_len_total
=
params
.
block_len_total
;
// 检查向量尺寸是否合理
if
(
params
.
block_len
.
size
()
<
block_dim
||
params
.
src_block_stride
.
size
()
<
block_dim
||
params
.
dst_block_stride
.
size
()
<
block_dim
)
{
return
INFINI_STATUS_BAD_PARAM
;
}
if
(
params
.
grid_len
.
empty
()
||
params
.
src_grid_stride
.
empty
()
||
params
.
dst_grid_stride
.
empty
())
{
return
INFINI_STATUS_BAD_PARAM
;
}
const
Constraint
<
ARRAY_TYPE_SIZE
>
*
constraints_data
;
auto
empty_constraints
=
Constraint
<
ARRAY_TYPE_SIZE
>
();
if
(
params
.
constraints
.
empty
())
{
constraints_data
=
&
empty_constraints
;
}
else
{
constraints_data
=
params
.
constraints
.
data
();
}
void
*
args
[]
=
{
&
y
,
&
x
,
&
block_dim
,
&
block_len_total
,
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
params
.
block_len
.
data
())),
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
params
.
src_block_stride
.
data
())),
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
params
.
dst_block_stride
.
data
())),
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
params
.
grid_len
.
data
())),
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
params
.
src_grid_stride
.
data
())),
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
params
.
dst_grid_stride
.
data
())),
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
constraints_data
))};
CHECK_OR_RETURN
(
cudaLaunchKernel
(
kernel_func
,
grid_size
,
BLOCK_SIZE
,
args
,
0
,
stream
)
==
cudaSuccess
,
INFINI_STATUS_INTERNAL_ERROR
);
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
y
,
const
void
*
x
,
void
*
stream
)
const
{
auto
cuda_stream
=
reinterpret_cast
<
cudaStream_t
>
(
stream
);
// 如果没有维度,直接进行内存拷贝
if
(
_meta
.
ndim
()
==
0
)
{
auto
err
=
cudaMemcpyAsync
(
y
,
x
,
_meta
.
unit
(),
cudaMemcpyDeviceToDevice
,
cuda_stream
);
if
(
err
!=
cudaSuccess
)
{
return
INFINI_STATUS_INTERNAL_ERROR
;
}
CHECK_OR_RETURN
(
cudaMemcpyAsync
(
y
,
x
,
_meta
.
unit
(),
cudaMemcpyDeviceToDevice
,
cuda_stream
)
==
cudaSuccess
,
INFINI_STATUS_INTERNAL_ERROR
);
return
INFINI_STATUS_SUCCESS
;
}
// 获取设备属性
int
max_threads
=
_opaque
->
internal
->
maxThreadsPerBlock
();
// 准备参数
auto
params_result
=
prepareRearrangeParams
(
_meta
,
std
::
min
(
CUDA_BLOCK_SIZE_1024
,
max_threads
));
CHECK_RESULT
(
params_result
);
auto
params
=
params_result
.
take
();
// 计算grid大小
size_t
grid_size
=
1
;
for
(
size_t
i
=
0
;
i
<
params
.
grid_len
.
size
();
++
i
)
{
grid_size
*=
params
.
grid_len
[
i
];
}
// 检查grid大小是否为0
if
(
grid_size
==
0
)
{
return
INFINI_STATUS_BAD_PARAM
;
}
// 根据设备属性选择合适的内核
infiniStatus_t
status
=
INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED
;
size_t
block_size
=
params
.
block_len_total
;
if
(
block_size
<=
CUDA_BLOCK_SIZE_512
)
{
status
=
launchKernel
<
CUDA_BLOCK_SIZE_512
>
(
y
,
x
,
grid_size
,
params
,
_meta
.
unit
(),
cuda_stream
);
}
else
if
(
block_size
<=
CUDA_BLOCK_SIZE_1024
)
{
status
=
launchKernel
<
CUDA_BLOCK_SIZE_1024
>
(
y
,
x
,
grid_size
,
params
,
_meta
.
unit
(),
cuda_stream
);
}
else
{
return
INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED
;
}
return
status
;
}
}
// namespace op::rearrange::cuda
src/infiniop/ops/rearrange/cuda/rearrange_cuda.cuh
0 → 100644
View file @
17415721
#ifndef __REARRANGE_CUDA_H__
#define __REARRANGE_CUDA_H__
#include "../rearrange.h"
DESCRIPTOR
(
cuda
)
#endif // __REARRANGE_CUDA_H__
src/infiniop/ops/rearrange/cuda/rearrange_kernel.cuh
0 → 100644
View file @
17415721
This diff is collapsed.
Click to expand it.
src/infiniop/ops/rearrange/operator.cc
View file @
17415721
...
...
@@ -6,6 +6,10 @@
#include "cpu/rearrange_cpu.h"
#endif
#ifdef ENABLE_CUDA_API
#include "cuda/rearrange_cuda.cuh"
#endif
__C
infiniStatus_t
infiniopCreateRearrangeDescriptor
(
infiniopHandle_t
handle
,
infiniopRearrangeDescriptor_t
*
desc_ptr
,
...
...
@@ -26,6 +30,10 @@ __C infiniStatus_t infiniopCreateRearrangeDescriptor(
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
...
...
@@ -50,6 +58,10 @@ __C infiniStatus_t infiniopRearrange(
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
...
...
@@ -71,6 +83,10 @@ __C infiniStatus_t infiniopDestroyRearrangeDescriptor(
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
...
...
src/utils/rearrange.cc
View file @
17415721
...
...
@@ -138,4 +138,73 @@ void rearrange(
}
}
utils
::
Result
<
RearrangeMeta
>
RearrangeMeta
::
distributeUnit
(
const
std
::
vector
<
size_t
>
&
candidates
)
const
{
// 获取当前的unit大小
size_t
current_unit
=
_meta
[
0
];
// 寻找满足条件的unit值:当前unit能被其整除
size_t
new_unit
=
0
;
for
(
size_t
candidate
:
candidates
)
{
if
(
current_unit
%
candidate
==
0
)
{
new_unit
=
candidate
;
break
;
}
}
// 如果没找到合适的值,返回错误
if
(
new_unit
==
0
)
{
return
INFINI_STATUS_BAD_PARAM
;
}
// 如果找到的值就是当前unit,返回自身的副本
if
(
new_unit
==
current_unit
)
{
return
Result
<
RearrangeMeta
>
(
_meta
);
}
// 获取当前维度
size_t
ndim_value
=
this
->
ndim
();
// 创建新的布局数组
std
::
vector
<
ptrdiff_t
>
layout
(
2
+
(
ndim_value
+
1
)
*
3
,
0
);
// 设置新的unit值
layout
[
0
]
=
new_unit
;
// 计算扩展因子
ptrdiff_t
extra
=
current_unit
/
new_unit
;
// 计算步长指针的偏移量
ptrdiff_t
idx_offset
=
1
;
// 在新布局中设置相应的指针
ptrdiff_t
*
new_idx
=
layout
.
data
()
+
1
;
ptrdiff_t
*
new_dst
=
layout
.
data
()
+
2
+
(
ndim_value
+
1
);
ptrdiff_t
*
new_src
=
layout
.
data
()
+
2
+
(
ndim_value
+
1
)
*
2
;
// 复制并调整索引步长
// 索引步长需要重新计算
// 首先复制原来的索引步长
for
(
size_t
i
=
0
;
i
<
ndim_value
+
1
;
++
i
)
{
new_idx
[
i
]
=
_meta
[
idx_offset
+
i
]
*
extra
;
}
// 设置最后一个维度的步长为1
new_idx
[
ndim_value
+
1
]
=
1
;
// 复制目标步长数据,并添加新单元大小
for
(
size_t
i
=
0
;
i
<
ndim_value
;
++
i
)
{
new_dst
[
i
]
=
dst_strides
()[
i
];
}
new_dst
[
ndim_value
]
=
new_unit
;
// 复制源步长数据,并添加新单元大小
for
(
size_t
i
=
0
;
i
<
ndim_value
;
++
i
)
{
new_src
[
i
]
=
src_strides
()[
i
];
}
new_src
[
ndim_value
]
=
new_unit
;
return
Result
<
RearrangeMeta
>
(
layout
);
}
}
// namespace utils
src/utils/rearrange.h
View file @
17415721
...
...
@@ -28,6 +28,9 @@ public:
const
ptrdiff_t
*
src_strides
()
const
;
void
launch
(
void
*
dst
,
const
void
*
src
)
const
;
// 拆分 unit 到更小的规模以利于并行
utils
::
Result
<
RearrangeMeta
>
distributeUnit
(
const
std
::
vector
<
size_t
>
&
candidates
)
const
;
};
void
rearrange
(
...
...
test/infiniop/rearrange.py
View file @
17415721
...
...
@@ -17,19 +17,88 @@ from libinfiniop import (
profile_operation
,
)
def
row_major_strides
(
shape
):
"""生成张量的行优先(C风格)stride
Args:
shape: 张量形状
Returns:
行优先strides列表
"""
# 行优先 (C风格,从最后一维到第一维)
stride
=
1
strides
=
[
1
]
for
dim
in
reversed
(
shape
[
1
:]):
stride
*=
dim
strides
.
insert
(
0
,
stride
)
return
strides
def
column_major_strides
(
shape
):
"""生成张量的列优先(Fortran风格)stride
Args:
shape: 张量形状
Returns:
列优先strides列表
"""
# 列优先 (Fortran风格,从第一维到最后一维)
stride
=
1
strides
=
[
stride
]
for
dim
in
shape
[:
-
1
]:
stride
*=
dim
strides
.
append
(
stride
)
return
strides
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# These are not meant to be imported from other modules
_TEST_CASES
=
[
# ((src_shape, src_stride), (dst_shape, dst_stride))
(((
2
,
4
,
32
),
None
),
((
2
,
4
,
32
),
(
256
,
64
,
1
))),
(((
32
,
6
,
64
),
(
64
,
2560
,
1
)),
((
32
,
6
,
64
),
None
)),
(((
4
,
6
,
64
),
(
64
,
2560
,
1
)),
((
4
,
6
,
64
),
(
131072
,
64
,
1
))),
(((
1
,
32
,
64
),
(
2048
,
64
,
1
)),
((
1
,
32
,
64
),
(
2048
,
64
,
1
))),
(((
32
,
1
,
64
),
(
64
,
2560
,
1
)),
((
32
,
1
,
64
),
(
64
,
64
,
1
))),
(((
4
,
1
,
64
),
(
64
,
2560
,
1
)),
((
4
,
1
,
64
),
(
64
,
11264
,
1
))),
(((
64
,),
(
1
,)),
((
64
,),
(
1
,))),
# (shape, x_stride, y_stride)
(
(
2
,
4
,
64
),
# shape
(
2
,
4
,
8
),
# x_stride
(
512
,
128
,
2
)
# y_stride
),
(
(
100
,
100
),
# shape
(
1
,
100
),
# x_stride
(
100
,
1
)
# y_stride
),
(
(
4
,
4
),
# shape
(
1
,
4
),
# x_stride
(
4
,
1
)
# y_stride
),
(
(
4
,
6
,
64
),
# shape
(
64
,
4
*
64
,
1
),
# x_stride
(
6
*
64
,
64
,
1
)
# y_stride
),
(
(
2000
,
2000
),
# shape
(
1
,
2000
),
# x_stride
(
2000
,
1
)
# y_stride
),
(
(
2001
,
2001
),
# shape
(
1
,
2001
),
# x_stride
(
2001
,
1
)
# y_stride
),
(
(
3
,
4
,
7
,
53
,
9
),
# shape
row_major_strides
((
3
,
4
,
7
,
53
,
9
)),
# x_stride
column_major_strides
((
3
,
4
,
7
,
53
,
9
))
# y_stride
),
(
(
3
,
4
,
50
,
50
,
5
,
7
),
# shape
row_major_strides
((
3
,
4
,
50
,
50
,
5
,
7
)),
# x_stride
column_major_strides
((
3
,
4
,
50
,
50
,
5
,
7
))
# y_stride
),
]
# Data types used for testing
...
...
@@ -58,23 +127,23 @@ def test(
lib
,
handle
,
torch_device
,
x_
shape
,
shape
,
x_stride
,
y_shape
,
y_stride
,
dtype
=
torch
.
float16
,
):
print
(
f
"Testing Rerrange on
{
torch_device
}
with
x_
shape:
{
x_
shape
}
x_stride:
{
x_stride
}
y_shape:
{
y_shape
}
y_stride:
{
y_stride
}
dtype:
{
dtype
}
"
f
"Testing Rerrange on
{
torch_device
}
with shape:
{
shape
}
x_stride:
{
x_stride
}
y_stride:
{
y_stride
}
dtype:
{
dtype
}
"
)
x
=
torch
.
rand
(
x_
shape
,
dtype
=
dtype
).
to
(
torch_device
)
y
=
torch
.
zeros
(
y_
shape
,
dtype
=
dtype
).
to
(
torch_device
)
x
=
torch
.
rand
(
shape
,
dtype
=
dtype
).
to
(
torch_device
)
y
=
torch
.
zeros
(
shape
,
dtype
=
dtype
).
to
(
torch_device
)
x
,
y
=
[
rearrange_if_needed
(
tensor
,
stride
)
for
tensor
,
stride
in
zip
([
x
,
y
],
[
x_stride
,
y_stride
])
]
x_tensor
,
y_tensor
=
[
to_tensor
(
tensor
,
lib
)
for
tensor
in
[
x
,
y
]]
descriptor
=
infiniopRearrangeDescriptor_t
()
...
...
@@ -86,7 +155,7 @@ def test(
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for
tensor
in
[
x_tensor
,
y_tensor
]:
tensor
.
des
criptor
.
contents
.
invalidate
(
)
tensor
.
des
troyDesc
(
lib
)
def
lib_rearrange
():
check_error
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment