Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
17415721
Unverified
Commit
17415721
authored
Apr 24, 2025
by
PanZezhong1725
Committed by
GitHub
Apr 24, 2025
Browse files
Merge pull request #128 from pwhMass/rearrange
issue/131: rearrange 算子 - CUDA
parents
a16380fb
c98dac66
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
1002 additions
and
24 deletions
+1002
-24
src/infiniop/ops/rearrange/cpu/rearrange_cpu.cc
src/infiniop/ops/rearrange/cpu/rearrange_cpu.cc
+8
-10
src/infiniop/ops/rearrange/cuda/rearrange_cuda.cu
src/infiniop/ops/rearrange/cuda/rearrange_cuda.cu
+485
-0
src/infiniop/ops/rearrange/cuda/rearrange_cuda.cuh
src/infiniop/ops/rearrange/cuda/rearrange_cuda.cuh
+8
-0
src/infiniop/ops/rearrange/cuda/rearrange_kernel.cuh
src/infiniop/ops/rearrange/cuda/rearrange_kernel.cuh
+330
-0
src/infiniop/ops/rearrange/operator.cc
src/infiniop/ops/rearrange/operator.cc
+16
-0
src/utils/rearrange.cc
src/utils/rearrange.cc
+69
-0
src/utils/rearrange.h
src/utils/rearrange.h
+3
-0
test/infiniop/rearrange.py
test/infiniop/rearrange.py
+83
-14
No files found.
src/infiniop/ops/rearrange/cpu/rearrange_cpu.cc
View file @
17415721
...
@@ -15,20 +15,18 @@ infiniStatus_t Descriptor::create(
...
@@ -15,20 +15,18 @@ infiniStatus_t Descriptor::create(
auto
handle
=
reinterpret_cast
<
device
::
cpu
::
Handle
*>
(
handle_
);
auto
handle
=
reinterpret_cast
<
device
::
cpu
::
Handle
*>
(
handle_
);
auto
dtype
=
y_desc
->
dtype
();
auto
dtype
=
y_desc
->
dtype
();
auto
ndim
=
y_desc
->
ndim
();
auto
ndim
=
y_desc
->
ndim
();
auto
shape
=
y_desc
->
shape
().
data
();
CHECK_API_OR
(
x_desc
->
dtype
(),
dtype
,
return
INFINI_STATUS_BAD_TENSOR_DTYPE
);
auto
y_shape
=
y_desc
->
shape
();
CHECK_API_OR
(
x_desc
->
ndim
(),
ndim
,
return
INFINI_STATUS_BAD_TENSOR_SHAPE
);
auto
x_shape
=
x_desc
->
shape
();
CHECK_OR_RETURN
(
x_desc
->
dtype
()
==
dtype
,
INFINI_STATUS_BAD_TENSOR_DTYPE
);
CHECK_OR_RETURN
(
x_desc
->
ndim
()
==
ndim
,
INFINI_STATUS_BAD_TENSOR_SHAPE
);
CHECK_SAME_SHAPE
(
x_shape
,
y_shape
);
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
auto
dst_strides
=
y_desc
->
strides
();
CHECK_API_OR
(
x_desc
->
shape
()[
i
],
shape
[
i
],
return
INFINI_STATUS_BAD_TENSOR_SHAPE
);
auto
src_strides
=
x_desc
->
strides
();
}
auto
dst_strides
=
y_desc
->
strides
().
data
();
auto
src_strides
=
x_desc
->
strides
().
data
();
auto
element_size
=
infiniSizeOf
(
dtype
);
auto
element_size
=
infiniSizeOf
(
dtype
);
auto
result
=
utils
::
RearrangeMeta
::
create
(
shape
,
dst_strides
,
src_strides
,
ndim
,
element_size
);
auto
result
=
utils
::
RearrangeMeta
::
create
(
y_
shape
.
data
()
,
dst_strides
.
data
()
,
src_strides
.
data
()
,
ndim
,
element_size
);
CHECK_RESULT
(
result
);
CHECK_RESULT
(
result
);
*
desc_ptr
=
new
Descriptor
(
*
desc_ptr
=
new
Descriptor
(
...
...
src/infiniop/ops/rearrange/cuda/rearrange_cuda.cu
0 → 100644
View file @
17415721
#include "../../../devices/cuda/cuda_common.cuh"
#include "../../../devices/cuda/cuda_kernel_common.cuh"
#include "../../../tensor.h"
#include "rearrange_cuda.cuh"
#include "rearrange_kernel.cuh"
#include <algorithm>
#include <cmath>
#include <memory>
#include <stdint.h>
#include <vector>
namespace
op
::
rearrange
::
cuda
{
struct
Descriptor
::
Opaque
{
std
::
shared_ptr
<
device
::
cuda
::
Handle
::
Internal
>
internal
;
};
Descriptor
::~
Descriptor
()
{
delete
_opaque
;
}
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
y_desc
,
infiniopTensorDescriptor_t
x_desc
)
{
auto
dtype
=
y_desc
->
dtype
();
auto
ndim
=
y_desc
->
ndim
();
CHECK_OR_RETURN
(
x_desc
->
dtype
()
==
dtype
,
INFINI_STATUS_BAD_TENSOR_DTYPE
);
CHECK_OR_RETURN
(
x_desc
->
ndim
()
==
ndim
,
INFINI_STATUS_BAD_TENSOR_SHAPE
);
// 保存临时vector对象
auto
x_shape
=
x_desc
->
shape
();
auto
y_shape
=
y_desc
->
shape
();
auto
y_strides
=
y_desc
->
strides
();
auto
x_strides
=
x_desc
->
strides
();
CHECK_SAME_SHAPE
(
x_shape
,
y_shape
);
auto
meta
=
utils
::
RearrangeMeta
::
create
(
y_shape
.
data
(),
y_strides
.
data
(),
x_strides
.
data
(),
ndim
,
infiniSizeOf
(
dtype
));
CHECK_RESULT
(
meta
);
*
desc_ptr
=
new
Descriptor
(
std
::
move
(
*
meta
),
new
Opaque
{
reinterpret_cast
<
device
::
cuda
::
Handle
*>
(
handle
)
->
internal
()},
handle
->
device
,
handle
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
// 维度信息结构
struct
Dim
{
size_t
len
;
ARRAY_TYPE_STRIDE
src_stride
;
ARRAY_TYPE_STRIDE
dst_stride
;
};
// 分割维度结构
struct
SplitDim
{
size_t
choose_idx
;
size_t
num_per_block
;
size_t
num_per_grid
;
int
array_struct_idx_block
;
int
array_struct_idx_grid
;
size_t
dim_len
;
};
/**
* 根据给定的元数据准备张量重排参数,该函数主要完成以下工作:
* 1. 根据原始元数据调整单元大小,获取更适合GPU处理的单元大小
* 2. 将维度分配为CUDA块(block)维度和网格(grid)维度:
* 该步骤是核心,目标是为每个block分配尽可能多的相对连续的数据进行处理,
* 对无法完整放入块的维度进行分割,并记录分割维度信息,用于防止kernel访问越界,最大化内存访问局部性和计算效率
*/
utils
::
Result
<
RearrangeParams
>
prepareRearrangeParams
(
const
utils
::
RearrangeMeta
&
original_meta
,
int
max_threads
)
{
RearrangeParams
params
;
// 获取更适合GPU处理的单元大小,这里使用2的幂次方
auto
meta_result
=
original_meta
.
distributeUnit
({
32
,
16
,
8
,
4
,
2
,
1
});
CHECK_RESULT
(
meta_result
);
const
utils
::
RearrangeMeta
&
meta
=
meta_result
.
take
();
// 获取维度信息
const
size_t
ndim
=
meta
.
ndim
();
const
size_t
unit
=
meta
.
unit
();
// 特殊情况:无维度,只需要简单复制
if
(
ndim
==
0
)
{
params
.
block_dim
=
0
;
params
.
block_len_total
=
1
;
params
.
block_len
=
{
static_cast
<
ARRAY_TYPE_SIZE
>
(
1
)};
params
.
src_block_stride
=
{
static_cast
<
ARRAY_TYPE_STRIDE
>
(
0
)};
params
.
dst_block_stride
=
{
static_cast
<
ARRAY_TYPE_STRIDE
>
(
0
)};
params
.
grid_len
=
{
static_cast
<
ARRAY_TYPE_SIZE
>
(
1
)};
params
.
src_grid_stride
=
{
static_cast
<
ARRAY_TYPE_STRIDE
>
(
0
)};
params
.
dst_grid_stride
=
{
static_cast
<
ARRAY_TYPE_STRIDE
>
(
0
)};
params
.
unit_size
=
unit
;
return
utils
::
Result
<
RearrangeParams
>
(
params
);
}
// 从元数据中提取必要的信息
const
ptrdiff_t
*
idx_strides
=
meta
.
idx_strides
();
const
ptrdiff_t
*
dst_strides
=
meta
.
dst_strides
();
const
ptrdiff_t
*
src_strides
=
meta
.
src_strides
();
// 准备维度信息
std
::
vector
<
Dim
>
dims
;
std
::
vector
<
size_t
>
shape
;
dims
.
reserve
(
ndim
);
shape
.
reserve
(
ndim
);
auto
prev_idx_stride
=
meta
.
count
();
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
size_t
len
=
prev_idx_stride
/
idx_strides
[
i
];
shape
.
push_back
(
len
);
dims
.
push_back
({
len
,
src_strides
[
i
],
dst_strides
[
i
]});
prev_idx_stride
=
idx_strides
[
i
];
}
// 计算src_strides的降序排序索引,类似于Rust版本中的src_strides_desc_idx
std
::
vector
<
size_t
>
src_strides_desc_idx
(
ndim
);
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
src_strides_desc_idx
[
i
]
=
i
;
}
std
::
sort
(
src_strides_desc_idx
.
begin
(),
src_strides_desc_idx
.
end
(),
[
&
dims
](
size_t
a
,
size_t
b
)
{
return
std
::
abs
(
dims
[
a
].
src_stride
)
>
std
::
abs
(
dims
[
b
].
src_stride
);
});
// 根据最大线程数选择block和grid维度
const
size_t
block_size
=
max_threads
;
std
::
vector
<
bool
>
block_dim_choose
(
ndim
,
false
);
// 初始化计数器
size_t
block_elements
=
1
;
size_t
block_src_elements
=
1
;
size_t
block_dst_elements
=
1
;
size_t
src_choose_idx
=
ndim
;
size_t
dst_choose_idx
=
ndim
;
// 用于存储分割维度信息
std
::
vector
<
SplitDim
>
split_dims
;
// 维度选择循环
while
(
src_choose_idx
>
0
&&
dst_choose_idx
>
0
)
{
// 获取当前需要处理的维度索引
size_t
src_idx
=
src_strides_desc_idx
[
src_choose_idx
-
1
];
size_t
dst_idx
=
dst_choose_idx
-
1
;
if
(
src_idx
==
dst_idx
)
{
// 源和目标维度相同,可以一起处理
size_t
idx
=
src_idx
;
size_t
len
=
shape
[
idx
];
// 检查是否可以将此维度完全添加到block中
if
(
block_elements
*
len
<=
block_size
)
{
// 选择此维度
block_dim_choose
[
idx
]
=
true
;
block_elements
*=
len
;
block_src_elements
*=
len
;
block_dst_elements
*=
len
;
src_choose_idx
--
;
dst_choose_idx
--
;
}
else
{
// 需要分割此维度
size_t
num_per_block
=
block_size
/
block_elements
;
// 确保num_per_block > 0且len >= num_per_block
if
(
num_per_block
>
0
&&
len
>=
num_per_block
&&
num_per_block
>
1
)
{
size_t
num_per_grid
=
(
len
+
num_per_block
-
1
)
/
num_per_block
;
// 向上取整
SplitDim
split_dim
=
{
idx
,
// choose_idx
num_per_block
,
// num_per_block
num_per_grid
,
// num_per_grid
0
,
// array_struct_idx_block (待更新)
0
,
// array_struct_idx_grid (待更新)
len
// 原始维度长度
};
split_dims
.
push_back
(
split_dim
);
}
break
;
}
}
else
{
// 源和目标维度不同,需要分别处理
// 计算块比例
double
src_div_dst
=
static_cast
<
double
>
(
block_src_elements
)
/
block_dst_elements
;
double
src_num_per_block
=
std
::
sqrt
(
block_size
/
(
double
)
block_elements
/
src_div_dst
);
double
dst_num_per_block
=
src_num_per_block
*
src_div_dst
;
size_t
src_current_dim_len
=
shape
[
src_idx
];
size_t
dst_current_dim_len
=
shape
[
dst_idx
];
if
(
static_cast
<
double
>
(
src_current_dim_len
)
<
src_num_per_block
)
{
// 源维度可以完全添加到block
block_dim_choose
[
src_idx
]
=
true
;
block_elements
*=
src_current_dim_len
;
block_src_elements
*=
src_current_dim_len
;
src_choose_idx
--
;
}
else
if
(
static_cast
<
double
>
(
dst_current_dim_len
)
<
dst_num_per_block
)
{
// 目标维度可以完全添加到block
block_dim_choose
[
dst_idx
]
=
true
;
block_elements
*=
dst_current_dim_len
;
block_dst_elements
*=
dst_current_dim_len
;
dst_choose_idx
--
;
}
else
{
// 需要分割源和目标维度
size_t
src_num_per_block_int
=
static_cast
<
size_t
>
(
std
::
floor
(
src_num_per_block
));
size_t
dst_num_per_block_int
=
static_cast
<
size_t
>
(
std
::
floor
(
dst_num_per_block
));
// 计算网格尺寸
size_t
src_num_per_grid
=
(
src_current_dim_len
+
src_num_per_block_int
-
1
)
/
src_num_per_block_int
;
// 向上取整
size_t
dst_num_per_grid
=
(
dst_current_dim_len
+
dst_num_per_block_int
-
1
)
/
dst_num_per_block_int
;
// 向上取整
// 处理源维度
if
(
src_num_per_block_int
>
1
)
{
if
(
src_num_per_grid
==
1
)
{
// 可以完全放入块
block_dim_choose
[
src_idx
]
=
true
;
block_elements
*=
src_current_dim_len
;
block_src_elements
*=
src_current_dim_len
;
src_choose_idx
--
;
}
else
{
// 需要分割
SplitDim
split_dim
=
{
src_idx
,
// choose_idx
src_num_per_block_int
,
// num_per_block
src_num_per_grid
,
// num_per_grid
0
,
// array_struct_idx_block (待更新)
0
,
// array_struct_idx_grid (待更新)
src_current_dim_len
// 原始维度长度
};
split_dims
.
push_back
(
split_dim
);
}
}
// 处理目标维度
if
(
dst_num_per_block_int
>
1
)
{
if
(
dst_num_per_grid
==
1
)
{
// 可以完全放入块
block_dim_choose
[
dst_idx
]
=
true
;
block_elements
*=
dst_current_dim_len
;
block_dst_elements
*=
dst_current_dim_len
;
dst_choose_idx
--
;
}
else
{
// 需要分割
SplitDim
split_dim
=
{
dst_idx
,
// choose_idx
dst_num_per_block_int
,
// num_per_block
dst_num_per_grid
,
// num_per_grid
0
,
// array_struct_idx_block (待更新)
0
,
// array_struct_idx_grid (待更新)
dst_current_dim_len
// 原始维度长度
};
split_dims
.
push_back
(
split_dim
);
}
}
break
;
}
}
}
// 准备block维度相关参数
size_t
block_dim
=
0
;
size_t
block_len_total
=
1
;
std
::
vector
<
ARRAY_TYPE_SIZE
>
block_len
;
std
::
vector
<
ARRAY_TYPE_STRIDE
>
src_block_stride
;
std
::
vector
<
ARRAY_TYPE_STRIDE
>
dst_block_stride
;
std
::
vector
<
ARRAY_TYPE_SIZE
>
grid_len
;
std
::
vector
<
ARRAY_TYPE_STRIDE
>
src_grid_stride
;
std
::
vector
<
ARRAY_TYPE_STRIDE
>
dst_grid_stride
;
// 处理block维度,填充block_len和block_stride
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
if
(
block_dim_choose
[
i
])
{
block_len
.
push_back
(
shape
[
i
]);
src_block_stride
.
push_back
(
dims
[
i
].
src_stride
);
dst_block_stride
.
push_back
(
dims
[
i
].
dst_stride
);
block_dim
+=
1
;
block_len_total
*=
shape
[
i
];
}
// 处理分割维度的block部分
for
(
size_t
j
=
0
;
j
<
split_dims
.
size
();
++
j
)
{
if
(
i
==
split_dims
[
j
].
choose_idx
)
{
block_len
.
push_back
(
split_dims
[
j
].
num_per_block
);
src_block_stride
.
push_back
(
dims
[
i
].
src_stride
);
dst_block_stride
.
push_back
(
dims
[
i
].
dst_stride
);
split_dims
[
j
].
array_struct_idx_block
=
block_dim
;
block_dim
+=
1
;
block_len_total
*=
split_dims
[
j
].
num_per_block
;
}
}
}
// 处理grid维度,填充grid_len和grid_stride
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
if
(
!
block_dim_choose
[
i
])
{
bool
is_split
=
false
;
// 检查是否是分割维度
for
(
size_t
j
=
0
;
j
<
split_dims
.
size
();
++
j
)
{
if
(
i
==
split_dims
[
j
].
choose_idx
)
{
is_split
=
true
;
grid_len
.
push_back
(
split_dims
[
j
].
num_per_grid
);
src_grid_stride
.
push_back
(
dims
[
i
].
src_stride
*
split_dims
[
j
].
num_per_block
);
dst_grid_stride
.
push_back
(
dims
[
i
].
dst_stride
*
split_dims
[
j
].
num_per_block
);
split_dims
[
j
].
array_struct_idx_grid
=
grid_len
.
size
()
-
1
;
}
}
// 如果不是分割维度,则作为完整的grid维度
if
(
!
is_split
)
{
grid_len
.
push_back
(
shape
[
i
]);
src_grid_stride
.
push_back
(
dims
[
i
].
src_stride
);
dst_grid_stride
.
push_back
(
dims
[
i
].
dst_stride
);
}
}
}
// 如果grid_len为空,添加一个默认值
if
(
grid_len
.
empty
())
{
grid_len
.
push_back
(
1
);
src_grid_stride
.
push_back
(
0
);
dst_grid_stride
.
push_back
(
0
);
}
// 处理约束条件 - 使用与Rust版本相似的逻辑
std
::
vector
<
Constraint
<
ARRAY_TYPE_SIZE
>>
constraints
;
// 限制最多处理2个约束条件
for
(
size_t
i
=
0
;
i
<
split_dims
.
size
();
++
i
)
{
if
(
split_dims
[
i
].
dim_len
%
split_dims
[
i
].
num_per_block
==
0
)
{
continue
;
}
Constraint
<
ARRAY_TYPE_SIZE
>
constraint
;
constraint
.
grid_idx
=
split_dims
[
i
].
array_struct_idx_grid
;
constraint
.
block_idx
=
split_dims
[
i
].
array_struct_idx_block
;
constraint
.
grid_div_block
=
split_dims
[
i
].
num_per_block
;
constraint
.
total_len
=
split_dims
[
i
].
dim_len
;
constraints
.
push_back
(
constraint
);
}
// 设置参数
params
.
block_dim
=
block_dim
;
params
.
block_len_total
=
block_len_total
;
params
.
block_len
=
block_len
;
params
.
src_block_stride
=
src_block_stride
;
params
.
dst_block_stride
=
dst_block_stride
;
params
.
grid_len
=
grid_len
;
params
.
src_grid_stride
=
src_grid_stride
;
params
.
dst_grid_stride
=
dst_grid_stride
;
params
.
constraints
=
constraints
;
params
.
unit_size
=
unit
;
return
utils
::
Result
<
RearrangeParams
>
(
params
);
}
// 带约束的内核启动模板函数
template
<
unsigned
int
BLOCK_SIZE
>
infiniStatus_t
launchKernel
(
void
*
y
,
const
void
*
x
,
size_t
grid_size
,
const
RearrangeParams
&
params
,
size_t
unit_size
,
cudaStream_t
stream
)
{
// 获取内核函数
RearrangeParams
params_copy
=
params
;
// 创建一个非const副本
auto
kernel_func_result
=
getRearrangeKernel
(
params_copy
);
CHECK_RESULT
(
kernel_func_result
);
auto
kernel_func
=
kernel_func_result
.
take
();
// 创建非const的临时变量
size_t
block_dim
=
params
.
block_dim
;
size_t
block_len_total
=
params
.
block_len_total
;
// 检查向量尺寸是否合理
if
(
params
.
block_len
.
size
()
<
block_dim
||
params
.
src_block_stride
.
size
()
<
block_dim
||
params
.
dst_block_stride
.
size
()
<
block_dim
)
{
return
INFINI_STATUS_BAD_PARAM
;
}
if
(
params
.
grid_len
.
empty
()
||
params
.
src_grid_stride
.
empty
()
||
params
.
dst_grid_stride
.
empty
())
{
return
INFINI_STATUS_BAD_PARAM
;
}
const
Constraint
<
ARRAY_TYPE_SIZE
>
*
constraints_data
;
auto
empty_constraints
=
Constraint
<
ARRAY_TYPE_SIZE
>
();
if
(
params
.
constraints
.
empty
())
{
constraints_data
=
&
empty_constraints
;
}
else
{
constraints_data
=
params
.
constraints
.
data
();
}
void
*
args
[]
=
{
&
y
,
&
x
,
&
block_dim
,
&
block_len_total
,
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
params
.
block_len
.
data
())),
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
params
.
src_block_stride
.
data
())),
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
params
.
dst_block_stride
.
data
())),
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
params
.
grid_len
.
data
())),
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
params
.
src_grid_stride
.
data
())),
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
params
.
dst_grid_stride
.
data
())),
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
constraints_data
))};
CHECK_OR_RETURN
(
cudaLaunchKernel
(
kernel_func
,
grid_size
,
BLOCK_SIZE
,
args
,
0
,
stream
)
==
cudaSuccess
,
INFINI_STATUS_INTERNAL_ERROR
);
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
y
,
const
void
*
x
,
void
*
stream
)
const
{
auto
cuda_stream
=
reinterpret_cast
<
cudaStream_t
>
(
stream
);
// 如果没有维度,直接进行内存拷贝
if
(
_meta
.
ndim
()
==
0
)
{
auto
err
=
cudaMemcpyAsync
(
y
,
x
,
_meta
.
unit
(),
cudaMemcpyDeviceToDevice
,
cuda_stream
);
if
(
err
!=
cudaSuccess
)
{
return
INFINI_STATUS_INTERNAL_ERROR
;
}
CHECK_OR_RETURN
(
cudaMemcpyAsync
(
y
,
x
,
_meta
.
unit
(),
cudaMemcpyDeviceToDevice
,
cuda_stream
)
==
cudaSuccess
,
INFINI_STATUS_INTERNAL_ERROR
);
return
INFINI_STATUS_SUCCESS
;
}
// 获取设备属性
int
max_threads
=
_opaque
->
internal
->
maxThreadsPerBlock
();
// 准备参数
auto
params_result
=
prepareRearrangeParams
(
_meta
,
std
::
min
(
CUDA_BLOCK_SIZE_1024
,
max_threads
));
CHECK_RESULT
(
params_result
);
auto
params
=
params_result
.
take
();
// 计算grid大小
size_t
grid_size
=
1
;
for
(
size_t
i
=
0
;
i
<
params
.
grid_len
.
size
();
++
i
)
{
grid_size
*=
params
.
grid_len
[
i
];
}
// 检查grid大小是否为0
if
(
grid_size
==
0
)
{
return
INFINI_STATUS_BAD_PARAM
;
}
// 根据设备属性选择合适的内核
infiniStatus_t
status
=
INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED
;
size_t
block_size
=
params
.
block_len_total
;
if
(
block_size
<=
CUDA_BLOCK_SIZE_512
)
{
status
=
launchKernel
<
CUDA_BLOCK_SIZE_512
>
(
y
,
x
,
grid_size
,
params
,
_meta
.
unit
(),
cuda_stream
);
}
else
if
(
block_size
<=
CUDA_BLOCK_SIZE_1024
)
{
status
=
launchKernel
<
CUDA_BLOCK_SIZE_1024
>
(
y
,
x
,
grid_size
,
params
,
_meta
.
unit
(),
cuda_stream
);
}
else
{
return
INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED
;
}
return
status
;
}
}
// namespace op::rearrange::cuda
src/infiniop/ops/rearrange/cuda/rearrange_cuda.cuh
0 → 100644
View file @
17415721
#ifndef __REARRANGE_CUDA_H__
#define __REARRANGE_CUDA_H__
#include "../rearrange.h"
DESCRIPTOR
(
cuda
)
#endif // __REARRANGE_CUDA_H__
src/infiniop/ops/rearrange/cuda/rearrange_kernel.cuh
0 → 100644
View file @
17415721
#ifndef __REARRANGE_CUDA_KERNEL_H__
#define __REARRANGE_CUDA_KERNEL_H__
#include "../../../devices/cuda/cuda_common.cuh"
#define ARRAY_TYPE_STRIDE ptrdiff_t
#define ARRAY_TYPE_SIZE size_t
// 与 DEFINE_KERNELS_BY_CONSTRAINT 耦合,需要同时修改
#define MAX_BLOCK_ARRAY_SIZE 5
#define MAX_GRID_ARRAY_SIZE 5
template
<
int
ArrSize
,
typename
ArrayType
>
struct
ArrayStruct
{
ArrayType
a
[
ArrSize
];
};
// 各个元素分别代表:[grid_idx, block_idx, grid的stride相对于block的倍数,总的len限制]
template
<
typename
ElementType
>
struct
Constraint
{
ElementType
grid_idx
;
ElementType
block_idx
;
ElementType
grid_div_block
;
ElementType
total_len
;
};
#define IF_CONSTRAINT_0 , const ArrayStruct<1, Constraint<ARRAY_TYPE_SIZE>> constraints
#define IF_CONSTRAINT_1 , const ArrayStruct<1, Constraint<ARRAY_TYPE_SIZE>> constraints
#define IF_CONSTRAINT_2 , const ArrayStruct<2, Constraint<ARRAY_TYPE_SIZE>> constraints
// 定义宏生成内核函数
#define DEFINE_REARRANGE_KERNEL(Tmem_type, constraint_num, block_array_size, grid_array_size) \
extern "C" __global__ void rearrange_unit_##Tmem_type##_block_##block_array_size##_grid_##grid_array_size##_constrain_##constraint_num( \
void *__restrict__ dst, \
const void *__restrict__ src, \
const size_t block_dim, \
const size_t block_len_total, \
const ArrayStruct<block_array_size, ARRAY_TYPE_SIZE> block_len, \
const ArrayStruct<block_array_size, ARRAY_TYPE_STRIDE> src_block_stride,
/* 字节单位的步长 */
\
const ArrayStruct<block_array_size, ARRAY_TYPE_STRIDE> dst_block_stride,
/* 字节单位的步长 */
\
const ArrayStruct<grid_array_size, ARRAY_TYPE_SIZE> grid_len, \
const ArrayStruct<grid_array_size, ARRAY_TYPE_STRIDE> src_grid_stride,
/* 字节单位的步长 */
\
const ArrayStruct<grid_array_size, ARRAY_TYPE_STRIDE> dst_grid_stride
/* 字节单位的步长 */
\
IF_CONSTRAINT_##constraint_num) { \
size_t remaining = threadIdx.x; \
if (remaining >= block_len_total) { \
return; \
} \
\
/* 声明共享内存 */
\
__shared__ ptrdiff_t shared_src_offset; \
__shared__ ptrdiff_t shared_dst_offset; \
\
if (constraint_num > 0) { \
__shared__ ARRAY_TYPE_SIZE shared_constraints_grid_idx_multiple[constraint_num > 0 ? constraint_num : 1]; \
\
if (threadIdx.x == 0) {
/* 只让0号线程计算 */
\
/* 计算当前block处理的数据在src和dst中的基础偏移(bytes) */
\
ptrdiff_t src_offset = 0; \
ptrdiff_t dst_offset = 0; \
ARRAY_TYPE_SIZE constraints_grid_idx_multiple[constraint_num > 0 ? constraint_num : 1]; \
\
size_t remaining \
= blockIdx.x; \
\
for (ssize_t i = grid_array_size - 1; i >= 0; i--) { \
size_t idx = remaining % grid_len.a[i]; \
remaining /= grid_len.a[i]; \
src_offset += idx * src_grid_stride.a[i]; \
dst_offset += idx * dst_grid_stride.a[i]; \
if (constraint_num > 0) { \
for (ssize_t j = 0; j < constraint_num; j++) { \
if (i == constraints.a[j].grid_idx) { \
constraints_grid_idx_multiple[j] = idx * constraints.a[j].grid_div_block; \
} \
} \
} \
} \
\
/* 将结果存入共享内存 */
\
shared_src_offset = src_offset; \
shared_dst_offset = dst_offset; \
for (ssize_t j = 0; j < constraint_num; j++) { \
shared_constraints_grid_idx_multiple[j] = constraints_grid_idx_multiple[j]; \
} \
} \
\
/* 确保所有线程都能看到共享内存中的值 */
\
__syncthreads(); \
\
/* 所有线程直接使用计算好的偏移值 */
\
ptrdiff_t src_offset = shared_src_offset; \
ptrdiff_t dst_offset = shared_dst_offset; \
ARRAY_TYPE_SIZE constraints_grid_idx_multiple[constraint_num > 0 ? constraint_num : 1]; \
for (ssize_t j = 0; j < constraint_num; j++) { \
constraints_grid_idx_multiple[j] = shared_constraints_grid_idx_multiple[j]; \
} \
\
for (ssize_t i = block_array_size - 1; i >= 0; i--) { \
size_t idx = remaining % block_len.a[i]; \
remaining /= block_len.a[i]; \
/* 计算偏移量 */
\
src_offset += idx * src_block_stride.a[i]; \
dst_offset += idx * dst_block_stride.a[i]; \
if (constraint_num > 0) { \
for (ssize_t j = 0; j < constraint_num; j++) { \
if (i == constraints.a[j].block_idx) { \
if (constraints_grid_idx_multiple[j] + idx >= constraints.a[j].total_len) { \
return; \
} \
} \
} \
} \
} \
\
src_offset += remaining * src_block_stride.a[0]; \
dst_offset += remaining * dst_block_stride.a[0]; \
for (ssize_t j = 0; j < constraint_num; j++) { \
if (0 == constraints.a[j].block_idx) { \
if (constraints_grid_idx_multiple[j] + remaining >= constraints.a[j].total_len) { \
return; \
} \
} \
} \
\
/* 执行数据拷贝,注意offset已经是字节偏移 */
\
*reinterpret_cast<Tmem_type *>(reinterpret_cast<char *>(dst) + dst_offset) = *reinterpret_cast<const Tmem_type *>(reinterpret_cast<const char *>(src) + src_offset); \
\
} else { \
if (threadIdx.x == 0) {
/* 只让0号线程计算 */
\
/* 计算当前block处理的数据在src和dst中的基础偏移(bytes) */
\
ptrdiff_t src_offset = 0; \
ptrdiff_t dst_offset = 0; \
size_t remaining = blockIdx.x; \
\
for (ssize_t i = grid_array_size - 1; i >= 0; i--) { \
size_t idx = remaining % grid_len.a[i]; \
remaining /= grid_len.a[i]; \
src_offset += idx * src_grid_stride.a[i]; \
dst_offset += idx * dst_grid_stride.a[i]; \
} \
\
/* 将结果存入共享内存 */
\
shared_src_offset = src_offset; \
shared_dst_offset = dst_offset; \
} \
\
/* 确保所有线程都能看到共享内存中的值 */
\
__syncthreads(); \
\
/* 所有线程直接使用计算好的偏移值 */
\
ptrdiff_t src_offset = shared_src_offset; \
ptrdiff_t dst_offset = shared_dst_offset; \
\
for (ssize_t i = block_array_size - 1; i > 0; i--) { \
size_t idx = remaining % block_len.a[i]; \
remaining /= block_len.a[i]; \
/* 计算偏移量 */
\
src_offset += idx * src_block_stride.a[i]; \
dst_offset += idx * dst_block_stride.a[i]; \
} \
\
src_offset += remaining * src_block_stride.a[0]; \
dst_offset += remaining * dst_block_stride.a[0]; \
\
/* 执行数据拷贝,注意offset已经是字节偏移 */
\
*reinterpret_cast<Tmem_type *>(reinterpret_cast<char *>(dst) + dst_offset) = *reinterpret_cast<const Tmem_type *>(reinterpret_cast<const char *>(src) + src_offset); \
} \
}
// 定义支持的约束条件数量组合
#define DEFINE_KERNELS_BY_CONSTRAINT(block_array_size, grid_array_size) \
DEFINE_KERNELS_BY_TYPE(0, block_array_size, grid_array_size) \
DEFINE_KERNELS_BY_TYPE(1, block_array_size, grid_array_size) \
DEFINE_KERNELS_BY_TYPE(2, block_array_size, grid_array_size)
// 定义支持的类型
#define DEFINE_KERNELS_BY_TYPE(constraint_num, block_array_size, grid_array_size) \
DEFINE_REARRANGE_KERNEL(uchar1, constraint_num, block_array_size, grid_array_size) \
DEFINE_REARRANGE_KERNEL(uchar2, constraint_num, block_array_size, grid_array_size) \
DEFINE_REARRANGE_KERNEL(float1, constraint_num, block_array_size, grid_array_size) \
DEFINE_REARRANGE_KERNEL(float2, constraint_num, block_array_size, grid_array_size) \
DEFINE_REARRANGE_KERNEL(float4, constraint_num, block_array_size, grid_array_size) \
DEFINE_REARRANGE_KERNEL(double4, constraint_num, block_array_size, grid_array_size)
// 与 MAX_BLOCK_ARRAY_SIZE 和 MAX_GRID_ARRAY_SIZE 耦合,需要同时修改
// 为1-5和1-5的所有组合生成内核
DEFINE_KERNELS_BY_CONSTRAINT
(
1
,
1
)
DEFINE_KERNELS_BY_CONSTRAINT
(
1
,
2
)
DEFINE_KERNELS_BY_CONSTRAINT
(
1
,
3
)
DEFINE_KERNELS_BY_CONSTRAINT
(
1
,
4
)
DEFINE_KERNELS_BY_CONSTRAINT
(
1
,
5
)
DEFINE_KERNELS_BY_CONSTRAINT
(
2
,
1
)
DEFINE_KERNELS_BY_CONSTRAINT
(
2
,
2
)
DEFINE_KERNELS_BY_CONSTRAINT
(
2
,
3
)
DEFINE_KERNELS_BY_CONSTRAINT
(
2
,
4
)
DEFINE_KERNELS_BY_CONSTRAINT
(
2
,
5
)
DEFINE_KERNELS_BY_CONSTRAINT
(
3
,
1
)
DEFINE_KERNELS_BY_CONSTRAINT
(
3
,
2
)
DEFINE_KERNELS_BY_CONSTRAINT
(
3
,
3
)
DEFINE_KERNELS_BY_CONSTRAINT
(
3
,
4
)
DEFINE_KERNELS_BY_CONSTRAINT
(
3
,
5
)
DEFINE_KERNELS_BY_CONSTRAINT
(
4
,
1
)
DEFINE_KERNELS_BY_CONSTRAINT
(
4
,
2
)
DEFINE_KERNELS_BY_CONSTRAINT
(
4
,
3
)
DEFINE_KERNELS_BY_CONSTRAINT
(
4
,
4
)
DEFINE_KERNELS_BY_CONSTRAINT
(
4
,
5
)
DEFINE_KERNELS_BY_CONSTRAINT
(
5
,
1
)
DEFINE_KERNELS_BY_CONSTRAINT
(
5
,
2
)
DEFINE_KERNELS_BY_CONSTRAINT
(
5
,
3
)
DEFINE_KERNELS_BY_CONSTRAINT
(
5
,
4
)
DEFINE_KERNELS_BY_CONSTRAINT
(
5
,
5
)
// 准备参数结构体
struct
RearrangeParams
{
std
::
vector
<
ARRAY_TYPE_SIZE
>
block_len
;
std
::
vector
<
ARRAY_TYPE_STRIDE
>
src_block_stride
;
std
::
vector
<
ARRAY_TYPE_STRIDE
>
dst_block_stride
;
std
::
vector
<
ARRAY_TYPE_SIZE
>
grid_len
;
std
::
vector
<
ARRAY_TYPE_STRIDE
>
src_grid_stride
;
std
::
vector
<
ARRAY_TYPE_STRIDE
>
dst_grid_stride
;
size_t
block_dim
;
size_t
block_len_total
;
std
::
vector
<
Constraint
<
ARRAY_TYPE_SIZE
>>
constraints
;
size_t
unit_size
;
};
utils
::
Result
<
void
*>
getRearrangeKernel
(
const
RearrangeParams
&
params
)
{
auto
grid_num
=
params
.
grid_len
.
size
();
auto
block_num
=
params
.
block_len
.
size
();
auto
constraint_num
=
params
.
constraints
.
size
();
auto
unit_size
=
params
.
unit_size
;
CHECK_OR_RETURN
(
grid_num
<=
MAX_GRID_ARRAY_SIZE
&&
grid_num
!=
0
,
INFINI_STATUS_BAD_PARAM
);
CHECK_OR_RETURN
(
block_num
<=
MAX_BLOCK_ARRAY_SIZE
&&
block_num
!=
0
,
INFINI_STATUS_BAD_PARAM
);
CHECK_OR_RETURN
(
constraint_num
<=
2
,
INFINI_STATUS_BAD_PARAM
);
auto
block_len
=
params
.
block_len
.
data
();
auto
src_block_stride
=
params
.
src_block_stride
.
data
();
auto
dst_block_stride
=
params
.
dst_block_stride
.
data
();
auto
grid_len
=
params
.
grid_len
.
data
();
auto
src_grid_stride
=
params
.
src_grid_stride
.
data
();
auto
dst_grid_stride
=
params
.
dst_grid_stride
.
data
();
auto
constrain
=
params
.
constraints
.
data
();
void
*
kernel_func
=
nullptr
;
#define GET_REARRANGE_KERNEL(Tmem_type, block_array_size, grid_array_size, constraint_num) \
kernel_func = (void *)rearrange_unit_##Tmem_type##_block_##block_array_size##_grid_##grid_array_size##_constrain_##constraint_num;
#define GET_REARRANGE_KERNEL_BY_TYPE(block_array_size, grid_array_size, constraint_num) \
switch (unit_size) { \
case 1: \
GET_REARRANGE_KERNEL(uchar1, block_array_size, grid_array_size, constraint_num); \
break; \
case 2: \
GET_REARRANGE_KERNEL(uchar2, block_array_size, grid_array_size, constraint_num); \
break; \
case 4: \
GET_REARRANGE_KERNEL(float1, block_array_size, grid_array_size, constraint_num); \
break; \
case 8: \
GET_REARRANGE_KERNEL(float2, block_array_size, grid_array_size, constraint_num); \
break; \
case 16: \
GET_REARRANGE_KERNEL(float4, block_array_size, grid_array_size, constraint_num); \
break; \
case 32: \
GET_REARRANGE_KERNEL(double4, block_array_size, grid_array_size, constraint_num); \
break; \
default: \
return INFINI_STATUS_BAD_PARAM; \
}
#define GET_REARRANGE_KERNEL_BY_CONSTRAINT(block_array_size, grid_array_size) \
switch (constraint_num) { \
case 0: \
GET_REARRANGE_KERNEL_BY_TYPE(block_array_size, grid_array_size, 0); \
break; \
case 1: \
GET_REARRANGE_KERNEL_BY_TYPE(block_array_size, grid_array_size, 1); \
break; \
case 2: \
GET_REARRANGE_KERNEL_BY_TYPE(block_array_size, grid_array_size, 2); \
break; \
}
#define GET_REARRANGE_KERNEL_BY_GRID_NUM(block_array_size) \
switch (grid_num) { \
case 1: \
GET_REARRANGE_KERNEL_BY_CONSTRAINT(block_array_size, 1); \
break; \
case 2: \
GET_REARRANGE_KERNEL_BY_CONSTRAINT(block_array_size, 2); \
break; \
case 3: \
GET_REARRANGE_KERNEL_BY_CONSTRAINT(block_array_size, 3); \
break; \
case 4: \
GET_REARRANGE_KERNEL_BY_CONSTRAINT(block_array_size, 4); \
break; \
case 5: \
GET_REARRANGE_KERNEL_BY_CONSTRAINT(block_array_size, 5); \
break; \
}
#define GET_REARRANGE_KERNEL_BY_BLOCK_NUM \
switch (block_num) { \
case 1: \
GET_REARRANGE_KERNEL_BY_GRID_NUM(1); \
break; \
case 2: \
GET_REARRANGE_KERNEL_BY_GRID_NUM(2); \
break; \
case 3: \
GET_REARRANGE_KERNEL_BY_GRID_NUM(3); \
break; \
case 4: \
GET_REARRANGE_KERNEL_BY_GRID_NUM(4); \
break; \
case 5: \
GET_REARRANGE_KERNEL_BY_GRID_NUM(5); \
break; \
}
GET_REARRANGE_KERNEL_BY_BLOCK_NUM
return
utils
::
Result
<
void
*>
(
kernel_func
);
}
#endif // __REARRANGE_CUDA_KERNEL_H__
src/infiniop/ops/rearrange/operator.cc
View file @
17415721
...
@@ -6,6 +6,10 @@
...
@@ -6,6 +6,10 @@
#include "cpu/rearrange_cpu.h"
#include "cpu/rearrange_cpu.h"
#endif
#endif
#ifdef ENABLE_CUDA_API
#include "cuda/rearrange_cuda.cuh"
#endif
__C
infiniStatus_t
infiniopCreateRearrangeDescriptor
(
__C
infiniStatus_t
infiniopCreateRearrangeDescriptor
(
infiniopHandle_t
handle
,
infiniopHandle_t
handle
,
infiniopRearrangeDescriptor_t
*
desc_ptr
,
infiniopRearrangeDescriptor_t
*
desc_ptr
,
...
@@ -26,6 +30,10 @@ __C infiniStatus_t infiniopCreateRearrangeDescriptor(
...
@@ -26,6 +30,10 @@ __C infiniStatus_t infiniopCreateRearrangeDescriptor(
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#endif
#ifdef ENABLE_CUDA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
default:
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
}
...
@@ -50,6 +58,10 @@ __C infiniStatus_t infiniopRearrange(
...
@@ -50,6 +58,10 @@ __C infiniStatus_t infiniopRearrange(
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#endif
#ifdef ENABLE_CUDA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
default:
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
}
...
@@ -71,6 +83,10 @@ __C infiniStatus_t infiniopDestroyRearrangeDescriptor(
...
@@ -71,6 +83,10 @@ __C infiniStatus_t infiniopDestroyRearrangeDescriptor(
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#endif
#ifdef ENABLE_CUDA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
default:
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
}
...
...
src/utils/rearrange.cc
View file @
17415721
...
@@ -138,4 +138,73 @@ void rearrange(
...
@@ -138,4 +138,73 @@ void rearrange(
}
}
}
}
utils
::
Result
<
RearrangeMeta
>
RearrangeMeta
::
distributeUnit
(
const
std
::
vector
<
size_t
>
&
candidates
)
const
{
// 获取当前的unit大小
size_t
current_unit
=
_meta
[
0
];
// 寻找满足条件的unit值:当前unit能被其整除
size_t
new_unit
=
0
;
for
(
size_t
candidate
:
candidates
)
{
if
(
current_unit
%
candidate
==
0
)
{
new_unit
=
candidate
;
break
;
}
}
// 如果没找到合适的值,返回错误
if
(
new_unit
==
0
)
{
return
INFINI_STATUS_BAD_PARAM
;
}
// 如果找到的值就是当前unit,返回自身的副本
if
(
new_unit
==
current_unit
)
{
return
Result
<
RearrangeMeta
>
(
_meta
);
}
// 获取当前维度
size_t
ndim_value
=
this
->
ndim
();
// 创建新的布局数组
std
::
vector
<
ptrdiff_t
>
layout
(
2
+
(
ndim_value
+
1
)
*
3
,
0
);
// 设置新的unit值
layout
[
0
]
=
new_unit
;
// 计算扩展因子
ptrdiff_t
extra
=
current_unit
/
new_unit
;
// 计算步长指针的偏移量
ptrdiff_t
idx_offset
=
1
;
// 在新布局中设置相应的指针
ptrdiff_t
*
new_idx
=
layout
.
data
()
+
1
;
ptrdiff_t
*
new_dst
=
layout
.
data
()
+
2
+
(
ndim_value
+
1
);
ptrdiff_t
*
new_src
=
layout
.
data
()
+
2
+
(
ndim_value
+
1
)
*
2
;
// 复制并调整索引步长
// 索引步长需要重新计算
// 首先复制原来的索引步长
for
(
size_t
i
=
0
;
i
<
ndim_value
+
1
;
++
i
)
{
new_idx
[
i
]
=
_meta
[
idx_offset
+
i
]
*
extra
;
}
// 设置最后一个维度的步长为1
new_idx
[
ndim_value
+
1
]
=
1
;
// 复制目标步长数据,并添加新单元大小
for
(
size_t
i
=
0
;
i
<
ndim_value
;
++
i
)
{
new_dst
[
i
]
=
dst_strides
()[
i
];
}
new_dst
[
ndim_value
]
=
new_unit
;
// 复制源步长数据,并添加新单元大小
for
(
size_t
i
=
0
;
i
<
ndim_value
;
++
i
)
{
new_src
[
i
]
=
src_strides
()[
i
];
}
new_src
[
ndim_value
]
=
new_unit
;
return
Result
<
RearrangeMeta
>
(
layout
);
}
}
// namespace utils
}
// namespace utils
src/utils/rearrange.h
View file @
17415721
...
@@ -28,6 +28,9 @@ public:
...
@@ -28,6 +28,9 @@ public:
const
ptrdiff_t
*
src_strides
()
const
;
const
ptrdiff_t
*
src_strides
()
const
;
void
launch
(
void
*
dst
,
const
void
*
src
)
const
;
void
launch
(
void
*
dst
,
const
void
*
src
)
const
;
// 拆分 unit 到更小的规模以利于并行
utils
::
Result
<
RearrangeMeta
>
distributeUnit
(
const
std
::
vector
<
size_t
>
&
candidates
)
const
;
};
};
void
rearrange
(
void
rearrange
(
...
...
test/infiniop/rearrange.py
View file @
17415721
...
@@ -17,19 +17,88 @@ from libinfiniop import (
...
@@ -17,19 +17,88 @@ from libinfiniop import (
profile_operation
,
profile_operation
,
)
)
def
row_major_strides
(
shape
):
"""生成张量的行优先(C风格)stride
Args:
shape: 张量形状
Returns:
行优先strides列表
"""
# 行优先 (C风格,从最后一维到第一维)
stride
=
1
strides
=
[
1
]
for
dim
in
reversed
(
shape
[
1
:]):
stride
*=
dim
strides
.
insert
(
0
,
stride
)
return
strides
def
column_major_strides
(
shape
):
"""生成张量的列优先(Fortran风格)stride
Args:
shape: 张量形状
Returns:
列优先strides列表
"""
# 列优先 (Fortran风格,从第一维到最后一维)
stride
=
1
strides
=
[
stride
]
for
dim
in
shape
[:
-
1
]:
stride
*=
dim
strides
.
append
(
stride
)
return
strides
# ==============================================================================
# ==============================================================================
# Configuration (Internal Use Only)
# Configuration (Internal Use Only)
# ==============================================================================
# ==============================================================================
# These are not meant to be imported from other modules
# These are not meant to be imported from other modules
_TEST_CASES
=
[
_TEST_CASES
=
[
# ((src_shape, src_stride), (dst_shape, dst_stride))
# (shape, x_stride, y_stride)
(((
2
,
4
,
32
),
None
),
((
2
,
4
,
32
),
(
256
,
64
,
1
))),
(
(((
32
,
6
,
64
),
(
64
,
2560
,
1
)),
((
32
,
6
,
64
),
None
)),
(
2
,
4
,
64
),
# shape
(((
4
,
6
,
64
),
(
64
,
2560
,
1
)),
((
4
,
6
,
64
),
(
131072
,
64
,
1
))),
(
2
,
4
,
8
),
# x_stride
(((
1
,
32
,
64
),
(
2048
,
64
,
1
)),
((
1
,
32
,
64
),
(
2048
,
64
,
1
))),
(
512
,
128
,
2
)
# y_stride
(((
32
,
1
,
64
),
(
64
,
2560
,
1
)),
((
32
,
1
,
64
),
(
64
,
64
,
1
))),
),
(((
4
,
1
,
64
),
(
64
,
2560
,
1
)),
((
4
,
1
,
64
),
(
64
,
11264
,
1
))),
(
(((
64
,),
(
1
,)),
((
64
,),
(
1
,))),
(
100
,
100
),
# shape
(
1
,
100
),
# x_stride
(
100
,
1
)
# y_stride
),
(
(
4
,
4
),
# shape
(
1
,
4
),
# x_stride
(
4
,
1
)
# y_stride
),
(
(
4
,
6
,
64
),
# shape
(
64
,
4
*
64
,
1
),
# x_stride
(
6
*
64
,
64
,
1
)
# y_stride
),
(
(
2000
,
2000
),
# shape
(
1
,
2000
),
# x_stride
(
2000
,
1
)
# y_stride
),
(
(
2001
,
2001
),
# shape
(
1
,
2001
),
# x_stride
(
2001
,
1
)
# y_stride
),
(
(
3
,
4
,
7
,
53
,
9
),
# shape
row_major_strides
((
3
,
4
,
7
,
53
,
9
)),
# x_stride
column_major_strides
((
3
,
4
,
7
,
53
,
9
))
# y_stride
),
(
(
3
,
4
,
50
,
50
,
5
,
7
),
# shape
row_major_strides
((
3
,
4
,
50
,
50
,
5
,
7
)),
# x_stride
column_major_strides
((
3
,
4
,
50
,
50
,
5
,
7
))
# y_stride
),
]
]
# Data types used for testing
# Data types used for testing
...
@@ -58,23 +127,23 @@ def test(
...
@@ -58,23 +127,23 @@ def test(
lib
,
lib
,
handle
,
handle
,
torch_device
,
torch_device
,
x_
shape
,
shape
,
x_stride
,
x_stride
,
y_shape
,
y_stride
,
y_stride
,
dtype
=
torch
.
float16
,
dtype
=
torch
.
float16
,
):
):
print
(
print
(
f
"Testing Rerrange on
{
torch_device
}
with
x_
shape:
{
x_
shape
}
x_stride:
{
x_stride
}
y_shape:
{
y_shape
}
y_stride:
{
y_stride
}
dtype:
{
dtype
}
"
f
"Testing Rerrange on
{
torch_device
}
with shape:
{
shape
}
x_stride:
{
x_stride
}
y_stride:
{
y_stride
}
dtype:
{
dtype
}
"
)
)
x
=
torch
.
rand
(
x_
shape
,
dtype
=
dtype
).
to
(
torch_device
)
x
=
torch
.
rand
(
shape
,
dtype
=
dtype
).
to
(
torch_device
)
y
=
torch
.
zeros
(
y_
shape
,
dtype
=
dtype
).
to
(
torch_device
)
y
=
torch
.
zeros
(
shape
,
dtype
=
dtype
).
to
(
torch_device
)
x
,
y
=
[
x
,
y
=
[
rearrange_if_needed
(
tensor
,
stride
)
rearrange_if_needed
(
tensor
,
stride
)
for
tensor
,
stride
in
zip
([
x
,
y
],
[
x_stride
,
y_stride
])
for
tensor
,
stride
in
zip
([
x
,
y
],
[
x_stride
,
y_stride
])
]
]
x_tensor
,
y_tensor
=
[
to_tensor
(
tensor
,
lib
)
for
tensor
in
[
x
,
y
]]
x_tensor
,
y_tensor
=
[
to_tensor
(
tensor
,
lib
)
for
tensor
in
[
x
,
y
]]
descriptor
=
infiniopRearrangeDescriptor_t
()
descriptor
=
infiniopRearrangeDescriptor_t
()
...
@@ -86,7 +155,7 @@ def test(
...
@@ -86,7 +155,7 @@ def test(
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for
tensor
in
[
x_tensor
,
y_tensor
]:
for
tensor
in
[
x_tensor
,
y_tensor
]:
tensor
.
des
criptor
.
contents
.
invalidate
(
)
tensor
.
des
troyDesc
(
lib
)
def
lib_rearrange
():
def
lib_rearrange
():
check_error
(
check_error
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment