Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
a578ff93
Commit
a578ff93
authored
Oct 01, 2020
by
Chao Liu
Browse files
fixed a bug; tested more transformation, no scrach mem
parent
c15a3c09
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
369 additions
and
107 deletions
+369
-107
composable_kernel/include/kernel_algorithm/dummy_dynamic_transform_v2.hpp
...l/include/kernel_algorithm/dummy_dynamic_transform_v2.hpp
+249
-62
composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
...lude/tensor_description/dynamic_multi_index_transform.hpp
+2
-2
composable_kernel/include/tensor_description/dynamic_tensor_descriptor_v2.hpp
...clude/tensor_description/dynamic_tensor_descriptor_v2.hpp
+2
-1
composable_kernel/include/utility/array_helper.hpp
composable_kernel/include/utility/array_helper.hpp
+16
-8
composable_kernel/include/utility/config.amd.hpp.in
composable_kernel/include/utility/config.amd.hpp.in
+1
-1
composable_kernel/include/utility/print.hpp
composable_kernel/include/utility/print.hpp
+12
-0
driver/include/device_dummy_dynamic_transform_v2.hpp
driver/include/device_dummy_dynamic_transform_v2.hpp
+85
-31
driver/src/conv_driver.cpp
driver/src/conv_driver.cpp
+2
-2
No files found.
composable_kernel/include/kernel_algorithm/dummy_dynamic_transform_v2.hpp
View file @
a578ff93
...
@@ -7,44 +7,44 @@
...
@@ -7,44 +7,44 @@
namespace
ck
{
namespace
ck
{
template
<
typename
Wei
Desc
,
typename
In
Desc
,
typename
Out
Desc
>
template
<
typename
...
Wei
,
typename
...
In
,
typename
...
Out
>
__host__
__device__
constexpr
auto
__host__
__device__
constexpr
auto
map_convolution_into_gemm_fwd_v4r4
(
map_convolution_into_gemm_v2
(
const
WeiDesc
&
wei_k_c_y_x_global_desc
,
const
DynamicTensorDescriptor_v2
<
Wei
...
>
&
wei_k_c_y_x_global_desc
,
const
InDesc
&
in_n_c_hi_wi_global_desc
,
const
DynamicTensorDescriptor_v2
<
In
...
>
&
in_n_c_hi_wi_global_desc
,
const
OutDesc
&
out_n_k_ho_wo_global_desc
,
const
DynamicTensorDescriptor_v2
<
Out
...
>
&
out_n_k_ho_wo_global_desc
,
const
MultiIndex
<
2
>
conv_strides
,
const
MultiIndex
<
2
>
conv_strides
,
const
MultiIndex
<
2
>
conv_dilations
,
const
MultiIndex
<
2
>
conv_dilations
,
const
MultiIndex
<
2
>
in_left_pads
,
const
MultiIndex
<
2
>
in_left_pads
,
const
MultiIndex
<
2
>
in_right_pads
)
const
MultiIndex
<
2
>
in_right_pads
)
{
{
constexpr
auto
i
0
=
Number
<
0
>
{};
constexpr
auto
I
0
=
Number
<
0
>
{};
constexpr
auto
i
1
=
Number
<
1
>
{};
constexpr
auto
I
1
=
Number
<
1
>
{};
constexpr
auto
i
2
=
Number
<
2
>
{};
constexpr
auto
I
2
=
Number
<
2
>
{};
constexpr
auto
i
3
=
Number
<
3
>
{};
constexpr
auto
I
3
=
Number
<
3
>
{};
const
index_t
N
=
in_n_c_hi_wi_global_desc
.
GetLength
(
i
0
);
const
index_t
N
=
in_n_c_hi_wi_global_desc
.
GetLength
(
I
0
);
const
index_t
C
=
in_n_c_hi_wi_global_desc
.
GetLength
(
i
1
);
const
index_t
C
=
in_n_c_hi_wi_global_desc
.
GetLength
(
I
1
);
const
index_t
K
=
out_n_k_ho_wo_global_desc
.
GetLength
(
i
1
);
const
index_t
K
=
out_n_k_ho_wo_global_desc
.
GetLength
(
I
1
);
const
index_t
Y
=
wei_k_c_y_x_global_desc
.
GetLength
(
i
2
);
const
index_t
Y
=
wei_k_c_y_x_global_desc
.
GetLength
(
I
2
);
const
index_t
X
=
wei_k_c_y_x_global_desc
.
GetLength
(
i
3
);
const
index_t
X
=
wei_k_c_y_x_global_desc
.
GetLength
(
I
3
);
const
index_t
Hi
=
in_n_c_hi_wi_global_desc
.
GetLength
(
i
2
);
const
index_t
Hi
=
in_n_c_hi_wi_global_desc
.
GetLength
(
I
2
);
const
index_t
Wi
=
in_n_c_hi_wi_global_desc
.
GetLength
(
i
3
);
const
index_t
Wi
=
in_n_c_hi_wi_global_desc
.
GetLength
(
I
3
);
const
index_t
Ho
=
out_n_k_ho_wo_global_desc
.
GetLength
(
i
2
);
const
index_t
Ho
=
out_n_k_ho_wo_global_desc
.
GetLength
(
I
2
);
const
index_t
Wo
=
out_n_k_ho_wo_global_desc
.
GetLength
(
i
3
);
const
index_t
Wo
=
out_n_k_ho_wo_global_desc
.
GetLength
(
I
3
);
const
index_t
ConvStrideH
=
conv_strides
[
i
0
];
const
index_t
ConvStrideH
=
conv_strides
[
I
0
];
const
index_t
ConvStrideW
=
conv_strides
[
i
1
];
const
index_t
ConvStrideW
=
conv_strides
[
I
1
];
const
index_t
ConvDilationH
=
conv_dilations
[
i
0
];
const
index_t
ConvDilationH
=
conv_dilations
[
I
0
];
const
index_t
ConvDilationW
=
conv_dilations
[
i
1
];
const
index_t
ConvDilationW
=
conv_dilations
[
I
1
];
const
index_t
InLeftPadH
=
in_left_pads
[
i
0
];
const
index_t
InLeftPadH
=
in_left_pads
[
I
0
];
const
index_t
InLeftPadW
=
in_left_pads
[
i
1
];
const
index_t
InLeftPadW
=
in_left_pads
[
I
1
];
const
index_t
InRightPadH
=
in_right_pads
[
i
0
];
const
index_t
InRightPadH
=
in_right_pads
[
I
0
];
const
index_t
InRightPadW
=
in_right_pads
[
i
1
];
const
index_t
InRightPadW
=
in_right_pads
[
I
1
];
// input tensor
// input tensor
const
auto
in_n_c_hip_wip_global_desc
=
transform_dynamic_tensor_descriptor_v2
(
const
auto
in_n_c_hip_wip_global_desc
=
transform_dynamic_tensor_descriptor_v2
(
...
@@ -63,8 +63,8 @@ map_convolution_into_gemm_v2(const WeiDesc& wei_k_c_y_x_global_desc,
...
@@ -63,8 +63,8 @@ map_convolution_into_gemm_v2(const WeiDesc& wei_k_c_y_x_global_desc,
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
const
index_t
Hip
=
in_n_c_hip_wip_global_desc
.
GetLength
(
i
2
);
const
index_t
Hip
=
in_n_c_hip_wip_global_desc
.
GetLength
(
I
2
);
const
index_t
Wip
=
in_n_c_hip_wip_global_desc
.
GetLength
(
i
3
);
const
index_t
Wip
=
in_n_c_hip_wip_global_desc
.
GetLength
(
I
3
);
const
auto
in_n_c_y_ho_x_wo_global_desc
=
transform_dynamic_tensor_descriptor_v2
(
const
auto
in_n_c_y_ho_x_wo_global_desc
=
transform_dynamic_tensor_descriptor_v2
(
in_n_c_hip_wip_global_desc
,
in_n_c_hip_wip_global_desc
,
...
@@ -76,16 +76,127 @@ map_convolution_into_gemm_v2(const WeiDesc& wei_k_c_y_x_global_desc,
...
@@ -76,16 +76,127 @@ map_convolution_into_gemm_v2(const WeiDesc& wei_k_c_y_x_global_desc,
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{},
Sequence
<
4
,
5
>
{}));
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{},
Sequence
<
4
,
5
>
{}));
const
auto
in_gemmk_gemmn_global_desc
=
transform_dynamic_tensor_descriptor_v2
(
const
auto
in_gemmk
total
_gemmn_global_desc
=
transform_dynamic_tensor_descriptor_v2
(
in_n_c_y_ho_x_wo_global_desc
,
in_n_c_y_ho_x_wo_global_desc
,
make_tuple
(
DynamicMerge
<
3
>
{
make_multi_index
(
C
,
Y
,
X
)},
make_tuple
(
DynamicMerge
<
3
>
{
make_multi_index
(
C
,
Y
,
X
)},
DynamicMerge
<
3
>
{
make_multi_index
(
N
,
Ho
,
Wo
)}),
DynamicMerge
<
3
>
{
make_multi_index
(
N
,
Ho
,
Wo
)}),
make_tuple
(
Sequence
<
1
,
2
,
4
>
{},
Sequence
<
0
,
3
,
5
>
{}),
make_tuple
(
Sequence
<
1
,
2
,
4
>
{},
Sequence
<
0
,
3
,
5
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
return
make_tuple
(
in_gemmk_gemmn_global_desc
);
const
index_t
gemmktotal
=
in_gemmktotal_gemmn_global_desc
.
GetLength
(
I0
);
const
index_t
gemmn
=
in_gemmktotal_gemmn_global_desc
.
GetLength
(
I1
);
constexpr
index_t
GemmKPack
=
8
;
const
index_t
gemmk
=
gemmktotal
/
GemmKPack
;
const
auto
in_gemmk_gemmn_gemmkpack_global_desc
=
transform_dynamic_tensor_descriptor_v2
(
in_gemmktotal_gemmn_global_desc
,
make_tuple
(
DynamicUnMerge
<
2
>
{
make_multi_index
(
gemmk
,
GemmKPack
)},
DynamicPassThrough
{
gemmn
}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
,
2
>
{},
Sequence
<
1
>
{}));
static_assert
(
decltype
(
in_gemmk_gemmn_gemmkpack_global_desc
)
::
GetNumOfDimension
()
==
3
,
"wrong!"
);
return
make_tuple
(
in_gemmk_gemmn_gemmkpack_global_desc
);
}
}
#if 0
template <typename... Wei, typename... In, typename... Out>
__host__ __device__ constexpr auto map_convolution_into_gemm_bwd_v4r1(
const DynamicTensorDescriptor_v2<Wei...>& wei_k_c_y_x_global_desc,
const DynamicTensorDescriptor_v2<In...>& in_n_c_hi_wi_global_desc,
const DynamicTensorDescriptor_v2<Out...>& out_n_k_ho_wo_global_desc,
const MultiIndex<2> conv_strides,
const MultiIndex<2> conv_dilations,
const MultiIndex<2> in_left_pads,
const MultiIndex<2> in_right_pads)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
const index_t N = in_n_c_hi_wi_global_desc.GetLength(I0);
const index_t C = in_n_c_hi_wi_global_desc.GetLength(I1);
const index_t K = out_n_k_ho_wo_global_desc.GetLength(I1);
const index_t Y = wei_k_c_y_x_global_desc.GetLength(I2);
const index_t X = wei_k_c_y_x_global_desc.GetLength(I3);
const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(I2);
const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(I3);
const index_t Ho = out_n_k_ho_wo_global_desc.GetLength(I2);
const index_t Wo = out_n_k_ho_wo_global_desc.GetLength(I3);
const index_t ConvStrideH = conv_strides[I0];
const index_t ConvStrideW = conv_strides[I1];
const index_t ConvDilationH = conv_dilations[I0];
const index_t ConvDilationW = conv_dilations[I1];
const index_t InLeftPadH = in_left_pads[I0];
const index_t InLeftPadW = in_left_pads[I1];
const index_t InRightPadH = in_right_pads[I0];
const index_t InRightPadW = in_right_pads[I1];
#if !CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK
constexpr bool out_skip_out_of_bound_check = false;
#else
constexpr bool out_skip_out_of_bound_check = true;
#endif
constexpr
auto
out_n_k_ydot_htilda_xdot_wtilda_global_desc
=
transform_tensor_descriptor_v2
(
out_n_k_ho_wo_global_desc
,
make_tuple
(
PassThrough
{
N
},
PassThrough
{
K
},
Embed
<
2
>
{
make_multi_index
(
YDot
,
HTilda
),
make_multi_index
(
-
ConvDilationH
/
GcdStrideDilationH
,
1
)},
Embed
<
2
>
{
make_multi_index
(
XDot
,
WTilda
),
make_multi_index
(
-
ConvDilationW
/
GcdStrideDilationW
,
1
)}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{},
Sequence
<
4
,
5
>
{}));
constexpr
auto
out_n_k_ydot_htildaslice_xdot_wtildaslice_global_desc
=
transform_tensor_descriptor_v2
(
out_n_k_ydot_htilda_xdot_wtilda_global_desc
,
make_tuple
(
PassThrough
{
N
},
PassThrough
{
K
},
PassThrough
{
YDot
},
PassThrough
{
XDot
},
Slice
<
Sequence
<
HTilda
,
WTilda
>
,
Sequence
<
iHTildaLeft
,
iWTildaLeft
>
,
Sequence
<
iHTildaRight
,
iWTildaRight
>>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
4
>
{},
Sequence
<
3
,
5
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
4
>
{},
Sequence
<
3
,
5
>
{}));
constexpr
auto
out_n_k_ydotslice_htildaslice_xdotslice_wtildaslice_global_desc
=
transform_tensor_descriptor
(
out_n_k_ydot_htildaslice_xdot_wtildaslice_global_desc
,
make_tuple
(
PassThrough
<
N
>
{},
PassThrough
<
K
>
{},
PassThrough
<
HTildaSlice
>
{},
PassThrough
<
WTildaSlice
>
{},
Slice
<
Sequence
<
YDot
,
XDot
>
,
Sequence
<
0
,
0
>
,
Sequence
<
YDotSlice
,
XDotSlice
>>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
3
>
{},
Sequence
<
5
>
{},
Sequence
<
2
,
4
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
3
>
{},
Sequence
<
5
>
{},
Sequence
<
2
,
4
>
{}));
constexpr
auto
out_gemmk_gemmn_global_desc
=
transform_tensor_descriptor
(
out_n_k_ydotslice_htildaslice_xdotslice_wtildaslice_global_desc
,
make_tuple
(
Merge
<
Sequence
<
K
,
YDotSlice
,
XDotSlice
>>
{},
Merge
<
Sequence
<
N
,
HTildaSlice
,
WTildaSlice
>>
{}),
make_tuple
(
Sequence
<
1
,
2
,
4
>
{},
Sequence
<
0
,
3
,
5
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
return
make_tuple
(
in_gemmk_gemmn_gemmkpack_global_desc
);
}
#endif
template
<
index_t
BlockSize
>
template
<
index_t
BlockSize
>
struct
DummyDynamicTransform_v2_1
struct
DummyDynamicTransform_v2_1
{
{
...
@@ -102,13 +213,13 @@ struct DummyDynamicTransform_v2_1
...
@@ -102,13 +213,13 @@ struct DummyDynamicTransform_v2_1
const
MultiIndex
<
2
>
in_right_pads
)
const
const
MultiIndex
<
2
>
in_right_pads
)
const
{
{
const
auto
transformed_tensor_descs
=
const
auto
transformed_tensor_descs
=
map_convolution_into_gemm
_v2
(
move
(
wei_k_c_y_x_global_desc
),
map_convolution_into_gemm
(
move
(
wei_k_c_y_x_global_desc
),
move
(
in_n_c_hi_wi_global_desc
),
move
(
in_n_c_hi_wi_global_desc
),
move
(
out_n_k_ho_wo_global_desc
),
move
(
out_n_k_ho_wo_global_desc
),
conv_strides
,
conv_strides
,
conv_dilations
,
conv_dilations
,
in_left_pads
,
in_left_pads
,
in_right_pads
);
in_right_pads
);
const
auto
in_gemmk_gemmn_global_desc
=
transformed_tensor_descs
.
At
(
Number
<
0
>
{});
const
auto
in_gemmk_gemmn_global_desc
=
transformed_tensor_descs
.
At
(
Number
<
0
>
{});
...
@@ -302,31 +413,42 @@ struct DummyDynamicTransform_v2_1
...
@@ -302,31 +413,42 @@ struct DummyDynamicTransform_v2_1
};
};
template
<
index_t
BlockSize
>
template
<
index_t
BlockSize
>
struct
DummyDynamicTransform_v2_
2
struct
DummyDynamicTransform_v2_
fwd_v4r4
{
{
template
<
typename
TransformInDesc
>
template
<
typename
TransformInDesc
>
__device__
void
Run
(
index_t
*
const
__restrict__
p_wei_global
,
__device__
void
Run
(
index_t
*
const
__restrict__
p_wei_global
,
float
*
const
__restrict__
p_in_global
,
float
*
const
__restrict__
p_in_global
,
float
*
const
__restrict__
p_out_global
,
float
*
const
__restrict__
p_out_global
,
const
TransformInDesc
in_gemmk_gemmn_global_desc
)
const
const
TransformInDesc
in_gemmk_gemmn_
gemmkpack_
global_desc
)
const
{
{
MultiIndex
<
2
>
idx
;
MultiIndex
<
3
>
idx
;
// initialize idx
// initialize idx
static_for
<
0
,
2
,
1
>
{}([
&
](
auto
i
)
{
idx
(
i
)
=
p_wei_global
[
get_thread_local_1d_id
()
+
i
];
});
static_for
<
0
,
3
,
1
>
{}([
&
](
auto
i
)
{
idx
(
i
)
=
p_wei_global
[
get_thread_local_1d_id
()
+
i
];
});
const
index_t
niter
=
p_wei_global
[
10
];
const
index_t
niter
=
p_wei_global
[
10
];
auto
in_gemmk_gemmn_coord
=
auto
in_gemmk_gemmn_
gemmkpack_
coord
=
make_dynamic_tensor_coordinate_v2
(
in_gemmk_gemmn_global_desc
,
idx
);
make_dynamic_tensor_coordinate_v2
(
in_gemmk_gemmn_
gemmkpack_
global_desc
,
idx
);
const
auto
in_gemmk_gemmn_coord_step
=
make_dynamic_tensor_coordinate_step_v2
(
const
auto
in_gemmk_gemmn_gemmkpack_coord_step_0_0_1
=
in_gemmk_gemmn_global_desc
,
make_multi_index
(
1
,
0
));
make_dynamic_tensor_coordinate_step_v2
(
in_gemmk_gemmn_gemmkpack_global_desc
,
make_multi_index
(
0
,
0
,
1
));
const
auto
in_gemmk_gemmn_gemmkpack_coord_step_0_1_0
=
make_dynamic_tensor_coordinate_step_v2
(
in_gemmk_gemmn_gemmkpack_global_desc
,
make_multi_index
(
0
,
1
,
0
));
const
auto
in_gemmk_gemmn_gemmkpack_coord_step_1_0_0
=
make_dynamic_tensor_coordinate_step_v2
(
in_gemmk_gemmn_gemmkpack_global_desc
,
make_multi_index
(
1
,
0
,
0
));
// move (0, 0, 1)
for
(
index_t
iter
=
0
;
iter
<
niter
;
++
iter
)
for
(
index_t
iter
=
0
;
iter
<
niter
;
++
iter
)
{
{
move_dynamic_tensor_coordinate_v2
(
move_dynamic_tensor_coordinate_v2
(
in_gemmk_gemmn_gemmkpack_global_desc
,
in_gemmk_gemmn_global_desc
,
in_gemmk_gemmn_coord
,
in_gemmk_gemmn_coord_step
);
in_gemmk_gemmn_gemmkpack_coord
,
in_gemmk_gemmn_gemmkpack_coord_step_0_0_1
);
// write
// write
float
value
=
1
;
float
value
=
1
;
...
@@ -337,19 +459,84 @@ struct DummyDynamicTransform_v2_2
...
@@ -337,19 +459,84 @@ struct DummyDynamicTransform_v2_2
AddressSpace
::
Global
,
AddressSpace
::
Global
,
InMemoryDataOperation
::
Set
,
InMemoryDataOperation
::
Set
,
1
,
1
,
1
>
(
&
value
,
1
>
(
0
,
&
value
,
true
,
0
,
1
,
true
,
p_out_global
,
1
,
in_gemmk_gemmn_coord
.
GetOffset
(),
p_out_global
,
#if 0
in_gemmk_gemmn_gemmkpack_coord
.
GetOffset
(),
coordinate_has_valid_offset_assuming_visible_index_is_valid(
#if 1
in_gemmk_gemmn_global_desc, in_gemmk_gemmn_coord),
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
in_gemmk_gemmn_gemmkpack_global_desc
,
in_gemmk_gemmn_gemmkpack_coord
),
#else
#else
true
,
true
,
#endif
#endif
in_gemmk_gemmn_global_desc
.
GetElementSpaceSize
());
in_gemmk_gemmn_gemmkpack_global_desc
.
GetElementSpaceSize
());
}
// move (0, 1, 0)
for
(
index_t
iter
=
0
;
iter
<
niter
;
++
iter
)
{
move_dynamic_tensor_coordinate_v2
(
in_gemmk_gemmn_gemmkpack_global_desc
,
in_gemmk_gemmn_gemmkpack_coord
,
in_gemmk_gemmn_gemmkpack_coord_step_0_1_0
);
// write
float
value
=
1
;
transfer_data
<
float
,
1
,
AddressSpace
::
Vgpr
,
AddressSpace
::
Global
,
InMemoryDataOperation
::
Set
,
1
,
1
>
(
&
value
,
0
,
true
,
1
,
p_out_global
,
in_gemmk_gemmn_gemmkpack_coord
.
GetOffset
(),
#if 1
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
in_gemmk_gemmn_gemmkpack_global_desc
,
in_gemmk_gemmn_gemmkpack_coord
),
#else
true
,
#endif
in_gemmk_gemmn_gemmkpack_global_desc
.
GetElementSpaceSize
());
}
// move (1, 0, 0)
for
(
index_t
iter
=
0
;
iter
<
niter
;
++
iter
)
{
move_dynamic_tensor_coordinate_v2
(
in_gemmk_gemmn_gemmkpack_global_desc
,
in_gemmk_gemmn_gemmkpack_coord
,
in_gemmk_gemmn_gemmkpack_coord_step_1_0_0
);
// write
float
value
=
1
;
transfer_data
<
float
,
1
,
AddressSpace
::
Vgpr
,
AddressSpace
::
Global
,
InMemoryDataOperation
::
Set
,
1
,
1
>
(
&
value
,
0
,
true
,
1
,
p_out_global
,
in_gemmk_gemmn_gemmkpack_coord
.
GetOffset
(),
#if 1
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
in_gemmk_gemmn_gemmkpack_global_desc
,
in_gemmk_gemmn_gemmkpack_coord
),
#else
true
,
#endif
in_gemmk_gemmn_gemmkpack_global_desc
.
GetElementSpaceSize
());
}
}
}
}
};
};
...
...
composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
View file @
a578ff93
...
@@ -371,7 +371,7 @@ struct DynamicMerge
...
@@ -371,7 +371,7 @@ struct DynamicMerge
__host__
__device__
constexpr
DynamicMerge
(
const
LowerIndex
&
low_lengths
)
__host__
__device__
constexpr
DynamicMerge
(
const
LowerIndex
&
low_lengths
)
:
low_lengths_
{
low_lengths
},
:
low_lengths_
{
low_lengths
},
low_lengths_scan_
{
container_reverse_exclusive_scan
<
index_t
,
NDimLow
>
(
low_lengths_scan_
{
container_reverse_exclusive_scan
(
low_lengths
,
math
::
multiplies
<
index_t
>
{},
index_t
{
1
})},
low_lengths
,
math
::
multiplies
<
index_t
>
{},
index_t
{
1
})},
up_lengths_
{
make_multi_index
(
up_lengths_
{
make_multi_index
(
container_reduce
(
low_lengths
,
math
::
multiplies
<
index_t
>
(),
index_t
{
1
}))}
container_reduce
(
low_lengths
,
math
::
multiplies
<
index_t
>
(),
index_t
{
1
}))}
...
@@ -536,7 +536,7 @@ struct DynamicUnMerge
...
@@ -536,7 +536,7 @@ struct DynamicUnMerge
__host__
__device__
constexpr
void
CalculateLowerIndex
(
LowIdx
&
idx_low
,
__host__
__device__
constexpr
void
CalculateLowerIndex
(
LowIdx
&
idx_low
,
const
UpIdx
&
idx_up
)
const
const
UpIdx
&
idx_up
)
const
{
{
idx_low
(
Number
<
0
>
{})
=
idx_up
[
Number
<
NDimUp
>
{}];
idx_low
(
Number
<
0
>
{})
=
idx_up
[
Number
<
NDimUp
-
1
>
{}];
static_for
<
0
,
NDimUp
-
1
,
1
>
{}(
static_for
<
0
,
NDimUp
-
1
,
1
>
{}(
[
&
](
auto
i
)
{
idx_low
(
Number
<
0
>
{})
+=
idx_up
[
i
]
*
up_lengths_scan_
[
i
];
});
[
&
](
auto
i
)
{
idx_low
(
Number
<
0
>
{})
+=
idx_up
[
i
]
*
up_lengths_scan_
[
i
];
});
...
...
composable_kernel/include/tensor_description/dynamic_tensor_descriptor_v2.hpp
View file @
a578ff93
...
@@ -445,7 +445,8 @@ make_dynamic_tensor_coordinate_v2(const TensorDesc& tensor_desc, const VisibleIn
...
@@ -445,7 +445,8 @@ make_dynamic_tensor_coordinate_v2(const TensorDesc& tensor_desc, const VisibleIn
idx_hidden_pick_visible
=
idx_visible
;
idx_hidden_pick_visible
=
idx_visible
;
// calculate hidden index
// calculate hidden index
static_for
<
ntransform
-
1
,
-
1
,
-
1
>
{}([
&
tensor_desc
,
&
idx_hidden
](
auto
itran
)
{
static_for
<
ntransform
,
0
,
-
1
>
{}([
&
tensor_desc
,
&
idx_hidden
](
auto
itran_p1
)
{
auto
itran
=
itran_p1
-
Number
<
1
>
{};
const
auto
&
tran
=
tensor_desc
.
GetTransforms
().
At
(
itran
);
const
auto
&
tran
=
tensor_desc
.
GetTransforms
().
At
(
itran
);
constexpr
auto
dims_low
=
TensorDesc
::
GetLowerDimensionIdss
().
At
(
itran
);
constexpr
auto
dims_low
=
TensorDesc
::
GetLowerDimensionIdss
().
At
(
itran
);
constexpr
auto
dims_up
=
TensorDesc
::
GetUpperDimensionIdss
().
At
(
itran
);
constexpr
auto
dims_up
=
TensorDesc
::
GetUpperDimensionIdss
().
At
(
itran
);
...
...
composable_kernel/include/utility/array_helper.hpp
View file @
a578ff93
...
@@ -112,11 +112,17 @@ container_reverse_exclusive_scan(const Array<TData, NSize>& x, Reduce f, TData i
...
@@ -112,11 +112,17 @@ container_reverse_exclusive_scan(const Array<TData, NSize>& x, Reduce f, TData i
return
y
;
return
y
;
}
}
template
<
typename
TData
,
index_t
NSize
,
typename
Reduce
>
// Here should use StaticallyIndexedArray<TData, NSize>, instead of Tuple<Xs...>,
__host__
__device__
constexpr
auto
container_reverse_exclusive_scan
(
// although the former is the alias of the latter. This is because compiler cannot
const
StaticallyIndexedArray
<
TData
,
NSize
>&
x
,
Reduce
f
,
TData
init
)
// infer the NSize if using StaticallyIndexedArray<TData, NSize>
// TODO: how to fix this?
template
<
typename
...
Xs
,
typename
Reduce
,
typename
TData
>
__host__
__device__
constexpr
auto
container_reverse_exclusive_scan
(
const
Tuple
<
Xs
...
>&
x
,
Reduce
f
,
TData
init
)
{
{
StaticallyIndexedArray
<
TData
,
NSize
>
y
;
constexpr
index_t
NSize
=
sizeof
...(
Xs
);
Tuple
<
Xs
...
>
y
;
TData
r
=
init
;
TData
r
=
init
;
...
@@ -130,11 +136,13 @@ __host__ __device__ constexpr auto container_reverse_exclusive_scan(
...
@@ -130,11 +136,13 @@ __host__ __device__ constexpr auto container_reverse_exclusive_scan(
return
y
;
return
y
;
}
}
template
<
typename
TData
,
index_t
NSiz
e
,
typename
Reduce
>
template
<
typename
...
Xs
,
typename
Reduc
e
,
typename
TData
>
__host__
__device__
constexpr
auto
container_reverse_inclusive_scan
(
__host__
__device__
constexpr
auto
const
StaticallyIndexedArray
<
TData
,
NSize
>&
x
,
Reduce
f
,
TData
init
)
container_reverse_inclusive_scan
(
const
Tuple
<
Xs
...
>&
x
,
Reduce
f
,
TData
init
)
{
{
StaticallyIndexedArray
<
TData
,
NSize
>
y
;
constexpr
index_t
NSize
=
sizeof
...(
Xs
);
Tuple
<
Xs
...
>
y
;
TData
r
=
init
;
TData
r
=
init
;
...
...
composable_kernel/include/utility/config.amd.hpp.in
View file @
a578ff93
...
@@ -9,7 +9,7 @@
...
@@ -9,7 +9,7 @@
#define CK_UNSIGNED_INDEX_TYPE 0
#define CK_UNSIGNED_INDEX_TYPE 0
// multi index
// multi index
#define CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX
1
#define CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX
0
// device backend
// device backend
#define CK_DEVICE_BACKEND_AMD 1
#define CK_DEVICE_BACKEND_AMD 1
...
...
composable_kernel/include/utility/print.hpp
View file @
a578ff93
...
@@ -14,6 +14,7 @@ __host__ __device__ void print_array(const char* s, T a)
...
@@ -14,6 +14,7 @@ __host__ __device__ void print_array(const char* s, T a)
using
data_type
=
decltype
(
a
.
At
(
Number
<
0
>
{}));
using
data_type
=
decltype
(
a
.
At
(
Number
<
0
>
{}));
constexpr
index_t
nsize
=
a
.
Size
();
constexpr
index_t
nsize
=
a
.
Size
();
#if 0
if constexpr(is_same<data_type, uint32_t>{})
if constexpr(is_same<data_type, uint32_t>{})
{
{
printf("%s size %u, {", s, nsize);
printf("%s size %u, {", s, nsize);
...
@@ -32,6 +33,11 @@ __host__ __device__ void print_array(const char* s, T a)
...
@@ -32,6 +33,11 @@ __host__ __device__ void print_array(const char* s, T a)
static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("%d, ", bool{a[i]}); });
static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("%d, ", bool{a[i]}); });
printf("}\n");
printf("}\n");
}
}
#else
printf
(
"%s size %d, {"
,
s
,
nsize
);
static_for
<
0
,
nsize
,
1
>
{}([
&
a
](
auto
i
)
constexpr
{
printf
(
"%d, "
,
int32_t
{
a
[
i
]});
});
printf
(
"}
\n
"
);
#endif
}
}
template
<
typename
T
>
template
<
typename
T
>
...
@@ -40,6 +46,7 @@ __host__ __device__ void print_array_v2(const char* s, T a)
...
@@ -40,6 +46,7 @@ __host__ __device__ void print_array_v2(const char* s, T a)
using
data_type
=
decltype
(
a
.
At
(
Number
<
0
>
{}));
using
data_type
=
decltype
(
a
.
At
(
Number
<
0
>
{}));
constexpr
index_t
nsize
=
a
.
Size
();
constexpr
index_t
nsize
=
a
.
Size
();
#if 0
if constexpr(is_same<data_type, uint32_t>{})
if constexpr(is_same<data_type, uint32_t>{})
{
{
printf("%s size %u, {", s, nsize);
printf("%s size %u, {", s, nsize);
...
@@ -52,6 +59,11 @@ __host__ __device__ void print_array_v2(const char* s, T a)
...
@@ -52,6 +59,11 @@ __host__ __device__ void print_array_v2(const char* s, T a)
static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("[%d] %d, ", i.value, a[i]); });
static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("[%d] %d, ", i.value, a[i]); });
printf("}\n");
printf("}\n");
}
}
#else
printf
(
"%s size %d, {"
,
s
,
nsize
);
static_for
<
0
,
nsize
,
1
>
{}([
&
a
](
auto
i
)
constexpr
{
printf
(
"[%d] %d, "
,
i
.
value
,
a
[
i
]);
});
printf
(
"}
\n
"
);
#endif
}
}
}
// namespace ck
}
// namespace ck
...
...
driver/include/device_dummy_dynamic_transform_v2.hpp
View file @
a578ff93
...
@@ -40,36 +40,89 @@ void device_dummy_dynamic_transform_v2(InDesc,
...
@@ -40,36 +40,89 @@ void device_dummy_dynamic_transform_v2(InDesc,
const
auto
in_left_pads
=
to_multi_index
(
InLeftPads
{});
const
auto
in_left_pads
=
to_multi_index
(
InLeftPads
{});
const
auto
in_right_pads
=
to_multi_index
(
InRightPads
{});
const
auto
in_right_pads
=
to_multi_index
(
InRightPads
{});
const
auto
tensor_descs
=
map_convolution_into_gemm_
v2
(
wei_kcyx_desc
,
const
auto
tensor_descs
=
map_convolution_into_gemm_
fwd_v4r4
(
wei_kcyx_desc
,
in_nchw_desc
,
in_nchw_desc
,
out_nkhw_desc
,
out_nkhw_desc
,
conv_strides
,
conv_strides
,
conv_dilations
,
conv_dilations
,
in_left_pads
,
in_left_pads
,
in_right_pads
);
in_right_pads
);
const
auto
in_gemmk_gemmn_global_desc
=
tensor_descs
.
At
(
Number
<
0
>
{});
const
auto
in_gemmk_gemmn_
gemmkpack_
global_desc
=
tensor_descs
.
At
(
Number
<
0
>
{});
// test on cpu
// test on cpu
{
{
auto
in_gemmk_gemmn_
coord
=
auto
in_gemmk_gemmn_
gemmkpack_coord
=
make_dynamic_tensor_coordinate_v2
(
make_dynamic_tensor_coordinate_v2
(
in_gemmk_gemmn_global_desc
,
make_multi_index
(
0
,
0
));
in_gemmk_gemmn
_gemmkpack
_global_desc
,
make_multi_index
(
0
,
0
,
0
));
const
auto
in_gemmk_gemmn_coord_step
=
make_dynamic_tensor_coordinate_step_v2
(
const
auto
in_gemmk_gemmn_gemmkpack_coord_step_0_0_1
=
in_gemmk_gemmn_global_desc
,
make_multi_index
(
1
,
0
));
make_dynamic_tensor_coordinate_step_v2
(
in_gemmk_gemmn_gemmkpack_global_desc
,
make_multi_index
(
0
,
0
,
1
));
print_array
(
"do_tansforms: "
,
in_gemmk_gemmn_coord_step
.
do_transforms_
);
print_array_v2
(
"do_tansforms 0 0 1: "
,
in_gemmk_gemmn_gemmkpack_coord_step_0_0_1
.
do_transforms_
);
for
(
index_t
iter
=
0
;
iter
<
10
;
++
iter
)
for
(
index_t
iter
=
0
;
iter
<
10
;
++
iter
)
{
{
printf
(
"iter %d
\n
"
,
iter
);
printf
(
"iter %d
\n
"
,
iter
);
print_array
(
"idx: "
,
in_gemmk_gemmn_coord
.
GetIndex
());
print_array
_v2
(
"idx: "
,
in_gemmk_gemmn_
gemmkpack_
coord
.
GetIndex
());
print_array
(
"hidden idx: "
,
in_gemmk_gemmn_coord
.
GetHiddenIndex
());
print_array
_v2
(
"hidden idx: "
,
in_gemmk_gemmn_
gemmkpack_
coord
.
GetHiddenIndex
());
printf
(
"offset: %d
\n
"
,
in_gemmk_gemmn_coord
.
GetOffset
());
printf
(
"offset: %d
\n
"
,
in_gemmk_gemmn_
gemmkpack_
coord
.
GetOffset
());
printf
(
"
\n
"
);
printf
(
"
\n
"
);
move_dynamic_tensor_coordinate_v2
(
move_dynamic_tensor_coordinate_v2
(
in_gemmk_gemmn_gemmkpack_global_desc
,
in_gemmk_gemmn_global_desc
,
in_gemmk_gemmn_coord
,
in_gemmk_gemmn_coord_step
);
in_gemmk_gemmn_gemmkpack_coord
,
in_gemmk_gemmn_gemmkpack_coord_step_0_0_1
);
}
}
{
auto
in_gemmk_gemmn_gemmkpack_coord
=
make_dynamic_tensor_coordinate_v2
(
in_gemmk_gemmn_gemmkpack_global_desc
,
make_multi_index
(
0
,
0
,
0
));
const
auto
in_gemmk_gemmn_gemmkpack_coord_step_0_1_0
=
make_dynamic_tensor_coordinate_step_v2
(
in_gemmk_gemmn_gemmkpack_global_desc
,
make_multi_index
(
0
,
1
,
0
));
print_array_v2
(
"do_tansforms 0 1 0: "
,
in_gemmk_gemmn_gemmkpack_coord_step_0_1_0
.
do_transforms_
);
for
(
index_t
iter
=
0
;
iter
<
10
;
++
iter
)
{
printf
(
"iter %d
\n
"
,
iter
);
print_array_v2
(
"idx: "
,
in_gemmk_gemmn_gemmkpack_coord
.
GetIndex
());
print_array_v2
(
"hidden idx: "
,
in_gemmk_gemmn_gemmkpack_coord
.
GetHiddenIndex
());
printf
(
"offset: %d
\n
"
,
in_gemmk_gemmn_gemmkpack_coord
.
GetOffset
());
printf
(
"
\n
"
);
move_dynamic_tensor_coordinate_v2
(
in_gemmk_gemmn_gemmkpack_global_desc
,
in_gemmk_gemmn_gemmkpack_coord
,
in_gemmk_gemmn_gemmkpack_coord_step_0_1_0
);
}
}
{
auto
in_gemmk_gemmn_gemmkpack_coord
=
make_dynamic_tensor_coordinate_v2
(
in_gemmk_gemmn_gemmkpack_global_desc
,
make_multi_index
(
0
,
0
,
0
));
const
auto
in_gemmk_gemmn_gemmkpack_coord_step_1_0_0
=
make_dynamic_tensor_coordinate_step_v2
(
in_gemmk_gemmn_gemmkpack_global_desc
,
make_multi_index
(
1
,
0
,
0
));
print_array_v2
(
"do_tansforms 1 0 0: "
,
in_gemmk_gemmn_gemmkpack_coord_step_1_0_0
.
do_transforms_
);
for
(
index_t
iter
=
0
;
iter
<
10
;
++
iter
)
{
printf
(
"iter %d
\n
"
,
iter
);
print_array_v2
(
"idx: "
,
in_gemmk_gemmn_gemmkpack_coord
.
GetIndex
());
print_array_v2
(
"hidden idx: "
,
in_gemmk_gemmn_gemmkpack_coord
.
GetHiddenIndex
());
printf
(
"offset: %d
\n
"
,
in_gemmk_gemmn_gemmkpack_coord
.
GetOffset
());
printf
(
"
\n
"
);
move_dynamic_tensor_coordinate_v2
(
in_gemmk_gemmn_gemmkpack_global_desc
,
in_gemmk_gemmn_gemmkpack_coord
,
in_gemmk_gemmn_gemmkpack_coord_step_1_0_0
);
}
}
}
}
...
@@ -123,19 +176,20 @@ void device_dummy_dynamic_transform_v2(InDesc,
...
@@ -123,19 +176,20 @@ void device_dummy_dynamic_transform_v2(InDesc,
in_left_pads,
in_left_pads,
in_right_pads);
in_right_pads);
#else
#else
launch_kernel
(
run_gridwise_operation
<
DummyDynamicTransform_v2_2
<
BlockSize
>
,
launch_kernel
(
index_t
*
const
,
run_gridwise_operation
<
DummyDynamicTransform_v2_fwd_v4r4
<
BlockSize
>
,
float
*
const
,
index_t
*
const
,
float
*
const
,
float
*
const
,
const
decltype
(
in_gemmk_gemmn_global_desc
)
>
,
float
*
const
,
dim3
(
GridSize
),
const
decltype
(
in_gemmk_gemmn_gemmkpack_global_desc
)
>
,
dim3
(
BlockSize
),
dim3
(
GridSize
),
0
,
dim3
(
BlockSize
),
0
,
0
,
static_cast
<
index_t
*>
(
wei_kcyx_device_buf
.
GetDeviceBuffer
()),
0
,
static_cast
<
float
*>
(
in_nchw_device_buf
.
GetDeviceBuffer
()),
static_cast
<
index_t
*>
(
wei_kcyx_device_buf
.
GetDeviceBuffer
()),
static_cast
<
float
*>
(
out_nkhw_device_buf
.
GetDeviceBuffer
()),
static_cast
<
float
*>
(
in_nchw_device_buf
.
GetDeviceBuffer
()),
in_gemmk_gemmn_global_desc
);
static_cast
<
float
*>
(
out_nkhw_device_buf
.
GetDeviceBuffer
()),
in_gemmk_gemmn_gemmkpack_global_desc
);
#endif
#endif
}
}
}
}
...
...
driver/src/conv_driver.cpp
View file @
a578ff93
...
@@ -549,7 +549,7 @@ int main(int argc, char* argv[])
...
@@ -549,7 +549,7 @@ int main(int argc, char* argv[])
#endif
#endif
}
}
#if
1
#if
0
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
in_nchw,
in_nchw,
wei_kcyx_desc,
wei_kcyx_desc,
...
@@ -573,7 +573,7 @@ int main(int argc, char* argv[])
...
@@ -573,7 +573,7 @@ int main(int argc, char* argv[])
LeftPads
{},
LeftPads
{},
RightPads
{},
RightPads
{},
nrepeat
);
nrepeat
);
#elif
1
#elif
0
device_dummy_static_transform
(
in_nchw_desc
,
device_dummy_static_transform
(
in_nchw_desc
,
in_nchw
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx_desc
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment