Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
e4b77dcf
"dist/vscode:/vscode.git/clone" did not exist on "7a79845151c044ff3982bc45e2251b7aaf7343d5"
Commit
e4b77dcf
authored
Jun 04, 2021
by
Jing Zhang
Browse files
testing
parent
58ee3f13
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
55 additions
and
28 deletions
+55
-28
composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
...tion_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
+1
-1
composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp
...kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp
+2
-2
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops.hpp
...include/tensor_operation/gridwise_dynamic_gemm_xdlops.hpp
+10
-12
composable_kernel/include/utility/math.hpp
composable_kernel/include/utility/math.hpp
+2
-2
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
...tion_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
+38
-9
driver/src/conv_driver.cpp
driver/src/conv_driver.cpp
+1
-1
script/cmake-rocm.sh
script/cmake-rocm.sh
+1
-1
No files found.
composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
View file @
e4b77dcf
...
...
@@ -158,7 +158,7 @@ transform_forward_convolution_into_gemm_v4r4_xdlops_nchw_kcyx_nkhw_pad(
constexpr
auto
wei_gemmk0_gemmm_gemmk1_global_move_slice_window_iterator_hacks
=
Sequence
<
0
,
0
,
0
,
0
,
0
>
{};
#if
0
#if
1
// hack to control index calculation when iterating over in_gemmk0_gemmn_gemmk1_global tensor
constexpr
auto
in_gemmk0_gemmn_gemmk1_global_iterator_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
>
{},
...
...
composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp
View file @
e4b77dcf
...
...
@@ -211,7 +211,7 @@ struct BlockwiseGemmXdlops_km_kn_m0m1m2n_v1
Sequence
<
1
,
MRepeat
,
1
,
KPack
>
,
Sequence
<
0
,
1
,
2
,
3
>
,
3
,
1
,
//
KPack,
KPack
,
1
>
;
using
BThreadCopy
=
ThreadwiseDynamicTensorSliceTransfer_v4
<
FloatAB
,
...
...
@@ -221,7 +221,7 @@ struct BlockwiseGemmXdlops_km_kn_m0m1m2n_v1
Sequence
<
1
,
NRepeat
,
1
,
KPack
>
,
Sequence
<
0
,
1
,
2
,
3
>
,
3
,
1
,
//
KPack,
KPack
,
1
>
;
AThreadCopy
a_thread_copy_
;
...
...
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops.hpp
View file @
e4b77dcf
...
...
@@ -141,7 +141,7 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_xdlops_v1
{
__host__
__device__
static
constexpr
index_t
GetSharedMemoryNumberOfByte
()
{
constexpr
auto
max_lds_align
=
KPack
;
constexpr
auto
max_lds_align
=
Number
<
KPack
>
{}
;
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
...
...
@@ -204,7 +204,7 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_xdlops_v1
__builtin_amdgcn_readfirstlane
(
block_work_idx
[
I1
]
*
NPerBlock
);
// lds max alignment
constexpr
auto
max_lds_align
=
KPack
;
constexpr
auto
max_lds_align
=
Number
<
KPack
>
{}
;
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
...
...
@@ -229,11 +229,11 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_xdlops_v1
decltype
(
a_k0_m_k1_global_desc
),
decltype
(
a_k0_m_k1_block_desc
),
ABlockTransferSrcAccessOrder
,
Sequence
<
2
,
0
,
1
>
,
2
,
//
ABlockTransferSrcVectorDim,
Sequence
<
1
,
0
,
2
>
,
ABlockTransferSrcVectorDim
,
2
,
1
,
//
ABlockTransferSrcScalarPerVector,
1
,
//
ABlockTransferDstScalarPerVector_KPack,
ABlockTransferSrcScalarPerVector
,
ABlockTransferDstScalarPerVector_KPack
,
1
,
1
,
AThreadTransferSrcResetCoordinateAfterRun
,
...
...
@@ -256,11 +256,11 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_xdlops_v1
decltype
(
b_k0_n_k1_global_desc
),
decltype
(
b_k0_n_k1_block_desc
),
BBlockTransferSrcAccessOrder
,
Sequence
<
2
,
0
,
1
>
,
1
,
//
BBlockTransferSrcVectorDim,
Sequence
<
1
,
0
,
2
>
,
BBlockTransferSrcVectorDim
,
2
,
1
,
//
BBlockTransferSrcScalarPerVector,
1
,
//
BBlockTransferDstScalarPerVector_KPack,
BBlockTransferSrcScalarPerVector
,
BBlockTransferDstScalarPerVector_KPack
,
1
,
1
,
BThreadTransferSrcResetCoordinateAfterRun
,
...
...
@@ -282,8 +282,6 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_xdlops_v1
NPerBlock
%
(
NPerWave
*
NRepeat
)
==
0
,
"wrong!"
);
static_assert
(
KPack
==
1
,
""
);
constexpr
auto
a_k0_m0_m1_k1_block_desc
=
transform_dynamic_tensor_descriptor
(
a_k0_m_k1_block_desc
,
make_tuple
(
make_pass_through_transform
(
Number
<
KPerBlock
>
{}),
...
...
composable_kernel/include/utility/math.hpp
View file @
e4b77dcf
...
...
@@ -61,7 +61,7 @@ struct integer_divide_ceiler
{
static_assert
(
is_same
<
T
,
index_t
>
{}
||
is_same
<
T
,
int
>
{},
"wrong type"
);
return
(
a
+
b
-
1
)
/
b
;
return
(
a
+
b
-
Number
<
1
>
{}
)
/
b
;
}
};
...
...
@@ -74,7 +74,7 @@ __host__ __device__ constexpr auto integer_divide_floor(X x, Y y)
template
<
class
X
,
class
Y
>
__host__
__device__
constexpr
auto
integer_divide_ceil
(
X
x
,
Y
y
)
{
return
(
x
+
y
-
1
)
/
y
;
return
(
x
+
y
-
Number
<
1
>
{}
)
/
y
;
}
template
<
class
X
,
class
Y
>
...
...
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
View file @
e4b77dcf
...
...
@@ -79,10 +79,38 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw
const
auto
in_right_pads
=
sequence_to_tuple_of_number
(
InRightPads
{});
#endif
#if 1
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
GemmMPerBlock
=
128
;
constexpr
index_t
GemmNPerBlock
=
128
;
constexpr
index_t
GemmKPerBlock
=
4
;
constexpr
index_t
GemmMPerWave
=
64
;
constexpr
index_t
GemmNPerWave
=
64
;
constexpr
index_t
GemmKPack
=
4
;
constexpr
index_t
MRepeat
=
1
;
constexpr
index_t
NRepeat
=
1
;
using
GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1
=
Sequence
<
1
,
2
,
4
>
;
using
GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
GemmABlockTransferSrcScalarPerVector_GemmK
=
4
;
constexpr
index_t
GemmABlockTransferDstScalarPerVector_KPack
=
4
;
using
GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1
=
Sequence
<
1
,
2
,
4
>
;
using
GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
GemmBBlockTransferSrcScalarPerVector_GemmN
=
1
;
constexpr
index_t
GemmBBlockTransferDstScalarPerVector_KPack
=
4
;
constexpr
index_t
GemmCThreadTransferDstScalarPerVector_GemmN1
=
1
;
#else
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
GemmMPerBlock
=
128
;
constexpr
index_t
GemmNPerBlock
=
256
;
constexpr
index_t
GemmKPerBlock
=
16
;
constexpr
index_t
GemmMPerWave
=
64
;
...
...
@@ -90,21 +118,22 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw
constexpr
index_t
GemmKPack
=
1
;
constexpr
index_t
MRepeat
=
1
;
constexpr
index_t
NRepeat
=
1
;
constexpr
index_t
NRepeat
=
2
;
using
GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1
=
Sequence
<
4
,
2
,
GemmKPack
>
;
using
GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1
=
Sequence
<
4
,
2
,
1
>
;
using
GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
GemmABlockTransferSrcScalarPerVector_GemmK
=
1
;
constexpr
index_t
GemmABlockTransferDstScalarPerVector_KPack
=
1
;
using
GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1
=
Sequence
<
2
,
4
,
GemmKPack
>
;
using
GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1
=
Sequence
<
8
,
32
,
1
>
;
using
GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1
=
Sequence
<
4
,
4
,
1
>
;
using
GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
GemmBBlockTransferSrcScalarPerVector_GemmN
=
1
;
constexpr
index_t
GemmBBlockTransferDstScalarPerVector_KPack
=
1
;
constexpr
index_t
GemmCThreadTransferDstScalarPerVector_GemmN1
=
1
;
#endif
const
auto
descs
=
transform_forward_convolution_into_gemm_v4r4_xdlops_nchw_kcyx_nkhw_pad
<
TInWei
,
...
...
@@ -152,7 +181,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw
GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1
,
GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1
,
Sequence
<
0
,
2
,
1
>
,
Sequence
<
0
,
2
,
1
>
,
Sequence
<
1
,
0
,
2
>
,
1
,
GemmBBlockTransferSrcScalarPerVector_GemmN
,
GemmBBlockTransferDstScalarPerVector_KPack
,
...
...
driver/src/conv_driver.cpp
View file @
e4b77dcf
...
...
@@ -24,7 +24,7 @@ int main(int argc, char* argv[])
{
using
namespace
ck
;
#if
1
#if
0
constexpr index_t N = 256;
constexpr index_t C = 256;
constexpr index_t HI = 16;
...
...
script/cmake-rocm.sh
View file @
e4b77dcf
...
...
@@ -10,7 +10,7 @@ cmake
-D
CMAKE_INSTALL_PREFIX
=
${
MY_PROJECT_INSTALL
}
\
-D
CMAKE_BUILD_TYPE
=
Release
\
-D
DEVICE_BACKEND
=
"AMD"
\
-D
CMAKE_CXX_FLAGS
=
"-O3 --amdgpu-target=gfx908 -gline-tables-only -save-temps=
$CWD
-ftemplate-backtrace-limit=0"
\
-D
CMAKE_CXX_FLAGS
=
"-O3 --amdgpu-target=gfx908 -gline-tables-only -save-temps=
$CWD
-ftemplate-backtrace-limit=0
-mllvm --amdgpu-spill-vgpr-to-agpr=0
"
\
-D
CMAKE_CXX_COMPILER
=
/opt/rocm/bin/hipcc
\
-D
CMAKE_PREFIX_PATH
=
"/opt/rocm"
\
-D
CMAKE_VERBOSE_MAKEFILE:BOOL
=
ON
\
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment