Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
caa91db0
"...composable_kernel_rocm.git" did not exist on "37bfa01c0dce53959ea05abf31a9802ecca66c48"
Commit
caa91db0
authored
Mar 17, 2021
by
root
Browse files
clean code
parent
269adde8
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
43 additions
and
61 deletions
+43
-61
composable_kernel/include/tensor_operation/blockwise_gemm_v3.hpp
...ble_kernel/include/tensor_operation/blockwise_gemm_v3.hpp
+2
-1
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
...nel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
+24
-47
composable_kernel/include/tensor_operation/threadwise_gemm_v3.hpp
...le_kernel/include/tensor_operation/threadwise_gemm_v3.hpp
+11
-7
driver/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
...convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
+3
-3
driver/src/conv_driver.cpp
driver/src/conv_driver.cpp
+2
-2
script/cmake-rocm3.7.sh
script/cmake-rocm3.7.sh
+1
-1
No files found.
composable_kernel/include/tensor_operation/blockwise_gemm_v3.hpp
View file @
caa91db0
...
@@ -160,7 +160,8 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
...
@@ -160,7 +160,8 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
#if 1
#if 1
a_thread_copy
.
Run
(
p_a_block
+
a_block_mtx
.
CalculateOffset
(
make_tuple
(
cyx_begin
,
0
))
+
a_thread_copy
.
Run
(
p_a_block
+
a_block_mtx
.
CalculateOffset
(
make_tuple
(
cyx_begin
,
0
))
+
mMyThreadOffsetA
,
mMyThreadOffsetA
,
p_a_thread
);
p_a_thread
+
b_thread_mtx
.
CalculateOffset
(
make_tuple
(
cyx_begin
,
0
,
0
,
0
)));
#else
#else
for
(
index_t
i
=
0
;
i
<
a_thread_mtx
.
GetElementSpaceSize
();
i
++
)
for
(
index_t
i
=
0
;
i
<
a_thread_mtx
.
GetElementSpaceSize
();
i
++
)
p_a_thread
[
i
]
=
1
;
p_a_thread
[
i
]
=
1
;
...
...
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
View file @
caa91db0
...
@@ -102,6 +102,18 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
...
@@ -102,6 +102,18 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
const
index_t
k_block_work_id
=
get_block_1d_id
()
/
hw_block_work_num
;
const
index_t
k_block_work_id
=
get_block_1d_id
()
/
hw_block_work_num
;
const
index_t
hw_block_work_id
=
get_block_1d_id
()
-
k_block_work_id
*
hw_block_work_num
;
const
index_t
hw_block_work_id
=
get_block_1d_id
()
-
k_block_work_id
*
hw_block_work_num
;
#else
// Hack: this force result into SGPR
const
index_t
m_block_work_num
=
__builtin_amdgcn_readfirstlane
(
K
/
KPerBlock
);
const
index_t
h_block_work_num
=
__builtin_amdgcn_readfirstlane
(
H
/
HPerBlock
);
const
index_t
w_block_work_num
=
__builtin_amdgcn_readfirstlane
(
W
/
WPerBlock
);
const
index_t
hw_block_work_num
=
h_block_work_num
*
w_block_work_num
;
const
index_t
k_block_work_id
=
__builtin_amdgcn_readfirstlane
(
get_block_1d_id
()
/
hw_block_work_num
);
const
index_t
hw_block_work_id
=
get_block_1d_id
()
-
k_block_work_id
*
hw_block_work_num
;
#endif
const
index_t
h_block_work_id
=
hw_block_work_id
/
w_block_work_num
;
const
index_t
h_block_work_id
=
hw_block_work_id
/
w_block_work_num
;
const
index_t
w_block_work_id
=
hw_block_work_id
-
h_block_work_id
*
w_block_work_num
;
const
index_t
w_block_work_id
=
hw_block_work_id
-
h_block_work_id
*
w_block_work_num
;
...
@@ -110,23 +122,18 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
...
@@ -110,23 +122,18 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
static_assert
(
KPerBlock
==
KPerThread
,
""
);
static_assert
(
KPerBlock
==
KPerThread
,
""
);
const
auto
k_thread_id
=
0
;
const
auto
h_thread_id
=
get_thread_local_1d_id
()
/
w_num_threads
;
const
auto
h_thread_id
=
get_thread_local_1d_id
()
/
w_num_threads
;
const
auto
w_thread_id
=
get_thread_local_1d_id
()
%
w_num_threads
;
const
auto
w_thread_id
=
get_thread_local_1d_id
()
%
w_num_threads
;
#else
// Hack: this force result into SGPR
const
index_t
m_block_work_num
=
__builtin_amdgcn_readfirstlane
(
K
/
KPerBlock
);
const
index_t
hw_block_work_num
=
__builtin_amdgcn_readfirstlane
(
N
/
HWPerBlock
);
const
index_t
k_block_work_id
=
__builtin_amdgcn_readfirstlane
(
get_block_1d_id
()
/
hw_block_work_num
);
const
index_t
hw_block_work_id
=
get_block_1d_id
()
-
k_block_work_id
*
hw_block_work_num
;
#endif
const
index_t
m_block_data_on_global
=
k_block_work_id
*
KPerBlock
;
const
index_t
k_block_data_on_global
=
k_block_work_id
*
KPerBlock
;
const
index_t
h_block_data_on_global
=
h_block_work_id
*
HPerBlock
;
const
index_t
h_block_data_on_global
=
h_block_work_id
*
HPerBlock
;
const
index_t
w_block_data_on_global
=
w_block_work_id
*
WPerBlock
;
const
index_t
w_block_data_on_global
=
w_block_work_id
*
WPerBlock
;
const
index_t
k_thread_data_on_global
=
k_block_data_on_global
+
k_thread_id
*
KPerThread
;
const
index_t
h_thread_data_on_global
=
h_block_data_on_global
+
h_thread_id
*
HPerThread
;
const
index_t
w_thread_data_on_global
=
w_block_data_on_global
+
w_thread_id
*
WPerThread
;
// lds max alignment
// lds max alignment
constexpr
auto
max_lds_align
=
constexpr
auto
max_lds_align
=
math
::
lcm
(
Number
<
ABlockTransferDstScalarPerVector_M
>
{},
Number
<
KPerThread
>
{});
math
::
lcm
(
Number
<
ABlockTransferDstScalarPerVector_M
>
{},
Number
<
KPerThread
>
{});
...
@@ -167,20 +174,20 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
...
@@ -167,20 +174,20 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
AThreadTransferSrcResetCoordinateAfterRun
,
AThreadTransferSrcResetCoordinateAfterRun
,
true
>
(
true
>
(
a_cyx_k_global_desc
,
a_cyx_k_global_desc
,
make_multi_index
(
0
,
m
_block_data_on_global
),
make_multi_index
(
0
,
k
_block_data_on_global
),
a_cyx_k_block_desc
,
a_cyx_k_block_desc
,
make_multi_index
(
0
,
0
));
make_multi_index
(
0
,
0
));
constexpr
auto
b_cyx_n_h_w_thread_desc
=
constexpr
auto
b_cyx_n_h_w_thread_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
Number
<
CYXPer
Thread
>
{},
Number
<
1
>
{},
Number
<
HPerThread
>
{},
Number
<
WPerThread
>
{}));
Number
<
CYXPer
Block
>
{},
Number
<
1
>
{},
Number
<
HPerThread
>
{},
Number
<
WPerThread
>
{}));
using
ThreadwiseTensorSliceTransferB
=
ThreadwiseDynamicTensorSliceTransfer_v2
<
using
ThreadwiseTensorSliceTransferB
=
ThreadwiseDynamicTensorSliceTransfer_v2
<
Float
,
Float
,
Float
,
Float
,
decltype
(
b_cyx_n_h_w_global_desc
),
decltype
(
b_cyx_n_h_w_global_desc
),
decltype
(
b_cyx_n_h_w_thread_desc
),
decltype
(
b_cyx_n_h_w_thread_desc
),
Sequence
<
CYXPer
Thread
,
1
,
HPerThread
,
WPerThread
>
,
Sequence
<
CYXPer
Block
,
1
,
HPerThread
,
WPerThread
>
,
Sequence
<
3
,
2
,
0
,
1
>
,
// BBlockTransferSrcAccessOrder,
Sequence
<
3
,
2
,
0
,
1
>
,
// BBlockTransferSrcAccessOrder,
3
,
// BBlockTransferSrcVectorDim,
3
,
// BBlockTransferSrcVectorDim,
1
,
// BBlockTransferSrcScalarPerVector,
1
,
// BBlockTransferSrcScalarPerVector,
...
@@ -192,10 +199,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
...
@@ -192,10 +199,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
ThreadwiseTensorSliceTransferB
b_threadwise_transfer
(
ThreadwiseTensorSliceTransferB
b_threadwise_transfer
(
b_cyx_n_h_w_global_desc
,
b_cyx_n_h_w_global_desc
,
make_multi_index
(
0
,
make_multi_index
(
0
,
0
,
h_thread_data_on_global
,
w_thread_data_on_global
));
0
,
h_block_data_on_global
+
h_thread_id
*
HPerThread
,
w_block_data_on_global
+
w_thread_id
*
WPerThread
));
// c_thread_mtx definition: this is a mess
// c_thread_mtx definition: this is a mess
// TODO:: more elegent way of defining c_thread_mtx
// TODO:: more elegent way of defining c_thread_mtx
...
@@ -229,13 +233,8 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
...
@@ -229,13 +233,8 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
// register allocation for output
// register allocation for output
AccFloat
p_c_thread
[
c_k_n_h_w_thread_desc
.
GetElementSpaceSize
()];
AccFloat
p_c_thread
[
c_k_n_h_w_thread_desc
.
GetElementSpaceSize
()];
for
(
index_t
i
=
0
;
i
<
c_k_n_h_w_thread_desc
.
GetElementSpaceSize
();
i
++
)
{
p_c_thread
[
i
]
=
0
;
}
// zero out threadwise output
// zero out threadwise output
//
threadwise_matrix_set_zero_v
2
(c_k_n_h_w_thread_desc, p_c_thread);
threadwise_matrix_set_zero_v
3
(
c_k_n_h_w_thread_desc
,
p_c_thread
);
constexpr
auto
a_block_slice_copy_step
=
make_multi_index
(
CYXPerBlock
,
0
);
constexpr
auto
a_block_slice_copy_step
=
make_multi_index
(
CYXPerBlock
,
0
);
constexpr
auto
b_thread_slice_copy_step
=
make_multi_index
(
CYXPerBlock
,
0
,
0
,
0
);
constexpr
auto
b_thread_slice_copy_step
=
make_multi_index
(
CYXPerBlock
,
0
,
0
,
0
);
...
@@ -272,10 +271,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
...
@@ -272,10 +271,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
#if 0
#if 0
__syncthreads();
__syncthreads();
//index_t sum = 0;
//for(index_t i = 0; i < b_cyx_n_h_w_thread_desc.GetElementSpaceSize(); i++)
//sum += p_b_thread_double[i];
p_c_thread[0] += p_b_thread_double[0] + p_b_thread_double[1] + p_b_thread_double[2];
p_c_thread[0] += p_b_thread_double[0] + p_b_thread_double[1] + p_b_thread_double[2];
p_c_thread[0] += p_b_thread_double[3] + p_b_thread_double[4] + p_b_thread_double[5];
p_c_thread[0] += p_b_thread_double[3] + p_b_thread_double[4] + p_b_thread_double[5];
p_c_thread[0] += p_b_thread_double[6] + p_b_thread_double[7] + p_b_thread_double[8];
p_c_thread[0] += p_b_thread_double[6] + p_b_thread_double[7] + p_b_thread_double[8];
...
@@ -407,24 +402,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
...
@@ -407,24 +402,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
Number
<
KPerThread
>
{},
Number
<
1
>
{},
Number
<
HPerThread
>
{},
Number
<
WPerThread
>
{}));
Number
<
KPerThread
>
{},
Number
<
1
>
{},
Number
<
HPerThread
>
{},
Number
<
WPerThread
>
{}));
// calculate origin of thread input tensor on global memory
// blockwise GEMM c matrix starting index
#if 0
const auto c_thread_mtx_on_block =
blockwise_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id());
const index_t m_thread_data_on_global =
m_block_data_on_global + c_thread_mtx_on_block.row;
const index_t n_thread_data_on_global =
n_block_data_on_global + c_thread_mtx_on_block.col;
#endif
const
index_t
m_thread_data_on_global
=
m_block_data_on_global
;
const
index_t
h_thread_data_on_global
=
h_block_data_on_global
+
h_thread_id
*
HPerThread
;
const
index_t
w_thread_data_on_global
=
w_block_data_on_global
+
w_thread_id
*
WPerThread
;
// hack to control index calculation when iterating over c_k_n_h_w_global tensor
// hack to control index calculation when iterating over c_k_n_h_w_global tensor
constexpr
auto
c_k_n_h_w_global_tensor_iterator_hacks
=
CGlobalIteratorHacks
{};
constexpr
auto
c_k_n_h_w_global_tensor_iterator_hacks
=
CGlobalIteratorHacks
{};
...
@@ -444,7 +421,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
...
@@ -444,7 +421,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
true
>
(
true
>
(
c_k_n_h_w_global_desc
,
c_k_n_h_w_global_desc
,
make_multi_index
(
make_multi_index
(
m
_thread_data_on_global
,
0
,
h_thread_data_on_global
,
w_thread_data_on_global
))
k
_thread_data_on_global
,
0
,
h_thread_data_on_global
,
w_thread_data_on_global
))
.
Run
(
c_k_n_h_w_thread_desc
,
.
Run
(
c_k_n_h_w_thread_desc
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
make_tuple
(
I0
,
I0
,
I0
,
I0
),
p_c_thread
,
p_c_thread
,
...
...
composable_kernel/include/tensor_operation/threadwise_gemm_v3.hpp
View file @
caa91db0
...
@@ -12,18 +12,22 @@ __device__ void threadwise_matrix_set_zero_v3(Desc, Float* __restrict__ p_thread
...
@@ -12,18 +12,22 @@ __device__ void threadwise_matrix_set_zero_v3(Desc, Float* __restrict__ p_thread
static_assert
(
Desc
::
IsKnownAtCompileTime
(),
"wrong! Desc should be known at compile-time"
);
static_assert
(
Desc
::
IsKnownAtCompileTime
(),
"wrong! Desc should be known at compile-time"
);
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
desc
=
Desc
{};
constexpr
auto
desc
=
Desc
{};
constexpr
auto
M
=
desc
.
GetLength
(
I0
);
constexpr
auto
K
=
desc
.
GetLength
(
I0
);
constexpr
auto
N
=
desc
.
GetLength
(
I1
);
constexpr
auto
H
=
desc
.
GetLength
(
I2
);
constexpr
auto
W
=
desc
.
GetLength
(
I3
);
static_for
<
0
,
M
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
K
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
N
,
1
>
{}([
&
](
auto
j
)
{
static_for
<
0
,
H
,
1
>
{}([
&
](
auto
j
)
{
constexpr
auto
offset
=
desc
.
CalculateOffset
(
make_tuple
(
i
,
j
));
static_for
<
0
,
W
,
1
>
{}([
&
](
auto
k
)
{
constexpr
auto
offset
=
desc
.
CalculateOffset
(
make_tuple
(
i
,
0
,
j
,
k
));
p_thread
[
offset
]
=
Float
(
0
);
p_thread
[
offset
]
=
Float
(
0
);
});
});
});
});
});
}
}
...
...
driver/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
View file @
caa91db0
...
@@ -71,12 +71,12 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(InDesc
...
@@ -71,12 +71,12 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(InDesc
constexpr
index_t
BlockSize
=
64
;
constexpr
index_t
BlockSize
=
64
;
constexpr
index_t
KPerBlock
=
16
;
constexpr
index_t
KPerBlock
=
16
;
constexpr
index_t
HPerBlock
=
8
;
constexpr
index_t
HPerBlock
=
16
;
constexpr
index_t
WPerBlock
=
16
;
constexpr
index_t
WPerBlock
=
16
;
constexpr
index_t
CYXPerBlock
=
4
;
constexpr
index_t
CYXPerBlock
=
4
;
constexpr
index_t
KPerThread
=
16
;
constexpr
index_t
KPerThread
=
KPerBlock
;
constexpr
index_t
HPerThread
=
1
;
constexpr
index_t
HPerThread
=
2
;
constexpr
index_t
WPerThread
=
2
;
constexpr
index_t
WPerThread
=
2
;
constexpr
index_t
CYXPerThread
=
4
;
constexpr
index_t
CYXPerThread
=
4
;
...
...
driver/src/conv_driver.cpp
View file @
caa91db0
...
@@ -82,8 +82,8 @@ int main(int argc, char* argv[])
...
@@ -82,8 +82,8 @@ int main(int argc, char* argv[])
#elif 1
#elif 1
constexpr
index_t
N
=
1
;
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
4
;
constexpr
index_t
C
=
4
;
constexpr
index_t
HI
=
10
80
;
constexpr
index_t
HI
=
10
24
;
constexpr
index_t
WI
=
19
20
;
constexpr
index_t
WI
=
20
48
;
constexpr
index_t
K
=
16
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
constexpr
index_t
X
=
3
;
...
...
script/cmake-rocm3.7.sh
View file @
caa91db0
...
@@ -10,7 +10,7 @@ cmake
...
@@ -10,7 +10,7 @@ cmake
-D
CMAKE_INSTALL_PREFIX
=
${
MY_PROJECT_INSTALL
}
\
-D
CMAKE_INSTALL_PREFIX
=
${
MY_PROJECT_INSTALL
}
\
-D
CMAKE_BUILD_TYPE
=
Release
\
-D
CMAKE_BUILD_TYPE
=
Release
\
-D
DEVICE_BACKEND
=
"AMD"
\
-D
DEVICE_BACKEND
=
"AMD"
\
-D
CMAKE_CXX_FLAGS
=
"-O3 --amdgpu-target=gfx906
-gline-tables-only
-save-temps=
$CWD
-ftemplate-backtrace-limit=0"
\
-D
CMAKE_CXX_FLAGS
=
"-O3 --amdgpu-target=gfx906 -save-temps=
$CWD
-ftemplate-backtrace-limit=0"
\
-D
CMAKE_CXX_COMPILER
=
/opt/rocm/bin/hipcc
\
-D
CMAKE_CXX_COMPILER
=
/opt/rocm/bin/hipcc
\
-D
CMAKE_PREFIX_PATH
=
"/opt/rocm"
\
-D
CMAKE_PREFIX_PATH
=
"/opt/rocm"
\
-D
CMAKE_VERBOSE_MAKEFILE:BOOL
=
ON
\
-D
CMAKE_VERBOSE_MAKEFILE:BOOL
=
ON
\
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment