Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
415a4a5b
"vscode:/vscode.git/clone" did not exist on "8b57e715a763b6234053d5b33ddd98db82bc7ae0"
Commit
415a4a5b
authored
Apr 23, 2021
by
Chao Liu
Browse files
updating v5r1
parent
32d485dd
Changes
9
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
836 additions
and
22 deletions
+836
-22
composable_kernel/include/tensor_operation/blockwise_gemm_v2.hpp
...ble_kernel/include/tensor_operation/blockwise_gemm_v2.hpp
+13
-4
composable_kernel/include/tensor_operation/blockwise_gemm_v3.hpp
...ble_kernel/include/tensor_operation/blockwise_gemm_v3.hpp
+176
-0
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
...nel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
+479
-0
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_set.hpp
.../tensor_operation/threadwise_dynamic_tensor_slice_set.hpp
+3
-0
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
...or_operation/threadwise_dynamic_tensor_slice_transfer.hpp
+6
-3
composable_kernel/include/tensor_operation/threadwise_gemm_v2.hpp
...le_kernel/include/tensor_operation/threadwise_gemm_v2.hpp
+2
-3
composable_kernel/include/tensor_operation/threadwise_gemm_v3.hpp
...le_kernel/include/tensor_operation/threadwise_gemm_v3.hpp
+152
-7
composable_kernel/include/utility/config.amd.hpp.in
composable_kernel/include/utility/config.amd.hpp.in
+2
-2
driver/src/conv_driver.cpp
driver/src/conv_driver.cpp
+3
-3
No files found.
composable_kernel/include/tensor_operation/blockwise_gemm_v2.hpp
View file @
415a4a5b
...
...
@@ -7,6 +7,7 @@
namespace
ck
{
#if 0
// blockwise GEMM: C[M, N] += transpose(A[K, M]) * B[K, N]
// A and B are visable to the whole block, C is distributed among each thread
// If following number are power of 2, index calculation shall be greatly reduced:
...
...
@@ -364,12 +365,20 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v1
#endif
}
}
;
#endif
//
blockwise GEMM:
C[M, N] += transpose(A[K, M]) * B[K, N]
// C[M, N] += transpose(A[K, M]) * B[K, N]
// A and B are visable to the whole block, C is distributed among each thread
// If following number are power of 2, index calculation shall be greatly reduced:
// MPerThreadSubC, NPerThreadSubC, MLevel0ThreadCluster, NLevel0ThreadCluster,
// MLevel1ThreadCluster, NLevel1ThreadCluster
// Assume:
// 1. A:
// 1. BlockMatrixA is known at compile-time
// 2. ABlockBuffer is DynamicBuffer
// 2. B:
// 1. BlockMatrixA is known at compile-time
// 2. BBlockBuffer is DynamicBuffer
// 3. C:
// 1. ThreadMatrixC is known at compile-time
// 2. CThreadBuffer is StaticBuffer
template
<
index_t
BlockSize
,
typename
FloatA
,
typename
FloatB
,
...
...
composable_kernel/include/tensor_operation/blockwise_gemm_v3.hpp
View file @
415a4a5b
...
...
@@ -6,6 +6,7 @@
namespace
ck
{
#if 0
// blockwise GEMM: C[M, N] += transpose(A[K, M]) * B[K, N]
// A and B are visable to the whole block, C is distributed among each thread
// If following number are power of 2, index calculation shall be greatly reduced:
...
...
@@ -199,6 +200,181 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
Run_naive(p_a_block, p_b_thread, p_c_thread);
}
};
#else
// blockwise GEMM: C[M, N] += transpose(A[K, M]) * B[K, N]
// A and B are visable to the whole block, C is distributed among each thread
// If following number are power of 2, index calculation shall be greatly reduced:
// KPerThread, HPerThread, MLevel0ThreadCluster, NLevel0ThreadCluster,
// MLevel1ThreadCluster, NLevel1ThreadCluster
template
<
index_t
BlockSize
,
typename
FloatA
,
typename
FloatB
,
typename
FloatC
,
typename
BlockMatrixA
,
typename
BlockMatrixB
,
typename
ThreadMatrixC
,
index_t
KPerThread
,
index_t
HPerThread
,
index_t
WPerThread
,
index_t
EPerThreadLoop
,
index_t
ThreadGemmADataPerRead_K
,
index_t
ThreadGemmBDataPerRead_W
>
struct
BlockwiseGemm_km_kn_m0m1n0n1_v3
{
struct
MatrixIndex
{
index_t
k
;
index_t
h
;
index_t
w
;
};
MatrixIndex
c_thread_begin_mtx_idx_
;
// HACK: fix this @Jing Zhang
static
constexpr
index_t
KPerThreadSubC
=
4
;
static
constexpr
auto
a_thread_mtx_
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
Number
<
EPerThreadLoop
>
{},
Number
<
KPerThreadSubC
>
{}));
static
constexpr
auto
b_thread_mtx_
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
Number
<
EPerThreadLoop
>
{},
Number
<
1
>
{},
Number
<
HPerThread
>
{},
Number
<
WPerThread
>
{}));
static
constexpr
auto
c_thread_mtx_
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
Number
<
KPerThreadSubC
>
{},
Number
<
1
>
{},
Number
<
HPerThread
>
{},
Number
<
WPerThread
>
{}));
using
AThreadCopy
=
ThreadwiseDynamicTensorSliceTransfer_v4
<
FloatA
,
FloatA
,
BlockMatrixA
,
decltype
(
a_thread_mtx_
),
Sequence
<
EPerThreadLoop
,
KPerThreadSubC
>
,
Sequence
<
0
,
1
>
,
1
,
ThreadGemmADataPerRead_K
,
AddressSpace
::
Generic
,
AddressSpace
::
Vgpr
,
1
>
;
AThreadCopy
a_thread_copy_
;
__device__
BlockwiseGemm_km_kn_m0m1n0n1_v3
()
:
c_thread_begin_mtx_idx_
{
GetBeginOfThreadMatrixC
(
get_thread_local_1d_id
())},
a_thread_copy_
{
make_tuple
(
0
,
c_thread_begin_mtx_idx_
.
k
*
KPerThread
)}
{
static_assert
(
BlockMatrixA
::
IsKnownAtCompileTime
()
&&
BlockMatrixB
::
IsKnownAtCompileTime
()
&&
ThreadMatrixC
::
IsKnownAtCompileTime
(),
"wrong! Desc should be known at compile-time"
);
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
static_assert
(
BlockMatrixA
{}.
GetLength
(
I0
)
==
BlockMatrixB
{}.
GetLength
(
I0
),
"wrong! K dimension not consistent
\n
"
);
constexpr
index_t
K
=
BlockMatrixA
{}.
GetLength
(
I1
);
// A is transposed
constexpr
index_t
N
=
BlockMatrixB
{}.
GetLength
(
I1
);
constexpr
index_t
H
=
BlockMatrixB
{}.
GetLength
(
I2
);
constexpr
index_t
W
=
BlockMatrixB
{}.
GetLength
(
I3
);
static_assert
(
K
%
KPerThread
==
0
&&
H
%
HPerThread
==
0
&&
W
%
WPerThread
==
0
,
"wrong! Cannot evenly divide work among
\n
"
);
constexpr
auto
KThreadCluster
=
K
/
KPerThread
;
constexpr
auto
HThreadCluster
=
H
/
HPerThread
;
constexpr
auto
WThreadCluster
=
W
/
WPerThread
;
static_assert
(
BlockSize
==
KThreadCluster
*
HThreadCluster
*
WThreadCluster
,
"wrong! wrong blocksize
\n
"
);
}
__device__
static
constexpr
auto
GetThreadMatrixCLengths
()
{
return
Sequence
<
KPerThread
,
1
,
HPerThread
,
WPerThread
>
{};
}
__device__
static
MatrixIndex
GetBeginOfThreadMatrixC
(
index_t
thread_id
)
{
constexpr
index_t
H
=
BlockMatrixB
{}.
GetLength
(
Number
<
2
>
{});
constexpr
index_t
W
=
BlockMatrixB
{}.
GetLength
(
Number
<
3
>
{});
constexpr
auto
num_w_threads
=
W
/
WPerThread
;
constexpr
auto
num_h_threads
=
H
/
HPerThread
;
constexpr
auto
num_hw_threads
=
num_w_threads
*
num_h_threads
;
index_t
k_thread_id
=
thread_id
/
num_hw_threads
;
index_t
hw_thread_id
=
thread_id
%
num_hw_threads
;
index_t
h_thread_id
=
hw_thread_id
/
num_w_threads
;
index_t
w_thread_id
=
hw_thread_id
%
num_w_threads
;
return
MatrixIndex
{
k_thread_id
,
h_thread_id
,
w_thread_id
};
}
__device__
void
Run
(
const
FloatA
*
p_a_block
,
const
FloatB
*
p_b_thread
,
FloatC
*
p_c_thread
)
const
{
auto
a_block_buf
=
make_dynamic_buffer
(
p_a_block
);
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
a_block_mtx
=
BlockMatrixA
{};
constexpr
auto
EPerBlock
=
a_block_mtx
.
GetLength
(
I0
);
// HACK: fix this @Jing Zhang
constexpr
auto
HoPerThreadSubC
=
2
;
constexpr
auto
WoPerThreadSubC
=
2
;
static_assert
(
KPerThread
%
KPerThreadSubC
==
0
,
""
);
static_assert
(
HPerThread
%
HoPerThreadSubC
==
0
,
""
);
static_assert
(
WPerThread
%
WoPerThreadSubC
==
0
,
""
);
// thread A, B for GEMM
FloatA
p_a_thread
[
a_thread_mtx_
.
GetElementSpaceSize
()];
auto
a_thread_buf
=
make_dynamic_buffer
(
p_a_thread
);
auto
b_thread_buf
=
make_dynamic_buffer
(
p_b_thread
);
auto
c_thread_buf
=
make_dynamic_buffer
(
p_c_thread
);
constexpr
auto
threadwise_gemm
=
ThreadwiseGemm_km_kn_mn_v3
<
FloatA
,
FloatB
,
FloatC
,
decltype
(
a_thread_mtx_
),
decltype
(
b_thread_mtx_
),
decltype
(
c_thread_mtx_
),
HoPerThreadSubC
,
WoPerThreadSubC
>
{};
static_for
<
0
,
EPerBlock
,
EPerThreadLoop
>
{}([
&
](
auto
e_begin
)
{
static_for
<
0
,
KPerThread
,
KPerThreadSubC
>
{}([
&
](
auto
k_begin
)
{
a_thread_copy_
.
Run
(
a_block_mtx
,
make_tuple
(
e_begin
,
k_begin
),
a_block_buf
,
a_thread_mtx_
,
make_tuple
(
I0
,
I0
),
a_thread_buf
);
static_for
<
0
,
HPerThread
,
HoPerThreadSubC
>
{}([
&
](
auto
h_begin
)
{
static_for
<
0
,
WPerThread
,
WoPerThreadSubC
>
{}([
&
](
auto
w_begin
)
{
threadwise_gemm
.
Run
(
a_thread_buf
,
make_tuple
(
I0
,
I0
),
b_thread_buf
,
make_tuple
(
e_begin
,
I0
,
h_begin
,
w_begin
),
c_thread_buf
,
make_tuple
(
k_begin
,
I0
,
h_begin
,
w_begin
));
});
});
});
});
}
};
#endif
}
// namespace ck
#endif
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
View file @
415a4a5b
...
...
@@ -11,6 +11,7 @@
namespace
ck
{
#if 0
template <index_t BlockSize,
typename FloatAB,
typename FloatAcc,
...
...
@@ -462,6 +463,484 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
integral_constant
<
bool
,
HasDoubleTailKBlockLoop
>
{});
}
}
;
#else
template
<
index_t
BlockSize
,
typename
FloatAB
,
typename
FloatAcc
,
typename
FloatC
,
InMemoryDataOperation
CGlobalMemoryDataOperation
,
typename
AGlobalDesc
,
typename
BGlobalDesc
,
typename
CGlobalDesc
,
index_t
KPerBlock
,
index_t
HoPerBlock
,
index_t
WoPerBlock
,
index_t
EPerBlock
,
index_t
KPerThread
,
index_t
HoPerThread
,
index_t
WoPerThread
,
index_t
EPerThread
,
typename
ABlockTransferThreadSliceLengths_E_K
,
typename
ABlockTransferThreadClusterLengths_E_K
,
typename
ABlockTransferThreadClusterArrangeOrder
,
typename
ABlockTransferSrcAccessOrder
,
index_t
ABlockTransferSrcVectorDim
,
index_t
ABlockTransferSrcScalarPerVector
,
index_t
ABlockTransferDstScalarPerVector_K
,
bool
AThreadTransferSrcResetCoordinateAfterRun
,
typename
BBlockTransferSrcAccessOrder
,
index_t
BBlockTransferSrcVectorDim
,
index_t
BBlockTransferSrcScalarPerVector
,
bool
BThreadTransferSrcResetCoordinateAfterRun
,
typename
CThreadTransferSrcDstAccessOrder
,
index_t
CThreadTransferSrcDstVectorDim
,
index_t
CThreadTransferDstScalarPerVector
,
typename
AGlobalIteratorHacks
,
typename
BGlobalIteratorHacks
,
typename
CGlobalIteratorHacks
,
typename
AGlobalMoveSliceWindowIteratorHacks
,
typename
BGlobalMoveSliceWindowIteratorHacks
>
struct
GridwiseDynamicGemm_km_kn_mn_v3
{
__host__
__device__
static
constexpr
index_t
GetSharedMemoryNumberOfByte
()
{
constexpr
auto
E
=
EPerBlock
*
3
*
3
;
constexpr
auto
max_lds_align
=
math
::
lcm
(
Number
<
ABlockTransferDstScalarPerVector_K
>
{},
Number
<
KPerBlock
>
{});
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
constexpr
auto
a_e_k_desc
=
make_dynamic_naive_tensor_descriptor_aligned_v2
(
make_tuple
(
Number
<
E
>
{},
Number
<
KPerBlock
>
{}),
max_lds_align
);
// LDS allocation for A and B: be careful of alignment
constexpr
auto
a_block_space_size
=
math
::
integer_least_multiple
(
a_e_k_desc
.
GetElementSpaceSize
(),
max_lds_align
);
return
a_block_space_size
*
sizeof
(
FloatAB
);
}
template
<
bool
HasMainKBlockLoop
,
bool
HasDoubleTailKBlockLoop
>
__device__
void
Run
(
const
AGlobalDesc
&
a_e_k_global_desc
,
const
FloatAB
*
__restrict__
p_a_global
,
const
BGlobalDesc
&
b_e_n_ho_wo_global_desc
,
const
FloatAB
*
__restrict__
p_b_global
,
const
CGlobalDesc
&
c_k_n_ho_wo_global_desc
,
FloatC
*
__restrict__
p_c_global
,
FloatAB
*
__restrict__
p_shared_block
,
integral_constant
<
bool
,
HasMainKBlockLoop
>
,
integral_constant
<
bool
,
HasDoubleTailKBlockLoop
>
)
const
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
E
=
EPerBlock
*
3
*
3
;
// const auto E = a_e_k_global_desc.GetLength(I0);
const
auto
K
=
a_e_k_global_desc
.
GetLength
(
I1
);
const
auto
N
=
b_e_n_ho_wo_global_desc
.
GetLength
(
I1
);
const
auto
Ho
=
b_e_n_ho_wo_global_desc
.
GetLength
(
I2
);
const
auto
Wo
=
b_e_n_ho_wo_global_desc
.
GetLength
(
I3
);
// divide block work by [M, N]
#if 0
const auto k_block_work_num = K / Number<KPerBlock>{};
const auto ho_block_work_num = Ho / Number<HoPerBlock>{};
const auto wo_block_work_num = Wo / Number<WoPerBlock>{};
const auto hwo_block_work_num = ho_block_work_num * wo_block_work_num;
const index_t k_block_work_id = get_block_1d_id() / hwo_block_work_num;
const index_t hwo_block_work_id = get_block_1d_id() - k_block_work_id * hwo_block_work_num;
const index_t ho_block_work_id = hwo_block_work_id / wo_block_work_num;
const index_t wo_block_work_id = hwo_block_work_id - ho_block_work_id * wo_block_work_num;
#else
// Hack: this force result into SGPR
const
index_t
k_block_work_num
=
__builtin_amdgcn_readfirstlane
(
K
/
KPerBlock
);
const
index_t
ho_block_work_num
=
__builtin_amdgcn_readfirstlane
(
Ho
/
HoPerBlock
);
const
index_t
wo_block_work_num
=
__builtin_amdgcn_readfirstlane
(
Wo
/
WoPerBlock
);
const
index_t
hwo_block_work_num
=
ho_block_work_num
*
wo_block_work_num
;
const
index_t
k_block_work_id
=
__builtin_amdgcn_readfirstlane
(
get_block_1d_id
()
/
hwo_block_work_num
);
const
index_t
hwo_block_work_id
=
get_block_1d_id
()
-
k_block_work_id
*
hwo_block_work_num
;
const
index_t
ho_block_work_id
=
__builtin_amdgcn_readfirstlane
(
hwo_block_work_id
/
wo_block_work_num
);
const
index_t
wo_block_work_id
=
hwo_block_work_id
-
ho_block_work_id
*
wo_block_work_num
;
#endif
// lds max alignment
constexpr
auto
max_lds_align
=
math
::
lcm
(
Number
<
ABlockTransferDstScalarPerVector_K
>
{},
Number
<
KPerBlock
>
{});
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
constexpr
auto
a_e_k_block_desc
=
make_dynamic_naive_tensor_descriptor_aligned_v2
(
make_tuple
(
Number
<
EPerBlock
>
{},
Number
<
KPerBlock
>
{}),
max_lds_align
);
constexpr
auto
a_e_k_desc
=
make_dynamic_naive_tensor_descriptor_aligned_v2
(
make_tuple
(
Number
<
E
>
{},
Number
<
KPerBlock
>
{}),
max_lds_align
);
// B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
constexpr
auto
b_e_n_ho_wo_block_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
Number
<
EPerBlock
>
{},
Number
<
1
>
{},
Number
<
HoPerBlock
>
{},
Number
<
WoPerBlock
>
{}));
// c_thread_mtx definition: this is a mess
// TODO:: more elegent way of defining c_thread_mtx
constexpr
auto
c_k_n_ho_wo_thread_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
Number
<
KPerThread
>
{},
Number
<
1
>
{},
Number
<
HoPerThread
>
{},
Number
<
WoPerThread
>
{}));
const
auto
blockwise_gemm
=
BlockwiseGemm_km_kn_m0m1n0n1_v3
<
BlockSize
,
FloatAB
,
FloatAB
,
FloatAcc
,
decltype
(
a_e_k_block_desc
),
decltype
(
b_e_n_ho_wo_block_desc
),
decltype
(
c_k_n_ho_wo_thread_desc
),
KPerThread
,
HoPerThread
,
WoPerThread
,
EPerThread
,
ABlockTransferSrcScalarPerVector
,
ABlockTransferDstScalarPerVector_K
>
{};
auto
c_thread_mtx_index
=
blockwise_gemm
.
GetBeginOfThreadMatrixC
(
get_thread_local_1d_id
());
const
auto
k_thread_id
=
c_thread_mtx_index
.
k
;
const
auto
ho_thread_id
=
c_thread_mtx_index
.
h
;
const
auto
wo_thread_id
=
c_thread_mtx_index
.
w
;
const
index_t
k_block_data_on_global
=
k_block_work_id
*
KPerBlock
;
const
index_t
ho_block_data_on_global
=
ho_block_work_id
*
HoPerBlock
;
const
index_t
wo_block_data_on_global
=
wo_block_work_id
*
WoPerBlock
;
const
index_t
ho_thread_data_on_global
=
ho_block_data_on_global
+
ho_thread_id
*
HoPerThread
;
const
index_t
wo_thread_data_on_global
=
wo_block_data_on_global
+
wo_thread_id
*
WoPerThread
;
// A matrix blockwise copy
auto
a_blockwise_copy
=
BlockwiseDynamicTensorSliceTransfer_v4
<
BlockSize
,
InMemoryDataOperation
::
Set
,
Sequence
<
E
,
KPerBlock
>
,
ABlockTransferThreadSliceLengths_E_K
,
ABlockTransferThreadClusterLengths_E_K
,
ABlockTransferThreadClusterArrangeOrder
,
FloatAB
,
FloatAB
,
decltype
(
a_e_k_global_desc
),
decltype
(
a_e_k_desc
),
ABlockTransferSrcAccessOrder
,
Sequence
<
0
,
1
>
,
ABlockTransferSrcVectorDim
,
1
,
ABlockTransferSrcScalarPerVector
,
ABlockTransferDstScalarPerVector_K
,
AddressSpace
::
Global
,
AddressSpace
::
Lds
,
1
,
1
,
AThreadTransferSrcResetCoordinateAfterRun
,
true
>
(
a_e_k_global_desc
,
make_multi_index
(
0
,
k_block_data_on_global
),
a_e_k_desc
,
make_multi_index
(
0
,
0
));
constexpr
auto
b_e_n_ho_wo_thread_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
Number
<
EPerBlock
>
{},
Number
<
1
>
{},
Number
<
HoPerThread
>
{},
Number
<
WoPerThread
>
{}));
auto
b_threadwise_transfer
=
ThreadwiseDynamicTensorSliceTransfer_v2
<
FloatAB
,
FloatAB
,
decltype
(
b_e_n_ho_wo_global_desc
),
decltype
(
b_e_n_ho_wo_thread_desc
),
Sequence
<
EPerBlock
,
1
,
HoPerThread
,
WoPerThread
>
,
BBlockTransferSrcAccessOrder
,
BBlockTransferSrcVectorDim
,
BBlockTransferSrcScalarPerVector
,
AddressSpace
::
Global
,
AddressSpace
::
Vgpr
,
InMemoryDataOperation
::
Set
,
1
,
true
>
(
b_e_n_ho_wo_global_desc
,
make_multi_index
(
0
,
0
,
ho_thread_data_on_global
,
wo_thread_data_on_global
));
FloatAB
*
p_a_block
=
p_shared_block
;
#if 0
// register allocation for output
FloatAcc p_c_thread[c_k_n_ho_wo_thread_desc.GetElementSpaceSize()];
// zero out threadwise output
threadwise_matrix_set_zero_v3(c_k_n_ho_wo_thread_desc, p_c_thread);
#else
// register allocation for output
FloatAcc
p_c_thread
[
c_k_n_ho_wo_thread_desc
.
GetElementSpaceSize
()];
auto
c_thread_buf
=
make_dynamic_buffer
(
p_c_thread
);
ThreadwiseDynamicTensorSliceSet_v1
<
FloatAcc
,
decltype
(
c_k_n_ho_wo_thread_desc
),
Sequence
<
KPerThread
,
1
,
HoPerThread
,
WoPerThread
>>
{}
.
Run
(
c_k_n_ho_wo_thread_desc
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
c_thread_buf
,
FloatAcc
{
0
});
#endif
constexpr
auto
b_thread_slice_copy_step
=
make_multi_index
(
EPerBlock
,
0
,
0
,
0
);
// hack to control index calculation when iterating over A and B matrix for threadwise copy
constexpr
auto
a_e_k_global_iterator_hacks
=
AGlobalIteratorHacks
{};
constexpr
auto
b_e_n_ho_wo_global_iterator_hacks
=
BGlobalIteratorHacks
{};
// hack to control index calculation when move slice window for A and B matrix for
// threadwise copy
constexpr
auto
a_e_k_global_move_slice_window_iterator_hack
=
AGlobalMoveSliceWindowIteratorHacks
{};
constexpr
auto
b_e_n_ho_wo_global_move_slice_window_iterator_hack
=
BGlobalMoveSliceWindowIteratorHacks
{};
#if 0
constexpr auto b_thread_space_size = b_e_n_ho_wo_thread_desc.GetElementSpaceSize();
FloatAB p_b_thread[b_thread_space_size * 2];
FloatAB* p_b_thread_double = p_b_thread;
#elif
1
constexpr
auto
b_thread_space_size
=
b_e_n_ho_wo_thread_desc
.
GetElementSpaceSize
();
FloatAB
p_b_thread
[
b_thread_space_size
*
2
];
auto
b_thread_even_buf
=
make_dynamic_buffer
(
p_b_thread
);
auto
b_thread_odd_buf
=
make_dynamic_buffer
(
p_b_thread
+
b_thread_space_size
);
#else
StaticBuffer
<
FloatAB
,
b_e_n_ho_wo_thread_desc
.
GetElementSpaceSize
()
>
a_thread_even_buf
,
a_thread_odd_buf
;
#endif
// LDS double buffer: preload data into LDS
{
a_blockwise_copy
.
RunRead
(
a_e_k_global_desc
,
p_a_global
,
a_e_k_global_iterator_hacks
);
b_threadwise_transfer
.
Run
(
b_e_n_ho_wo_global_desc
,
p_b_global
,
b_e_n_ho_wo_thread_desc
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
b_thread_even_buf
.
p_data_
,
b_e_n_ho_wo_global_iterator_hacks
);
a_blockwise_copy
.
RunWrite
(
a_e_k_desc
,
p_a_block
);
}
__syncthreads
();
index_t
b_block_data_begin
=
0
;
if
constexpr
(
HasMainKBlockLoop
)
{
// LDS double buffer: main body
// use Do-While loop instead of For loop to simplify control flow
do
{
// even iteration
b_threadwise_transfer
.
MoveSrcSliceWindow
(
b_e_n_ho_wo_global_desc
,
b_thread_slice_copy_step
);
b_threadwise_transfer
.
Run
(
b_e_n_ho_wo_global_desc
,
p_b_global
,
b_e_n_ho_wo_thread_desc
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
b_thread_odd_buf
.
p_data_
,
b_e_n_ho_wo_global_iterator_hacks
);
// LDS double buffer: GEMM on current data
blockwise_gemm
.
Run
(
p_a_block
+
a_e_k_block_desc
.
CalculateOffset
(
make_tuple
(
b_block_data_begin
,
0
)),
b_thread_even_buf
.
p_data_
,
p_c_thread
);
b_block_data_begin
+=
EPerBlock
;
b_threadwise_transfer
.
MoveSrcSliceWindow
(
b_e_n_ho_wo_global_desc
,
b_thread_slice_copy_step
);
b_threadwise_transfer
.
Run
(
b_e_n_ho_wo_global_desc
,
p_b_global
,
b_e_n_ho_wo_thread_desc
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
b_thread_even_buf
.
p_data_
,
b_e_n_ho_wo_global_iterator_hacks
);
// LDS double buffer: GEMM on current data
blockwise_gemm
.
Run
(
p_a_block
+
a_e_k_block_desc
.
CalculateOffset
(
make_tuple
(
b_block_data_begin
,
0
)),
b_thread_odd_buf
.
p_data_
,
p_c_thread
);
b_block_data_begin
+=
EPerBlock
;
}
while
(
b_block_data_begin
<
E
-
2
*
EPerBlock
);
}
// LDS double buffer: tail
if
constexpr
(
HasDoubleTailKBlockLoop
)
// if has 2 iteration left
{
b_threadwise_transfer
.
MoveSrcSliceWindow
(
b_e_n_ho_wo_global_desc
,
b_thread_slice_copy_step
);
b_threadwise_transfer
.
Run
(
b_e_n_ho_wo_global_desc
,
p_b_global
,
b_e_n_ho_wo_thread_desc
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
b_thread_odd_buf
.
p_data_
,
b_e_n_ho_wo_global_iterator_hacks
);
// LDS double buffer: GEMM on 2nd-last data
blockwise_gemm
.
Run
(
p_a_block
+
a_e_k_block_desc
.
CalculateOffset
(
make_tuple
(
b_block_data_begin
,
0
)),
b_thread_even_buf
.
p_data_
,
p_c_thread
);
b_block_data_begin
+=
EPerBlock
;
// LDS double buffer: GEMM on last data
blockwise_gemm
.
Run
(
p_a_block
+
a_e_k_block_desc
.
CalculateOffset
(
make_tuple
(
b_block_data_begin
,
0
)),
b_thread_odd_buf
.
p_data_
,
p_c_thread
);
}
else
// if has 1 iteration left
{
// LDS double buffer: GEMM on last data
blockwise_gemm
.
Run
(
p_a_block
+
a_e_k_block_desc
.
CalculateOffset
(
make_tuple
(
b_block_data_begin
,
0
)),
b_thread_even_buf
.
p_data_
,
p_c_thread
);
}
// output: register to global memory
{
// hack to control index calculation when iterating over c_k_n_ho_wo_global tensor
constexpr
auto
c_k_n_ho_wo_global_tensor_iterator_hacks
=
CGlobalIteratorHacks
{};
const
index_t
k_thread_data_on_global
=
k_block_data_on_global
+
k_thread_id
*
KPerThread
;
ThreadwiseDynamicTensorSliceTransfer_v1r3
<
FloatAcc
,
FloatC
,
decltype
(
c_k_n_ho_wo_thread_desc
),
decltype
(
c_k_n_ho_wo_global_desc
),
Sequence
<
KPerThread
,
1
,
HoPerThread
,
WoPerThread
>
,
CThreadTransferSrcDstAccessOrder
,
CThreadTransferSrcDstVectorDim
,
CThreadTransferDstScalarPerVector
,
AddressSpace
::
Vgpr
,
AddressSpace
::
Global
,
CGlobalMemoryDataOperation
,
1
,
true
>
(
c_k_n_ho_wo_global_desc
,
make_multi_index
(
k_thread_data_on_global
,
0
,
ho_thread_data_on_global
,
wo_thread_data_on_global
))
.
Run
(
c_k_n_ho_wo_thread_desc
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
c_thread_buf
,
c_k_n_ho_wo_global_desc
,
p_c_global
,
c_k_n_ho_wo_global_tensor_iterator_hacks
);
}
}
// pass tensor descriptor by reference
template
<
bool
HasMainKBlockLoop
,
bool
HasDoubleTailKBlockLoop
>
__device__
void
Run
(
const
AGlobalDesc
&
a_e_k_global_desc
,
const
FloatAB
*
__restrict__
p_a_global
,
const
BGlobalDesc
&
b_e_n_ho_wo_global_desc
,
const
FloatAB
*
__restrict__
p_b_global
,
const
CGlobalDesc
&
c_k_n_ho_wo_global_desc
,
FloatC
*
__restrict__
p_c_global
,
integral_constant
<
bool
,
HasMainKBlockLoop
>
,
integral_constant
<
bool
,
HasDoubleTailKBlockLoop
>
)
const
{
constexpr
index_t
shared_block_size
=
GetSharedMemoryNumberOfByte
()
/
sizeof
(
FloatAB
);
__shared__
FloatAB
p_shared_block
[
shared_block_size
];
Run
(
a_e_k_global_desc
,
p_a_global
,
b_e_n_ho_wo_global_desc
,
p_b_global
,
c_k_n_ho_wo_global_desc
,
p_c_global
,
p_shared_block
,
integral_constant
<
bool
,
HasMainKBlockLoop
>
{},
integral_constant
<
bool
,
HasDoubleTailKBlockLoop
>
{});
}
// pass tensor descriptors by their pointers
template
<
bool
HasMainKBlockLoop
,
bool
HasDoubleTailKBlockLoop
>
__device__
void
Run
(
const
AGlobalDesc
*
p_a_e_k_global_desc
,
const
FloatAB
*
__restrict__
p_a_global
,
const
BGlobalDesc
*
p_b_e_n_ho_wo_global_desc
,
const
FloatAB
*
__restrict__
p_b_global
,
const
CGlobalDesc
*
p_c_k_n_ho_wo_global_desc
,
FloatC
*
__restrict__
p_c_global
,
integral_constant
<
bool
,
HasMainKBlockLoop
>
,
integral_constant
<
bool
,
HasDoubleTailKBlockLoop
>
)
const
{
const
auto
a_e_k_global_desc
=
*
p_a_e_k_global_desc
;
const
auto
b_e_n_ho_wo_global_desc
=
*
p_b_e_n_ho_wo_global_desc
;
const
auto
c_k_n_ho_wo_global_desc
=
*
p_c_k_n_ho_wo_global_desc
;
Run
(
a_e_k_global_desc
,
p_a_global
,
b_e_n_ho_wo_global_desc
,
p_b_global
,
c_k_n_ho_wo_global_desc
,
p_c_global
,
integral_constant
<
bool
,
HasMainKBlockLoop
>
{},
integral_constant
<
bool
,
HasDoubleTailKBlockLoop
>
{});
}
// pass tensor descriptors by void*
template
<
bool
HasMainKBlockLoop
,
bool
HasDoubleTailKBlockLoop
>
__device__
void
Run
(
const
void
*
p_a_e_k_global_desc
,
const
FloatAB
*
__restrict__
p_a_global
,
const
void
*
p_b_e_n_ho_wo_global_desc
,
const
FloatAB
*
__restrict__
p_b_global
,
const
void
*
p_c_k_n_ho_wo_global_desc
,
FloatC
*
__restrict__
p_c_global
,
integral_constant
<
bool
,
HasMainKBlockLoop
>
,
integral_constant
<
bool
,
HasDoubleTailKBlockLoop
>
)
const
{
const
auto
a_e_k_global_desc
=
*
reinterpret_cast
<
const
AGlobalDesc
*>
(
p_a_e_k_global_desc
);
const
auto
b_e_n_ho_wo_global_desc
=
*
reinterpret_cast
<
const
BGlobalDesc
*>
(
p_b_e_n_ho_wo_global_desc
);
const
auto
c_k_n_ho_wo_global_desc
=
*
reinterpret_cast
<
const
CGlobalDesc
*>
(
p_c_k_n_ho_wo_global_desc
);
Run
(
a_e_k_global_desc
,
p_a_global
,
b_e_n_ho_wo_global_desc
,
p_b_global
,
c_k_n_ho_wo_global_desc
,
p_c_global
,
integral_constant
<
bool
,
HasMainKBlockLoop
>
{},
integral_constant
<
bool
,
HasDoubleTailKBlockLoop
>
{});
}
};
#endif
}
// namespace ck
#endif
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_set.hpp
View file @
415a4a5b
...
...
@@ -28,7 +28,10 @@ struct ThreadwiseDynamicTensorSliceSet_v1
static_assert
(
Desc
::
IsKnownAtCompileTime
(),
"wrong! SrcDesc and DstDesc need to known at compile-time"
);
#if 0
// TODO: turn this on when v5r1 is update to get rid of array
static_assert(Buffer::IsStaticBuffer(), "wrong! DstBuffer need to be StaticBuffer");
#endif
static_assert
(
is_known_at_compile_time
<
remove_cv_t
<
remove_reference_t
<
OriginIdx
>>>::
value
,
"wrong! OriginIdx need to be known at compile-time"
);
...
...
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
View file @
415a4a5b
...
...
@@ -1399,7 +1399,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v4
remove_cv_t
<
remove_reference_t
<
DstData
>>>::
value
,
"wrong! SrcBuffer or DstBuffer data type is wrong"
);
#if 0
// turn this on after v5r1 is updated
static_assert(DstBuffer::IsStaticBuffer(), "wrong! DstBuffer need to be StaticBuffer");
#endif
static_assert
(
is_known_at_compile_time
<
remove_cv_t
<
remove_reference_t
<
SrcRefToOriginDisplacement
>>>::
value
&&
...
...
@@ -1413,8 +1416,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v4
constexpr
auto
dst_desc
=
remove_cv_t
<
remove_reference_t
<
DstDesc
>>
{};
// SrcOriginToRefDisttance and DstOriginToRefDistance are known at compile-time
constexpr
auto
src_ref_to_origin_disp_idx
=
SrcRefToOriginDisplacement
{};
constexpr
auto
dst_ref_to_origin_disp_idx
=
DstRefToOriginDisplacement
{};
constexpr
auto
src_ref_to_origin_disp_idx
=
to_multi_index
(
SrcRefToOriginDisplacement
{}
)
;
constexpr
auto
dst_ref_to_origin_disp_idx
=
to_multi_index
(
DstRefToOriginDisplacement
{}
)
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
...
...
@@ -1469,7 +1472,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v4
// src coordinate
constexpr
auto
src_ref_to_data_disp_idx
=
to_multi_index
(
src_ref_to_origin_disp_idx
+
data_to_origin_disp_idx
)
;
src_ref_to_origin_disp_idx
+
data_to_origin_disp_idx
;
constexpr
auto
src_ref_to_data_disp_coord_iterator
=
make_dynamic_tensor_coordinate_iterator
(
src_desc
,
src_ref_to_data_disp_idx
);
...
...
composable_kernel/include/tensor_operation/threadwise_gemm_v2.hpp
View file @
415a4a5b
...
...
@@ -6,7 +6,7 @@
namespace
ck
{
#if
1
#if
0
template <typename Float, typename Desc>
__device__ void threadwise_matrix_set_zero_v2(Desc, Float* __restrict__ p_thread)
{
...
...
@@ -174,8 +174,7 @@ struct ThreadwiseGemm_km_kn_mn_v1
// Element of matrix can be vectorized data
// Assume:
// 1. ADesc, BDesc, CDesc are known at compile-time
// 2. ABuffer, BBuffer, CBuffer are static buffer
// 3. AOriginIdx, BOriginIdx, COriginIdx are known at compile-time
// 2. AOriginIdx, BOriginIdx, COriginIdx are known at compile-time
template
<
typename
FloatA
,
typename
FloatB
,
typename
FloatC
,
...
...
composable_kernel/include/tensor_operation/threadwise_gemm_v3.hpp
View file @
415a4a5b
...
...
@@ -6,6 +6,7 @@
namespace
ck
{
#if 0
template <typename Float, typename Desc>
__device__ void threadwise_matrix_set_zero_v3(Desc, Float* __restrict__ p_thread)
{
...
...
@@ -31,10 +32,15 @@ __device__ void threadwise_matrix_set_zero_v3(Desc, Float* __restrict__ p_thread
});
});
}
#endif
#if 0
// C[M, N] += transpose(A[K, M]) * B[K, N]
// Element of matrix can be vectorized data
template
<
typename
ADesc
,
template <typename FloatA,
typename FloatB,
typename FloatC,
typename ADesc,
typename BDesc,
typename CDesc,
index_t H,
...
...
@@ -44,8 +50,7 @@ template <typename ADesc,
bool>::type = false>
struct ThreadwiseGemm_km_kn_mn_v3
{
template
<
typename
FloatA
,
typename
FloatB
,
typename
FloatC
>
__device__
static
void
Run_source
(
const
FloatA
*
p_a
,
const
FloatB
*
p_b
,
FloatC
*
p_c
)
__device__ static void Run(const FloatA* p_a, const FloatB* p_b, FloatC* p_c)
{
static_assert(ADesc::IsKnownAtCompileTime() && BDesc::IsKnownAtCompileTime() &&
CDesc::IsKnownAtCompileTime(),
...
...
@@ -132,13 +137,153 @@ struct ThreadwiseGemm_km_kn_mn_v3
}
);
}
);
}
}
;
#else
// C[M, N] += transpose(A[K, M]) * B[K, N]
// Element of matrix can be vectorized data
// Assume:
// 1. ADesc, BDesc, CDesc are known at compile-time
// 2. AOriginIdx, BOriginIdx, COriginIdx are known at compile-time
template
<
typename
FloatA
,
typename
FloatB
,
typename
FloatC
,
typename
ADesc
,
typename
BDesc
,
typename
CDesc
,
index_t
H
,
index_t
W
,
typename
std
::
enable_if
<
ADesc
::
IsKnownAtCompileTime
()
&&
BDesc
::
IsKnownAtCompileTime
()
&&
CDesc
::
IsKnownAtCompileTime
(),
bool
>
::
type
=
false
>
struct
ThreadwiseGemm_km_kn_mn_v3
{
template
<
typename
ABuffer
,
typename
AOriginIdx
,
typename
BBuffer
,
typename
BOriginIdx
,
typename
CBuffer
,
typename
COriginIdx
>
__device__
static
void
Run
(
const
ABuffer
&
a_buf
,
AOriginIdx
,
const
BBuffer
&
b_buf
,
BOriginIdx
,
CBuffer
&
c_buf
,
COriginIdx
)
{
static_assert
(
ADesc
::
IsKnownAtCompileTime
()
&&
BDesc
::
IsKnownAtCompileTime
()
&&
CDesc
::
IsKnownAtCompileTime
(),
"wrong! Desc should be known at compile-time"
);
template
<
typename
FloatA
,
typename
FloatB
,
typename
FloatC
>
__device__
static
void
Run
(
const
FloatA
*
p_a
,
const
FloatB
*
p_b
,
FloatC
*
p_c
)
static_assert
(
is_known_at_compile_time
<
remove_cv_t
<
remove_reference_t
<
AOriginIdx
>>>::
value
&&
is_known_at_compile_time
<
remove_cv_t
<
remove_reference_t
<
BOriginIdx
>>>::
value
&&
is_known_at_compile_time
<
remove_cv_t
<
remove_reference_t
<
COriginIdx
>>>::
value
,
"wrong! AOriginIdx, BOriginIdx, COringinIdx should be known at compile-time"
);
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
E
=
ADesc
{}.
GetLength
(
I0
);
constexpr
auto
K
=
ADesc
{}.
GetLength
(
I1
);
constexpr
auto
a_origin_idx
=
to_multi_index
(
AOriginIdx
{});
constexpr
auto
b_origin_idx
=
to_multi_index
(
BOriginIdx
{});
constexpr
auto
c_origin_idx
=
to_multi_index
(
COriginIdx
{});
static_for
<
0
,
E
,
1
>
{}([
&
](
auto
e
)
{
static_for
<
0
,
K
,
1
>
{}([
&
](
auto
k
)
{
constexpr
index_t
a_offset
=
ADesc
{}.
CalculateOffset
(
a_origin_idx
+
make_tuple
(
e
,
k
));
if
constexpr
(
H
==
2
&&
W
==
2
)
{
constexpr
index_t
b_offset_0
=
BDesc
{}.
CalculateOffset
(
b_origin_idx
+
make_tuple
(
e
,
0
,
0
,
0
));
constexpr
index_t
b_offset_1
=
BDesc
{}.
CalculateOffset
(
b_origin_idx
+
make_tuple
(
e
,
0
,
0
,
1
));
constexpr
index_t
b_offset_2
=
BDesc
{}.
CalculateOffset
(
b_origin_idx
+
make_tuple
(
e
,
0
,
1
,
0
));
constexpr
index_t
b_offset_3
=
BDesc
{}.
CalculateOffset
(
b_origin_idx
+
make_tuple
(
e
,
0
,
1
,
1
));
constexpr
index_t
c_offset_0
=
CDesc
{}.
CalculateOffset
(
c_origin_idx
+
make_tuple
(
k
,
0
,
0
,
0
));
constexpr
index_t
c_offset_1
=
CDesc
{}.
CalculateOffset
(
c_origin_idx
+
make_tuple
(
k
,
0
,
0
,
1
));
constexpr
index_t
c_offset_2
=
CDesc
{}.
CalculateOffset
(
c_origin_idx
+
make_tuple
(
k
,
0
,
1
,
0
));
constexpr
index_t
c_offset_3
=
CDesc
{}.
CalculateOffset
(
c_origin_idx
+
make_tuple
(
k
,
0
,
1
,
1
));
amd_assembly_outer_product_1x4
(
a_buf
[
Number
<
a_offset
>
{}],
b_buf
[
Number
<
b_offset_0
>
{}],
b_buf
[
Number
<
b_offset_1
>
{}],
b_buf
[
Number
<
b_offset_2
>
{}],
b_buf
[
Number
<
b_offset_3
>
{}],
c_buf
(
Number
<
c_offset_0
>
{}),
c_buf
(
Number
<
c_offset_1
>
{}),
c_buf
(
Number
<
c_offset_2
>
{}),
c_buf
(
Number
<
c_offset_3
>
{}));
}
else
if
constexpr
(
H
==
4
&&
W
==
1
)
{
Run_source
(
p_a
,
p_b
,
p_c
);
constexpr
index_t
b_offset_0
=
BDesc
{}.
CalculateOffset
(
b_origin_idx
+
make_tuple
(
e
,
0
,
0
,
0
));
constexpr
index_t
b_offset_1
=
BDesc
{}.
CalculateOffset
(
b_origin_idx
+
make_tuple
(
e
,
0
,
1
,
0
));
constexpr
index_t
b_offset_2
=
BDesc
{}.
CalculateOffset
(
b_origin_idx
+
make_tuple
(
e
,
0
,
2
,
0
));
constexpr
index_t
b_offset_3
=
BDesc
{}.
CalculateOffset
(
b_origin_idx
+
make_tuple
(
e
,
0
,
3
,
0
));
constexpr
index_t
c_offset_0
=
CDesc
{}.
CalculateOffset
(
c_origin_idx
+
make_tuple
(
k
,
0
,
0
,
0
));
constexpr
index_t
c_offset_1
=
CDesc
{}.
CalculateOffset
(
c_origin_idx
+
make_tuple
(
k
,
0
,
1
,
0
));
constexpr
index_t
c_offset_2
=
CDesc
{}.
CalculateOffset
(
c_origin_idx
+
make_tuple
(
k
,
0
,
2
,
0
));
constexpr
index_t
c_offset_3
=
CDesc
{}.
CalculateOffset
(
c_origin_idx
+
make_tuple
(
k
,
0
,
3
,
0
));
amd_assembly_outer_product_1x4
(
a_buf
[
Number
<
a_offset
>
{}],
b_buf
[
Number
<
b_offset_0
>
{}],
b_buf
[
Number
<
b_offset_1
>
{}],
b_buf
[
Number
<
b_offset_2
>
{}],
b_buf
[
Number
<
b_offset_3
>
{}],
c_buf
(
Number
<
c_offset_0
>
{}),
c_buf
(
Number
<
c_offset_1
>
{}),
c_buf
(
Number
<
c_offset_2
>
{}),
c_buf
(
Number
<
c_offset_3
>
{}));
}
else
{
static_for
<
0
,
H
,
1
>
{}([
&
](
auto
h
)
{
static_for
<
0
,
W
,
1
>
{}([
&
](
auto
w
)
{
constexpr
index_t
b_offset
=
BDesc
{}.
CalculateOffset
(
b_origin_idx
+
make_tuple
(
e
,
0
,
h
,
w
));
constexpr
index_t
c_offset
=
CDesc
{}.
CalculateOffset
(
c_origin_idx
+
make_tuple
(
k
,
0
,
h
,
w
));
#if 0
c_buf(Number<c_offset>{}) += inner_product_with_conversion<FloatC>{}(a_buf[Number<a_offset>{}],
b_buf[Number<b_offset>{}]);
#else
amd_assembly_inner_product
(
a_buf
[
Number
<
a_offset
>
{}],
b_buf
[
Number
<
b_offset
>
{}],
c_buf
(
Number
<
c_offset
>
{}));
#endif
});
});
}
});
});
}
};
#endif
}
// namespace ck
#endif
composable_kernel/include/utility/config.amd.hpp.in
View file @
415a4a5b
...
...
@@ -14,7 +14,7 @@
#define CK_DEVICE_BACKEND_AMD 1
// GPU ID
#if
1
#if
0
#define CK_AMD_GPU_GFX906 1
#elif 0
#define CK_AMD_GPU_GFX908 1
...
...
@@ -28,7 +28,7 @@
#endif
// launch bounds
#define CK_USE_LAUNCH_BOUNDS
0
#define CK_USE_LAUNCH_BOUNDS
1
#ifdef CK_USE_LAUNCH_BOUNDS
#define CK_MAX_THREAD_PER_BLOCK 256
...
...
driver/src/conv_driver.cpp
View file @
415a4a5b
...
...
@@ -64,7 +64,7 @@ int main(int argc, char* argv[])
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif
0
#elif
1
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
HI
=
1080
;
...
...
@@ -630,7 +630,7 @@ int main(int argc, char* argv[])
print_array
(
"ConvStrides"
,
to_multi_index
(
ConvStrides
{}));
print_array
(
"ConvDilations"
,
to_multi_index
(
ConvDilations
{}));
#if
1
#if
0
using in_data_t = float;
constexpr index_t in_vector_size = 1;
using acc_data_t = float;
...
...
@@ -724,7 +724,7 @@ int main(int argc, char* argv[])
LeftPads
{},
RightPads
{},
nrepeat
);
#elif
1
#elif
0
device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw
<
in_data_t
,
in_vector_size
,
acc_data_t
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment