Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
a037693f
Commit
a037693f
authored
Dec 01, 2021
by
ltqin
Browse files
Merge branch 'develop' into conv_splitk_f32
parents
0694d6ed
4041850f
Changes
38
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2745 additions
and
434 deletions
+2745
-434
CMakeLists.txt
CMakeLists.txt
+1
-0
composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp
...rnel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp
+92
-100
composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v3.hpp
...ernel/include/tensor_operation/gridwise_gemm_dlops_v3.hpp
+1920
-0
composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp
...nel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp
+104
-96
composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp
...ude/tensor_operation/threadwise_tensor_slice_transfer.hpp
+35
-0
composable_kernel/include/utility/amd_buffer_addressing.hpp
composable_kernel/include/utility/amd_buffer_addressing.hpp
+24
-16
composable_kernel/include/utility/amd_inline_asm.hpp
composable_kernel/include/utility/amd_inline_asm.hpp
+14
-14
composable_kernel/include/utility/amd_xdlops.hpp
composable_kernel/include/utility/amd_xdlops.hpp
+4
-4
composable_kernel/include/utility/config.hpp
composable_kernel/include/utility/config.hpp
+23
-2
composable_kernel/include/utility/data_type.hpp
composable_kernel/include/utility/data_type.hpp
+3
-3
composable_kernel/include/utility/dynamic_buffer.hpp
composable_kernel/include/utility/dynamic_buffer.hpp
+40
-0
composable_kernel/include/utility/inner_product.hpp
composable_kernel/include/utility/inner_product.hpp
+2
-2
composable_kernel/include/utility/magic_division.hpp
composable_kernel/include/utility/magic_division.hpp
+1
-1
composable_kernel/include/utility/statically_indexed_array.hpp
...sable_kernel/include/utility/statically_indexed_array.hpp
+44
-0
composable_kernel/include/utility/type.hpp
composable_kernel/include/utility/type.hpp
+9
-1
example/1_gemm_xdl/gemm_xdl.cpp
example/1_gemm_xdl/gemm_xdl.cpp
+4
-5
host/driver_offline/CMakeLists.txt
host/driver_offline/CMakeLists.txt
+9
-0
host/driver_offline/include/device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
...ward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+220
-0
host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
...ward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+196
-0
host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
...ution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
+0
-190
No files found.
CMakeLists.txt
View file @
a037693f
...
@@ -200,3 +200,4 @@ enable_cppcheck(
...
@@ -200,3 +200,4 @@ enable_cppcheck(
add_subdirectory
(
host
)
add_subdirectory
(
host
)
add_subdirectory
(
example
)
add_subdirectory
(
example
)
add_subdirectory
(
profiler
)
add_subdirectory
(
profiler
)
add_subdirectory
(
test
)
composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp
View file @
a037693f
...
@@ -10,99 +10,99 @@ template <index_t BlockSize,
...
@@ -10,99 +10,99 @@ template <index_t BlockSize,
typename
FloatA
,
typename
FloatA
,
typename
FloatB
,
typename
FloatB
,
typename
FloatC
,
typename
FloatC
,
typename
BlockMatrixA
,
typename
ABlockDesc_E1_K1_E2
,
typename
BlockMatrixB
,
typename
BBlockDesc_E1_N_Ho_Wo_E2
,
typename
ThreadMatrixC
,
typename
CThreadDesc_K_N_Ho_Wo
,
index_t
KPerThread
,
index_t
HPerThread
,
index_t
WPerThread
,
index_t
EPerThreadLoop
,
index_t
EPerThreadLoop
,
index_t
ThreadGemmADataPerRead_K
,
index_t
KPerThreadLoop
>
index_t
ThreadGemmBDataPerRead_W
>
struct
BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
struct
BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
{
{
struct
MatrixIndex
static
constexpr
auto
I0
=
Number
<
0
>
{};
{
static
constexpr
auto
I1
=
Number
<
1
>
{};
index_t
k
;
static
constexpr
auto
I2
=
Number
<
2
>
{};
index_t
h
;
static
constexpr
auto
I3
=
Number
<
3
>
{};
index_t
w
;
static
constexpr
auto
I4
=
Number
<
4
>
{};
};
using
AIndex
=
MultiIndex
<
3
>
;
using
BIndex
=
MultiIndex
<
3
>
;
using
CIndex
=
MultiIndex
<
4
>
;
static
constexpr
auto
E1
=
ABlockDesc_E1_K1_E2
{}.
GetLength
(
I0
);
static
constexpr
auto
KPerBlock
=
ABlockDesc_E1_K1_E2
{}.
GetLength
(
I1
);
static
constexpr
auto
E2
=
ABlockDesc_E1_K1_E2
{}.
GetLength
(
I2
);
static
constexpr
auto
HoPerBlock
=
BBlockDesc_E1_N_Ho_Wo_E2
{}.
GetLength
(
I2
);
static
constexpr
auto
WoPerBlock
=
BBlockDesc_E1_N_Ho_Wo_E2
{}.
GetLength
(
I3
);
// HACK: fix this @Jing Zhang
static
constexpr
auto
KPerThread
=
CThreadDesc_K_N_Ho_Wo
{}.
GetLength
(
I0
);
static
constexpr
index_t
KPerThreadSubC
=
4
;
static
constexpr
auto
HoPerThread
=
CThreadDesc_K_N_Ho_Wo
{}.
GetLength
(
I2
);
static
constexpr
auto
WoPerThread
=
CThreadDesc_K_N_Ho_Wo
{}.
GetLength
(
I3
);
static
constexpr
auto
a_thread_mtx_
=
make_naive_tensor_descriptor_packed
(
static
constexpr
auto
a_thread_mtx_
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
EPerThreadLoop
>
{},
Number
<
KPerThread
SubC
>
{}));
make_tuple
(
Number
<
EPerThreadLoop
>
{},
Number
<
KPerThread
Loop
>
{},
Number
<
E2
>
{}));
static
constexpr
auto
b_thread_mtx_
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
static
constexpr
auto
b_thread_mtx_
=
Number
<
EPerThreadLoop
>
{},
Number
<
1
>
{},
Number
<
HPerThread
>
{},
Number
<
WPerThread
>
{}));
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
EPerThreadLoop
>
{},
Number
<
1
>
{},
Number
<
HoPerThread
>
{},
Number
<
WoPerThread
>
{},
Number
<
E2
>
{}));
static
constexpr
auto
c_thread_mtx_
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
static
constexpr
auto
c_thread_mtx_
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
KPerThreadSubC
>
{},
Number
<
1
>
{},
Number
<
HPerThread
>
{},
Number
<
WPerThread
>
{}));
Number
<
KPerThreadLoop
>
{},
Number
<
1
>
{},
Number
<
HoPerThread
>
{},
Number
<
WoPerThread
>
{}));
using
AThreadCopy
=
ThreadwiseTensorSliceTransfer_v4
<
FloatA
,
FloatA
,
BlockMatrixA
,
decltype
(
a_thread_mtx_
),
Sequence
<
EPerThreadLoop
,
KPerThreadSubC
>
,
Sequence
<
0
,
1
>
,
1
,
ThreadGemmADataPerRead_K
,
1
>
;
__device__
BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
()
__device__
BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
()
:
c_thread_
be
gin_
mtx
_idx_
{
GetBeginOfThread
MatrixC
(
get_thread_local_1d_id
())},
:
c_thread_
ori
gin_
data
_idx_
{
GetBeginOf
C
Thread
Desc_K_N_Ho_Wo
(
get_thread_local_1d_id
())},
a_thread_copy_
{
make_tuple
(
0
,
c_thread_
be
gin_
mtx
_idx_
.
k
*
KPerThread
)}
a_thread_copy_
{
make_tuple
(
0
,
c_thread_
ori
gin_
data
_idx_
[
I0
]
*
KPerThread
,
0
)}
{
{
static_assert
(
Block
MatrixA
::
IsKnownAtCompileTime
()
&&
static_assert
(
A
Block
Desc_E1_K1_E2
::
IsKnownAtCompileTime
()
&&
Block
MatrixB
::
IsKnownAtCompileTime
()
&&
B
Block
Desc_E1_N_Ho_Wo_E2
::
IsKnownAtCompileTime
()
&&
Thread
MatrixC
::
IsKnownAtCompileTime
(),
C
Thread
Desc_K_N_Ho_Wo
::
IsKnownAtCompileTime
(),
"wrong! Desc should be known at compile-time"
);
"wrong! Desc should be known at compile-time"
);
constexpr
auto
I0
=
Number
<
0
>
{};
static_assert
(
constexpr
auto
I1
=
Number
<
1
>
{};
ABlockDesc_E1_K1_E2
{}.
GetLength
(
I0
)
==
BBlockDesc_E1_N_Ho_Wo_E2
{}.
GetLength
(
I0
)
&&
constexpr
auto
I2
=
Number
<
2
>
{};
ABlockDesc_E1_K1_E2
{}.
GetLength
(
I2
)
==
BBlockDesc_E1_N_Ho_Wo_E2
{}.
GetLength
(
I4
),
constexpr
auto
I3
=
Number
<
3
>
{};
"wrong! E dimension not consistent
\n
"
);
static_assert
(
BlockMatrixA
{}.
GetLength
(
I0
)
==
BlockMatrixB
{}.
GetLength
(
I0
),
"wrong! K dimension not consistent
\n
"
);
constexpr
index_t
K
=
BlockMatrixA
{}.
GetLength
(
I1
);
// A is transposed
static_assert
(
E1
%
EPerThreadLoop
==
0
,
""
);
constexpr
index_t
H
=
BlockMatrixB
{}.
GetLength
(
I2
);
static_assert
(
KPerThread
%
KPerThreadLoop
==
0
,
""
);
constexpr
index_t
W
=
BlockMatrixB
{}.
GetLength
(
I3
);
static_assert
(
K
%
KPerThread
==
0
&&
H
%
HPerThread
==
0
&&
W
%
WPerThread
==
0
,
static_assert
(
KPerBlock
%
KPerThread
==
0
&&
HoPerBlock
%
HoPerThread
==
0
&&
WoPerBlock
%
WoPerThread
==
0
,
"wrong! Cannot evenly divide work among
\n
"
);
"wrong! Cannot evenly divide work among
\n
"
);
constexpr
auto
KThreadCluster
=
K
/
KPerThread
;
constexpr
auto
KThreadCluster
=
K
PerBlock
/
KPerThread
;
constexpr
auto
HThreadCluster
=
H
/
HPerThread
;
constexpr
auto
HThreadCluster
=
H
oPerBlock
/
H
o
PerThread
;
constexpr
auto
WThreadCluster
=
W
/
WPerThread
;
constexpr
auto
WThreadCluster
=
W
oPerBlock
/
W
o
PerThread
;
static_assert
(
BlockSize
==
KThreadCluster
*
HThreadCluster
*
WThreadCluster
,
static_assert
(
BlockSize
==
KThreadCluster
*
HThreadCluster
*
WThreadCluster
,
"wrong! wrong blocksize
\n
"
);
"wrong! wrong blocksize
\n
"
);
}
}
__device__
static
constexpr
auto
GetThread
MatrixC
Lengths
()
__device__
static
constexpr
auto
Get
C
Thread
Desc_K_N_Ho_Wo
Lengths
()
{
{
return
Sequence
<
KPerThread
,
1
,
HPerThread
,
WPerThread
>
{};
return
Sequence
<
KPerThread
,
I
1
,
H
o
PerThread
,
W
o
PerThread
>
{};
}
}
__device__
static
Matrix
Index
GetBeginOfThread
MatrixC
(
index_t
thread_id
)
__device__
static
C
Index
GetBeginOf
C
Thread
Desc_K_N_Ho_Wo
(
index_t
thread_id
)
{
{
constexpr
index_t
H
=
BlockMatrixB
{}.
GetLength
(
Number
<
2
>
{});
constexpr
auto
K0
=
KPerBlock
/
KPerThread
;
constexpr
index_t
W
=
BlockMatrixB
{}.
GetLength
(
Number
<
3
>
{});
constexpr
auto
N0
=
I1
;
constexpr
auto
H0
=
HoPerBlock
/
HoPerThread
;
constexpr
auto
num_w_threads
=
W
/
WPerThread
;
constexpr
auto
W0
=
WoPerBlock
/
WoPerThread
;
constexpr
auto
num_h_threads
=
H
/
HPerThread
;
constexpr
auto
num_hw_threads
=
num_w_threads
*
num_h_threads
;
constexpr
auto
c_threadid_to_k_n_h_w_thread_cluster_adaptor
=
make_single_stage_tensor_adaptor
(
index_t
k_thread_id
=
thread_id
/
num_hw_threads
;
make_tuple
(
make_merge_transform
(
make_tuple
(
K0
,
N0
,
H0
,
W0
))),
index_t
hw_thread_id
=
thread_id
%
num_hw_threads
;
make_tuple
(
Sequence
<
0
,
1
,
2
,
3
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
index_t
h_thread_id
=
hw_thread_id
/
num_w_threads
;
index_t
w_thread_id
=
hw_thread_id
%
num_w_threads
;
const
auto
c_k_n_h_w_thread_cluster_idx
=
c_threadid_to_k_n_h_w_thread_cluster_adaptor
.
CalculateBottomIndex
(
return
MatrixIndex
{
k_thread_id
,
h_thread_id
,
w_thread_id
};
make_multi_index
(
thread_id
));
return
c_k_n_h_w_thread_cluster_idx
;
}
}
template
<
typename
ABlockBuffer
,
typename
BThreadBuffer
,
typename
CThreadBuffer
>
template
<
typename
ABlockBuffer
,
typename
BThreadBuffer
,
typename
CThreadBuffer
>
...
@@ -116,19 +116,7 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
...
@@ -116,19 +116,7 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
is_same
<
remove_cvref_t
<
typename
CThreadBuffer
::
type
>
,
remove_cvref_t
<
FloatC
>>::
value
&&
is_same
<
remove_cvref_t
<
typename
CThreadBuffer
::
type
>
,
remove_cvref_t
<
FloatC
>>::
value
&&
"wrong! inconsistent type"
);
"wrong! inconsistent type"
);
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
a_block_mtx
=
ABlockDesc_E1_K1_E2
{};
constexpr
auto
a_block_mtx
=
BlockMatrixA
{};
constexpr
auto
EPerBlock
=
a_block_mtx
.
GetLength
(
I0
);
// HACK: fix this @Jing Zhang
constexpr
auto
HoPerThreadSubC
=
2
;
constexpr
auto
WoPerThreadSubC
=
2
;
static_assert
(
KPerThread
%
KPerThreadSubC
==
0
,
""
);
static_assert
(
HPerThread
%
HoPerThreadSubC
==
0
,
""
);
static_assert
(
WPerThread
%
WoPerThreadSubC
==
0
,
""
);
// thread A buffer for GEMM
// thread A buffer for GEMM
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
FloatA
,
a_thread_mtx_
.
GetElementSpaceSize
(),
true
>
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
FloatA
,
a_thread_mtx_
.
GetElementSpaceSize
(),
true
>
...
@@ -139,42 +127,46 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
...
@@ -139,42 +127,46 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
FloatC
,
FloatC
,
decltype
(
a_thread_mtx_
),
decltype
(
a_thread_mtx_
),
decltype
(
b_thread_mtx_
),
decltype
(
b_thread_mtx_
),
decltype
(
c_thread_mtx_
),
decltype
(
c_thread_mtx_
)
>
{};
HoPerThreadSubC
,
WoPerThreadSubC
>
{};
static_for
<
0
,
E
PerBlock
,
EPerThreadLoop
>
{}([
&
](
auto
e_begin
)
{
static_for
<
0
,
E
1
,
EPerThreadLoop
>
{}([
&
](
auto
e_begin
)
{
static_for
<
0
,
KPerThread
,
KPerThread
SubC
>
{}([
&
](
auto
k_begin
)
{
static_for
<
0
,
KPerThread
,
KPerThread
Loop
>
{}([
&
](
auto
k_begin
)
{
a_thread_copy_
.
Run
(
a_block_mtx
,
a_thread_copy_
.
Run
(
a_block_mtx
,
make_tuple
(
e_begin
,
k_begin
),
make_tuple
(
e_begin
,
k_begin
,
I0
),
a_block_buf
,
a_block_buf
,
a_thread_mtx_
,
a_thread_mtx_
,
make_tuple
(
I0
,
I0
),
make_tuple
(
I0
,
I0
,
I0
),
a_thread_buf
);
a_thread_buf
);
static_for
<
0
,
HPerThread
,
HoPerThreadSubC
>
{}([
&
](
auto
h_begin
)
{
threadwise_gemm
.
Run
(
a_thread_buf
,
static_for
<
0
,
WPerThread
,
WoPerThreadSubC
>
{}([
&
](
auto
w_begin
)
{
make_tuple
(
I0
,
I0
,
I0
),
threadwise_gemm
.
Run
(
a_thread_buf
,
b_thread_buf
,
make_tuple
(
I0
,
I0
),
make_tuple
(
e_begin
,
I0
,
I0
,
I0
,
I0
),
b_thread_buf
,
c_thread_buf
,
make_tuple
(
e_begin
,
I0
,
h_begin
,
w_begin
),
make_tuple
(
k_begin
,
I0
,
I0
,
I0
));
c_thread_buf
,
make_tuple
(
k_begin
,
I0
,
h_begin
,
w_begin
));
});
});
});
});
});
});
}
}
template
<
typename
ABlockSliceMoveStepIdx
>
template
<
typename
ABlockSliceMoveStepIdx
>
__device__
void
MoveASliceWindow
(
const
BlockMatrixA
&
,
__device__
void
MoveABlockSliceWindow
(
const
ABlockSliceMoveStepIdx
&
a_block_slice_move_step_idx
)
const
ABlockSliceMoveStepIdx
&
a_block_slice_move_step_idx
)
{
{
a_thread_copy_
.
MoveSrcSliceWindow
(
Block
MatrixA
{},
a_block_slice_move_step_idx
);
a_thread_copy_
.
MoveSrcSliceWindow
(
A
Block
Desc_E1_K1_E2
{},
a_block_slice_move_step_idx
);
}
}
private:
private:
MatrixIndex
c_thread_begin_mtx_idx_
;
using
AThreadCopy
=
ThreadwiseTensorSliceTransfer_v4
<
FloatA
,
FloatA
,
ABlockDesc_E1_K1_E2
,
decltype
(
a_thread_mtx_
),
Sequence
<
EPerThreadLoop
,
KPerThreadLoop
,
E2
>
,
Sequence
<
0
,
1
,
2
>
,
2
,
E2
,
E2
>
;
CIndex
c_thread_origin_data_idx_
;
AThreadCopy
a_thread_copy_
;
AThreadCopy
a_thread_copy_
;
};
};
...
...
composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v3.hpp
0 → 100644
View file @
a037693f
This diff is collapsed.
Click to expand it.
composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp
View file @
a037693f
...
@@ -9,21 +9,22 @@ namespace ck {
...
@@ -9,21 +9,22 @@ namespace ck {
// C[M, N] += transpose(A[K, M]) * B[K, N]
// C[M, N] += transpose(A[K, M]) * B[K, N]
// Element of matrix can be vectorized data
// Element of matrix can be vectorized data
// Assume:
// Assume:
// 1. ADesc, BDesc, CDesc are known at compile-time
// 1. AThreadDesc_E1_K_E2, BThreadDesc_E1_N_Ho_Wo_E2, CThreadDesc_K_N_Ho_Wo are known at
// compile-time
// 2. AOriginIdx, BOriginIdx, COriginIdx are known at compile-time
// 2. AOriginIdx, BOriginIdx, COriginIdx are known at compile-time
template
<
typename
FloatA
,
template
<
typename
FloatA
,
typename
FloatB
,
typename
FloatB
,
typename
FloatC
,
typename
FloatC
,
typename
ADesc
,
typename
AThreadDesc_E1_K_E2
,
typename
BDesc
,
typename
BThreadDesc_E1_N_Ho_Wo_E2
,
typename
CDesc
,
typename
CThreadDesc_K_N_Ho_Wo
,
index_t
H
,
typename
enable_if
<
AThreadDesc_E1_K_E2
::
IsKnownAtCompileTime
()
&&
index_t
W
,
BThreadDesc_E1_N_Ho_Wo_E2
::
IsKnownAtCompileTime
()
&&
typename
enable_if
<
ADesc
::
IsKnownAtCompileTime
()
&&
BDesc
::
IsKnownAtCompileTime
()
&&
CThreadDesc_K_N_Ho_Wo
::
IsKnownAtCompileTime
(),
CDesc
::
IsKnownAtCompileTime
(),
bool
>
::
type
=
false
>
bool
>
::
type
=
false
>
struct
ThreadwiseGemmDlops_km_kn_mn_v3
struct
ThreadwiseGemmDlops_km_kn_mn_v3
{
{
template
<
typename
ABuffer
,
template
<
typename
ABuffer
,
typename
AOriginIdx
,
typename
AOriginIdx
,
typename
BBuffer
,
typename
BBuffer
,
...
@@ -37,8 +38,10 @@ struct ThreadwiseGemmDlops_km_kn_mn_v3
...
@@ -37,8 +38,10 @@ struct ThreadwiseGemmDlops_km_kn_mn_v3
CBuffer
&
c_buf
,
CBuffer
&
c_buf
,
COriginIdx
)
COriginIdx
)
{
{
static_assert
(
ADesc
::
IsKnownAtCompileTime
()
&&
BDesc
::
IsKnownAtCompileTime
()
&&
CDesc
::
IsKnownAtCompileTime
(),
static_assert
(
AThreadDesc_E1_K_E2
::
IsKnownAtCompileTime
()
&&
BThreadDesc_E1_N_Ho_Wo_E2
::
IsKnownAtCompileTime
()
&&
CThreadDesc_K_N_Ho_Wo
::
IsKnownAtCompileTime
(),
"wrong! Desc should be known at compile-time"
);
"wrong! Desc should be known at compile-time"
);
static_assert
(
is_known_at_compile_time
<
remove_cvref_t
<
AOriginIdx
>>::
value
&&
static_assert
(
is_known_at_compile_time
<
remove_cvref_t
<
AOriginIdx
>>::
value
&&
...
@@ -54,102 +57,107 @@ struct ThreadwiseGemmDlops_km_kn_mn_v3
...
@@ -54,102 +57,107 @@ struct ThreadwiseGemmDlops_km_kn_mn_v3
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
E1
=
AThreadDesc_E1_K_E2
{}.
GetLength
(
I0
);
constexpr
auto
K
=
AThreadDesc_E1_K_E2
{}.
GetLength
(
I1
);
constexpr
auto
E2
=
AThreadDesc_E1_K_E2
{}.
GetLength
(
I2
);
constexpr
auto
E
=
ADesc
{}.
GetLength
(
I
0
);
constexpr
auto
Ho
=
BThreadDesc_E1_N_Ho_Wo_E2
{}.
GetLength
(
I
2
);
constexpr
auto
K
=
ADesc
{}.
GetLength
(
I
1
);
constexpr
auto
Wo
=
BThreadDesc_E1_N_Ho_Wo_E2
{}.
GetLength
(
I
3
);
constexpr
auto
a_origin_idx
=
to_multi_index
(
AOriginIdx
{});
constexpr
auto
a_origin_idx
=
to_multi_index
(
AOriginIdx
{});
constexpr
auto
b_origin_idx
=
to_multi_index
(
BOriginIdx
{});
constexpr
auto
b_origin_idx
=
to_multi_index
(
BOriginIdx
{});
constexpr
auto
c_origin_idx
=
to_multi_index
(
COriginIdx
{});
constexpr
auto
c_origin_idx
=
to_multi_index
(
COriginIdx
{});
static_for
<
0
,
E
,
1
>
{}([
&
](
auto
e
)
{
if
constexpr
((
Ho
%
2
==
0
)
&&
(
Wo
%
2
==
0
))
{
constexpr
auto
SubHW
=
2
;
static_for
<
0
,
K
,
1
>
{}([
&
](
auto
k
)
{
static_for
<
0
,
K
,
1
>
{}([
&
](
auto
k
)
{
constexpr
index_t
a_offset
=
static_for
<
0
,
Ho
,
SubHW
>
{}([
&
](
auto
h
)
{
ADesc
{}.
CalculateOffset
(
a_origin_idx
+
make_tuple
(
e
,
k
));
static_for
<
0
,
Wo
,
SubHW
>
{}([
&
](
auto
w
)
{
static_for
<
0
,
E1
,
1
>
{}([
&
](
auto
e1
)
{
if
constexpr
(
H
==
2
&&
W
==
2
)
static_for
<
0
,
E2
,
1
>
{}([
&
](
auto
e2
)
{
{
constexpr
index_t
a_offset
=
AThreadDesc_E1_K_E2
{}.
CalculateOffset
(
constexpr
index_t
b_offset_0
=
a_origin_idx
+
make_tuple
(
e1
,
k
,
e2
));
BDesc
{}.
CalculateOffset
(
b_origin_idx
+
make_tuple
(
e
,
0
,
0
,
0
));
constexpr
index_t
b_offset_1
=
constexpr
index_t
b0_offset
=
BDesc
{}.
CalculateOffset
(
b_origin_idx
+
make_tuple
(
e
,
0
,
0
,
1
));
BThreadDesc_E1_N_Ho_Wo_E2
{}.
CalculateOffset
(
constexpr
index_t
b_offset_2
=
b_origin_idx
+
make_tuple
(
e1
,
0
,
h
,
w
,
e2
));
BDesc
{}.
CalculateOffset
(
b_origin_idx
+
make_tuple
(
e
,
0
,
1
,
0
));
constexpr
index_t
b_offset_3
=
constexpr
index_t
b1_offset
=
BDesc
{}.
CalculateOffset
(
b_origin_idx
+
make_tuple
(
e
,
0
,
1
,
1
));
BThreadDesc_E1_N_Ho_Wo_E2
{}.
CalculateOffset
(
b_origin_idx
+
make_tuple
(
e1
,
0
,
h
,
w
+
1
,
e2
));
constexpr
index_t
c_offset_0
=
CDesc
{}.
CalculateOffset
(
c_origin_idx
+
make_tuple
(
k
,
0
,
0
,
0
));
constexpr
index_t
b2_offset
=
constexpr
index_t
c_offset_1
=
BThreadDesc_E1_N_Ho_Wo_E2
{}.
CalculateOffset
(
CDesc
{}.
CalculateOffset
(
c_origin_idx
+
make_tuple
(
k
,
0
,
0
,
1
));
b_origin_idx
+
make_tuple
(
e1
,
0
,
h
+
1
,
w
,
e2
));
constexpr
index_t
c_offset_2
=
CDesc
{}.
CalculateOffset
(
c_origin_idx
+
make_tuple
(
k
,
0
,
1
,
0
));
constexpr
index_t
b3_offset
=
constexpr
index_t
c_offset_3
=
BThreadDesc_E1_N_Ho_Wo_E2
{}.
CalculateOffset
(
CDesc
{}.
CalculateOffset
(
c_origin_idx
+
make_tuple
(
k
,
0
,
1
,
1
));
b_origin_idx
+
make_tuple
(
e1
,
0
,
h
+
1
,
w
+
1
,
e2
));
amd_assembly_outer_product_1x4
(
a_buf
[
Number
<
a_offset
>
{}],
constexpr
index_t
c0_offset
=
b_buf
[
Number
<
b_offset_0
>
{}],
CThreadDesc_K_N_Ho_Wo
{}.
CalculateOffset
(
c_origin_idx
+
b_buf
[
Number
<
b_offset_1
>
{}],
make_tuple
(
k
,
0
,
h
,
w
));
b_buf
[
Number
<
b_offset_2
>
{}],
b_buf
[
Number
<
b_offset_3
>
{}],
constexpr
index_t
c1_offset
=
c_buf
(
Number
<
c_offset_0
>
{}),
CThreadDesc_K_N_Ho_Wo
{}.
CalculateOffset
(
c_buf
(
Number
<
c_offset_1
>
{}),
c_origin_idx
+
make_tuple
(
k
,
0
,
h
,
w
+
1
));
c_buf
(
Number
<
c_offset_2
>
{}),
c_buf
(
Number
<
c_offset_3
>
{}));
constexpr
index_t
c2_offset
=
}
CThreadDesc_K_N_Ho_Wo
{}.
CalculateOffset
(
else
if
constexpr
(
H
==
4
&&
W
==
1
)
c_origin_idx
+
make_tuple
(
k
,
0
,
h
+
1
,
w
));
{
constexpr
index_t
b_offset_0
=
constexpr
index_t
c3_offset
=
BDesc
{}.
CalculateOffset
(
b_origin_idx
+
make_tuple
(
e
,
0
,
0
,
0
));
CThreadDesc_K_N_Ho_Wo
{}.
CalculateOffset
(
constexpr
index_t
b_offset_1
=
c_origin_idx
+
make_tuple
(
k
,
0
,
h
+
1
,
w
+
1
));
BDesc
{}.
CalculateOffset
(
b_origin_idx
+
make_tuple
(
e
,
0
,
1
,
0
));
constexpr
index_t
b_offset_2
=
amd_assembly_outer_product_1x4
(
a_buf
[
Number
<
a_offset
>
{}],
BDesc
{}.
CalculateOffset
(
b_origin_idx
+
make_tuple
(
e
,
0
,
2
,
0
));
b_buf
[
Number
<
b0_offset
>
{}],
constexpr
index_t
b_offset_3
=
b_buf
[
Number
<
b1_offset
>
{}],
BDesc
{}.
CalculateOffset
(
b_origin_idx
+
make_tuple
(
e
,
0
,
3
,
0
));
b_buf
[
Number
<
b2_offset
>
{}],
b_buf
[
Number
<
b3_offset
>
{}],
constexpr
index_t
c_offset_0
=
c_buf
(
Number
<
c0_offset
>
{}),
CDesc
{}.
CalculateOffset
(
c_origin_idx
+
make_tuple
(
k
,
0
,
0
,
0
));
c_buf
(
Number
<
c1_offset
>
{}),
constexpr
index_t
c_offset_1
=
c_buf
(
Number
<
c2_offset
>
{}),
CDesc
{}.
CalculateOffset
(
c_origin_idx
+
make_tuple
(
k
,
0
,
1
,
0
));
c_buf
(
Number
<
c3_offset
>
{}));
constexpr
index_t
c_offset_2
=
});
CDesc
{}.
CalculateOffset
(
c_origin_idx
+
make_tuple
(
k
,
0
,
2
,
0
));
});
constexpr
index_t
c_offset_3
=
});
CDesc
{}.
CalculateOffset
(
c_origin_idx
+
make_tuple
(
k
,
0
,
3
,
0
));
});
});
amd_assembly_outer_product_1x4
(
a_buf
[
Number
<
a_offset
>
{}],
}
b_buf
[
Number
<
b_offset_0
>
{}],
else
b_buf
[
Number
<
b_offset_1
>
{}],
{
b_buf
[
Number
<
b_offset_2
>
{}],
b_buf
[
Number
<
b_offset_3
>
{}],
static_for
<
0
,
K
,
1
>
{}([
&
](
auto
k
)
{
c_buf
(
Number
<
c_offset_0
>
{}),
static_for
<
0
,
Ho
,
1
>
{}([
&
](
auto
h
)
{
c_buf
(
Number
<
c_offset_1
>
{}),
static_for
<
0
,
Wo
,
1
>
{}([
&
](
auto
w
)
{
c_buf
(
Number
<
c_offset_2
>
{}),
static_for
<
0
,
E1
,
1
>
{}([
&
](
auto
e1
)
{
c_buf
(
Number
<
c_offset_3
>
{}));
static_for
<
0
,
E2
,
1
>
{}([
&
](
auto
e2
)
{
}
constexpr
index_t
a_offset
=
AThreadDesc_E1_K_E2
{}.
CalculateOffset
(
else
a_origin_idx
+
make_tuple
(
e1
,
k
,
e2
));
{
static_for
<
0
,
H
,
1
>
{}([
&
](
auto
h
)
{
constexpr
index_t
b_offset
=
static_for
<
0
,
W
,
1
>
{}([
&
](
auto
w
)
{
BThreadDesc_E1_N_Ho_Wo_E2
{}.
CalculateOffset
(
constexpr
index_t
b_offset
=
b_origin_idx
+
make_tuple
(
e1
,
0
,
h
,
w
,
e2
));
BDesc
{}.
CalculateOffset
(
b_origin_idx
+
make_tuple
(
e
,
0
,
h
,
w
));
constexpr
index_t
c_offset
=
constexpr
index_t
c_offset
=
CThreadDesc_K_N_Ho_Wo
{}.
CalculateOffset
(
c_origin_idx
+
CDesc
{}.
CalculateOffset
(
c_origin_idx
+
make_tuple
(
k
,
0
,
h
,
w
));
make_tuple
(
k
,
0
,
h
,
w
));
#if 0
inner_product
<
FloatA
,
FloatB
,
FloatC
>
(
a_buf
[
Number
<
a_offset
>
{}],
c_buf(Number<c_offset>{}) += inner_product_with_conversion<FloatC>{}(
b_buf
[
Number
<
b_offset
>
{}],
a_buf[Number<a_offset>{}], b_buf[Number<b_offset>{}]);
c_buf
(
Number
<
c_offset
>
{}));
#else
});
amd_assembly_inner_product
(
a_buf
[
Number
<
a_offset
>
{}],
b_buf
[
Number
<
b_offset
>
{}],
c_buf
(
Number
<
c_offset
>
{}));
#endif
});
});
});
});
}
}
);
});
});
}
);
}
}
}
};
};
...
...
composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp
View file @
a037693f
...
@@ -217,6 +217,22 @@ struct ThreadwiseTensorSliceTransfer_v1r3
...
@@ -217,6 +217,22 @@ struct ThreadwiseTensorSliceTransfer_v1r3
is_dst_valid
,
is_dst_valid
,
dst_vector
.
template
AsType
<
dst_vector_t
>()[
Number
<
0
>
{}]);
dst_vector
.
template
AsType
<
dst_vector_t
>()[
Number
<
0
>
{}]);
}
}
else
if
constexpr
(
DstInMemOp
==
InMemoryDataOperationEnum_t
::
Add
)
{
typename
vector_type_maker
<
DstData
,
DstScalarPerVector
>::
type
tmp
;
tmp
.
template
AsType
<
dst_vector_t
>()(
Number
<
0
>
{})
=
dst_buf
.
template
Get
<
dst_vector_t
>(
dst_coord_
.
GetOffset
(),
is_dst_valid
);
static_for
<
0
,
DstScalarPerVector
,
1
>
{}([
&
](
auto
t
)
{
dst_vector
.
template
AsType
<
DstData
>()(
t
)
+=
tmp
.
template
AsType
<
DstData
>()[
t
];
});
dst_buf
.
template
Set
<
dst_vector_t
>(
dst_coord_
.
GetOffset
(),
is_dst_valid
,
dst_vector
.
template
AsType
<
dst_vector_t
>()[
Number
<
0
>
{}]);
}
constexpr
auto
move_on_dim
=
[
&
]()
constexpr
constexpr
auto
move_on_dim
=
[
&
]()
constexpr
{
{
...
@@ -666,6 +682,25 @@ struct ThreadwiseTensorSliceTransfer_v2
...
@@ -666,6 +682,25 @@ struct ThreadwiseTensorSliceTransfer_v2
move_tensor_coordinate
(
src_desc
,
src_coord_
,
adjusted_step
);
move_tensor_coordinate
(
src_desc
,
src_coord_
,
adjusted_step
);
}
}
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
template
<
typename
SrcMoveSliceWindowStepHack
>
__device__
void
MoveSrcSliceWindow
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_slice_origin_step_idx
,
const
SrcMoveSliceWindowStepHack
&
src_move_slice_window_step_hack
)
{
// if src coord was not reset by RunRead(), then need to adjust the step here
const
auto
adjusted_step_idx
=
SrcResetCoordinateAfterRun
?
src_slice_origin_step_idx
:
src_slice_origin_step_idx
+
GetSrcCoordinateResetStep
();
// is it OK to construct a new step every time?
const
auto
adjusted_step
=
make_tensor_coordinate_step
(
src_desc
,
adjusted_step_idx
,
src_move_slice_window_step_hack
);
move_tensor_coordinate
(
src_desc
,
src_coord_
,
adjusted_step
);
}
private:
private:
SrcCoord
src_coord_
;
SrcCoord
src_coord_
;
};
// namespace ck
};
// namespace ck
...
...
composable_kernel/include/utility/amd_buffer_addressing.hpp
View file @
a037693f
...
@@ -268,14 +268,14 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
...
@@ -268,14 +268,14 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
const
float2_t
tmp
=
llvm_amdgcn_raw_buffer_load_fp32x2
(
const
float2_t
tmp
=
llvm_amdgcn_raw_buffer_load_fp32x2
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
0
);
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
0
);
return
as_type
<
double
>
(
tmp
);
return
bit_cast
<
double
>
(
tmp
);
}
}
else
if
constexpr
(
N
==
2
)
else
if
constexpr
(
N
==
2
)
{
{
const
float4_t
tmp
=
llvm_amdgcn_raw_buffer_load_fp32x4
(
const
float4_t
tmp
=
llvm_amdgcn_raw_buffer_load_fp32x4
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
0
);
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
0
);
return
as_type
<
double2_t
>
(
tmp
);
return
bit_cast
<
double2_t
>
(
tmp
);
}
}
else
if
constexpr
(
N
==
4
)
else
if
constexpr
(
N
==
4
)
{
{
...
@@ -289,8 +289,8 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
...
@@ -289,8 +289,8 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
0
);
0
);
vector_type
<
double
,
4
>
tmp
;
vector_type
<
double
,
4
>
tmp
;
tmp
.
AsType
<
double2_t
>
()(
Number
<
0
>
{})
=
as_type
<
double2_t
>
(
f32_0
);
tmp
.
AsType
<
double2_t
>
()(
Number
<
0
>
{})
=
bit_cast
<
double2_t
>
(
f32_0
);
tmp
.
AsType
<
double2_t
>
()(
Number
<
1
>
{})
=
as_type
<
double2_t
>
(
f32_1
);
tmp
.
AsType
<
double2_t
>
()(
Number
<
1
>
{})
=
bit_cast
<
double2_t
>
(
f32_1
);
return
tmp
.
AsType
<
double4_t
>
()(
Number
<
0
>
{});
return
tmp
.
AsType
<
double4_t
>
()(
Number
<
0
>
{});
}
}
...
@@ -351,7 +351,7 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
...
@@ -351,7 +351,7 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
float4_t
tmp
=
llvm_amdgcn_raw_buffer_load_fp32x4
(
float4_t
tmp
=
llvm_amdgcn_raw_buffer_load_fp32x4
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
0
);
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
0
);
return
as_type
<
half8_t
>
(
tmp
);
return
bit_cast
<
half8_t
>
(
tmp
);
}
}
}
}
else
if
constexpr
(
is_same
<
T
,
ushort
>::
value
)
else
if
constexpr
(
is_same
<
T
,
ushort
>::
value
)
...
@@ -376,7 +376,7 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
...
@@ -376,7 +376,7 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
int32x4_t
tmp
=
llvm_amdgcn_raw_buffer_load_i32x4
(
int32x4_t
tmp
=
llvm_amdgcn_raw_buffer_load_i32x4
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
0
);
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
0
);
return
as_type
<
ushort8_t
>
(
tmp
);
return
bit_cast
<
ushort8_t
>
(
tmp
);
}
}
}
}
else
if
constexpr
(
is_same
<
T
,
int32_t
>::
value
)
else
if
constexpr
(
is_same
<
T
,
int32_t
>::
value
)
...
@@ -427,7 +427,7 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
...
@@ -427,7 +427,7 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
int16_t
tmp
=
llvm_amdgcn_raw_buffer_load_i16
(
int16_t
tmp
=
llvm_amdgcn_raw_buffer_load_i16
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
0
);
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
0
);
return
as_type
<
int8x2_t
>
(
tmp
);
return
bit_cast
<
int8x2_t
>
(
tmp
);
#endif
#endif
}
}
else
if
constexpr
(
N
==
4
)
else
if
constexpr
(
N
==
4
)
...
@@ -439,7 +439,7 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
...
@@ -439,7 +439,7 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
int32_t
tmp
=
llvm_amdgcn_raw_buffer_load_i32
(
int32_t
tmp
=
llvm_amdgcn_raw_buffer_load_i32
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
0
);
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
0
);
return
as_type
<
int8x4_t
>
(
tmp
);
return
bit_cast
<
int8x4_t
>
(
tmp
);
#endif
#endif
}
}
else
if
constexpr
(
N
==
8
)
else
if
constexpr
(
N
==
8
)
...
@@ -461,7 +461,7 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
...
@@ -461,7 +461,7 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
int32x2_t
tmp
=
llvm_amdgcn_raw_buffer_load_i32x2
(
int32x2_t
tmp
=
llvm_amdgcn_raw_buffer_load_i32x2
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
0
);
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
0
);
return
as_type
<
int8x8_t
>
(
tmp
);
return
bit_cast
<
int8x8_t
>
(
tmp
);
#endif
#endif
}
}
else
if
constexpr
(
N
==
16
)
else
if
constexpr
(
N
==
16
)
...
@@ -495,7 +495,7 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
...
@@ -495,7 +495,7 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
int32x4_t
tmp
=
llvm_amdgcn_raw_buffer_load_i32x4
(
int32x4_t
tmp
=
llvm_amdgcn_raw_buffer_load_i32x4
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
0
);
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
0
);
return
as_type
<
int8x16_t
>
(
tmp
);
return
bit_cast
<
int8x16_t
>
(
tmp
);
#endif
#endif
}
}
}
}
...
@@ -521,7 +521,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
...
@@ -521,7 +521,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
// use fp32 store to mimic fp64 store
// use fp32 store to mimic fp64 store
if
constexpr
(
N
==
1
)
if
constexpr
(
N
==
1
)
{
{
llvm_amdgcn_raw_buffer_store_fp32x2
(
as_type
<
float2_t
>
(
src_thread_data
),
llvm_amdgcn_raw_buffer_store_fp32x2
(
bit_cast
<
float2_t
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
dst_wave_addr_offset
,
...
@@ -529,7 +529,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
...
@@ -529,7 +529,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
}
}
else
if
constexpr
(
N
==
2
)
else
if
constexpr
(
N
==
2
)
{
{
llvm_amdgcn_raw_buffer_store_fp32x4
(
as_type
<
float4_t
>
(
src_thread_data
),
llvm_amdgcn_raw_buffer_store_fp32x4
(
bit_cast
<
float4_t
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
dst_wave_addr_offset
,
...
@@ -591,6 +591,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
...
@@ -591,6 +591,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
}
}
else
if
constexpr
(
N
==
8
)
else
if
constexpr
(
N
==
8
)
{
{
#if 0
vector_type<half_t, 8> tmp{src_thread_data};
vector_type<half_t, 8> tmp{src_thread_data};
llvm_amdgcn_raw_buffer_store_fp16x4(tmp.AsType<half4_t>()[Number<0>{}],
llvm_amdgcn_raw_buffer_store_fp16x4(tmp.AsType<half4_t>()[Number<0>{}],
...
@@ -604,6 +605,13 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
...
@@ -604,6 +605,13 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
dst_thread_addr_offset,
dst_thread_addr_offset,
dst_wave_addr_offset + 4 * sizeof(half_t),
dst_wave_addr_offset + 4 * sizeof(half_t),
0);
0);
#else
llvm_amdgcn_raw_buffer_store_fp32x4
(
bit_cast
<
float4_t
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
0
);
#endif
}
}
}
}
else
if
constexpr
(
is_same
<
T
,
ushort
>::
value
)
else
if
constexpr
(
is_same
<
T
,
ushort
>::
value
)
...
@@ -695,7 +703,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
...
@@ -695,7 +703,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
dst_wave_addr_offset
,
dst_wave_addr_offset
,
0
);
0
);
#else
#else
llvm_amdgcn_raw_buffer_store_i16
(
as_type
<
int16_t
>
(
src_thread_data
),
llvm_amdgcn_raw_buffer_store_i16
(
bit_cast
<
int16_t
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
dst_wave_addr_offset
,
...
@@ -711,7 +719,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
...
@@ -711,7 +719,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
dst_wave_addr_offset
,
dst_wave_addr_offset
,
0
);
0
);
#else
#else
llvm_amdgcn_raw_buffer_store_i32
(
as_type
<
int32_t
>
(
src_thread_data
),
llvm_amdgcn_raw_buffer_store_i32
(
bit_cast
<
int32_t
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
dst_wave_addr_offset
,
...
@@ -720,7 +728,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
...
@@ -720,7 +728,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
}
}
else
if
constexpr
(
N
==
8
)
else
if
constexpr
(
N
==
8
)
{
{
llvm_amdgcn_raw_buffer_store_i32x2
(
as_type
<
int32x2_t
>
(
src_thread_data
),
llvm_amdgcn_raw_buffer_store_i32x2
(
bit_cast
<
int32x2_t
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
dst_wave_addr_offset
,
...
@@ -728,7 +736,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
...
@@ -728,7 +736,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
}
}
else
if
constexpr
(
N
==
16
)
else
if
constexpr
(
N
==
16
)
{
{
llvm_amdgcn_raw_buffer_store_i32x4
(
as_type
<
int32x4_t
>
(
src_thread_data
),
llvm_amdgcn_raw_buffer_store_i32x4
(
bit_cast
<
int32x4_t
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
dst_wave_addr_offset
,
...
...
composable_kernel/include/utility/amd_inline_asm.hpp
View file @
a037693f
...
@@ -211,14 +211,14 @@ amd_assembly_outer_product_1x2(int8x4_t a, int8x4_t b0, int8x4_t b1, int32_t& c0
...
@@ -211,14 +211,14 @@ amd_assembly_outer_product_1x2(int8x4_t a, int8x4_t b0, int8x4_t b1, int32_t& c0
v_dot4_i32_i8 %1, %2, %4, %1
\n
\
v_dot4_i32_i8 %1, %2, %4, %1
\n
\
"
"
:
"=v"
(
c0
),
"=v"
(
c1
)
:
"=v"
(
c0
),
"=v"
(
c1
)
:
"v"
(
as_type
<
int32_t
>
(
a
)),
:
"v"
(
bit_cast
<
int32_t
>
(
a
)),
"v"
(
as_type
<
int32_t
>
(
b0
)),
"v"
(
bit_cast
<
int32_t
>
(
b0
)),
"v"
(
as_type
<
int32_t
>
(
b1
)),
"v"
(
bit_cast
<
int32_t
>
(
b1
)),
"0"
(
c0
),
"0"
(
c0
),
"1"
(
c1
));
"1"
(
c1
));
#else
#else
c0
=
__builtin_amdgcn_sdot4
(
as_type
<
int32_t
>
(
a
),
as_type
<
int32_t
>
(
b0
),
c0
,
false
);
c0
=
__builtin_amdgcn_sdot4
(
bit_cast
<
int32_t
>
(
a
),
bit_cast
<
int32_t
>
(
b0
),
c0
,
false
);
c1
=
__builtin_amdgcn_sdot4
(
as_type
<
int32_t
>
(
a
),
as_type
<
int32_t
>
(
b1
),
c1
,
false
);
c1
=
__builtin_amdgcn_sdot4
(
bit_cast
<
int32_t
>
(
a
),
bit_cast
<
int32_t
>
(
b1
),
c1
,
false
);
#endif
#endif
}
}
...
@@ -244,20 +244,20 @@ __device__ void amd_assembly_outer_product_1x4(int8x4_t a,
...
@@ -244,20 +244,20 @@ __device__ void amd_assembly_outer_product_1x4(int8x4_t a,
v_dot4_i32_i8 %3, %4, %8, %3
\n
\
v_dot4_i32_i8 %3, %4, %8, %3
\n
\
"
"
:
"=v"
(
c0
),
"=v"
(
c1
),
"=v"
(
c2
),
"=v"
(
c3
)
:
"=v"
(
c0
),
"=v"
(
c1
),
"=v"
(
c2
),
"=v"
(
c3
)
:
"v"
(
as_type
<
int32_t
>
(
a
)),
:
"v"
(
bit_cast
<
int32_t
>
(
a
)),
"v"
(
as_type
<
int32_t
>
(
b0
)),
"v"
(
bit_cast
<
int32_t
>
(
b0
)),
"v"
(
as_type
<
int32_t
>
(
b1
)),
"v"
(
bit_cast
<
int32_t
>
(
b1
)),
"v"
(
as_type
<
int32_t
>
(
b2
)),
"v"
(
bit_cast
<
int32_t
>
(
b2
)),
"v"
(
as_type
<
int32_t
>
(
b3
)),
"v"
(
bit_cast
<
int32_t
>
(
b3
)),
"0"
(
c0
),
"0"
(
c0
),
"1"
(
c1
),
"1"
(
c1
),
"2"
(
c2
),
"2"
(
c2
),
"3"
(
c3
));
"3"
(
c3
));
#else
#else
c0
=
__builtin_amdgcn_sdot4
(
as_type
<
int32_t
>
(
a
),
as_type
<
int32_t
>
(
b0
),
c0
,
false
);
c0
=
__builtin_amdgcn_sdot4
(
bit_cast
<
int32_t
>
(
a
),
bit_cast
<
int32_t
>
(
b0
),
c0
,
false
);
c1
=
__builtin_amdgcn_sdot4
(
as_type
<
int32_t
>
(
a
),
as_type
<
int32_t
>
(
b1
),
c1
,
false
);
c1
=
__builtin_amdgcn_sdot4
(
bit_cast
<
int32_t
>
(
a
),
bit_cast
<
int32_t
>
(
b1
),
c1
,
false
);
c2
=
__builtin_amdgcn_sdot4
(
as_type
<
int32_t
>
(
a
),
as_type
<
int32_t
>
(
b2
),
c2
,
false
);
c2
=
__builtin_amdgcn_sdot4
(
bit_cast
<
int32_t
>
(
a
),
bit_cast
<
int32_t
>
(
b2
),
c2
,
false
);
c3
=
__builtin_amdgcn_sdot4
(
as_type
<
int32_t
>
(
a
),
as_type
<
int32_t
>
(
b3
),
c3
,
false
);
c3
=
__builtin_amdgcn_sdot4
(
bit_cast
<
int32_t
>
(
a
),
bit_cast
<
int32_t
>
(
b3
),
c3
,
false
);
#endif
#endif
}
}
...
...
composable_kernel/include/utility/amd_xdlops.hpp
View file @
a037693f
...
@@ -340,8 +340,8 @@ struct intrin_mfma_i32_32x32x8i8<32, 32>
...
@@ -340,8 +340,8 @@ struct intrin_mfma_i32_32x32x8i8<32, 32>
__device__
static
void
Run
(
const
int8x4_t
&
reg_a
,
const
int8x4_t
&
reg_b
,
FloatC
&
reg_c
)
__device__
static
void
Run
(
const
int8x4_t
&
reg_a
,
const
int8x4_t
&
reg_b
,
FloatC
&
reg_c
)
{
{
reg_c
.
template
AsType
<
int32x16_t
>()(
Number
<
0
>
{})
=
reg_c
.
template
AsType
<
int32x16_t
>()(
Number
<
0
>
{})
=
llvm_intrin_amdgcn_mfma_i32_32x32x8i8
(
as_type
<
int
>
(
reg_a
),
llvm_intrin_amdgcn_mfma_i32_32x32x8i8
(
bit_cast
<
int
>
(
reg_a
),
as_type
<
int
>
(
reg_b
),
bit_cast
<
int
>
(
reg_b
),
reg_c
.
template
AsType
<
int32x16_t
>()[
Number
<
0
>
{}],
reg_c
.
template
AsType
<
int32x16_t
>()[
Number
<
0
>
{}],
0
,
0
,
0
,
0
,
...
@@ -359,8 +359,8 @@ struct intrin_mfma_i32_16x16x16i8<16, 16>
...
@@ -359,8 +359,8 @@ struct intrin_mfma_i32_16x16x16i8<16, 16>
__device__
static
void
Run
(
const
int8x4_t
&
reg_a
,
const
int8x4_t
&
reg_b
,
FloatC
&
reg_c
)
__device__
static
void
Run
(
const
int8x4_t
&
reg_a
,
const
int8x4_t
&
reg_b
,
FloatC
&
reg_c
)
{
{
reg_c
.
template
AsType
<
int32x4_t
>()(
Number
<
0
>
{})
=
reg_c
.
template
AsType
<
int32x4_t
>()(
Number
<
0
>
{})
=
llvm_intrin_amdgcn_mfma_i32_16x16x16i8
(
as_type
<
int
>
(
reg_a
),
llvm_intrin_amdgcn_mfma_i32_16x16x16i8
(
bit_cast
<
int
>
(
reg_a
),
as_type
<
int
>
(
reg_b
),
bit_cast
<
int
>
(
reg_b
),
reg_c
.
template
AsType
<
int32x4_t
>()[
Number
<
0
>
{}],
reg_c
.
template
AsType
<
int32x4_t
>()[
Number
<
0
>
{}],
0
,
0
,
0
,
0
,
...
...
composable_kernel/include/utility/config.hpp
View file @
a037693f
...
@@ -96,9 +96,22 @@
...
@@ -96,9 +96,22 @@
// pass tensor descriptor by value or void*
// pass tensor descriptor by value or void*
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE 1
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE 1
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 0
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 0
#define CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR 0
// merge transformation use magic number division
// merge transformation use magic number division
#ifndef CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION
#define CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION 1
#define CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION 1
#endif
// use __builtin_memcpy instead of pointer cast to access a vector from pointer of scalar
#ifndef CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
#define CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS 0
#endif
// use __builtin_memcpy instead of union to do bit_cast
#ifndef CK_EXPERIMENTAL_USE_MEMCPY_FOR_BIT_CAST
#define CK_EXPERIMENTAL_USE_MEMCPY_FOR_BIT_CAST 1
#endif
// hack: have underlying assumption that need to be satsified, otherwise it's a bug
// hack: have underlying assumption that need to be satsified, otherwise it's a bug
// hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
// hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
...
@@ -118,7 +131,7 @@
...
@@ -118,7 +131,7 @@
#define CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE 1
#define CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE 1
#endif
#endif
// workaround for compiler
crash when using buffer load/store for i8
// workaround for compiler
gnerating inefficient ds_write instructions
#ifndef CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE
#ifndef CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE
#define CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE 1
#define CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE 1
#endif
#endif
...
@@ -133,7 +146,15 @@ namespace ck {
...
@@ -133,7 +146,15 @@ namespace ck {
enum
InMemoryDataOperationEnum_t
enum
InMemoryDataOperationEnum_t
{
{
Set
,
Set
,
AtomicAdd
AtomicAdd
,
Add
};
enum
ActivTypeEnum_t
{
None
=
0
,
LeakyRelu
,
Sigmoid
};
};
// index type
// index type
...
...
composable_kernel/include/utility/data_type.hpp
View file @
a037693f
...
@@ -1081,11 +1081,11 @@ struct NumericLimits<half_t>
...
@@ -1081,11 +1081,11 @@ struct NumericLimits<half_t>
static
constexpr
unsigned
short
binary_max
=
0x7BFF
;
static
constexpr
unsigned
short
binary_max
=
0x7BFF
;
static
constexpr
unsigned
short
binary_lowest
=
0xFBFF
;
static
constexpr
unsigned
short
binary_lowest
=
0xFBFF
;
__host__
__device__
static
constexpr
half_t
Min
()
{
return
as_type
<
half_t
>
(
binary_min
);
}
__host__
__device__
static
constexpr
half_t
Min
()
{
return
bit_cast
<
half_t
>
(
binary_min
);
}
__host__
__device__
static
constexpr
half_t
Max
()
{
return
as_type
<
half_t
>
(
binary_max
);
}
__host__
__device__
static
constexpr
half_t
Max
()
{
return
bit_cast
<
half_t
>
(
binary_max
);
}
__host__
__device__
static
constexpr
half_t
Lowest
()
{
return
as_type
<
half_t
>
(
binary_lowest
);
}
__host__
__device__
static
constexpr
half_t
Lowest
()
{
return
bit_cast
<
half_t
>
(
binary_lowest
);
}
};
};
}
// namespace ck
}
// namespace ck
...
...
composable_kernel/include/utility/dynamic_buffer.hpp
View file @
a037693f
...
@@ -83,12 +83,28 @@ struct DynamicBuffer
...
@@ -83,12 +83,28 @@ struct DynamicBuffer
{
{
if
constexpr
(
InvalidElementUseNumericalZeroValue
)
if
constexpr
(
InvalidElementUseNumericalZeroValue
)
{
{
#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
X
tmp
;
__builtin_memcpy
(
&
tmp
,
&
(
p_data_
[
i
]),
sizeof
(
X
));
return
is_valid_element
?
tmp
:
X
{
0
};
#else
return
is_valid_element
?
*
c_style_pointer_cast
<
const
X
*>
(
&
p_data_
[
i
])
:
X
{
0
};
return
is_valid_element
?
*
c_style_pointer_cast
<
const
X
*>
(
&
p_data_
[
i
])
:
X
{
0
};
#endif
}
}
else
else
{
{
#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
X
tmp
;
__builtin_memcpy
(
&
tmp
,
&
(
p_data_
[
i
]),
sizeof
(
X
));
return
is_valid_element
?
tmp
:
X
{
invalid_element_value_
};
#else
return
is_valid_element
?
*
c_style_pointer_cast
<
const
X
*>
(
&
p_data_
[
i
])
return
is_valid_element
?
*
c_style_pointer_cast
<
const
X
*>
(
&
p_data_
[
i
])
:
X
{
invalid_element_value_
};
:
X
{
invalid_element_value_
};
#endif
}
}
}
}
}
}
...
@@ -117,7 +133,13 @@ struct DynamicBuffer
...
@@ -117,7 +133,13 @@ struct DynamicBuffer
#else
#else
if
(
is_valid_element
)
if
(
is_valid_element
)
{
{
#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
X
tmp
=
x
;
__builtin_memcpy
(
&
(
p_data_
[
i
]),
&
tmp
,
sizeof
(
X
));
#else
*
c_style_pointer_cast
<
X
*>
(
&
p_data_
[
i
])
=
x
;
*
c_style_pointer_cast
<
X
*>
(
&
p_data_
[
i
])
=
x
;
#endif
}
}
#endif
#endif
}
}
...
@@ -126,7 +148,13 @@ struct DynamicBuffer
...
@@ -126,7 +148,13 @@ struct DynamicBuffer
if
(
is_valid_element
)
if
(
is_valid_element
)
{
{
#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE
#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE
#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
X
tmp
=
x
;
__builtin_memcpy
(
&
(
p_data_
[
i
]),
&
tmp
,
sizeof
(
X
));
#else
*
c_style_pointer_cast
<
X
*>
(
&
p_data_
[
i
])
=
x
;
*
c_style_pointer_cast
<
X
*>
(
&
p_data_
[
i
])
=
x
;
#endif
#else
#else
// HACK: compiler would lower IR "store<i8, 16> address_space(3)" into
// HACK: compiler would lower IR "store<i8, 16> address_space(3)" into
// inefficient
// inefficient
...
@@ -201,7 +229,13 @@ struct DynamicBuffer
...
@@ -201,7 +229,13 @@ struct DynamicBuffer
}
}
else
else
{
{
#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
X
tmp
=
x
;
__builtin_memcpy
(
&
(
p_data_
[
i
]),
&
tmp
,
sizeof
(
X
));
#else
*
c_style_pointer_cast
<
X
*>
(
&
p_data_
[
i
])
=
x
;
*
c_style_pointer_cast
<
X
*>
(
&
p_data_
[
i
])
=
x
;
#endif
}
}
#endif
#endif
}
}
...
@@ -210,7 +244,13 @@ struct DynamicBuffer
...
@@ -210,7 +244,13 @@ struct DynamicBuffer
{
{
if
(
is_valid_element
)
if
(
is_valid_element
)
{
{
#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
X
tmp
=
x
;
__builtin_memcpy
(
&
(
p_data_
[
i
]),
&
tmp
,
sizeof
(
X
));
#else
*
c_style_pointer_cast
<
X
*>
(
&
p_data_
[
i
])
=
x
;
*
c_style_pointer_cast
<
X
*>
(
&
p_data_
[
i
])
=
x
;
#endif
}
}
}
}
}
}
...
...
composable_kernel/include/utility/inner_product.hpp
View file @
a037693f
...
@@ -144,9 +144,9 @@ inner_product<int8x4_t, int8x4_t, int32_t>(const int8x4_t& a, const int8x4_t& b,
...
@@ -144,9 +144,9 @@ inner_product<int8x4_t, int8x4_t, int32_t>(const int8x4_t& a, const int8x4_t& b,
v_dot4_i32_i8 %0, %1, %2, %0
\n
\
v_dot4_i32_i8 %0, %1, %2, %0
\n
\
"
"
:
"=v"
(
c
)
:
"=v"
(
c
)
:
"v"
(
as_type
<
int32_t
>
(
a
)),
"v"
(
as_type
<
int32_t
>
(
b
)),
"0"
(
c
));
:
"v"
(
bit_cast
<
int32_t
>
(
a
)),
"v"
(
bit_cast
<
int32_t
>
(
b
)),
"0"
(
c
));
#else
#else
c
=
__builtin_amdgcn_sdot4
(
as_type
<
int32_t
>
(
a
),
as_type
<
int32_t
>
(
b
),
c
,
false
);
c
=
__builtin_amdgcn_sdot4
(
bit_cast
<
int32_t
>
(
a
),
bit_cast
<
int32_t
>
(
b
),
c
,
false
);
#endif
#endif
#else
#else
const
vector_type
<
int8_t
,
4
>
a_vector
{
a
};
const
vector_type
<
int8_t
,
4
>
a_vector
{
a
};
...
...
composable_kernel/include/utility/magic_division.hpp
View file @
a037693f
...
@@ -125,7 +125,7 @@ struct MagicDivision
...
@@ -125,7 +125,7 @@ struct MagicDivision
__host__
__device__
static
constexpr
int32_t
__host__
__device__
static
constexpr
int32_t
DoMagicDivision
(
int32_t
dividend_i32
,
uint32_t
multiplier
,
uint32_t
shift
)
DoMagicDivision
(
int32_t
dividend_i32
,
uint32_t
multiplier
,
uint32_t
shift
)
{
{
uint32_t
dividend_u32
=
as_type
<
uint32_t
>
(
dividend_i32
);
uint32_t
dividend_u32
=
bit_cast
<
uint32_t
>
(
dividend_i32
);
uint32_t
tmp
=
__umulhi
(
dividend_u32
,
multiplier
);
uint32_t
tmp
=
__umulhi
(
dividend_u32
,
multiplier
);
return
(
tmp
+
dividend_u32
)
>>
shift
;
return
(
tmp
+
dividend_u32
)
>>
shift
;
}
}
...
...
composable_kernel/include/utility/statically_indexed_array.hpp
View file @
a037693f
...
@@ -54,5 +54,49 @@ __host__ __device__ constexpr auto make_statically_indexed_array()
...
@@ -54,5 +54,49 @@ __host__ __device__ constexpr auto make_statically_indexed_array()
return
StaticallyIndexedArray
<
X
,
0
>
();
return
StaticallyIndexedArray
<
X
,
0
>
();
}
}
template
<
typename
T
,
index_t
N
>
struct
StaticallyIndexedArray_v2
{
__host__
__device__
constexpr
StaticallyIndexedArray_v2
()
=
default
;
__host__
__device__
static
constexpr
index_t
Size
()
{
return
N
;
}
// read access
template
<
index_t
I
>
__host__
__device__
constexpr
const
auto
&
At
(
Number
<
I
>
)
const
{
static_assert
(
I
<
N
,
"wrong! out of range"
);
return
data_
[
I
];
}
// write access
template
<
index_t
I
>
__host__
__device__
constexpr
auto
&
At
(
Number
<
I
>
)
{
static_assert
(
I
<
N
,
"wrong! out of range"
);
return
data_
[
I
];
}
// read access
template
<
index_t
I
>
__host__
__device__
constexpr
const
auto
&
operator
[](
Number
<
I
>
i
)
const
{
return
At
(
i
);
}
// write access
template
<
index_t
I
>
__host__
__device__
constexpr
auto
&
operator
()(
Number
<
I
>
i
)
{
return
At
(
i
);
}
__host__
__device__
static
constexpr
bool
IsStaticBuffer
()
{
return
true
;
}
T
data_
[
N
];
};
}
// namespace ck
}
// namespace ck
#endif
#endif
composable_kernel/include/utility/type.hpp
View file @
a037693f
...
@@ -32,8 +32,15 @@ template <typename T>
...
@@ -32,8 +32,15 @@ template <typename T>
inline
constexpr
bool
is_pointer_v
=
std
::
is_pointer
<
T
>::
value
;
inline
constexpr
bool
is_pointer_v
=
std
::
is_pointer
<
T
>::
value
;
template
<
typename
Y
,
typename
X
,
typename
enable_if
<
sizeof
(
X
)
==
sizeof
(
Y
),
bool
>
::
type
=
false
>
template
<
typename
Y
,
typename
X
,
typename
enable_if
<
sizeof
(
X
)
==
sizeof
(
Y
),
bool
>
::
type
=
false
>
__host__
__device__
constexpr
Y
as_type
(
X
x
)
__host__
__device__
constexpr
Y
bit_cast
(
const
X
&
x
)
{
{
#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_BIT_CAST
Y
y
;
__builtin_memcpy
(
&
y
,
&
x
,
sizeof
(
X
));
return
y
;
#else
union
AsType
union
AsType
{
{
X
x
;
X
x
;
...
@@ -41,6 +48,7 @@ __host__ __device__ constexpr Y as_type(X x)
...
@@ -41,6 +48,7 @@ __host__ __device__ constexpr Y as_type(X x)
};
};
return
AsType
{
x
}.
y
;
return
AsType
{
x
}.
y
;
#endif
}
}
}
// namespace ck
}
// namespace ck
...
...
example/1_gemm_xdl/gemm_xdl.cpp
View file @
a037693f
...
@@ -9,7 +9,6 @@
...
@@ -9,7 +9,6 @@
#include "device.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "host_tensor_generator.hpp"
#include "gemm_common.hpp"
#include "host_gemm.hpp"
#include "host_gemm.hpp"
#include "device_tensor.hpp"
#include "device_tensor.hpp"
#include "device_base.hpp"
#include "device_base.hpp"
...
@@ -139,12 +138,12 @@ int main(int argc, char* argv[])
...
@@ -139,12 +138,12 @@ int main(int argc, char* argv[])
{
{
case
0
:
break
;
case
0
:
break
;
case
1
:
case
1
:
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
});
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_2
<
ADataType
>
{
-
5
,
5
});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_2
<
BDataType
>
{
-
5
,
5
});
break
;
break
;
default:
default:
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_3
<
float
>
{
0.0
,
1.0
});
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0.0
,
1.0
});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_3
<
float
>
{
-
0.5
,
0.5
});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
-
0.5
,
0.5
});
}
}
DeviceMem
a_m_k_device_buf
(
sizeof
(
ADataType
)
*
a_m_k
.
mDesc
.
GetElementSpace
());
DeviceMem
a_m_k_device_buf
(
sizeof
(
ADataType
)
*
a_m_k
.
mDesc
.
GetElementSpace
());
...
...
host/driver_offline/CMakeLists.txt
View file @
a037693f
...
@@ -13,16 +13,25 @@ include_directories(BEFORE
...
@@ -13,16 +13,25 @@ include_directories(BEFORE
)
)
set
(
CONV_FWD_DRIVER_OFFLINE_SOURCE src/conv_fwd_driver_offline.cpp
)
set
(
CONV_FWD_DRIVER_OFFLINE_SOURCE src/conv_fwd_driver_offline.cpp
)
set
(
CONV_FWD_DRIVER_OFFLINE_NCHWC_SOURCE src/conv_fwd_driver_offline_nchwc.cpp
)
set
(
CONV_ADD_FWD_DRIVER_OFFLINE_NCHWC_SOURCE src/conv_add_fwd_driver_offline_nchwc.cpp
)
set
(
CONV_MAXPOOL_FWD_DRIVER_OFFLINE_NCHWC_SOURCE src/conv_maxpool_fwd_driver_offline_nchwc.cpp
)
set
(
CONV_BWD_DRIVER_OFFLINE_SOURCE src/conv_bwd_driver_offline.cpp
)
set
(
CONV_BWD_DRIVER_OFFLINE_SOURCE src/conv_bwd_driver_offline.cpp
)
set
(
CONV_WRW_DRIVER_OFFLINE_SOURCE src/conv_wrw_driver_offline.cpp
)
set
(
CONV_WRW_DRIVER_OFFLINE_SOURCE src/conv_wrw_driver_offline.cpp
)
set
(
GEMM_DRIVER_OFFLINE_SOURCE src/gemm_driver_offline.cpp
)
set
(
GEMM_DRIVER_OFFLINE_SOURCE src/gemm_driver_offline.cpp
)
add_executable
(
conv_fwd_driver_offline
${
CONV_FWD_DRIVER_OFFLINE_SOURCE
}
)
add_executable
(
conv_fwd_driver_offline
${
CONV_FWD_DRIVER_OFFLINE_SOURCE
}
)
add_executable
(
conv_fwd_driver_offline_nchwc
${
CONV_FWD_DRIVER_OFFLINE_NCHWC_SOURCE
}
)
add_executable
(
conv_add_fwd_driver_offline_nchwc
${
CONV_ADD_FWD_DRIVER_OFFLINE_NCHWC_SOURCE
}
)
add_executable
(
conv_maxpool_fwd_driver_offline_nchwc
${
CONV_MAXPOOL_FWD_DRIVER_OFFLINE_NCHWC_SOURCE
}
)
add_executable
(
conv_bwd_driver_offline
${
CONV_BWD_DRIVER_OFFLINE_SOURCE
}
)
add_executable
(
conv_bwd_driver_offline
${
CONV_BWD_DRIVER_OFFLINE_SOURCE
}
)
add_executable
(
conv_wrw_driver_offline
${
CONV_WRW_DRIVER_OFFLINE_SOURCE
}
)
add_executable
(
conv_wrw_driver_offline
${
CONV_WRW_DRIVER_OFFLINE_SOURCE
}
)
add_executable
(
gemm_driver_offline
${
GEMM_DRIVER_OFFLINE_SOURCE
}
)
add_executable
(
gemm_driver_offline
${
GEMM_DRIVER_OFFLINE_SOURCE
}
)
target_link_libraries
(
conv_fwd_driver_offline PRIVATE host_tensor
)
target_link_libraries
(
conv_fwd_driver_offline PRIVATE host_tensor
)
target_link_libraries
(
conv_fwd_driver_offline_nchwc PRIVATE host_tensor
)
target_link_libraries
(
conv_add_fwd_driver_offline_nchwc PRIVATE host_tensor
)
target_link_libraries
(
conv_maxpool_fwd_driver_offline_nchwc PRIVATE host_tensor
)
target_link_libraries
(
conv_bwd_driver_offline PRIVATE host_tensor
)
target_link_libraries
(
conv_bwd_driver_offline PRIVATE host_tensor
)
target_link_libraries
(
conv_wrw_driver_offline PRIVATE host_tensor
)
target_link_libraries
(
conv_wrw_driver_offline PRIVATE host_tensor
)
target_link_libraries
(
gemm_driver_offline PRIVATE host_tensor
)
target_link_libraries
(
gemm_driver_offline PRIVATE host_tensor
)
host/driver_offline/include/device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
0 → 100644
View file @
a037693f
#include <unistd.h>
#include "device.hpp"
#include "host_tensor.hpp"
#include "driver_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
template
<
typename
TInWei
,
typename
TAcc
,
typename
TOut
,
ck
::
ActivTypeEnum_t
activ_type
,
typename
InLengths
,
typename
WeiLengths
,
typename
AddLengths
,
typename
OutLengths
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
void
device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1
(
const
InLengths
&
in_n_c0_hi_wi_c1_lengths
,
const
WeiLengths
&
wei_k_c0_y_x_c1_lengths
,
const
AddLengths
&
add_n_k0_hox2_wox2_k1_lengths
,
const
OutLengths
&
out_n_k0_ho_wo_k1_lengths
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
in_right_pads
,
const
Tensor
<
TInWei
>&
in_n_c0_hi_wi_c1
,
const
Tensor
<
TInWei
>&
wei_k_c0_y_x_c1
,
const
Tensor
<
TOut
>&
bias_k0_k1
,
const
Tensor
<
TOut
>&
add_n_k0_hox2_wox2_k1
,
Tensor
<
TOut
>&
add_n_k0_hox2_wox2_k1_out
,
ck
::
index_t
nrepeat
)
{
using
namespace
ck
;
std
::
cout
<<
__func__
<<
std
::
endl
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
const
auto
N
=
out_n_k0_ho_wo_k1_lengths
[
I0
];
const
auto
K0
=
out_n_k0_ho_wo_k1_lengths
[
I1
];
const
auto
Ho
=
out_n_k0_ho_wo_k1_lengths
[
I2
];
const
auto
Wo
=
out_n_k0_ho_wo_k1_lengths
[
I3
];
const
auto
K1
=
out_n_k0_ho_wo_k1_lengths
[
I4
];
const
auto
C0
=
in_n_c0_hi_wi_c1_lengths
[
I1
];
const
auto
Hi
=
in_n_c0_hi_wi_c1_lengths
[
I2
];
const
auto
Wi
=
in_n_c0_hi_wi_c1_lengths
[
I3
];
const
auto
C1
=
in_n_c0_hi_wi_c1_lengths
[
I4
];
const
auto
K
=
wei_k_c0_y_x_c1_lengths
[
I0
];
const
auto
Y
=
wei_k_c0_y_x_c1_lengths
[
I2
];
const
auto
X
=
wei_k_c0_y_x_c1_lengths
[
I3
];
const
auto
Hox2
=
add_n_k0_hox2_wox2_k1_lengths
[
I2
];
const
auto
Wox2
=
add_n_k0_hox2_wox2_k1_lengths
[
I3
];
DeviceMem
in_n_c0_hi_wi_c1_device_buf
(
sizeof
(
TInWei
)
*
in_n_c0_hi_wi_c1
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_k_c0_y_x_c1_device_buf
(
sizeof
(
TInWei
)
*
wei_k_c0_y_x_c1
.
mDesc
.
GetElementSpace
());
DeviceMem
bias_k0_k1_device_buf
(
sizeof
(
TOut
)
*
bias_k0_k1
.
mDesc
.
GetElementSpace
());
DeviceMem
add_n_k0_hox2_wox2_k1_device_buf
(
sizeof
(
TOut
)
*
add_n_k0_hox2_wox2_k1
.
mDesc
.
GetElementSpace
());
in_n_c0_hi_wi_c1_device_buf
.
ToDevice
(
in_n_c0_hi_wi_c1
.
mData
.
data
());
wei_k_c0_y_x_c1_device_buf
.
ToDevice
(
wei_k_c0_y_x_c1
.
mData
.
data
());
bias_k0_k1_device_buf
.
ToDevice
(
bias_k0_k1
.
mData
.
data
());
add_n_k0_hox2_wox2_k1_device_buf
.
ToDevice
(
add_n_k0_hox2_wox2_k1
.
mData
.
data
());
constexpr
index_t
InWeiVectorSize
=
8
;
if
(
C1
%
InWeiVectorSize
!=
0
)
{
throw
std
::
runtime_error
(
"wrong! C1 cannot be divided by InWeiVectorSize"
);
}
#if 0
constexpr index_t BlockSize = 256;
constexpr index_t KPerBlock = 32;
constexpr index_t HoPerBlock = 8;
constexpr index_t WoPerBlock = 64;
constexpr index_t E1 = C0 * 9;
constexpr index_t E2 = 1;
constexpr index_t E1PerBlock = C0;
constexpr index_t KPerThread = 16;
constexpr index_t HoPerThread = 2;
constexpr index_t WoPerThread = 2;
constexpr index_t EPerThread = 1;
using ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2 = Sequence<1, 9, 1, E2>;
using ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2 = Sequence<1, E1PerBlock, KPerBlock, 1>;
constexpr index_t ABlockTransferSrcScalarPerVector_E2 = E2;
constexpr index_t ABlockTransferDstScalarPerVector_E2 = E2;
constexpr index_t BThreadTransferSrcScalarPerVector_E2 = E2;
constexpr index_t CThreadTransferDstScalarPerVector_K = K1;
#elif
1
constexpr
auto
BlockSize
=
64
;
constexpr
auto
KPerBlock
=
8
;
constexpr
auto
HoPerBlock
=
8
;
constexpr
auto
WoPerBlock
=
32
;
constexpr
auto
E1
=
2
*
9
;
constexpr
auto
E2
=
1
;
constexpr
auto
K2
=
2
;
constexpr
auto
E1PerBlock
=
2
;
constexpr
auto
KPerThread
=
KPerBlock
;
constexpr
auto
HoPerThread
=
2
;
constexpr
auto
WoPerThread
=
2
;
constexpr
auto
EPerThread
=
1
;
using
ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2
=
Sequence
<
1
,
9
,
1
,
1
,
E2
>
;
using
ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2
=
Sequence
<
1
,
E1PerBlock
,
1
,
KPerBlock
,
1
>
;
constexpr
auto
ABlockTransferSrcScalarPerVector_E2
=
E2
;
constexpr
auto
ABlockTransferDstScalarPerVector_E2
=
E2
;
constexpr
auto
BThreadTransferSrcScalarPerVector_E2
=
E2
;
constexpr
auto
CThreadTransferDstScalarPerVector_K
=
InWeiVectorSize
;
#endif
const
auto
in_n_c0_hi_wi_c1_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
,
C0
,
Hi
,
Wi
,
E2
));
const
auto
wei_k_c0_y_x_c1_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
K
,
C0
,
Y
,
X
,
E2
));
const
auto
add_n_k0_hox2_wox2_k1_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
,
K0
,
Hox2
,
Wox2
,
K1
));
const
auto
out_n_k0_ho_wo_k1_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
,
K0
,
Ho
,
Wo
,
K1
));
constexpr
auto
conv_driver
=
DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_add
<
BlockSize
,
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
,
TAcc
,
TOut
,
E1
,
E2
,
K2
,
KPerBlock
,
HoPerBlock
,
WoPerBlock
,
E1PerBlock
,
KPerThread
,
HoPerThread
,
WoPerThread
,
EPerThread
,
ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2
,
ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2
,
ABlockTransferSrcScalarPerVector_E2
,
ABlockTransferDstScalarPerVector_E2
,
BThreadTransferSrcScalarPerVector_E2
,
CThreadTransferDstScalarPerVector_K
,
activ_type
>
{};
std
::
cerr
<<
"conv_bias_activ_resize_add_input_"
<<
"n"
<<
N
<<
"c"
<<
C0
<<
"h"
<<
Hi
<<
"w"
<<
Wi
<<
"c"
<<
C1
<<
"_filter_k"
<<
K
<<
"c"
<<
C0
<<
"y"
<<
Y
<<
"x"
<<
X
<<
"c"
<<
C1
<<
"_addout_n"
<<
N
<<
"k"
<<
K0
<<
"h"
<<
Ho
*
2
<<
"w"
<<
Wo
*
2
<<
"k"
<<
K1
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
5
;
i
++
)
{
const
auto
ave_time
=
conv_driver
.
Run
(
wei_k_c0_y_x_c1_desc
,
in_n_c0_hi_wi_c1_desc
,
out_n_k0_ho_wo_k1_desc
,
add_n_k0_hox2_wox2_k1_desc
,
conv_strides
,
conv_dilations
,
in_left_pads
,
in_right_pads
,
static_cast
<
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
*>
(
wei_k_c0_y_x_c1_device_buf
.
GetDeviceBuffer
()),
static_cast
<
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
*>
(
in_n_c0_hi_wi_c1_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TOut
*>
(
bias_k0_k1_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TOut
*>
(
add_n_k0_hox2_wox2_k1_device_buf
.
GetDeviceBuffer
()),
nrepeat
);
{
float
perf
=
static_cast
<
float
>
(
std
::
size_t
(
2
)
*
N
*
K
*
Ho
*
Wo
*
C0
*
C1
*
Y
*
X
)
/
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
ave_time
;
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
}
}
add_n_k0_hox2_wox2_k1_device_buf
.
ToDevice
(
add_n_k0_hox2_wox2_k1
.
mData
.
data
());
conv_driver
.
Run
(
wei_k_c0_y_x_c1_desc
,
in_n_c0_hi_wi_c1_desc
,
out_n_k0_ho_wo_k1_desc
,
add_n_k0_hox2_wox2_k1_desc
,
conv_strides
,
conv_dilations
,
in_left_pads
,
in_right_pads
,
static_cast
<
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
*>
(
wei_k_c0_y_x_c1_device_buf
.
GetDeviceBuffer
()),
static_cast
<
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
*>
(
in_n_c0_hi_wi_c1_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TOut
*>
(
bias_k0_k1_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TOut
*>
(
add_n_k0_hox2_wox2_k1_device_buf
.
GetDeviceBuffer
()),
0
);
add_n_k0_hox2_wox2_k1_device_buf
.
FromDevice
(
add_n_k0_hox2_wox2_k1_out
.
mData
.
data
());
}
host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
0 → 100644
View file @
a037693f
#include <unistd.h>
#include "device.hpp"
#include "host_tensor.hpp"
#include "driver_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
template
<
typename
TInWei
,
typename
TAcc
,
typename
TOut
,
ck
::
ActivTypeEnum_t
activ_type
,
typename
InLengths
,
typename
WeiLengths
,
typename
OutLengths
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
void
device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1
(
const
InLengths
&
in_n_c0_hi_wi_c1_lengths
,
const
WeiLengths
&
wei_k_c0_y_x_c1_lengths
,
const
OutLengths
&
out_n_k0_ho_wo_k1_lengths
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
in_right_pads
,
const
Tensor
<
TInWei
>&
in_n_c0_hi_wi_c1
,
const
Tensor
<
TInWei
>&
wei_k_c0_y_x_c1
,
const
Tensor
<
TOut
>&
bias_k0_k1
,
Tensor
<
TOut
>&
out_n_k0_ho_wo_k1
,
ck
::
index_t
nrepeat
)
{
using
namespace
ck
;
std
::
cout
<<
__func__
<<
std
::
endl
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
const
auto
N
=
out_n_k0_ho_wo_k1_lengths
[
I0
];
const
auto
K0
=
out_n_k0_ho_wo_k1_lengths
[
I1
];
const
auto
Ho
=
out_n_k0_ho_wo_k1_lengths
[
I2
];
const
auto
Wo
=
out_n_k0_ho_wo_k1_lengths
[
I3
];
const
auto
K1
=
out_n_k0_ho_wo_k1_lengths
[
I4
];
const
auto
C0
=
in_n_c0_hi_wi_c1_lengths
[
I1
];
const
auto
Hi
=
in_n_c0_hi_wi_c1_lengths
[
I2
];
const
auto
Wi
=
in_n_c0_hi_wi_c1_lengths
[
I3
];
const
auto
C1
=
in_n_c0_hi_wi_c1_lengths
[
I4
];
const
auto
K
=
wei_k_c0_y_x_c1_lengths
[
I0
];
const
auto
Y
=
wei_k_c0_y_x_c1_lengths
[
I2
];
const
auto
X
=
wei_k_c0_y_x_c1_lengths
[
I3
];
DeviceMem
in_n_c0_hi_wi_c1_device_buf
(
sizeof
(
TInWei
)
*
in_n_c0_hi_wi_c1
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_k_c0_y_x_c1_device_buf
(
sizeof
(
TInWei
)
*
wei_k_c0_y_x_c1
.
mDesc
.
GetElementSpace
());
DeviceMem
bias_k0_k1_device_buf
(
sizeof
(
TOut
)
*
bias_k0_k1
.
mDesc
.
GetElementSpace
());
DeviceMem
out_n_k0_ho_wo_k1_device_buf
(
sizeof
(
TOut
)
*
out_n_k0_ho_wo_k1
.
mDesc
.
GetElementSpace
());
in_n_c0_hi_wi_c1_device_buf
.
ToDevice
(
in_n_c0_hi_wi_c1
.
mData
.
data
());
wei_k_c0_y_x_c1_device_buf
.
ToDevice
(
wei_k_c0_y_x_c1
.
mData
.
data
());
bias_k0_k1_device_buf
.
ToDevice
(
bias_k0_k1
.
mData
.
data
());
constexpr
index_t
InWeiVectorSize
=
8
;
if
(
C1
%
InWeiVectorSize
!=
0
)
{
throw
std
::
runtime_error
(
"wrong! C1 cannot be divided by InWeiVectorSize"
);
}
#if 0
constexpr index_t BlockSize = 256;
constexpr index_t KPerBlock = 32;
constexpr index_t HoPerBlock = 8;
constexpr index_t WoPerBlock = 64;
constexpr index_t E1 = C0 * 9;
constexpr index_t E2 = 1;
constexpr index_t E1PerBlock = C0;
constexpr index_t KPerThread = 16;
constexpr index_t HoPerThread = 2;
constexpr index_t WoPerThread = 2;
constexpr index_t EPerThread = 1;
using ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2 = Sequence<1, 9, 1, E2>;
using ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2 = Sequence<1, E1PerBlock, KPerBlock, 1>;
constexpr index_t ABlockTransferSrcScalarPerVector_E2 = E2;
constexpr index_t ABlockTransferDstScalarPerVector_E2 = E2;
constexpr index_t BThreadTransferSrcScalarPerVector_E2 = E2;
constexpr index_t CThreadTransferDstScalarPerVector_K = K1;
#elif
1
constexpr
index_t
BlockSize
=
64
;
constexpr
index_t
KPerBlock
=
8
;
constexpr
index_t
HoPerBlock
=
8
;
constexpr
index_t
WoPerBlock
=
32
;
constexpr
index_t
E1
=
2
*
9
;
constexpr
index_t
E2
=
1
;
constexpr
index_t
K2
=
2
;
constexpr
index_t
E1PerBlock
=
2
;
constexpr
index_t
KPerThread
=
KPerBlock
;
constexpr
index_t
HoPerThread
=
2
;
constexpr
index_t
WoPerThread
=
2
;
constexpr
index_t
EPerThread
=
1
;
using
ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2
=
Sequence
<
1
,
9
,
1
,
1
,
E2
>
;
using
ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2
=
Sequence
<
1
,
E1PerBlock
,
1
,
KPerBlock
,
1
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_E2
=
E2
;
constexpr
index_t
ABlockTransferDstScalarPerVector_E2
=
E2
;
constexpr
index_t
BThreadTransferSrcScalarPerVector_E2
=
E2
;
constexpr
index_t
CThreadTransferDstScalarPerVector_K
=
InWeiVectorSize
;
#endif
if
(
KPerThread
%
InWeiVectorSize
!=
0
)
{
throw
std
::
runtime_error
(
"wrong! C1 cannot be divided by InWeiVectorSize"
);
}
const
auto
in_n_c0_hi_wi_c1_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
,
C0
,
Hi
,
Wi
,
E2
));
const
auto
wei_k_c0_y_x_c1_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
K
,
C0
,
Y
,
X
,
E2
));
const
auto
out_n_k0_ho_wo_k1_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
,
K0
,
Ho
,
Wo
,
K1
));
constexpr
auto
conv_driver
=
DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_outpad
<
BlockSize
,
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
,
TAcc
,
TOut
,
E1
,
E2
,
K2
,
KPerBlock
,
HoPerBlock
,
WoPerBlock
,
E1PerBlock
,
KPerThread
,
HoPerThread
,
WoPerThread
,
EPerThread
,
ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2
,
ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2
,
ABlockTransferSrcScalarPerVector_E2
,
ABlockTransferDstScalarPerVector_E2
,
BThreadTransferSrcScalarPerVector_E2
,
CThreadTransferDstScalarPerVector_K
,
activ_type
>
{};
std
::
cerr
<<
"conv_bias_activ_input_"
<<
"n"
<<
N
<<
"c"
<<
C0
<<
"h"
<<
Hi
<<
"w"
<<
Wi
<<
"c"
<<
C1
<<
"_filter_k"
<<
K
<<
"c"
<<
C0
<<
"y"
<<
Y
<<
"x"
<<
X
<<
"c"
<<
C1
<<
"_convout_n"
<<
N
<<
"k"
<<
K0
<<
"h"
<<
Ho
<<
"w"
<<
Wo
<<
"k"
<<
K1
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
5
;
i
++
)
{
const
auto
ave_time
=
conv_driver
.
Run
(
wei_k_c0_y_x_c1_desc
,
in_n_c0_hi_wi_c1_desc
,
out_n_k0_ho_wo_k1_desc
,
conv_strides
,
conv_dilations
,
in_left_pads
,
in_right_pads
,
static_cast
<
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
*>
(
wei_k_c0_y_x_c1_device_buf
.
GetDeviceBuffer
()),
static_cast
<
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
*>
(
in_n_c0_hi_wi_c1_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TOut
*>
(
bias_k0_k1_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TOut
*>
(
out_n_k0_ho_wo_k1_device_buf
.
GetDeviceBuffer
()),
nrepeat
);
{
float
perf
=
static_cast
<
float
>
(
std
::
size_t
(
2
)
*
N
*
K
*
Ho
*
Wo
*
C0
*
C1
*
Y
*
X
)
/
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
ave_time
;
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
}
}
out_n_k0_ho_wo_k1_device_buf
.
FromDevice
(
out_n_k0_ho_wo_k1
.
mData
.
data
());
}
host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
deleted
100644 → 0
View file @
0694d6ed
#include <unistd.h>
#include "device.hpp"
#include "host_tensor.hpp"
#include "driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp"
#include "driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp"
template
<
typename
TInWei
,
ck
::
index_t
InWeiVectorSize
,
typename
TAcc
,
typename
TOut
,
typename
InLengths
,
typename
WeiLengths
,
typename
OutLengths
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
void
device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw
(
const
InLengths
&
in_n_c_hi_wi_lengths
,
const
WeiLengths
&
wei_k_c_y_x_lengths
,
const
OutLengths
&
out_n_k_ho_wo_lengths
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
in_right_pads
,
const
Tensor
<
TInWei
>&
in_n_c_hi_wi
,
const
Tensor
<
TInWei
>&
wei_k_c_y_x
,
Tensor
<
TOut
>&
out_n_k_ho_wo
,
ck
::
index_t
/* nrepeat */
)
{
using
namespace
ck
;
std
::
cout
<<
__func__
<<
std
::
endl
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
const
auto
N
=
out_n_k_ho_wo_lengths
[
I0
];
const
auto
K
=
out_n_k_ho_wo_lengths
[
I1
];
const
auto
C
=
wei_k_c_y_x_lengths
[
I1
];
const
auto
Hi
=
in_n_c_hi_wi_lengths
[
I2
];
const
auto
Wi
=
in_n_c_hi_wi_lengths
[
I3
];
const
auto
Ho
=
out_n_k_ho_wo_lengths
[
I2
];
const
auto
Wo
=
out_n_k_ho_wo_lengths
[
I3
];
const
auto
Y
=
wei_k_c_y_x_lengths
[
I2
];
const
auto
X
=
wei_k_c_y_x_lengths
[
I3
];
const
auto
C0
=
C
/
Number
<
InWeiVectorSize
>
{};
const
auto
C1
=
Number
<
InWeiVectorSize
>
{};
const
auto
K0
=
K
/
Number
<
InWeiVectorSize
>
{};
const
auto
K1
=
Number
<
InWeiVectorSize
>
{};
Tensor
<
TInWei
>
in_n_c0_hi_wi_c1
(
HostTensorDescriptor
(
std
::
initializer_list
<
index_t
>
{
N
,
C0
,
Hi
,
Wi
,
C1
}));
Tensor
<
TInWei
>
wei_k_c0_y_x_c1
(
HostTensorDescriptor
(
std
::
initializer_list
<
index_t
>
{
K
,
C0
,
Y
,
X
,
C1
}));
Tensor
<
TOut
>
out_n_k0_ho_wo_k1
(
HostTensorDescriptor
(
std
::
initializer_list
<
index_t
>
{
N
,
K0
,
Ho
,
Wo
,
K1
}));
auto
f_nchw2nc0hwc1
=
[
&
](
auto
n
,
auto
hi
,
auto
wi
,
auto
c
)
{
in_n_c0_hi_wi_c1
(
n
,
c
/
InWeiVectorSize
,
hi
,
wi
,
c
%
InWeiVectorSize
)
=
in_n_c_hi_wi
(
n
,
c
,
hi
,
wi
);
};
auto
f_kcyx2kc0yxc1
=
[
&
](
auto
k
,
auto
y
,
auto
x
,
auto
c
)
{
wei_k_c0_y_x_c1
(
k
,
c
/
InWeiVectorSize
,
y
,
x
,
c
%
InWeiVectorSize
)
=
wei_k_c_y_x
(
k
,
c
,
y
,
x
);
};
make_ParallelTensorFunctor
(
f_nchw2nc0hwc1
,
N
,
Hi
,
Wi
,
C
)();
make_ParallelTensorFunctor
(
f_kcyx2kc0yxc1
,
K
,
Y
,
X
,
C
)();
DeviceMem
in_n_c0_hi_wi_c1_device_buf
(
sizeof
(
TInWei
)
*
in_n_c0_hi_wi_c1
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_k_c0_y_x_c1_device_buf
(
sizeof
(
TInWei
)
*
wei_k_c0_y_x_c1
.
mDesc
.
GetElementSpace
());
DeviceMem
out_n_k0_ho_wo_k1_device_buf
(
sizeof
(
TOut
)
*
out_n_k0_ho_wo_k1
.
mDesc
.
GetElementSpace
());
in_n_c0_hi_wi_c1_device_buf
.
ToDevice
(
in_n_c0_hi_wi_c1
.
mData
.
data
());
wei_k_c0_y_x_c1_device_buf
.
ToDevice
(
wei_k_c0_y_x_c1
.
mData
.
data
());
const
auto
in_n_c0_hi_wi_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
,
C0
,
Hi
,
Wi
));
const
auto
wei_k_c0_y_x_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
K
,
C0
,
Y
,
X
));
const
auto
out_n_k0_ho_wo_k1_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
,
K0
,
Ho
,
Wo
,
K1
));
#if 1
// cdata = 64, BlockSize = 64, 16x8x32x4
constexpr
index_t
BlockSize
=
64
;
constexpr
index_t
KPerBlock
=
16
;
constexpr
index_t
HoPerBlock
=
8
;
constexpr
index_t
WoPerBlock
=
32
;
constexpr
index_t
EPerBlock
=
1
;
constexpr
index_t
KPerThread
=
KPerBlock
;
constexpr
index_t
HoPerThread
=
2
;
constexpr
index_t
WoPerThread
=
2
;
constexpr
index_t
EPerThread
=
EPerBlock
;
using
ABlockTransferThreadSliceLengths_E_K
=
Sequence
<
3
,
1
>
;
using
ABlockTransferThreadClusterLengths_E_K
=
Sequence
<
3
*
EPerBlock
,
KPerBlock
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_E
=
1
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K
=
1
;
constexpr
index_t
BThreadTransferSrcScalarPerVector_W
=
1
;
constexpr
index_t
CThreadTransferDstScalarPerVector_W
=
16
;
static_assert
(
KPerThread
%
CThreadTransferDstScalarPerVector_W
==
0
,
""
);
#else
constexpr
index_t
BlockSize
=
64
;
constexpr
index_t
KPerBlock
=
16
;
constexpr
index_t
HoPerBlock
=
8
;
constexpr
index_t
WoPerBlock
=
32
;
constexpr
index_t
EPerBlock
=
1
;
constexpr
index_t
KPerThread
=
16
;
constexpr
index_t
HoPerThread
=
2
;
constexpr
index_t
WoPerThread
=
2
;
constexpr
index_t
EPerThread
=
EPerBlock
;
using
ABlockTransferThreadSliceLengths_E_K
=
Sequence
<
9
,
1
>
;
using
ABlockTransferThreadClusterLengths_E_K
=
Sequence
<
EPerBlock
,
16
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_E
=
1
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K
=
1
;
constexpr
index_t
BThreadTransferSrcScalarPerVector_W
=
1
;
constexpr
index_t
CThreadTransferDstScalarPerVector_W
=
K1
;
static_assert
(
KPerThread
%
CThreadTransferDstScalarPerVector_W
==
0
,
""
);
#endif
constexpr
auto
conv_driver
=
#if 0
DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
#else
DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outpad
#endif
<
BlockSize
,
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
,
TAcc
,
TOut
,
KPerBlock
,
HoPerBlock
,
WoPerBlock
,
EPerBlock
,
KPerThread
,
HoPerThread
,
WoPerThread
,
EPerThread
,
ABlockTransferThreadSliceLengths_E_K
,
ABlockTransferThreadClusterLengths_E_K
,
ABlockTransferSrcScalarPerVector_E
,
ABlockTransferDstScalarPerVector_K
,
BThreadTransferSrcScalarPerVector_W
,
CThreadTransferDstScalarPerVector_W
>
{};
conv_driver
.
Run
(
wei_k_c0_y_x_desc
,
in_n_c0_hi_wi_desc
,
out_n_k0_ho_wo_k1_desc
,
conv_strides
,
conv_dilations
,
in_left_pads
,
in_right_pads
,
static_cast
<
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
*>
(
wei_k_c0_y_x_c1_device_buf
.
GetDeviceBuffer
()),
static_cast
<
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
*>
(
in_n_c0_hi_wi_c1_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TOut
*>
(
out_n_k0_ho_wo_k1_device_buf
.
GetDeviceBuffer
()));
out_n_k0_ho_wo_k1_device_buf
.
FromDevice
(
out_n_k0_ho_wo_k1
.
mData
.
data
());
auto
f_nk0hwk1_to_nkhw
=
[
&
](
auto
n
,
auto
k
,
auto
ho
,
auto
wo
)
{
out_n_k_ho_wo
(
n
,
k
,
ho
,
wo
)
=
out_n_k0_ho_wo_k1
(
n
,
k
/
InWeiVectorSize
,
ho
,
wo
,
k
%
InWeiVectorSize
);
};
make_ParallelTensorFunctor
(
f_nk0hwk1_to_nkhw
,
N
,
K
,
Ho
,
Wo
)();
}
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment