Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
33d1e0e2
Commit
33d1e0e2
authored
Jun 17, 2019
by
Chao Liu
Browse files
refactoring for miopen
parent
b1cb48a0
Changes
33
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
122 additions
and
150 deletions
+122
-150
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp
...ridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp
+1
-1
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp
...ridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp
+1
-1
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp
...n_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp
+1
-1
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp
...ridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp
+1
-1
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp
...n_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp
+1
-1
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp
.../gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp
+1
-1
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp
...ion_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp
+3
-3
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
.../gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
+1
-1
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp
...ion_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp
+1
-1
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
.../gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
+1
-1
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp
...ion_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp
+7
-18
composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp
...l/include/tensor_description/ConstantTensorDescriptor.hpp
+4
-7
composable_kernel/include/tensor_operation/blockwise_2d_tensor_op.hpp
...ernel/include/tensor_operation/blockwise_2d_tensor_op.hpp
+4
-4
composable_kernel/include/tensor_operation/blockwise_3d_tensor_op.hpp
...ernel/include/tensor_operation/blockwise_3d_tensor_op.hpp
+1
-1
composable_kernel/include/tensor_operation/blockwise_4d_tensor_op.hpp
...ernel/include/tensor_operation/blockwise_4d_tensor_op.hpp
+1
-1
composable_kernel/include/tensor_operation/blockwise_batched_gemm.hpp
...ernel/include/tensor_operation/blockwise_batched_gemm.hpp
+19
-19
composable_kernel/include/tensor_operation/blockwise_gemm.hpp
...osable_kernel/include/tensor_operation/blockwise_gemm.hpp
+32
-16
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
.../tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+35
-64
composable_kernel/include/tensor_operation/blockwise_tensor_slice_copy.hpp
.../include/tensor_operation/blockwise_tensor_slice_copy.hpp
+3
-4
composable_kernel/include/tensor_operation/threadwise_4d_tensor_op.hpp
...rnel/include/tensor_operation/threadwise_4d_tensor_op.hpp
+4
-4
No files found.
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp
View file @
33d1e0e2
...
...
@@ -264,7 +264,7 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
#if 1
blockwise_batch_gemm
.
Run
#else
blockwise_batch_gemm
.
Run_asm
blockwise_batch_gemm
.
Run_
amd_
asm
#endif
(
p_wei_block
+
wei_c_y_x_k_block_desc
.
GetOffsetFromMultiIndex
(
0
,
y
,
x
,
0
),
p_in_block
+
in_c_h_w_n_block_desc
.
GetOffsetFromMultiIndex
(
0
,
y
,
x
,
0
),
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp
View file @
33d1e0e2
...
...
@@ -191,7 +191,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
#if 0
return blockwise_batch_gemm.Run(Xs...);
#elif
0
return
blockwise_batch_gemm
.
Run_asm
(
Xs
...);
return
blockwise_batch_gemm
.
Run_
amd_
asm
(
Xs
...);
#else
return
blockwise_batch_gemm
.
Run_asm_v2
(
Xs
...);
#endif
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp
View file @
33d1e0e2
...
...
@@ -206,7 +206,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
#if 1
return
blockwise_batch_gemm
.
Run
(
Xs
...);
#elif 0
return
blockwise_batch_gemm
.
Run_asm
(
Xs
...);
return
blockwise_batch_gemm
.
Run_
amd_
asm
(
Xs
...);
#else
return
blockwise_batch_gemm
.
Run_asm_v2
(
Xs
...);
#endif
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp
View file @
33d1e0e2
...
...
@@ -201,7 +201,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
#if 1
return
blockwise_batch_gemm
.
Run
(
Xs
...);
#elif 0
return
blockwise_batch_gemm
.
Run_asm
(
Xs
...);
return
blockwise_batch_gemm
.
Run_
amd_
asm
(
Xs
...);
#else
return
blockwise_batch_gemm
.
Run_asm_v2
(
Xs
...);
#endif
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp
View file @
33d1e0e2
...
...
@@ -203,7 +203,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer
#if 1
return
blockwise_batch_gemm
.
Run
(
Xs
...);
#elif 0
return
blockwise_batch_gemm
.
Run_asm
(
Xs
...);
return
blockwise_batch_gemm
.
Run_
amd_
asm
(
Xs
...);
#else
return
blockwise_batch_gemm
.
Run_asm_v2
(
Xs
...);
#endif
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp
View file @
33d1e0e2
...
...
@@ -238,7 +238,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
#elif 0
blockwise_gemm
.
Run_RegisterDoubleBuffer
#elif 1
blockwise_gemm
.
Run_asm
blockwise_gemm
.
Run_
amd_
asm
#endif
(
p_wei_block
+
wei_cyxk_block_desc
.
GetOffsetFromMultiIndex
(
0
,
y
,
x
,
0
),
p_in_block
+
y
*
Wi
+
x
,
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp
View file @
33d1e0e2
...
...
@@ -271,7 +271,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
#elif 0
blockwise_gemm
.
Run_RegisterDoubleBuffer
#elif 0
blockwise_gemm
.
Run_asm
blockwise_gemm
.
Run_
amd_
asm
#endif
(
p_wei_block_now
+
wei_cyxk_block_desc
.
GetOffsetFromMultiIndex
(
0
,
y
,
x
,
0
),
...
...
@@ -313,7 +313,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
#elif 0
blockwise_gemm
.
Run_RegisterDoubleBuffer
#elif 0
blockwise_gemm
.
Run_asm
blockwise_gemm
.
Run_
amd_
asm
#endif
(
p_wei_block_double
+
wei_cyxk_block_desc
.
GetOffsetFromMultiIndex
(
0
,
y
,
x
,
0
),
...
...
@@ -339,7 +339,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
#elif 0
blockwise_gemm
.
Run_RegisterDoubleBuffer
#elif 0
blockwise_gemm
.
Run_asm
blockwise_gemm
.
Run_
amd_
asm
#endif
(
p_wei_block_double
+
wei_block_space
+
wei_cyxk_block_desc
.
GetOffsetFromMultiIndex
(
0
,
y
,
x
,
0
),
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
View file @
33d1e0e2
...
...
@@ -217,7 +217,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
#if 1
return
blockwise_gemm
.
Run
(
Xs
...);
#else
return
blockwise_gemm
.
Run_asm
(
Xs
...);
return
blockwise_gemm
.
Run_
amd_
asm
(
Xs
...);
#endif
};
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp
View file @
33d1e0e2
...
...
@@ -214,7 +214,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw_lds_double_buffer
#if 1
return
blockwise_gemm
.
Run
(
Xs
...);
#else
return
blockwise_gemm
.
Run_asm
(
Xs
...);
return
blockwise_gemm
.
Run_
amd_
asm
(
Xs
...);
#endif
};
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
View file @
33d1e0e2
...
...
@@ -247,7 +247,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
#if 1
return
blockwise_gemm
.
Run
(
Xs
...);
#else
return
blockwise_gemm
.
Run_asm
(
Xs
...);
return
blockwise_gemm
.
Run_
amd_
asm
(
Xs
...);
#endif
};
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp
View file @
33d1e0e2
...
...
@@ -9,10 +9,6 @@
#include "blockwise_gemm.hpp"
#include "threadwise_generic_tensor_slice_copy.hpp"
#ifndef CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM
#define CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM 1
#endif
namespace
ck
{
// define B = merge(N0, Ho, Wo)
...
...
@@ -239,15 +235,6 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
GemmDataPerReadA
,
GemmDataPerReadB
>
{};
// choose GEMM implementation here
const
auto
run_blockwise_gemm
=
[
&
](
auto
...
Xs
)
{
#if CK_USE_AMD_INLINE_ASM && CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM
return
blockwise_gemm
.
Run_asm
(
Xs
...);
#else
return
blockwise_gemm
.
Run
(
Xs
...);
#endif
};
// LDS allocation for input and weight: be careful of alignment
constexpr
index_t
max_align
=
math
::
lcm
(
InBlockCopyDstDataPerWrite_N2
,
WeiBlockCopyDstDataPerWrite_K
,
...
...
@@ -255,9 +242,11 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
GemmDataPerReadB
);
constexpr
index_t
in_block_space
=
in_e_n1_b_n2_block_desc
.
GetElementSpace
(
Number
<
max_align
>
{});
math
::
integer_divide_ceil
(
in_e_n1_b_n2_block_desc
.
GetElementSpace
(),
max_align
)
*
max_align
;
constexpr
index_t
wei_block_space
=
wei_e_k_block_desc
.
GetElementSpace
(
Number
<
max_align
>
{});
constexpr
index_t
wei_block_space
=
math
::
integer_divide_ceil
(
wei_e_k_block_desc
.
GetElementSpace
(),
max_align
)
*
max_align
;
__shared__
Float
p_in_block_double
[
2
*
in_block_space
];
__shared__
Float
p_wei_block_double
[
2
*
wei_block_space
];
...
...
@@ -309,7 +298,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
p_wei_register_clipboard
);
// LDS double buffer: GEMM on current data
run_
blockwise_gemm
(
p_wei_block_now
,
p_in_block_now
,
p_out_thread
);
blockwise_gemm
.
Run
(
p_wei_block_now
,
p_in_block_now
,
p_out_thread
);
// LDS double buffer: store next data to LDS
blockwise_in_copy
.
RunStoreRegisterClipboard
(
p_in_register_clipboard
,
...
...
@@ -336,7 +325,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
p_wei_register_clipboard
);
// LDS double buffer: GEMM on current data
run_
blockwise_gemm
(
p_wei_block_double
,
p_in_block_double
,
p_out_thread
);
blockwise_gemm
.
Run
(
p_wei_block_double
,
p_in_block_double
,
p_out_thread
);
// LDS double buffer: store next data to LDS
blockwise_in_copy
.
RunStoreRegisterClipboard
(
p_in_register_clipboard
,
...
...
@@ -348,7 +337,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
__syncthreads
();
// LDS double buffer: GEMM on current data
run_
blockwise_gemm
(
p_wei_block_double
+
wei_block_space
,
blockwise_gemm
.
Run
(
p_wei_block_double
+
wei_block_space
,
p_in_block_double
+
in_block_space
,
p_out_thread
);
}
...
...
composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp
View file @
33d1e0e2
...
...
@@ -107,15 +107,12 @@ struct ConstantTensorDescriptor
return
accumulate_on_sequence
(
Lengths
{},
math
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
}
template
<
class
Align
=
Number
<
1
>
>
__host__
__device__
static
constexpr
index_t
GetElementSpace
(
Align
align
=
Align
{})
__host__
__device__
static
constexpr
index_t
GetElementSpace
()
{
// This is WRONG! align shouldbe applied to the last memory rank, not the last tensor
// dimension
constexpr
index_t
element_space_unaligned
=
accumulate_on_sequence
(
(
GetLengths
()
-
Number
<
1
>
{})
*
GetStrides
(),
math
::
plus
<
index_t
>
{},
Number
<
1
>
{});
return
align
.
Get
()
*
((
element_space_unaligned
+
align
.
Get
()
-
1
)
/
align
.
Get
())
;
return
element_space_unaligned
;
}
// emulate constexpr lambda
...
...
@@ -234,7 +231,7 @@ struct ConstantTensorDescriptor
// do carry check in reversed order, starting from lowest dimension
// don't check the highest dimension
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDimReverse
)
{
constexpr
index_t
idim
=
nDim
-
1
-
IDimReverse
.
Get
()
;
constexpr
index_t
idim
=
nDim
-
1
-
IDimReverse
;
constexpr
auto
IDim
=
Number
<
idim
>
{};
if
(
carry
)
...
...
@@ -260,7 +257,7 @@ struct ConstantTensorDescriptor
// do borrow check in reversed order, starting from lowest dimension
// don't check the highest dimension
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDimReverse
)
{
constexpr
index_t
idim
=
nDim
-
1
-
IDimReverse
.
Get
()
;
constexpr
index_t
idim
=
nDim
-
1
-
IDimReverse
;
constexpr
auto
IDim
=
Number
<
idim
>
{};
if
(
borrow
)
...
...
composable_kernel/include/tensor_operation/blockwise_2d_tensor_op.hpp
View file @
33d1e0e2
...
...
@@ -286,7 +286,7 @@ struct Blockwise2dTensorCopy2
__device__
void
Run
(
const
Float
*
__restrict__
p_src
,
Float
*
__restrict__
p_dst
)
const
{
static_assert
(
is_same
<
Float
,
float
>
::
value
,
"wrong! only support float!
\n
"
);
static_assert
(
is_same
<
Float
,
float
>
{}
,
"wrong! only support float!
\n
"
);
using
Float4
=
float4
;
using
Float2
=
float2
;
...
...
@@ -565,7 +565,7 @@ struct Blockwise2dTensorCopy3
__device__
constexpr
index_t
GetRegisterClipboardSize
()
const
{
static_assert
(
is_same
<
Float
,
float
>
::
value
,
"wrong! only support float!
\n
"
);
static_assert
(
is_same
<
Float
,
float
>
{}
,
"wrong! only support float!
\n
"
);
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
...
...
@@ -714,7 +714,7 @@ struct Blockwise2dTensorCopy3
*(reinterpret_cast<const vector_t*>(&p_src[mSrcMyThreadOffset +
iloop * src_loop_stride]));
#else
static_assert
(
is_same
<
float
,
Float
>
::
value
&&
DataPerRead
==
4
,
static_assert
(
is_same
<
float
,
Float
>
{}
&&
DataPerRead
==
4
,
"global_load is only for float4"
);
global_load
(
reinterpret_cast
<
vector_t
&>
(
p_clipboard
[
iloop
*
DataPerRead
]),
...
...
@@ -773,7 +773,7 @@ struct Blockwise2dTensorCopy3
*(reinterpret_cast<vector_t*>(&p_dst[mDstMyThreadOffset + iloop * dst_loop_stride]) =
*(reinterpret_cast<const vector_t*>(&p_clipboard[iloop * DataPerRead]);
#else
static_assert
(
is_same
<
float
,
Float
>
::
value
&&
DataPerRead
==
4
,
static_assert
(
is_same
<
float
,
Float
>
{}
&&
DataPerRead
==
4
,
"ds_write_b128 is only for float4"
);
ds_write_b128
(
reinterpret_cast
<
const
vector_t
&>
(
p_clipboard
[
iloop
*
DataPerRead
]),
...
...
composable_kernel/include/tensor_operation/blockwise_3d_tensor_op.hpp
View file @
33d1e0e2
...
...
@@ -239,7 +239,7 @@ struct Blockwise3dTensorCopy3
__device__
static
constexpr
index_t
GetRegisterClipboardSize
()
{
static_assert
(
is_same
<
Float
,
float
>
::
value
,
"wrong! only support float!
\n
"
);
static_assert
(
is_same
<
Float
,
float
>
{}
,
"wrong! only support float!
\n
"
);
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
...
...
composable_kernel/include/tensor_operation/blockwise_4d_tensor_op.hpp
View file @
33d1e0e2
...
...
@@ -598,7 +598,7 @@ struct Blockwise4dTensorCopy3
__device__
constexpr
index_t
GetRegisterClipboardSize
()
const
{
static_assert
(
is_same
<
Float
,
float
>
::
value
,
"wrong! only support float!
\n
"
);
static_assert
(
is_same
<
Float
,
float
>
{}
,
"wrong! only support float!
\n
"
);
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
...
...
composable_kernel/include/tensor_operation/blockwise_batched_gemm.hpp
View file @
33d1e0e2
...
...
@@ -295,9 +295,9 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
#if CK_USE_AMD_INLINE_ASM
template
<
class
FloatA
,
class
FloatB
,
class
FloatC
>
__device__
void
Run_asm
(
const
FloatA
*
__restrict__
p_a_block
,
const
FloatB
*
__restrict__
p_b_block
,
FloatC
*
__restrict__
p_c_thread
)
const
__device__
void
Run_
amd_
asm
(
const
FloatA
*
__restrict__
p_a_block
,
const
FloatB
*
__restrict__
p_b_block
,
FloatC
*
__restrict__
p_c_thread
)
const
{
constexpr
auto
a_block_mtx
=
BlockMatrixA
{};
constexpr
auto
b_block_mtx
=
BlockMatrixB
{};
...
...
@@ -330,19 +330,19 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
constexpr
index_t
NPerLevel1Cluster
=
NPerThreadSubC
*
NLevel0Cluster
*
NLevel1Cluster
;
// assertion for inline asm
static_assert
(
is_same
<
FloatA
,
float
>
::
value
&&
is_same
<
FloatB
,
float
>
::
value
&&
is_same
<
FloatC
,
float
>
::
value
,
"Run_asm only deal with float
\n
"
);
static_assert
(
is_same
<
FloatA
,
float
>
{}
&&
is_same
<
FloatB
,
float
>
{}
&&
is_same
<
FloatC
,
float
>
{}
,
"Run_
amd_
asm only deal with float
\n
"
);
static_assert
(
MPerThreadSubC
==
4
&&
NPerThreadSubC
==
4
&&
KPerThreadLoop
==
1
&&
MPerThread
==
8
&&
NPerThread
==
8
,
"Run_asm cannot deal with this GEMM shape yet
\n
"
);
"Run_
amd_
asm cannot deal with this GEMM shape yet
\n
"
);
static_assert
(
DataPerReadA
==
4
&&
DataPerReadB
==
4
,
"Run_asm only do float4 read
\n
"
);
static_assert
(
DataPerReadA
==
4
&&
DataPerReadB
==
4
,
"Run_
amd_
asm only do float4 read
\n
"
);
static_assert
(
BlockMatrixStrideA
==
0
&&
BatchPerThread
==
1
,
"Run_asm can only deal with BlockMatrixStrideA == 0 && BatchPerThread ==
1 for now
\n
"
);
static_assert
(
BlockMatrixStrideA
==
0
&&
BatchPerThread
==
1
,
"Run_amd_asm can only deal with
BlockMatrixStrideA == 0 && BatchPerThread ==
"
"
1 for now
\n
"
);
using
Float4
=
vector_type
<
float
,
4
>::
MemoryType
;
...
...
@@ -421,19 +421,19 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
constexpr
index_t
NPerLevel1Cluster
=
NPerThreadSubC
*
NLevel0Cluster
*
NLevel1Cluster
;
// assertion for inline asm
static_assert
(
is_same
<
FloatA
,
float
>
::
value
&&
is_same
<
FloatB
,
float
>
::
value
&&
is_same
<
FloatC
,
float
>
::
value
,
"Run_asm only deal with float
\n
"
);
static_assert
(
is_same
<
FloatA
,
float
>
{}
&&
is_same
<
FloatB
,
float
>
{}
&&
is_same
<
FloatC
,
float
>
{}
,
"Run_
amd_
asm only deal with float
\n
"
);
static_assert
(
MPerThreadSubC
==
4
&&
NPerThreadSubC
==
4
&&
KPerThreadLoop
==
1
&&
MPerThread
==
8
&&
NPerThread
==
8
,
"Run_asm cannot deal with this GEMM shape yet
\n
"
);
"Run_
amd_
asm cannot deal with this GEMM shape yet
\n
"
);
static_assert
(
DataPerReadA
==
4
&&
DataPerReadB
==
4
,
"Run_asm only do float4 read
\n
"
);
static_assert
(
DataPerReadA
==
4
&&
DataPerReadB
==
4
,
"Run_
amd_
asm only do float4 read
\n
"
);
static_assert
(
BlockMatrixStrideA
==
0
&&
BatchPerThread
==
1
,
"Run_asm can only deal with BlockMatrixStrideA == 0 && BatchPerThread ==
1 for now
\n
"
);
static_assert
(
BlockMatrixStrideA
==
0
&&
BatchPerThread
==
1
,
"Run_amd_asm can only deal with
BlockMatrixStrideA == 0 && BatchPerThread ==
"
"
1 for now
\n
"
);
using
Float4
=
vector_type
<
float
,
4
>::
MemoryType
;
...
...
composable_kernel/include/tensor_operation/blockwise_gemm.hpp
View file @
33d1e0e2
...
...
@@ -5,6 +5,10 @@
#include "ConstantMatrixDescriptor.hpp"
#include "threadwise_gemm.hpp"
#ifndef CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM
#define CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM 1
#endif
namespace
ck
{
// if following number are power of 2, index calculation shall be greatly reduced:
...
...
@@ -51,7 +55,8 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
N
%
(
NPerThreadSubC
*
NLevel0Cluster
*
NLevel1Cluster
)
==
0
,
"wrong! Cannot evenly divide work among
\n
"
);
static_assert
(
is_same_type
(
ThreadMatrixC
::
GetLengths
(),
GetThreadMatrixCLengths
()),
static_assert
(
std
::
is_same
<
decltype
(
ThreadMatrixC
::
GetLengths
()),
decltype
(
GetThreadMatrixCLengths
())
>
{},
"wrong! ThreadMatrixC lengths is wrong"
);
auto
c_thread_mtx_index
=
GetBeginOfThreadMatrixC
(
get_thread_local_1d_id
());
...
...
@@ -115,11 +120,10 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
}
#if CK_USE_AMD_INLINE_ASM
// TODO: this is not working correctly
template
<
class
FloatA
,
class
FloatB
,
class
FloatC
>
__device__
void
Run_asm
(
const
FloatA
*
__restrict__
p_a_block
,
const
FloatB
*
__restrict__
p_b_block
,
FloatC
*
__restrict__
p_c_thread
)
const
__device__
void
Run_
amd_
asm
(
const
FloatA
*
__restrict__
p_a_block
,
const
FloatB
*
__restrict__
p_b_block
,
FloatC
*
__restrict__
p_c_thread
)
const
{
constexpr
auto
True
=
integral_constant
<
bool
,
true
>
{};
constexpr
auto
False
=
integral_constant
<
bool
,
false
>
{};
...
...
@@ -156,15 +160,15 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
constexpr
index_t
NPerLevel1Cluster
=
NPerThreadSubC
*
NLevel0Cluster
*
NLevel1Cluster
;
// assertion for inline asm
static_assert
(
is_same
<
FloatA
,
float
>
::
value
&&
is_same
<
FloatB
,
float
>
::
value
&&
is_same
<
FloatC
,
float
>
::
value
,
"Run_asm only deal with float
\n
"
);
static_assert
(
is_same
<
FloatA
,
float
>
{}
&&
is_same
<
FloatB
,
float
>
{}
&&
is_same
<
FloatC
,
float
>
{}
,
"Run_
amd_
asm only deal with float"
);
static_assert
(
MPerThreadSubC
==
4
&&
NPerThreadSubC
==
4
&&
KPerThreadLoop
==
1
&&
MPerThread
==
8
&&
NPerThread
==
8
,
"Run_asm cannot deal with this GEMM shape yet
\n
"
);
"Run_
amd_
asm cannot deal with this GEMM shape yet"
);
static_assert
(
DataPerReadA
==
4
&&
DataPerReadB
==
4
,
"Run_asm only do float4 read
\n
"
);
static_assert
(
DataPerReadA
==
4
&&
DataPerReadB
==
4
,
"Run_
amd_
asm only do float4 read"
);
using
Float4
=
vector_type
<
float
,
4
>::
MemoryType
;
...
...
@@ -200,9 +204,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
#endif
template
<
class
FloatA
,
class
FloatB
,
class
FloatC
>
__device__
void
Run
(
const
FloatA
*
const
__restrict__
p_a_block
,
const
FloatB
*
const
__restrict__
p_b_block
,
FloatC
*
const
__restrict__
p_c_thread
)
const
__device__
void
Run
_source
(
const
FloatA
*
const
__restrict__
p_a_block
,
const
FloatB
*
const
__restrict__
p_b_block
,
FloatC
*
const
__restrict__
p_c_thread
)
const
{
constexpr
auto
True
=
integral_constant
<
bool
,
true
>
{};
constexpr
auto
False
=
integral_constant
<
bool
,
false
>
{};
...
...
@@ -291,9 +295,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
}
template
<
class
FloatA
,
class
FloatB
,
class
FloatC
>
__device__
void
Run
_
RegisterDoubleBuffer
(
FloatA
*
const
p_a_block
,
FloatB
*
const
p_b_block
,
FloatC
*
p_c_thread
)
const
__device__
void
RunRegisterDoubleBuffer
_source
(
FloatA
*
const
p_a_block
,
FloatB
*
const
p_b_block
,
FloatC
*
p_c_thread
)
const
{
constexpr
auto
True
=
integral_constant
<
bool
,
true
>
{};
constexpr
auto
False
=
integral_constant
<
bool
,
false
>
{};
...
...
@@ -427,6 +431,18 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
p_c_thread
);
}
}
template
<
class
FloatA
,
class
FloatB
,
class
FloatC
>
__device__
void
Run
(
const
FloatA
*
__restrict__
p_a_block
,
const
FloatB
*
__restrict__
p_b_block
,
FloatC
*
__restrict__
p_c_thread
)
const
{
#if CK_USE_AMD_INLINE_ASM && CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM
Run_amd_asm
(
p_a_block
,
p_b_block
,
p_c_thread
);
#else
Run_source
(
p_a_block
,
p_b_block
,
p_c_thread
);
#endif
}
};
}
// namespace ck
...
...
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
View file @
33d1e0e2
...
...
@@ -6,6 +6,10 @@
#include "ConstantMergedTensorDescriptor.hpp"
#include "threadwise_generic_tensor_slice_copy.hpp"
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
#endif
namespace
ck
{
// slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
...
...
@@ -91,7 +95,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
data_per_cluster_per_dims
;
// for now, only support SubLengths
.Get()
== 1 on a merged dimension that constains
// for now, only support SubLengths == 1 on a merged dimension that constains
// multiple original dimensions
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDim_
)
{
constexpr
auto
IDim
=
decltype
(
IDim_
){};
...
...
@@ -121,7 +125,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
// partial offset on each dimension
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDim_
)
{
constexpr
auto
IDim
=
decltype
(
IDim_
){};
constexpr
index_t
idim
=
IDim
.
Get
()
;
constexpr
index_t
idim
=
IDim
;
constexpr
auto
src_partial_original_dims
=
SrcDesc
::
GetContainedOriginalDimensions
(
IDim
);
...
...
@@ -135,7 +139,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDim_
)
{
constexpr
auto
IDim
=
decltype
(
IDim_
){};
constexpr
index_t
idim
=
IDim
.
Get
()
;
constexpr
index_t
idim
=
IDim
;
constexpr
auto
dst_partial_original_dims
=
DstDesc
::
GetContainedOriginalDimensions
(
IDim
);
...
...
@@ -153,38 +157,6 @@ struct BlockwiseGenericTensorSliceCopy_v1
mThreadDstOffset
=
accumulate_on_array
(
mThreadDstPartialOffsets
,
math
::
plus
<
index_t
>
{},
static_cast
<
index_t
>
(
0
));
#if 0
if(get_block_1d_id() == 0)
{
printf("id %5u %5u: "
"src_block_data_multi_id_begin: %u %u %u %u, "
"thread_cluster_multi_id: %u %u %u %u, "
"data_cluster_multi_id: %u %u %u %u, "
"thread_data_multi_id_begin: %u %u %u %u, "
"mThreadSrcOffset %u, mThreadDstOffset %u \n",
get_block_1d_id(),
get_thread_local_1d_id(),
src_block_data_multi_id_begin[0],
src_block_data_multi_id_begin[1],
src_block_data_multi_id_begin[2],
src_block_data_multi_id_begin[3],
thread_cluster_multi_id[0],
thread_cluster_multi_id[1],
thread_cluster_multi_id[2],
thread_cluster_multi_id[3],
data_cluster_multi_id[0],
data_cluster_multi_id[1],
data_cluster_multi_id[2],
data_cluster_multi_id[3],
thread_data_multi_id_begin[0],
thread_data_multi_id_begin[1],
thread_data_multi_id_begin[2],
thread_data_multi_id_begin[3],
mThreadSrcOffset,
mThreadDstOffset);
}
#endif
}
__device__
static
constexpr
index_t
GetRegisterClipboardSize
()
...
...
@@ -210,19 +182,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
make_ConstantTensorDescriptor_packed
(
thread_sub_tensor_lengths
*
repeat_lengths
);
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_multi_id_
)
{
#if 0
constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){});
const auto src_thread_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims;
const auto clipboard_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
const index_t src_offset =
SrcDesc{}.GetOffsetFromMultiIndex(src_thread_data_multi_id_begin);
const index_t clipboard_offset =
thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin);
#else
// HIP compiler performs better with these codes
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
constexpr
auto
repeat_multi_id
=
decltype
(
repeat_multi_id_
){};
constexpr
auto
src_thread_data_multi_id_begin
=
...
...
@@ -236,6 +196,18 @@ struct BlockwiseGenericTensorSliceCopy_v1
constexpr
index_t
clipboard_offset
=
thread_tensor_desc
.
GetOffsetFromMultiIndex
(
clipboard_data_multi_id_begin
);
#else
constexpr
auto
repeat_multi_id
=
sequence2array
(
decltype
(
repeat_multi_id_
){});
const
auto
src_thread_data_multi_id_begin
=
repeat_multi_id
*
data_per_cluster_per_dims
;
const
auto
clipboard_data_multi_id_begin
=
repeat_multi_id
*
thread_sub_tensor_lengths
;
const
index_t
src_offset
=
SrcDesc
{}.
GetOffsetFromMultiIndex
(
src_thread_data_multi_id_begin
);
const
index_t
clipboard_offset
=
thread_tensor_desc
.
GetOffsetFromMultiIndex
(
clipboard_data_multi_id_begin
);
#endif
threadwise_generic_tensor_slice_copy_v1
(
SrcDesc
{},
...
...
@@ -263,18 +235,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
make_ConstantTensorDescriptor_packed
(
thread_sub_tensor_lengths
*
repeat_lengths
);
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_multi_id_
)
{
#if 0
constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){});
const auto clipboard_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
const auto dst_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims;
const index_t clipboard_offset =
thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin);
const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_id_begin);
#else
// HIP compiler performs better with these codes
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
constexpr
auto
repeat_multi_id
=
decltype
(
repeat_multi_id_
){};
constexpr
auto
clipboard_data_multi_id_begin
=
...
...
@@ -287,6 +248,17 @@ struct BlockwiseGenericTensorSliceCopy_v1
constexpr
index_t
dst_offset
=
DstDesc
{}.
GetOffsetFromMultiIndex
(
dst_data_multi_id_begin
);
#else
constexpr
auto
repeat_multi_id
=
sequence2array
(
decltype
(
repeat_multi_id_
){});
const
auto
clipboard_data_multi_id_begin
=
repeat_multi_id
*
thread_sub_tensor_lengths
;
const
auto
dst_data_multi_id_begin
=
repeat_multi_id
*
data_per_cluster_per_dims
;
const
index_t
clipboard_offset
=
thread_tensor_desc
.
GetOffsetFromMultiIndex
(
clipboard_data_multi_id_begin
);
const
index_t
dst_offset
=
DstDesc
{}.
GetOffsetFromMultiIndex
(
dst_data_multi_id_begin
);
#endif
threadwise_generic_tensor_slice_copy_v1
(
thread_tensor_desc
,
...
...
@@ -310,7 +282,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
}
// When moving the slicing windows along a merged dimension, if the strides of the
// contained (by the merged dimension) original dimensions are in descending order,
// contained (by the merged dimension) original dimensions are
not
in descending order,
// then there is no guarantee that the new offset will be larger than the old offset
// for movement in positive direction (vice versue for movement in negative direction).
// As a result, there is the possiblity that the offset calculation may result in
...
...
@@ -323,7 +295,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
Number
<
IDim_
>
,
Number
<
StepSize
>
,
integral_constant
<
bool
,
PositiveDirection
>
direction
)
{
constexpr
auto
IDim
=
Number
<
IDim_
>
{};
constexpr
index_t
idim
=
IDim
.
Get
()
;
constexpr
index_t
idim
=
IDim
;
static_if
<
SrcDesc
::
ContainMultipleOriginalDimensions
(
IDim
)
>
{}([
&
](
auto
fwd
)
{
// logic for a merged dimension, also works for non-merged dimension, but its logic may
...
...
@@ -350,8 +322,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
constexpr
auto
I
=
decltype
(
I_
){};
constexpr
index_t
idim_original
=
src_partial_original_dims
.
Get
(
I
);
mThreadSrcOriginalMultiId
(
idim_original
)
=
new_src_partial_original_multi_id
[
I
.
Get
()];
mThreadSrcOriginalMultiId
(
idim_original
)
=
new_src_partial_original_multi_id
[
I
];
});
// calculate new partial offset on this merged dimension
...
...
composable_kernel/include/tensor_operation/blockwise_tensor_slice_copy.hpp
View file @
33d1e0e2
...
...
@@ -49,7 +49,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
make_ConstantTensorDescriptor_packed
(
thread_cluster_lengths
);
// sanity check: data type
static_assert
(
is_same
<
Float
,
float
>
::
value
,
"wrong! only support float for now!
\n
"
);
static_assert
(
is_same
<
Float
,
float
>
{}
,
"wrong! only support float for now!
\n
"
);
// sanity check: nDim
static_assert
(
SrcDesc
::
GetNumOfDimension
()
==
nDim
&&
...
...
@@ -121,12 +121,11 @@ struct BlockwiseTensorSliceReorderCopy_v3
reorder_array_given_old2new
(
thread_multi_id
,
map_thread_cluster_2_src_cluster
);
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDim
)
{
constexpr
auto
I
=
decltype
(
IDim
){};
constexpr
index_t
i
=
I
.
Get
();
constexpr
index_t
idim
=
IDim
;
// compiler: will it really compute index here, or be merged with
// GetOffsetFromMultiIndex and
// optimized away???
src_data_multi_id
(
i
)
*=
src_sub_lengths
.
Get
(
I
);
src_data_multi_id
(
i
dim
)
*=
src_sub_lengths
.
Get
(
I
Dim
);
});
// compiler: will it really compute index here, or be merged with GetOffsetFromMultiIndex
...
...
composable_kernel/include/tensor_operation/threadwise_4d_tensor_op.hpp
View file @
33d1e0e2
...
...
@@ -26,16 +26,16 @@ __device__ void threadwise_4d_tensor_shift_down(Desc, Float* __restrict__ p, IDi
constexpr
index_t
nshift
=
NShift
::
mValue
;
constexpr
index_t
did0_end
=
is_same
<
decltype
(
I0
),
IDim
>
::
value
?
desc
.
GetLength
(
I0
)
-
nshift
:
desc
.
GetLength
(
I0
);
is_same
<
decltype
(
I0
),
IDim
>
{}
?
desc
.
GetLength
(
I0
)
-
nshift
:
desc
.
GetLength
(
I0
);
constexpr
index_t
did1_end
=
is_same
<
decltype
(
I1
),
IDim
>
::
value
?
desc
.
GetLength
(
I1
)
-
nshift
:
desc
.
GetLength
(
I1
);
is_same
<
decltype
(
I1
),
IDim
>
{}
?
desc
.
GetLength
(
I1
)
-
nshift
:
desc
.
GetLength
(
I1
);
constexpr
index_t
did2_end
=
is_same
<
decltype
(
I2
),
IDim
>
::
value
?
desc
.
GetLength
(
I2
)
-
nshift
:
desc
.
GetLength
(
I2
);
is_same
<
decltype
(
I2
),
IDim
>
{}
?
desc
.
GetLength
(
I2
)
-
nshift
:
desc
.
GetLength
(
I2
);
constexpr
index_t
did3_end
=
is_same
<
decltype
(
I3
),
IDim
>
::
value
?
desc
.
GetLength
(
I3
)
-
nshift
:
desc
.
GetLength
(
I3
);
is_same
<
decltype
(
I3
),
IDim
>
{}
?
desc
.
GetLength
(
I3
)
-
nshift
:
desc
.
GetLength
(
I3
);
for
(
index_t
did0
=
0
;
did0
<
did0_end
;
++
did0
)
{
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment