Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
e288457d
Unverified
Commit
e288457d
authored
Mar 30, 2023
by
zjing14
Committed by
GitHub
Mar 30, 2023
Browse files
Merge branch 'develop' into lib_gemm_softmax_gemm_type
parents
10b11916
fde6d274
Changes
94
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
287 additions
and
445 deletions
+287
-445
example/40_conv2d_fwd_quantization/CMakeLists.txt
example/40_conv2d_fwd_quantization/CMakeLists.txt
+5
-0
example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp
.../conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp
+6
-2
example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp
...on/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp
+7
-2
example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp
.../conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp
+87
-0
example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp
...on/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp
+85
-0
example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp
...antization/conv2d_fwd_dl_perchannel_quantization_int8.cpp
+5
-1
example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp
...quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp
+6
-1
example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp
...conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp
+6
-2
example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp
...n/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp
+7
-2
example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp
...ntization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp
+5
-1
example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp
...uantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp
+6
-1
example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perchannel_quantization_example.inc
...n/run_conv2d_fwd_bias_perchannel_quantization_example.inc
+1
-2
example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perlayer_quantization_example.inc
...ion/run_conv2d_fwd_bias_perlayer_quantization_example.inc
+1
-2
example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc
...zation/run_conv2d_fwd_perchannel_quantization_example.inc
+1
-2
example/40_conv2d_fwd_quantization/run_conv2d_fwd_perlayer_quantization_example.inc
...tization/run_conv2d_fwd_perlayer_quantization_example.inc
+1
-2
include/ck/ck.hpp
include/ck/ck.hpp
+1
-1
include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp
...eration/gpu/device/device_grouped_conv_fwd_multiple_d.hpp
+1
-1
include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
...tion/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
+54
-412
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
...impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
+1
-10
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp
.../impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp
+1
-1
No files found.
example/40_conv2d_fwd_quantization/CMakeLists.txt
View file @
e288457d
...
@@ -14,3 +14,8 @@ add_example_executable(example_conv2d_fwd_xdl_bias_relu_perlayer_quantization_in
...
@@ -14,3 +14,8 @@ add_example_executable(example_conv2d_fwd_xdl_bias_relu_perlayer_quantization_in
add_example_executable
(
example_conv2d_fwd_dl_bias_relu_perchannel_quantization_int8 conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp
)
add_example_executable
(
example_conv2d_fwd_dl_bias_relu_perchannel_quantization_int8 conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp
)
add_example_executable
(
example_conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8 conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp
)
add_example_executable
(
example_conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8 conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp
)
# Conv + bias + tanh perlayer quantization
add_example_executable
(
example_conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8 conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp
)
# Conv + bias + tanh perchannel quantization
add_example_executable
(
example_conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8 conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp
)
example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp
View file @
e288457d
...
@@ -76,6 +76,10 @@ using DeviceGroupedConvNDFwdInstance =
...
@@ -76,6 +76,10 @@ using DeviceGroupedConvNDFwdInstance =
5
,
// CThreadTransferSrcDstVectorDim
5
,
// CThreadTransferSrcDstVectorDim
4
>
;
// CThreadTransferDstScalarPerVector
4
>
;
// CThreadTransferDstScalarPerVector
#include "run_conv2d_fwd_bias_
relu_
perchannel_quantization_example.inc"
#include "run_conv2d_fwd_bias_perchannel_quantization_example.inc"
int
main
()
{
run_conv2d_fwd_bias_relu_perchannel_quantization_example
();
};
int
main
()
{
const
auto
out_element_op
=
OutElementOp
{
ActivationOp
{}};
run_conv2d_fwd_bias_perchannel_quantization_example
(
out_element_op
);
};
example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp
View file @
e288457d
...
@@ -74,6 +74,11 @@ using DeviceGroupedConvNDFwdInstance =
...
@@ -74,6 +74,11 @@ using DeviceGroupedConvNDFwdInstance =
5
,
// CThreadTransferSrcDstVectorDim
5
,
// CThreadTransferSrcDstVectorDim
4
>
;
// CThreadTransferDstScalarPerVector
4
>
;
// CThreadTransferDstScalarPerVector
#include "run_conv2d_fwd_bias_
relu_
perlayer_quantization_example.inc"
#include "run_conv2d_fwd_bias_perlayer_quantization_example.inc"
int
main
()
{
run_conv2d_fwd_bias_relu_perlayer_quantization_example
();
}
int
main
()
{
float
requant_scale
=
0.5
f
;
const
auto
out_element_op
=
OutElementOp
{
requant_scale
,
ActivationOp
{}};
run_conv2d_fwd_bias_perlayer_quantization_example
(
out_element_op
);
}
example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp
0 → 100644
View file @
e288457d
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
using
InDataType
=
int8_t
;
using
WeiDataType
=
int8_t
;
using
BiasDataType
=
int32_t
;
using
RequantScaleDataType
=
float
;
using
AccDataType
=
int32_t
;
using
OutDataType
=
int8_t
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
InElementOp
=
PassThrough
;
using
WeiElementOp
=
PassThrough
;
using
ActivationOp
=
ck
::
tensor_operation
::
element_wise
::
TanH
;
using
OutElementOp
=
ck
::
tensor_operation
::
element_wise
::
Add_Mul2_Activation_Mul_Clamp
<
ActivationOp
>
;
static
constexpr
auto
ConvSpec
=
ck
::
tensor_operation
::
device
::
ConvolutionForwardSpecialization
::
Default
;
static
constexpr
auto
GemmSpec
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNKPadding
;
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
,
typename
WeiLayout
,
typename
BiasLayout
,
typename
RequantScaleLayout
,
typename
OutLayout
>
using
DeviceGroupedConvNDFwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
<
NDimSpatial
,
InDataType
,
WeiDataType
,
ck
::
Tuple
<
BiasDataType
,
RequantScaleDataType
>
,
OutDataType
,
AccDataType
,
InLayout
,
WeiLayout
,
ck
::
Tuple
<
BiasLayout
,
RequantScaleLayout
>
,
OutLayout
,
InElementOp
,
WeiElementOp
,
OutElementOp
,
ConvSpec
,
// ConvForwardSpecialization
GemmSpec
,
// GemmSpecialization
256
,
// BlockSize
128
,
// MPerBlock
128
,
// NPerBlock
16
,
// K0PerBlock
4
,
// K1
4
,
// M1PerThread
4
,
// N1PerThread
1
,
// KPerThread
S
<
8
,
2
>
,
// M1N1ThreadClusterM1Xs
S
<
8
,
2
>
,
// M1N1ThreadClusterN1Xs
S
<
8
,
1
,
1
,
4
>
,
// ABlockTransferThreadSliceLengths_K0_M0_M1_K1
S
<
2
,
1
,
128
,
1
>
,
// ABlockTransferThreadClusterLengths_K0_M0_M1_K1
S
<
1
,
2
,
0
,
3
>
,
// ABlockTransferThreadClusterArrangeOrder
S
<
1
,
2
,
0
,
3
>
,
// ABlockTransferSrcAccessOrder
S
<
4
,
1
,
1
,
4
>
,
// ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1
S
<
1
,
2
,
0
,
3
>
,
// ABlockTransferSrcVectorTensorContiguousDimOrder
S
<
1
,
1
,
1
,
4
>
,
// ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1
S
<
8
,
1
,
1
,
4
>
,
// BBlockTransferThreadSliceLengths_K0_N0_N1_K1
S
<
2
,
1
,
128
,
1
>
,
// BBlockTransferThreadClusterLengths_K0_N0_N1_K1
S
<
1
,
2
,
0
,
3
>
,
// BBlockTransferThreadClusterArrangeOrder
S
<
1
,
2
,
0
,
3
>
,
// BBlockTransferSrcAccessOrder
S
<
4
,
1
,
1
,
4
>
,
// BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1
S
<
1
,
2
,
0
,
3
>
,
// BBlockTransferSrcVectorTensorContiguousDimOrder
S
<
1
,
1
,
1
,
4
>
,
// BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
// CThreadTransferSrcDstAccessOrder
5
,
// CThreadTransferSrcDstVectorDim
4
>
;
// CThreadTransferDstScalarPerVector
#include "run_conv2d_fwd_bias_perchannel_quantization_example.inc"
int
main
()
{
float
scale_z_inv
=
0.5
f
;
const
auto
out_element_op
=
OutElementOp
{
scale_z_inv
,
ActivationOp
{}};
run_conv2d_fwd_bias_perchannel_quantization_example
(
out_element_op
);
};
example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp
0 → 100644
View file @
e288457d
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
using
InDataType
=
int8_t
;
using
WeiDataType
=
int8_t
;
using
BiasDataType
=
int32_t
;
using
AccDataType
=
int32_t
;
using
OutDataType
=
int8_t
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
InElementOp
=
PassThrough
;
using
WeiElementOp
=
PassThrough
;
using
ActivationOp
=
ck
::
tensor_operation
::
element_wise
::
TanH
;
using
OutElementOp
=
ck
::
tensor_operation
::
element_wise
::
Add_Mul_Activation_Mul_Clamp
<
ActivationOp
>
;
static
constexpr
auto
ConvSpec
=
ck
::
tensor_operation
::
device
::
ConvolutionForwardSpecialization
::
Default
;
static
constexpr
auto
GemmSpec
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNKPadding
;
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
,
typename
WeiLayout
,
typename
BiasLayout
,
typename
OutLayout
>
using
DeviceGroupedConvNDFwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
<
NDimSpatial
,
InDataType
,
WeiDataType
,
ck
::
Tuple
<
BiasDataType
>
,
OutDataType
,
AccDataType
,
InLayout
,
WeiLayout
,
ck
::
Tuple
<
BiasLayout
>
,
OutLayout
,
InElementOp
,
WeiElementOp
,
OutElementOp
,
ConvSpec
,
// ConvForwardSpecialization
GemmSpec
,
// GemmSpecialization
256
,
// BlockSize
128
,
// MPerBlock
128
,
// NPerBlock
16
,
// K0PerBlock
4
,
// K1
4
,
// M1PerThread
4
,
// N1PerThread
1
,
// KPerThread
S
<
8
,
2
>
,
// M1N1ThreadClusterM1Xs
S
<
8
,
2
>
,
// M1N1ThreadClusterN1Xs
S
<
8
,
1
,
1
,
4
>
,
// ABlockTransferThreadSliceLengths_K0_M0_M1_K1
S
<
2
,
1
,
128
,
1
>
,
// ABlockTransferThreadClusterLengths_K0_M0_M1_K1
S
<
1
,
2
,
0
,
3
>
,
// ABlockTransferThreadClusterArrangeOrder
S
<
1
,
2
,
0
,
3
>
,
// ABlockTransferSrcAccessOrder
S
<
4
,
1
,
1
,
4
>
,
// ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1
S
<
1
,
2
,
0
,
3
>
,
// ABlockTransferSrcVectorTensorContiguousDimOrder
S
<
1
,
1
,
1
,
4
>
,
// ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1
S
<
8
,
1
,
1
,
4
>
,
// BBlockTransferThreadSliceLengths_K0_N0_N1_K1
S
<
2
,
1
,
128
,
1
>
,
// BBlockTransferThreadClusterLengths_K0_N0_N1_K1
S
<
1
,
2
,
0
,
3
>
,
// BBlockTransferThreadClusterArrangeOrder
S
<
1
,
2
,
0
,
3
>
,
// BBlockTransferSrcAccessOrder
S
<
4
,
1
,
1
,
4
>
,
// BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1
S
<
1
,
2
,
0
,
3
>
,
// BBlockTransferSrcVectorTensorContiguousDimOrder
S
<
1
,
1
,
1
,
4
>
,
// BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
// CThreadTransferSrcDstAccessOrder
5
,
// CThreadTransferSrcDstVectorDim
4
>
;
// CThreadTransferDstScalarPerVector
#include "run_conv2d_fwd_bias_perlayer_quantization_example.inc"
int
main
()
{
float
scale_acc
=
0.5
f
;
float
scale_z_inv
=
0.5
f
;
const
auto
out_element_op
=
OutElementOp
{
scale_z_inv
,
scale_acc
,
ActivationOp
{}};
run_conv2d_fwd_bias_perlayer_quantization_example
(
out_element_op
);
}
example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp
View file @
e288457d
...
@@ -76,4 +76,8 @@ using DeviceGroupedConvNDFwdInstance =
...
@@ -76,4 +76,8 @@ using DeviceGroupedConvNDFwdInstance =
#include "run_conv2d_fwd_perchannel_quantization_example.inc"
#include "run_conv2d_fwd_perchannel_quantization_example.inc"
int
main
()
{
run_conv2d_fwd_perchannel_quantization_example
();
}
int
main
()
{
const
auto
out_element_op
=
OutElementOp
{
ActivationOp
{}};
run_conv2d_fwd_perchannel_quantization_example
(
out_element_op
);
}
example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp
View file @
e288457d
...
@@ -71,4 +71,9 @@ using DeviceGroupedConvNDFwdInstance =
...
@@ -71,4 +71,9 @@ using DeviceGroupedConvNDFwdInstance =
#include "run_conv2d_fwd_perlayer_quantization_example.inc"
#include "run_conv2d_fwd_perlayer_quantization_example.inc"
int
main
()
{
run_conv2d_fwd_perlayer_quantization_example
();
}
int
main
()
{
float
requant_scale
=
0.5
f
;
const
auto
out_element_op
=
OutElementOp
{
requant_scale
,
ActivationOp
{}};
run_conv2d_fwd_perlayer_quantization_example
(
out_element_op
);
}
example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp
View file @
e288457d
...
@@ -80,6 +80,10 @@ using DeviceGroupedConvNDFwdInstance =
...
@@ -80,6 +80,10 @@ using DeviceGroupedConvNDFwdInstance =
S
<
1
,
64
,
1
,
4
>
,
S
<
1
,
64
,
1
,
4
>
,
8
>
;
8
>
;
#include "run_conv2d_fwd_bias_
relu_
perchannel_quantization_example.inc"
#include "run_conv2d_fwd_bias_perchannel_quantization_example.inc"
int
main
()
{
run_conv2d_fwd_bias_relu_perchannel_quantization_example
();
};
int
main
()
{
const
auto
out_element_op
=
OutElementOp
{
ActivationOp
{}};
run_conv2d_fwd_bias_perchannel_quantization_example
(
out_element_op
);
};
example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp
View file @
e288457d
...
@@ -78,6 +78,11 @@ using DeviceGroupedConvNDFwdInstance =
...
@@ -78,6 +78,11 @@ using DeviceGroupedConvNDFwdInstance =
S
<
1
,
64
,
1
,
4
>
,
S
<
1
,
64
,
1
,
4
>
,
8
>
;
8
>
;
#include "run_conv2d_fwd_bias_
relu_
perlayer_quantization_example.inc"
#include "run_conv2d_fwd_bias_perlayer_quantization_example.inc"
int
main
()
{
run_conv2d_fwd_bias_relu_perlayer_quantization_example
();
}
int
main
()
{
float
requant_scale
=
0.5
f
;
const
auto
out_element_op
=
OutElementOp
{
requant_scale
,
ActivationOp
{}};
run_conv2d_fwd_bias_perlayer_quantization_example
(
out_element_op
);
}
example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp
View file @
e288457d
...
@@ -80,4 +80,8 @@ using DeviceGroupedConvNDFwdInstance =
...
@@ -80,4 +80,8 @@ using DeviceGroupedConvNDFwdInstance =
#include "run_conv2d_fwd_perchannel_quantization_example.inc"
#include "run_conv2d_fwd_perchannel_quantization_example.inc"
int
main
()
{
run_conv2d_fwd_perchannel_quantization_example
();
}
int
main
()
{
const
auto
out_element_op
=
OutElementOp
{
ActivationOp
{}};
run_conv2d_fwd_perchannel_quantization_example
(
out_element_op
);
}
example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp
View file @
e288457d
...
@@ -75,4 +75,9 @@ using DeviceGroupedConvNDFwdInstance =
...
@@ -75,4 +75,9 @@ using DeviceGroupedConvNDFwdInstance =
#include "run_conv2d_fwd_perlayer_quantization_example.inc"
#include "run_conv2d_fwd_perlayer_quantization_example.inc"
int
main
()
{
run_conv2d_fwd_perlayer_quantization_example
();
}
int
main
()
{
float
requant_scale
=
0.5
f
;
const
auto
out_element_op
=
OutElementOp
{
requant_scale
,
ActivationOp
{}};
run_conv2d_fwd_perlayer_quantization_example
(
out_element_op
);
}
example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_
relu_
perchannel_quantization_example.inc
→
example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perchannel_quantization_example.inc
View file @
e288457d
...
@@ -167,7 +167,7 @@ bool run_grouped_conv_fwd(bool do_verification,
...
@@ -167,7 +167,7 @@ bool run_grouped_conv_fwd(bool do_verification,
return
(
pass
?
0
:
1
);
return
(
pass
?
0
:
1
);
}
}
int
run_conv2d_fwd_bias_
relu_
perchannel_quantization_example
()
int
run_conv2d_fwd_bias_perchannel_quantization_example
(
const
OutElementOp
&
out_element_op
)
{
{
bool
do_verification
=
true
;
bool
do_verification
=
true
;
bool
time_kernel
=
true
;
bool
time_kernel
=
true
;
...
@@ -189,7 +189,6 @@ int run_conv2d_fwd_bias_relu_perchannel_quantization_example()
...
@@ -189,7 +189,6 @@ int run_conv2d_fwd_bias_relu_perchannel_quantization_example()
const
auto
in_element_op
=
InElementOp
{};
const
auto
in_element_op
=
InElementOp
{};
const
auto
wei_element_op
=
WeiElementOp
{};
const
auto
wei_element_op
=
WeiElementOp
{};
const
auto
out_element_op
=
OutElementOp
{
ActivationOp
{}};
using
InLayout
=
ck
::
tensor_layout
::
convolution
::
GNHWC
;
using
InLayout
=
ck
::
tensor_layout
::
convolution
::
GNHWC
;
using
WeiLayout
=
ck
::
tensor_layout
::
convolution
::
GKYXC
;
using
WeiLayout
=
ck
::
tensor_layout
::
convolution
::
GKYXC
;
...
...
example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_
relu_
perlayer_quantization_example.inc
→
example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perlayer_quantization_example.inc
View file @
e288457d
...
@@ -155,7 +155,7 @@ bool run_grouped_conv_fwd(bool do_verification,
...
@@ -155,7 +155,7 @@ bool run_grouped_conv_fwd(bool do_verification,
return
(
pass
?
0
:
1
);
return
(
pass
?
0
:
1
);
}
}
int
run_conv2d_fwd_bias_
relu_
perlayer_quantization_example
()
int
run_conv2d_fwd_bias_perlayer_quantization_example
(
const
OutElementOp
&
out_element_op
)
{
{
bool
do_verification
=
true
;
bool
do_verification
=
true
;
bool
time_kernel
=
true
;
bool
time_kernel
=
true
;
...
@@ -177,7 +177,6 @@ int run_conv2d_fwd_bias_relu_perlayer_quantization_example()
...
@@ -177,7 +177,6 @@ int run_conv2d_fwd_bias_relu_perlayer_quantization_example()
const
auto
in_element_op
=
InElementOp
{};
const
auto
in_element_op
=
InElementOp
{};
const
auto
wei_element_op
=
WeiElementOp
{};
const
auto
wei_element_op
=
WeiElementOp
{};
const
auto
out_element_op
=
OutElementOp
{
0.5
f
,
ActivationOp
{}};
using
InLayout
=
ck
::
tensor_layout
::
convolution
::
GNHWC
;
using
InLayout
=
ck
::
tensor_layout
::
convolution
::
GNHWC
;
using
WeiLayout
=
ck
::
tensor_layout
::
convolution
::
GKYXC
;
using
WeiLayout
=
ck
::
tensor_layout
::
convolution
::
GKYXC
;
...
...
example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc
View file @
e288457d
...
@@ -157,7 +157,7 @@ bool run_grouped_conv_fwd(bool do_verification,
...
@@ -157,7 +157,7 @@ bool run_grouped_conv_fwd(bool do_verification,
return
(
pass
?
0
:
1
);
return
(
pass
?
0
:
1
);
}
}
int
run_conv2d_fwd_perchannel_quantization_example
()
int
run_conv2d_fwd_perchannel_quantization_example
(
const
OutElementOp
&
out_element_op
)
{
{
bool
do_verification
=
true
;
bool
do_verification
=
true
;
bool
time_kernel
=
true
;
bool
time_kernel
=
true
;
...
@@ -179,7 +179,6 @@ int run_conv2d_fwd_perchannel_quantization_example()
...
@@ -179,7 +179,6 @@ int run_conv2d_fwd_perchannel_quantization_example()
const
auto
in_element_op
=
InElementOp
{};
const
auto
in_element_op
=
InElementOp
{};
const
auto
wei_element_op
=
WeiElementOp
{};
const
auto
wei_element_op
=
WeiElementOp
{};
const
auto
out_element_op
=
OutElementOp
{
ActivationOp
{}};
using
InLayout
=
ck
::
tensor_layout
::
convolution
::
GNHWC
;
using
InLayout
=
ck
::
tensor_layout
::
convolution
::
GNHWC
;
using
WeiLayout
=
ck
::
tensor_layout
::
convolution
::
GKYXC
;
using
WeiLayout
=
ck
::
tensor_layout
::
convolution
::
GKYXC
;
...
...
example/40_conv2d_fwd_quantization/run_conv2d_fwd_perlayer_quantization_example.inc
View file @
e288457d
...
@@ -139,7 +139,7 @@ bool run_grouped_conv_fwd(bool do_verification,
...
@@ -139,7 +139,7 @@ bool run_grouped_conv_fwd(bool do_verification,
return
(
pass
?
0
:
1
);
return
(
pass
?
0
:
1
);
}
}
int
run_conv2d_fwd_perlayer_quantization_example
()
int
run_conv2d_fwd_perlayer_quantization_example
(
const
OutElementOp
&
out_element_op
)
{
{
bool
do_verification
=
true
;
bool
do_verification
=
true
;
bool
time_kernel
=
false
;
bool
time_kernel
=
false
;
...
@@ -161,7 +161,6 @@ int run_conv2d_fwd_perlayer_quantization_example()
...
@@ -161,7 +161,6 @@ int run_conv2d_fwd_perlayer_quantization_example()
const
auto
in_element_op
=
InElementOp
{};
const
auto
in_element_op
=
InElementOp
{};
const
auto
wei_element_op
=
WeiElementOp
{};
const
auto
wei_element_op
=
WeiElementOp
{};
const
auto
out_element_op
=
OutElementOp
{
0.5
f
,
ActivationOp
{}};
using
InLayout
=
ck
::
tensor_layout
::
convolution
::
GNHWC
;
using
InLayout
=
ck
::
tensor_layout
::
convolution
::
GNHWC
;
using
WeiLayout
=
ck
::
tensor_layout
::
convolution
::
GKYXC
;
using
WeiLayout
=
ck
::
tensor_layout
::
convolution
::
GKYXC
;
...
...
include/ck/ck.hpp
View file @
e288457d
...
@@ -36,7 +36,7 @@
...
@@ -36,7 +36,7 @@
#elif defined(__gfx1030__) // for GPU code
#elif defined(__gfx1030__) // for GPU code
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
#elif defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) // for GPU code
#elif defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) // for GPU code
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x100
20
000
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x
3
100
4
000
#endif
#endif
// FMA instruction
// FMA instruction
...
...
include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp
View file @
e288457d
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
2
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
...
...
include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
View file @
e288457d
...
@@ -73,157 +73,18 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
...
@@ -73,157 +73,18 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
static
constexpr
auto
I2
=
Number
<
2
>
{};
static
constexpr
auto
I2
=
Number
<
2
>
{};
static
constexpr
auto
I3
=
Number
<
3
>
{};
static
constexpr
auto
I3
=
Number
<
3
>
{};
static
constexpr
auto
K1Number
=
Number
<
K1
>
{};
static
auto
MakeAGridDescriptor_KBatch_K0_M_K1
(
index_t
M
,
index_t
K
,
index_t
StrideA
,
int
KBatch
,
int
KPad
)
{
assert
(
KPad
%
(
K1
*
KBatch
)
==
0
);
const
index_t
K0
=
KPad
/
(
K1
*
KBatch
);
const
auto
a_grid_desc_m_k
=
[
&
]()
{
if
constexpr
(
is_same
<
tensor_layout
::
gemm
::
RowMajor
,
ALayout
>::
value
)
{
return
make_naive_tensor_descriptor
(
make_tuple
(
M
,
K
),
make_tuple
(
StrideA
,
I1
));
}
else
if
constexpr
(
is_same
<
tensor_layout
::
gemm
::
ColumnMajor
,
ALayout
>::
value
)
{
return
make_naive_tensor_descriptor
(
make_tuple
(
M
,
K
),
make_tuple
(
I1
,
StrideA
));
}
}();
const
auto
a_grid_desc_m_kpad
=
transform_tensor_descriptor
(
a_grid_desc_m_k
,
make_tuple
(
make_pass_through_transform
(
M
),
make_right_pad_transform
(
K
,
KPad
-
K
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
if
constexpr
(
GemmSpec
==
GemmSpecialization
::
MNPadding
)
{
const
auto
PadM
=
(
MPerBlock
-
M
%
MPerBlock
)
%
MPerBlock
;
return
transform_tensor_descriptor
(
a_grid_desc_m_kpad
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
KBatch
,
K0
,
K1Number
)),
make_right_pad_transform
(
M
,
PadM
)),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
,
1
,
3
>
{},
Sequence
<
2
>
{}));
}
else
{
return
transform_tensor_descriptor
(
a_grid_desc_m_kpad
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
KBatch
,
K0
,
K1Number
)),
make_pass_through_transform
(
M
)),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
,
1
,
3
>
{},
Sequence
<
2
>
{}));
}
}
static
auto
MakeBGridDescriptor_KBatch_K0_N_K1
(
index_t
K
,
index_t
N
,
index_t
StrideB
,
int
KBatch
,
int
KPad
)
{
assert
(
KPad
%
(
K1
*
KBatch
)
==
0
);
const
index_t
K0
=
KPad
/
(
K1
*
KBatch
);
const
auto
b_grid_desc_k_n
=
[
&
]()
{
if
constexpr
(
is_same
<
tensor_layout
::
gemm
::
RowMajor
,
BLayout
>::
value
)
{
return
make_naive_tensor_descriptor
(
make_tuple
(
K
,
N
),
make_tuple
(
StrideB
,
I1
));
}
else
if
constexpr
(
is_same
<
tensor_layout
::
gemm
::
ColumnMajor
,
BLayout
>::
value
)
{
return
make_naive_tensor_descriptor
(
make_tuple
(
K
,
N
),
make_tuple
(
I1
,
StrideB
));
}
}();
const
auto
b_grid_desc_kpad_n
=
transform_tensor_descriptor
(
b_grid_desc_k_n
,
make_tuple
(
make_right_pad_transform
(
K
,
KPad
-
K
),
make_pass_through_transform
(
N
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
if
constexpr
(
GemmSpec
==
GemmSpecialization
::
MNPadding
)
{
const
auto
PadN
=
(
NPerBlock
-
N
%
NPerBlock
)
%
NPerBlock
;
return
transform_tensor_descriptor
(
b_grid_desc_kpad_n
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
KBatch
,
K0
,
K1Number
)),
make_right_pad_transform
(
N
,
PadN
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
,
1
,
3
>
{},
Sequence
<
2
>
{}));
}
else
{
return
transform_tensor_descriptor
(
b_grid_desc_kpad_n
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
KBatch
,
K0
,
K1Number
)),
make_pass_through_transform
(
N
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
,
1
,
3
>
{},
Sequence
<
2
>
{}));
}
}
static
auto
MakeCGridDescriptor_M_N
(
index_t
M
,
index_t
N
,
index_t
StrideC
)
{
const
auto
c_grid_desc_m_n
=
[
&
]()
{
if
constexpr
(
is_same
<
tensor_layout
::
gemm
::
RowMajor
,
CLayout
>::
value
)
{
return
make_naive_tensor_descriptor
(
make_tuple
(
M
,
N
),
make_tuple
(
StrideC
,
I1
));
}
else
if
constexpr
(
is_same
<
tensor_layout
::
gemm
::
ColumnMajor
,
CLayout
>::
value
)
{
return
make_naive_tensor_descriptor
(
make_tuple
(
M
,
N
),
make_tuple
(
I1
,
StrideC
));
}
}();
if
constexpr
(
GemmSpec
==
GemmSpecialization
::
MNPadding
)
{
const
auto
PadM
=
(
MPerBlock
-
M
%
MPerBlock
)
%
MPerBlock
;
const
auto
PadN
=
(
NPerBlock
-
N
%
NPerBlock
)
%
NPerBlock
;
return
transform_tensor_descriptor
(
c_grid_desc_m_n
,
make_tuple
(
make_right_pad_transform
(
M
,
PadM
),
make_right_pad_transform
(
N
,
PadN
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
}
else
{
return
transform_tensor_descriptor
(
c_grid_desc_m_n
,
make_tuple
(
make_pass_through_transform
(
M
),
make_pass_through_transform
(
N
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
}
}
static
auto
GetKPad
(
index_t
K
,
index_t
KBatch
)
{
const
index_t
K0
=
math
::
integer_divide_ceil
(
K
,
K1
*
K0PerBlock
*
KBatch
)
*
K0PerBlock
;
const
index_t
KPad
=
KBatch
*
K0
*
K1
;
return
KPad
;
}
using
AGridDesc_K0_M_K1
=
decltype
(
MakeAGridDescriptor_KBatch_K0_M_K1
(
1
,
1
,
1
,
1
,
1
));
using
BGridDesc_K0_N_K1
=
decltype
(
MakeBGridDescriptor_KBatch_K0_N_K1
(
1
,
1
,
1
,
1
,
1
));
using
CGridDesc_M_N
=
decltype
(
MakeCGridDescriptor_M_N
(
1
,
1
,
1
));
// GridwiseGemm
using
GridwiseGemm
=
GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
<
using
GridwiseGemm
=
GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
<
BlockSize
,
BlockSize
,
ADataType
,
// TODO: distinguish A/B datatype
ADataType
,
// TODO: distinguish A/B datatype
AccDataType
,
AccDataType
,
CDataType
,
CDataType
,
InMemoryDataOperationEnum
::
Set
,
ALayout
,
AGridDesc_K0_M_K1
,
BLayout
,
BGridDesc_K0_N_K1
,
CLayout
,
CGridDesc_M_N
,
AElementwiseOperation
,
AElementwiseOperation
,
BElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
,
CElementwiseOperation
,
GemmSpec
,
MPerBlock
,
MPerBlock
,
NPerBlock
,
NPerBlock
,
K0PerBlock
,
K0PerBlock
,
...
@@ -253,236 +114,64 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
...
@@ -253,236 +114,64 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
CBlockTransferScalarPerVector_NWaveNPerXDL
,
CBlockTransferScalarPerVector_NWaveNPerXDL
,
CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
>
;
CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
>
;
// GridwiseGemm
using
Argument
=
typename
GridwiseGemm
::
Argument
;
using
GridwiseGemmAtomicAdd
=
GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
<
BlockSize
,
ADataType
,
// TODO: distinguish A/B datatype
AccDataType
,
CDataType
,
InMemoryDataOperationEnum
::
AtomicAdd
,
AGridDesc_K0_M_K1
,
BGridDesc_K0_N_K1
,
CGridDesc_M_N
,
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
,
MPerBlock
,
NPerBlock
,
K0PerBlock
,
MPerXDL
,
NPerXDL
,
K1
,
MXdlPerWave
,
NXdlPerWave
,
ABlockTransferThreadClusterLengths_K0_M_K1
,
ABlockTransferThreadClusterArrangeOrder
,
ABlockTransferSrcAccessOrder
,
ABlockTransferSrcVectorDim
,
ABlockTransferSrcScalarPerVector
,
ABlockTransferDstScalarPerVector_K1
,
false
,
// AThreadTransferSrcResetCoordinateAfterRun,
ABlockLdsAddExtraM
,
BBlockTransferThreadClusterLengths_K0_N_K1
,
BBlockTransferThreadClusterArrangeOrder
,
BBlockTransferSrcAccessOrder
,
BBlockTransferSrcVectorDim
,
BBlockTransferSrcScalarPerVector
,
BBlockTransferDstScalarPerVector_K1
,
false
,
// BThreadTransferSrcResetCoordinateAfterRun,
BBlockLdsAddExtraN
,
CShuffleMRepeatPerShuffle
,
CShuffleNRepeatPerShuffle
,
CBlockTransferScalarPerVector_NWaveNPerXDL
,
CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
>
;
using
CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
=
decltype
(
GridwiseGemm
::
MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
(
CGridDesc_M_N
{}));
using
Block2CTileMap
=
typename
GridwiseGemm
::
CBlockClusterAdaptor
;
// Argument
struct
Argument
:
public
BaseArgument
{
Argument
(
const
ADataType
*
p_a_grid
,
const
BDataType
*
p_b_grid
,
CDataType
*
p_c_grid
,
index_t
M
,
index_t
N
,
index_t
K
,
index_t
StrideA
,
index_t
StrideB
,
index_t
StrideC
,
index_t
M01
,
index_t
N01
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
CElementwiseOperation
c_element_op
,
index_t
k_batch
)
:
p_a_grid_
{
p_a_grid
},
p_b_grid_
{
p_b_grid
},
p_c_grid_
{
p_c_grid
},
a_grid_desc_kbatch_k0_m_k1_
{},
b_grid_desc_kbatch_k0_n_k1_
{},
c_grid_desc_m_n_
{},
c_grid_desc_mblock_mperblock_nblock_nperblock_
{},
block_2_ctile_map_
{},
M01_
{
M01
},
N01_
{
N01
},
a_element_op_
{
a_element_op
},
b_element_op_
{
b_element_op
},
c_element_op_
{
c_element_op
},
k_batch_
{
k_batch
}
{
int
KPad
=
DeviceGemmXdlSplitKCShuffle
::
GetKPad
(
K
,
k_batch_
);
a_grid_desc_kbatch_k0_m_k1_
=
DeviceGemmXdlSplitKCShuffle
::
MakeAGridDescriptor_KBatch_K0_M_K1
(
M
,
K
,
StrideA
,
k_batch_
,
KPad
);
b_grid_desc_kbatch_k0_n_k1_
=
DeviceGemmXdlSplitKCShuffle
::
MakeBGridDescriptor_KBatch_K0_N_K1
(
K
,
N
,
StrideB
,
k_batch_
,
KPad
);
c_grid_desc_m_n_
=
DeviceGemmXdlSplitKCShuffle
::
MakeCGridDescriptor_M_N
(
M
,
N
,
StrideC
);
block_2_ctile_map_
=
GridwiseGemm
::
MakeCBlockClusterAdaptor
(
c_grid_desc_m_n_
,
M01
,
N01
,
k_batch_
);
if
(
GridwiseGemm
::
CheckValidity
(
a_grid_desc_kbatch_k0_m_k1_
,
b_grid_desc_kbatch_k0_n_k1_
,
c_grid_desc_m_n_
,
block_2_ctile_map_
))
{
c_grid_desc_mblock_mperblock_nblock_nperblock_
=
GridwiseGemm
::
MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
(
c_grid_desc_m_n_
);
}
}
// private:
const
ADataType
*
p_a_grid_
;
const
BDataType
*
p_b_grid_
;
CDataType
*
p_c_grid_
;
AGridDesc_K0_M_K1
a_grid_desc_kbatch_k0_m_k1_
;
BGridDesc_K0_N_K1
b_grid_desc_kbatch_k0_n_k1_
;
CGridDesc_M_N
c_grid_desc_m_n_
;
CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
c_grid_desc_mblock_mperblock_nblock_nperblock_
;
Block2CTileMap
block_2_ctile_map_
;
index_t
M01_
;
index_t
N01_
;
AElementwiseOperation
a_element_op_
;
BElementwiseOperation
b_element_op_
;
CElementwiseOperation
c_element_op_
;
index_t
k_batch_
;
};
// Invoker
// Invoker
struct
Invoker
:
public
BaseInvoker
struct
Invoker
:
public
BaseInvoker
{
{
using
Argument
=
DeviceGemmXdlSplitKCShuffle
::
Argument
;
void
Print
(
const
Argument
&
arg
)
void
Print
(
const
Argument
&
karg
)
{
karg
.
Print
();
}
{
std
::
cout
<<
"arg.a_grid_desc_kbatch_k0_m_k1_{"
<<
arg
.
a_grid_desc_kbatch_k0_m_k1_
.
GetLength
(
I0
)
<<
", "
<<
arg
.
a_grid_desc_kbatch_k0_m_k1_
.
GetLength
(
I1
)
<<
", "
<<
arg
.
a_grid_desc_kbatch_k0_m_k1_
.
GetLength
(
I2
)
<<
", "
<<
arg
.
a_grid_desc_kbatch_k0_m_k1_
.
GetLength
(
I3
)
<<
"}"
<<
std
::
endl
;
std
::
cout
<<
"arg.b_grid_desc_kbatch_k0_n_k1_{"
<<
arg
.
b_grid_desc_kbatch_k0_n_k1_
.
GetLength
(
I0
)
<<
", "
<<
arg
.
b_grid_desc_kbatch_k0_n_k1_
.
GetLength
(
I1
)
<<
", "
<<
arg
.
b_grid_desc_kbatch_k0_n_k1_
.
GetLength
(
I2
)
<<
", "
<<
arg
.
b_grid_desc_kbatch_k0_n_k1_
.
GetLength
(
I3
)
<<
"}"
<<
std
::
endl
;
std
::
cout
<<
"arg.c_grid_desc_m_n_{ "
<<
arg
.
c_grid_desc_m_n_
.
GetLength
(
I0
)
<<
", "
<<
arg
.
c_grid_desc_m_n_
.
GetLength
(
I1
)
<<
"}"
<<
std
::
endl
;
}
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
float
Run
(
const
Argument
&
k
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
{
{
if
(
stream_config
.
log_level_
>
0
)
if
(
stream_config
.
log_level_
>
0
)
{
{
Print
(
arg
);
Print
(
k
arg
);
}
}
const
auto
kbatch
=
arg
.
a_grid_desc_kbatch_k0_m_k1_
.
GetLength
(
I0
)
;
const
auto
kbatch
=
k
arg
.
k_batch
;
if
(
!
GridwiseGemm
::
CheckValidity
(
arg
.
a_grid_desc_kbatch_k0_m_k1_
,
if
(
!
GridwiseGemm
::
CheckValidity
(
karg
))
arg
.
b_grid_desc_kbatch_k0_n_k1_
,
arg
.
c_grid_desc_m_n_
,
arg
.
block_2_ctile_map_
))
{
{
throw
std
::
runtime_error
(
throw
std
::
runtime_error
(
"wrong! GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2 has invalid setting"
);
"wrong! GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2 has invalid "
"setting"
);
}
}
const
index_t
grid_size
=
index_t
gdx
,
gdy
,
gdz
;
arg
.
block_2_ctile_map_
.
CalculateGridSize
(
arg
.
c_grid_desc_m_n_
);
std
::
tie
(
gdx
,
gdy
,
gdz
)
=
GridwiseGemm
::
CalculateGridSize
(
karg
);
const
auto
K0
=
karg
.
K0
;
const
auto
K0
=
arg
.
a_grid_desc_kbatch_k0_m_k1_
.
GetLength
(
I1
);
const
bool
has_main_k0_block_loop
=
GridwiseGemm
::
CalculateHasMainK0BlockLoop
(
K0
);
const
bool
has_main_k0_block_loop
=
GridwiseGemm
::
CalculateHasMainK0BlockLoop
(
K0
);
float
ave_time
=
0
;
float
ave_time
=
0
;
const
auto
Run
=
[
&
](
const
auto
&
kernel
)
{
const
auto
Run
=
[
&
](
const
auto
&
kernel
)
{
hipGetErrorString
(
hipMemset
(
if
(
kbatch
>
1
)
arg
.
p_c_grid_
,
hipGetErrorString
(
0
,
hipMemset
(
karg
.
p_c_grid
,
0
,
karg
.
M
*
karg
.
N
*
sizeof
(
CDataType
)));
arg
.
c_grid_desc_mblock_mperblock_nblock_nperblock_
.
GetElementSpaceSize
()
*
sizeof
(
CDataType
)));
ave_time
=
launch_and_time_kernel
(
stream_config
,
kernel
,
dim3
(
gdx
,
gdy
,
gdz
),
dim3
(
BlockSize
),
0
,
karg
);
ave_time
=
launch_and_time_kernel
(
stream_config
,
kernel
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
arg
.
p_a_grid_
,
arg
.
p_b_grid_
,
arg
.
p_c_grid_
,
arg
.
a_grid_desc_kbatch_k0_m_k1_
,
arg
.
b_grid_desc_kbatch_k0_n_k1_
,
arg
.
c_grid_desc_mblock_mperblock_nblock_nperblock_
,
arg
.
a_element_op_
,
arg
.
b_element_op_
,
arg
.
c_element_op_
,
arg
.
block_2_ctile_map_
);
};
};
if
(
has_main_k0_block_loop
)
if
(
has_main_k0_block_loop
)
{
{
if
(
kbatch
==
1
)
if
(
kbatch
==
1
)
{
{
const
auto
kernel
=
kernel_gemm_xdlops_v2r4r2
<
const
auto
kernel
=
GridwiseGemm
,
kernel_gemm_xdlops_v2r4r2_simplified
<
GridwiseGemm
,
ADataType
,
// TODO: distiguish A/B datatype
true
,
CDataType
,
InMemoryDataOperationEnum
::
Set
>
;
remove_reference_t
<
DeviceGemmXdlSplitKCShuffle
::
AGridDesc_K0_M_K1
>
,
remove_reference_t
<
DeviceGemmXdlSplitKCShuffle
::
BGridDesc_K0_N_K1
>
,
remove_reference_t
<
DeviceGemmXdlSplitKCShuffle
::
CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
>
,
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
,
remove_reference_t
<
DeviceGemmXdlSplitKCShuffle
::
Block2CTileMap
>
,
true
>
;
Run
(
kernel
);
Run
(
kernel
);
}
}
else
else
{
{
const
auto
kernel
=
kernel_gemm_xdlops_v2r4r2
<
const
auto
kernel
=
GridwiseGemmAtomicAdd
,
kernel_gemm_xdlops_v2r4r2_simplified
<
GridwiseGemm
,
ADataType
,
// TODO: distiguish A/B datatype
true
,
CDataType
,
InMemoryDataOperationEnum
::
AtomicAdd
>
;
remove_reference_t
<
DeviceGemmXdlSplitKCShuffle
::
AGridDesc_K0_M_K1
>
,
remove_reference_t
<
DeviceGemmXdlSplitKCShuffle
::
BGridDesc_K0_N_K1
>
,
remove_reference_t
<
DeviceGemmXdlSplitKCShuffle
::
CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
>
,
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
,
remove_reference_t
<
DeviceGemmXdlSplitKCShuffle
::
Block2CTileMap
>
,
true
>
;
Run
(
kernel
);
Run
(
kernel
);
}
}
...
@@ -491,37 +180,19 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
...
@@ -491,37 +180,19 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
{
{
if
(
kbatch
==
1
)
if
(
kbatch
==
1
)
{
{
const
auto
kernel
=
kernel_gemm_xdlops_v2r4r2
<
const
auto
kernel
=
GridwiseGemm
,
kernel_gemm_xdlops_v2r4r2_simplified
<
GridwiseGemm
,
ADataType
,
// TODO: distiguish A/B datatype
false
,
CDataType
,
InMemoryDataOperationEnum
::
Set
>
;
remove_reference_t
<
DeviceGemmXdlSplitKCShuffle
::
AGridDesc_K0_M_K1
>
,
remove_reference_t
<
DeviceGemmXdlSplitKCShuffle
::
BGridDesc_K0_N_K1
>
,
remove_reference_t
<
DeviceGemmXdlSplitKCShuffle
::
CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
>
,
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
,
remove_reference_t
<
DeviceGemmXdlSplitKCShuffle
::
Block2CTileMap
>
,
false
>
;
Run
(
kernel
);
Run
(
kernel
);
}
}
else
else
{
{
const
auto
kernel
=
kernel_gemm_xdlops_v2r4r2
<
const
auto
kernel
=
GridwiseGemmAtomicAdd
,
kernel_gemm_xdlops_v2r4r2_simplified
<
GridwiseGemm
,
ADataType
,
// TODO: distiguish A/B datatype
false
,
CDataType
,
InMemoryDataOperationEnum
::
AtomicAdd
>
;
remove_reference_t
<
DeviceGemmXdlSplitKCShuffle
::
AGridDesc_K0_M_K1
>
,
remove_reference_t
<
DeviceGemmXdlSplitKCShuffle
::
BGridDesc_K0_N_K1
>
,
remove_reference_t
<
DeviceGemmXdlSplitKCShuffle
::
CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
>
,
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
,
remove_reference_t
<
DeviceGemmXdlSplitKCShuffle
::
Block2CTileMap
>
,
false
>
;
Run
(
kernel
);
Run
(
kernel
);
}
}
...
@@ -544,12 +215,9 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
...
@@ -544,12 +215,9 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
return
true
;
return
true
;
}
}
static
bool
IsSupportedArgument
(
const
Argument
&
arg
)
static
bool
IsSupportedArgument
(
const
Argument
&
k
arg
)
{
{
return
GridwiseGemm
::
CheckValidity
(
arg
.
a_grid_desc_kbatch_k0_m_k1_
,
return
GridwiseGemm
::
CheckValidity
(
karg
);
arg
.
b_grid_desc_kbatch_k0_n_k1_
,
arg
.
c_grid_desc_m_n_
,
arg
.
block_2_ctile_map_
);
}
}
// polymorphic
// polymorphic
...
@@ -567,9 +235,9 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
...
@@ -567,9 +235,9 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
index_t
StrideA
,
index_t
StrideA
,
index_t
StrideB
,
index_t
StrideB
,
index_t
StrideC
,
index_t
StrideC
,
AElementwiseOperation
a_element_op
,
AElementwiseOperation
,
BElementwiseOperation
b_element_op
,
BElementwiseOperation
,
CElementwiseOperation
c_element_op
,
CElementwiseOperation
,
index_t
KBatch
)
index_t
KBatch
)
{
{
return
Argument
{
p_a
,
return
Argument
{
p_a
,
...
@@ -581,11 +249,10 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
...
@@ -581,11 +249,10 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
StrideA
,
StrideA
,
StrideB
,
StrideB
,
StrideC
,
StrideC
,
1
,
GridwiseGemm
::
CalculateMPadded
(
M
),
1
,
GridwiseGemm
::
CalculateNPadded
(
N
),
a_element_op
,
GridwiseGemm
::
CalculateKPadded
(
K
),
b_element_op
,
GridwiseGemm
::
CalculateK0
(
K
,
KBatch
),
c_element_op
,
KBatch
};
KBatch
};
}
}
...
@@ -601,9 +268,9 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
...
@@ -601,9 +268,9 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
index_t
StrideA
,
index_t
StrideA
,
index_t
StrideB
,
index_t
StrideB
,
index_t
StrideC
,
index_t
StrideC
,
AElementwiseOperation
a_element_op
,
AElementwiseOperation
,
BElementwiseOperation
b_element_op
,
BElementwiseOperation
,
CElementwiseOperation
c_element_op
,
CElementwiseOperation
,
ck
::
index_t
KBatch
=
1
)
override
ck
::
index_t
KBatch
=
1
)
override
{
{
return
std
::
make_unique
<
Argument
>
(
static_cast
<
const
ADataType
*>
(
p_a
),
return
std
::
make_unique
<
Argument
>
(
static_cast
<
const
ADataType
*>
(
p_a
),
...
@@ -615,11 +282,10 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
...
@@ -615,11 +282,10 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
StrideA
,
StrideA
,
StrideB
,
StrideB
,
StrideC
,
StrideC
,
1
,
GridwiseGemm
::
CalculateMPadded
(
M
),
1
,
GridwiseGemm
::
CalculateNPadded
(
N
),
a_element_op
,
GridwiseGemm
::
CalculateKPadded
(
K
),
b_element_op
,
GridwiseGemm
::
CalculateK0
(
K
,
KBatch
),
c_element_op
,
KBatch
);
KBatch
);
}
}
...
@@ -630,31 +296,7 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
...
@@ -630,31 +296,7 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
}
}
// polymorphic
// polymorphic
std
::
string
GetTypeString
()
const
override
std
::
string
GetTypeString
()
const
override
{
return
GridwiseGemm
::
GetTypeString
();
}
{
auto
str
=
std
::
stringstream
();
// clang-format off
str
<<
"DeviceGemmXdlSplitKCShuffle"
<<
"<"
<<
BlockSize
<<
", "
<<
MPerBlock
<<
", "
<<
NPerBlock
<<
", "
<<
K0PerBlock
<<
", "
<<
K1
<<
", "
<<
MPerXDL
<<
", "
<<
NPerXDL
<<
", "
<<
MXdlPerWave
<<
", "
<<
NXdlPerWave
<<
", "
<<
ABlockTransferSrcScalarPerVector
<<
", "
<<
ABlockTransferDstScalarPerVector_K1
<<
", "
<<
BBlockTransferSrcScalarPerVector
<<
", "
<<
BBlockTransferDstScalarPerVector_K1
<<
">"
;
// clang-format on
return
str
.
str
();
}
};
};
}
// namespace device
}
// namespace device
...
...
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
View file @
e288457d
...
@@ -840,17 +840,8 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
...
@@ -840,17 +840,8 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<<
KPerBlock
<<
", "
<<
KPerBlock
<<
", "
<<
getConvForwardSpecializationString
(
ConvForwardSpecialization
)
<<
", "
<<
getConvForwardSpecializationString
(
ConvForwardSpecialization
)
<<
", "
<<
K1
<<
", "
<<
K1
<<
", "
<<
MPerXDL
<<
", "
<<
NPerXDL
<<
", "
<<
MXdlPerWave
<<
", "
<<
NXdlPerWave
<<
", "
<<
ABlockTransferSrcScalarPerVector
<<
", "
<<
ABlockTransferSrcScalarPerVector
<<
", "
<<
ABlockTransferDstScalarPerVector_K1
<<
", "
<<
BBlockTransferSrcScalarPerVector
<<
BBlockTransferSrcScalarPerVector
<<
", "
<<
BBlockTransferDstScalarPerVector_K1
<<
", "
<<
CShuffleMXdlPerWavePerShuffle
<<
", "
<<
CShuffleNXdlPerWavePerShuffle
<<
", "
<<
CBlockTransferScalarPerVector_NWaveNPerXdl
<<
">"
;
<<
">"
;
// clang-format on
// clang-format on
...
...
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp
View file @
e288457d
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
2
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
...
...
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment