Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
1724f7c8
Unverified
Commit
1724f7c8
authored
Mar 16, 2023
by
zjing14
Committed by
GitHub
Mar 16, 2023
Browse files
Merge branch 'develop' into lwpck-586
parents
ddbeddb4
fa998675
Changes
61
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1749 additions
and
322 deletions
+1749
-322
example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_relu_perchannel_quantization_example.inc
..._conv2d_fwd_bias_relu_perchannel_quantization_example.inc
+5
-97
example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_relu_perlayer_quantization_example.inc
...un_conv2d_fwd_bias_relu_perlayer_quantization_example.inc
+5
-94
example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc
...zation/run_conv2d_fwd_perchannel_quantization_example.inc
+235
-0
example/40_conv2d_fwd_quantization/run_conv2d_fwd_perlayer_quantization_example.inc
...tization/run_conv2d_fwd_perlayer_quantization_example.inc
+4
-86
example/44_conv2d_fwd_quantization/CMakeLists.txt
example/44_conv2d_fwd_quantization/CMakeLists.txt
+0
-3
include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
.../device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
+7
-5
include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp
...r_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp
+2
-2
include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
...ion/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
+763
-0
include/ck/tensor_operation/gpu/element/quantization_operation.hpp
...k/tensor_operation/gpu/element/quantization_operation.hpp
+52
-9
include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_multiple_d.hpp
...tensor_operation/gpu/grid/gridwise_gemm_dl_multiple_d.hpp
+9
-6
include/ck/utility/inner_product.hpp
include/ck/utility/inner_product.hpp
+22
-0
library/include/ck/library/tensor_operation_instance/gpu/quantization/gemm_quantization.hpp
...operation_instance/gpu/quantization/gemm_quantization.hpp
+224
-0
library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp
...uped_convolution_bias_forward_perchannel_quantization.hpp
+41
-4
library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp
...rouped_convolution_bias_forward_perlayer_quantization.hpp
+41
-4
library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perchannel_quantization.hpp
...n/grouped_convolution_forward_perchannel_quantization.hpp
+40
-4
library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perlayer_quantization.hpp
...ion/grouped_convolution_forward_perlayer_quantization.hpp
+40
-4
library/src/tensor_operation_instance/gpu/quantization/CMakeLists.txt
...tensor_operation_instance/gpu/quantization/CMakeLists.txt
+36
-4
library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/conv2d_quantization_common.hpp
...pu/quantization/conv2d_fwd/conv2d_quantization_common.hpp
+59
-0
library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perchannel_quantization_int8_instance.cpp
..._conv2d_dl_bias_perchannel_quantization_int8_instance.cpp
+82
-0
library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perlayer_quantization_int8_instance.cpp
...ce_conv2d_dl_bias_perlayer_quantization_int8_instance.cpp
+82
-0
No files found.
example/4
4
_conv2d_fwd_quantization/conv2d_fwd_
xdl_
bias_relu_perchannel_quantization_
int8.cpp
→
example/4
0
_conv2d_fwd_quantization/
run_
conv2d_fwd_bias_relu_perchannel_quantization_
example.inc
View file @
1724f7c8
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/ck.hpp"
#pragma once
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
using
InDataType
=
int8_t
;
using
WeiDataType
=
int8_t
;
using
BiasDataType
=
int32_t
;
using
RequantScaleDataType
=
float
;
using
AccDataType
=
int32_t
;
using
CShuffleDataType
=
int32_t
;
using
OutDataType
=
int8_t
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
InElementOp
=
PassThrough
;
using
WeiElementOp
=
PassThrough
;
using
ActivationOp
=
ck
::
tensor_operation
::
element_wise
::
Relu
;
using
OutElementOp
=
ck
::
tensor_operation
::
element_wise
::
Add_Activation_Mul2_Clamp
<
ActivationOp
>
;
static
constexpr
auto
ConvSpec
=
ck
::
tensor_operation
::
device
::
ConvolutionForwardSpecialization
::
Default
;
static
constexpr
auto
GemmSpec
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNKPadding
;
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
,
typename
WeiLayout
,
typename
BiasLayout
,
typename
RequantScaleLayout
,
typename
OutLayout
>
using
DeviceGroupedConvNDFwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
NDimSpatial
,
InLayout
,
WeiLayout
,
ck
::
Tuple
<
BiasLayout
,
RequantScaleLayout
>
,
OutLayout
,
InDataType
,
WeiDataType
,
AccDataType
,
CShuffleDataType
,
ck
::
Tuple
<
BiasDataType
,
RequantScaleDataType
>
,
OutDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
,
ConvSpec
,
// ConvForwardSpecialization
GemmSpec
,
// GemmSpecialization
1
,
//
256
,
// BlockSize
128
,
// MPerBlock
256
,
// NPerBlock
64
,
// KPerBlock
16
,
// AK1
16
,
// BK1
32
,
// MPerXdl
32
,
// NPerXdl
2
,
// MXdlPerWave
4
,
// NXdlPerWave
S
<
4
,
64
,
1
>
,
// ABlockTransferThreadClusterLengths_AK0_M_AK1
S
<
1
,
0
,
2
>
,
// ABlockTransferThreadClusterArrangeOrder
S
<
1
,
0
,
2
>
,
// ABlockTransferSrcAccessOrder
2
,
// ABlockTransferSrcVectorDim
16
,
// ABlockTransferSrcScalarPerVector
16
,
// ABlockTransferDstScalarPerVector_AK1
1
,
// ABlockLdsExtraM
S
<
4
,
64
,
1
>
,
// BBlockTransferThreadClusterLengths_BK0_N_BK1
S
<
1
,
0
,
2
>
,
// BBlockTransferThreadClusterArrangeOrder
S
<
1
,
0
,
2
>
,
// BBlockTransferSrcAccessOrder
2
,
// BBlockTransferSrcVectorDim
16
,
// BBlockTransferSrcScalarPerVector
16
,
// BBlockTransferDstScalarPerVector_BK1
1
,
// BBlockLdsExtraN
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
8
>
;
template
<
ck
::
index_t
NDimSpatial
,
template
<
ck
::
index_t
NDimSpatial
,
typename
InDataType
,
typename
InDataType
,
typename
WeiDataType
,
typename
WeiDataType
,
...
@@ -219,12 +129,12 @@ bool run_grouped_conv_fwd(bool do_verification,
...
@@ -219,12 +129,12 @@ bool run_grouped_conv_fwd(bool do_verification,
if
(
do_verification
)
if
(
do_verification
)
{
{
Tensor
<
CShuffle
DataType
>
c_host
(
out_g_n_k_wos_desc
);
Tensor
<
Acc
DataType
>
c_host
(
out_g_n_k_wos_desc
);
auto
ref_conv
=
ck
::
tensor_operation
::
host
::
ReferenceConvFwd
<
NDimSpatial
,
auto
ref_conv
=
ck
::
tensor_operation
::
host
::
ReferenceConvFwd
<
NDimSpatial
,
InDataType
,
InDataType
,
WeiDataType
,
WeiDataType
,
CShuffle
DataType
,
Acc
DataType
,
InElementOp
,
InElementOp
,
WeiElementOp
,
WeiElementOp
,
PassThrough
>
();
PassThrough
>
();
...
@@ -257,7 +167,7 @@ bool run_grouped_conv_fwd(bool do_verification,
...
@@ -257,7 +167,7 @@ bool run_grouped_conv_fwd(bool do_verification,
return
(
pass
?
0
:
1
);
return
(
pass
?
0
:
1
);
}
}
int
main
()
int
run_conv2d_fwd_bias_relu_perchannel_quantization_example
()
{
{
bool
do_verification
=
true
;
bool
do_verification
=
true
;
bool
time_kernel
=
true
;
bool
time_kernel
=
true
;
...
@@ -268,7 +178,7 @@ int main()
...
@@ -268,7 +178,7 @@ int main()
1
,
// group
1
,
// group
4
,
// batch
4
,
// batch
64
,
// output channels
64
,
// output channels
3
2
,
// input chanels
19
2
,
// input chanels
{
3
,
3
},
// weight HW
{
3
,
3
},
// weight HW
{
71
,
71
},
// x HW
{
71
,
71
},
// x HW
{
2
,
2
},
// strides
{
2
,
2
},
// strides
...
@@ -312,8 +222,6 @@ int main()
...
@@ -312,8 +222,6 @@ int main()
const
auto
out_g_n_k_wos_desc
=
const
auto
out_g_n_k_wos_desc
=
ck
::
utils
::
conv
::
make_output_host_tensor_descriptor_g_n_k_wos_packed
<
OutLayout
>
(
conv_param
);
ck
::
utils
::
conv
::
make_output_host_tensor_descriptor_g_n_k_wos_packed
<
OutLayout
>
(
conv_param
);
std
::
cout
<<
out_g_n_k_wos_desc
<<
std
::
endl
;
using
deviceOp
=
DeviceGroupedConvNDFwdInstance
<
ndim_spatial
,
using
deviceOp
=
DeviceGroupedConvNDFwdInstance
<
ndim_spatial
,
InLayout
,
InLayout
,
WeiLayout
,
WeiLayout
,
...
...
example/4
4
_conv2d_fwd_quantization/conv2d_fwd_
xdl_
bias_relu_perlayer_quantization_
int8.cpp
→
example/4
0
_conv2d_fwd_quantization/
run_
conv2d_fwd_bias_relu_perlayer_quantization_
example.inc
View file @
1724f7c8
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/ck.hpp"
#pragma once
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
using
InDataType
=
int8_t
;
using
WeiDataType
=
int8_t
;
using
BiasDataType
=
int32_t
;
using
AccDataType
=
int32_t
;
using
CShuffleDataType
=
int32_t
;
using
OutDataType
=
int8_t
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
InElementOp
=
PassThrough
;
using
WeiElementOp
=
PassThrough
;
using
ActivationOp
=
ck
::
tensor_operation
::
element_wise
::
Relu
;
using
OutElementOp
=
ck
::
tensor_operation
::
element_wise
::
Add_Activation_Mul_Clamp
<
ActivationOp
>
;
static
constexpr
auto
ConvSpec
=
ck
::
tensor_operation
::
device
::
ConvolutionForwardSpecialization
::
Default
;
static
constexpr
auto
GemmSpec
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNKPadding
;
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
,
typename
WeiLayout
,
typename
BiasLayout
,
typename
OutLayout
>
using
DeviceGroupedConvNDFwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
NDimSpatial
,
InLayout
,
WeiLayout
,
ck
::
Tuple
<
BiasLayout
>
,
OutLayout
,
InDataType
,
WeiDataType
,
AccDataType
,
CShuffleDataType
,
ck
::
Tuple
<
BiasDataType
>
,
OutDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
,
ConvSpec
,
// ConvForwardSpecialization
GemmSpec
,
// GemmSpecialization
1
,
//
256
,
// BlockSize
128
,
// MPerBlock
256
,
// NPerBlock
64
,
// KPerBlock
16
,
// AK1
16
,
// BK1
32
,
// MPerXdl
32
,
// NPerXdl
2
,
// MXdlPerWave
4
,
// NXdlPerWave
S
<
4
,
64
,
1
>
,
// ABlockTransferThreadClusterLengths_AK0_M_AK1
S
<
1
,
0
,
2
>
,
// ABlockTransferThreadClusterArrangeOrder
S
<
1
,
0
,
2
>
,
// ABlockTransferSrcAccessOrder
2
,
// ABlockTransferSrcVectorDim
16
,
// ABlockTransferSrcScalarPerVector
16
,
// ABlockTransferDstScalarPerVector_AK1
1
,
// ABlockLdsExtraM
S
<
4
,
64
,
1
>
,
// BBlockTransferThreadClusterLengths_BK0_N_BK1
S
<
1
,
0
,
2
>
,
// BBlockTransferThreadClusterArrangeOrder
S
<
1
,
0
,
2
>
,
// BBlockTransferSrcAccessOrder
2
,
// BBlockTransferSrcVectorDim
16
,
// BBlockTransferSrcScalarPerVector
16
,
// BBlockTransferDstScalarPerVector_BK1
1
,
// BBlockLdsExtraN
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
8
>
;
template
<
ck
::
index_t
NDimSpatial
,
template
<
ck
::
index_t
NDimSpatial
,
typename
InDataType
,
typename
InDataType
,
...
@@ -205,12 +118,12 @@ bool run_grouped_conv_fwd(bool do_verification,
...
@@ -205,12 +118,12 @@ bool run_grouped_conv_fwd(bool do_verification,
if
(
do_verification
)
if
(
do_verification
)
{
{
Tensor
<
CShuffle
DataType
>
c_host
(
out_g_n_k_wos_desc
);
Tensor
<
Acc
DataType
>
c_host
(
out_g_n_k_wos_desc
);
auto
ref_conv
=
ck
::
tensor_operation
::
host
::
ReferenceConvFwd
<
NDimSpatial
,
auto
ref_conv
=
ck
::
tensor_operation
::
host
::
ReferenceConvFwd
<
NDimSpatial
,
InDataType
,
InDataType
,
WeiDataType
,
WeiDataType
,
CShuffle
DataType
,
Acc
DataType
,
InElementOp
,
InElementOp
,
WeiElementOp
,
WeiElementOp
,
PassThrough
>
();
PassThrough
>
();
...
@@ -242,7 +155,7 @@ bool run_grouped_conv_fwd(bool do_verification,
...
@@ -242,7 +155,7 @@ bool run_grouped_conv_fwd(bool do_verification,
return
(
pass
?
0
:
1
);
return
(
pass
?
0
:
1
);
}
}
int
main
()
int
run_conv2d_fwd_bias_relu_perlayer_quantization_example
()
{
{
bool
do_verification
=
true
;
bool
do_verification
=
true
;
bool
time_kernel
=
true
;
bool
time_kernel
=
true
;
...
@@ -253,7 +166,7 @@ int main()
...
@@ -253,7 +166,7 @@ int main()
1
,
// group
1
,
// group
4
,
// batch
4
,
// batch
64
,
// output channels
64
,
// output channels
3
2
,
// input chanels
19
2
,
// input chanels
{
3
,
3
},
// weight HW
{
3
,
3
},
// weight HW
{
71
,
71
},
// x HW
{
71
,
71
},
// x HW
{
2
,
2
},
// strides
{
2
,
2
},
// strides
...
@@ -294,8 +207,6 @@ int main()
...
@@ -294,8 +207,6 @@ int main()
const
auto
out_g_n_k_wos_desc
=
const
auto
out_g_n_k_wos_desc
=
ck
::
utils
::
conv
::
make_output_host_tensor_descriptor_g_n_k_wos_packed
<
OutLayout
>
(
conv_param
);
ck
::
utils
::
conv
::
make_output_host_tensor_descriptor_g_n_k_wos_packed
<
OutLayout
>
(
conv_param
);
std
::
cout
<<
out_g_n_k_wos_desc
<<
std
::
endl
;
return
run_grouped_conv_fwd
<
return
run_grouped_conv_fwd
<
ndim_spatial
,
ndim_spatial
,
InDataType
,
InDataType
,
...
...
example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc
0 → 100644
View file @
1724f7c8
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
template
<
ck
::
index_t
NDimSpatial
,
typename
InDataType
,
typename
WeiDataType
,
typename
OutDataType
,
typename
InElementOp
,
typename
WeiElementOp
,
typename
OutElementOp
,
typename
DeviceConvNDFwdInstance
>
bool
run_grouped_conv_fwd
(
bool
do_verification
,
bool
time_kernel
,
const
ck
::
utils
::
conv
::
ConvParam
&
conv_param
,
const
HostTensorDescriptor
&
in_g_n_c_wis_desc
,
const
HostTensorDescriptor
&
wei_g_k_c_xs_desc
,
const
HostTensorDescriptor
&
requant_scale_g_k_desc
,
const
HostTensorDescriptor
&
out_g_n_k_wos_desc
,
const
InElementOp
&
in_element_op
,
const
WeiElementOp
&
wei_element_op
,
const
OutElementOp
&
out_element_op
)
{
Tensor
<
InDataType
>
in
(
in_g_n_c_wis_desc
);
Tensor
<
WeiDataType
>
wei
(
wei_g_k_c_xs_desc
);
Tensor
<
RequantScaleDataType
>
requant_scale
(
requant_scale_g_k_desc
);
Tensor
<
OutDataType
>
out_host
(
out_g_n_k_wos_desc
);
Tensor
<
OutDataType
>
out_device
(
out_g_n_k_wos_desc
);
std
::
cout
<<
"in: "
<<
in
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"wei: "
<<
wei
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"requant_scale: "
<<
requant_scale
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"out: "
<<
out_host
.
mDesc
<<
std
::
endl
;
in
.
GenerateTensorValue
(
GeneratorTensor_2
<
InDataType
>
{
-
128
,
127
});
wei
.
GenerateTensorValue
(
GeneratorTensor_2
<
WeiDataType
>
{
-
128
,
127
});
requant_scale
.
GenerateTensorValue
(
GeneratorTensor_2
<
RequantScaleDataType
>
{
0
,
1
});
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
in
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
wei_device_buf
(
sizeof
(
WeiDataType
)
*
wei
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
requant_scale_device_buf
(
sizeof
(
RequantScaleDataType
)
*
requant_scale
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
out_device
.
mDesc
.
GetElementSpaceSize
());
in_device_buf
.
ToDevice
(
in
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
wei
.
mData
.
data
());
requant_scale_device_buf
.
ToDevice
(
requant_scale
.
mData
.
data
());
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
a_g_n_c_wis_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
a_g_n_c_wis_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
d0_g_n_k_wos_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
d0_g_n_k_wos_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
e_g_n_k_wos_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
e_g_n_k_wos_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
conv_filter_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
conv_filter_dilations
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_left_pads
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_right_pads
{};
auto
copy
=
[](
const
auto
&
x
,
auto
&
y
)
{
ck
::
ranges
::
copy
(
x
,
y
.
begin
());
};
copy
(
in_g_n_c_wis_desc
.
GetLengths
(),
a_g_n_c_wis_lengths
);
copy
(
in_g_n_c_wis_desc
.
GetStrides
(),
a_g_n_c_wis_strides
);
copy
(
wei_g_k_c_xs_desc
.
GetLengths
(),
b_g_k_c_xs_lengths
);
copy
(
wei_g_k_c_xs_desc
.
GetStrides
(),
b_g_k_c_xs_strides
);
copy
(
requant_scale_g_k_desc
.
GetLengths
(),
d0_g_n_k_wos_lengths
);
copy
(
requant_scale_g_k_desc
.
GetStrides
(),
d0_g_n_k_wos_strides
);
copy
(
out_g_n_k_wos_desc
.
GetLengths
(),
e_g_n_k_wos_lengths
);
copy
(
out_g_n_k_wos_desc
.
GetStrides
(),
e_g_n_k_wos_strides
);
copy
(
conv_param
.
conv_filter_strides_
,
conv_filter_strides
);
copy
(
conv_param
.
conv_filter_dilations_
,
conv_filter_dilations
);
copy
(
conv_param
.
input_left_pads_
,
input_left_pads
);
copy
(
conv_param
.
input_right_pads_
,
input_right_pads
);
// do Conv
auto
conv
=
DeviceConvNDFwdInstance
{};
auto
invoker
=
conv
.
MakeInvoker
();
auto
argument
=
conv
.
MakeArgument
(
in_device_buf
.
GetDeviceBuffer
(),
wei_device_buf
.
GetDeviceBuffer
(),
{
requant_scale_device_buf
.
GetDeviceBuffer
()},
out_device_buf
.
GetDeviceBuffer
(),
a_g_n_c_wis_lengths
,
a_g_n_c_wis_strides
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
,
{
d0_g_n_k_wos_lengths
},
{
d0_g_n_k_wos_strides
},
e_g_n_k_wos_lengths
,
e_g_n_k_wos_strides
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
in_element_op
,
wei_element_op
,
out_element_op
);
if
(
!
conv
.
IsSupportedArgument
(
argument
))
{
throw
std
::
runtime_error
(
"wrong! device_conv with the specified compilation parameters does "
"not support this Conv problem"
);
}
float
avg_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
flop
=
conv_param
.
GetFlops
();
std
::
size_t
num_btype
=
conv_param
.
GetByte
<
InDataType
,
WeiDataType
,
OutDataType
>
();
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
avg_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
avg_time
;
std
::
cout
<<
"Perf: "
<<
avg_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
conv
.
GetTypeString
()
<<
std
::
endl
;
bool
pass
=
true
;
if
(
do_verification
)
{
Tensor
<
AccDataType
>
c_host
(
out_g_n_k_wos_desc
);
auto
ref_conv
=
ck
::
tensor_operation
::
host
::
ReferenceConvFwd
<
NDimSpatial
,
InDataType
,
WeiDataType
,
AccDataType
,
InElementOp
,
WeiElementOp
,
PassThrough
>
();
auto
ref_invoker
=
ref_conv
.
MakeInvoker
();
auto
ref_argument
=
ref_conv
.
MakeArgument
(
in
,
wei
,
c_host
,
conv_param
.
conv_filter_strides_
,
conv_param
.
conv_filter_dilations_
,
conv_param
.
input_left_pads_
,
conv_param
.
input_right_pads_
,
in_element_op
,
wei_element_op
,
PassThrough
{});
ref_invoker
.
Run
(
ref_argument
);
// TODO: implement elementwise operation for host
out_host
.
ForEach
([
&
](
auto
&
,
auto
idx
)
{
out_element_op
(
out_host
(
idx
),
c_host
(
idx
),
requant_scale
(
idx
));
});
out_device_buf
.
FromDevice
(
out_device
.
mData
.
data
());
pass
&=
ck
::
utils
::
check_err
(
out_device
,
out_host
,
"Error: incorrect results!"
,
1
e
-
5
f
,
1
e
-
4
f
);
}
return
(
pass
?
0
:
1
);
}
int
run_conv2d_fwd_perchannel_quantization_example
()
{
bool
do_verification
=
true
;
bool
time_kernel
=
true
;
const
ck
::
index_t
ndim_spatial
=
2
;
ck
::
utils
::
conv
::
ConvParam
conv_param
{
ndim_spatial
,
// n_dim
1
,
// group
4
,
// batch
64
,
// output channels
192
,
// input chanels
{
3
,
3
},
// weight HW
{
71
,
71
},
// x HW
{
2
,
2
},
// strides
{
1
,
1
},
// dilations
{
1
,
1
},
// left_pads
{
1
,
1
}
// right_pads
};
const
auto
in_element_op
=
InElementOp
{};
const
auto
wei_element_op
=
WeiElementOp
{};
const
auto
out_element_op
=
OutElementOp
{
ActivationOp
{}};
using
InLayout
=
ck
::
tensor_layout
::
convolution
::
GNHWC
;
using
WeiLayout
=
ck
::
tensor_layout
::
convolution
::
GKYXC
;
using
RequantScaleLayout
=
ck
::
tensor_layout
::
convolution
::
G_K
;
using
OutLayout
=
ck
::
tensor_layout
::
convolution
::
GNHWK
;
const
auto
in_g_n_c_wis_desc
=
ck
::
utils
::
conv
::
make_input_host_tensor_descriptor_g_n_c_wis_packed
<
InLayout
>
(
conv_param
);
const
auto
wei_g_k_c_xs_desc
=
ck
::
utils
::
conv
::
make_weight_host_tensor_descriptor_g_k_c_xs_packed
<
WeiLayout
>
(
conv_param
);
const
auto
requant_scale_g_k_desc
=
HostTensorDescriptor
({
conv_param
.
G_
,
conv_param
.
N_
,
conv_param
.
K_
,
conv_param
.
output_spatial_lengths_
[
0
],
conv_param
.
output_spatial_lengths_
[
1
]},
{
conv_param
.
K_
,
// g
0
,
// n
1
,
// k
0
,
// ho
0
// wo
});
const
auto
out_g_n_k_wos_desc
=
ck
::
utils
::
conv
::
make_output_host_tensor_descriptor_g_n_k_wos_packed
<
OutLayout
>
(
conv_param
);
using
deviceOp
=
DeviceGroupedConvNDFwdInstance
<
ndim_spatial
,
InLayout
,
WeiLayout
,
RequantScaleLayout
,
OutLayout
>
;
return
run_grouped_conv_fwd
<
ndim_spatial
,
InDataType
,
WeiDataType
,
OutDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
,
deviceOp
>
(
do_verification
,
time_kernel
,
conv_param
,
in_g_n_c_wis_desc
,
wei_g_k_c_xs_desc
,
requant_scale_g_k_desc
,
out_g_n_k_wos_desc
,
in_element_op
,
wei_element_op
,
out_element_op
);
}
example/4
4
_conv2d_fwd_quantization/conv2d_fwd_
xdl_
perlayer_quantization_
int8.cpp
→
example/4
0
_conv2d_fwd_quantization/
run_
conv2d_fwd_perlayer_quantization_
example.inc
View file @
1724f7c8
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/ck.hpp"
#pragma once
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
using
InDataType
=
int8_t
;
using
WeiDataType
=
int8_t
;
using
AccDataType
=
int32_t
;
using
CShuffleDataType
=
int32_t
;
using
OutDataType
=
int8_t
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
InElementOp
=
PassThrough
;
using
WeiElementOp
=
PassThrough
;
using
ActivationOp
=
PassThrough
;
using
OutElementOp
=
ck
::
tensor_operation
::
element_wise
::
Activation_Mul_Clamp
<
ActivationOp
>
;
static
constexpr
auto
ConvSpec
=
ck
::
tensor_operation
::
device
::
ConvolutionForwardSpecialization
::
Default
;
static
constexpr
auto
GemmSpec
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNKPadding
;
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
,
typename
WeiLayout
,
typename
OutLayout
>
using
DeviceGroupedConvNDFwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
NDimSpatial
,
InLayout
,
WeiLayout
,
ck
::
Tuple
<>
,
OutLayout
,
InDataType
,
WeiDataType
,
AccDataType
,
CShuffleDataType
,
ck
::
Tuple
<>
,
OutDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
,
ConvSpec
,
// ConvForwardSpecialization
GemmSpec
,
// GemmSpecialization
1
,
//
256
,
// BlockSize
128
,
// MPerBlock
256
,
// NPerBlock
64
,
// KPerBlock
16
,
// AK1
16
,
// BK1
32
,
// MPerXdl
32
,
// NPerXdl
2
,
// MXdlPerWave
4
,
// NXdlPerWave
S
<
4
,
64
,
1
>
,
// ABlockTransferThreadClusterLengths_AK0_M_AK1
S
<
1
,
0
,
2
>
,
// ABlockTransferThreadClusterArrangeOrder
S
<
1
,
0
,
2
>
,
// ABlockTransferSrcAccessOrder
2
,
// ABlockTransferSrcVectorDim
16
,
// ABlockTransferSrcScalarPerVector
16
,
// ABlockTransferDstScalarPerVector_AK1
1
,
// ABlockLdsExtraM
S
<
4
,
64
,
1
>
,
// BBlockTransferThreadClusterLengths_BK0_N_BK1
S
<
1
,
0
,
2
>
,
// BBlockTransferThreadClusterArrangeOrder
S
<
1
,
0
,
2
>
,
// BBlockTransferSrcAccessOrder
2
,
// BBlockTransferSrcVectorDim
16
,
// BBlockTransferSrcScalarPerVector
16
,
// BBlockTransferDstScalarPerVector_BK1
1
,
// BBlockLdsExtraN
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
16
>
;
template
<
ck
::
index_t
NDimSpatial
,
template
<
ck
::
index_t
NDimSpatial
,
typename
InDataType
,
typename
InDataType
,
...
@@ -221,10 +139,10 @@ bool run_grouped_conv_fwd(bool do_verification,
...
@@ -221,10 +139,10 @@ bool run_grouped_conv_fwd(bool do_verification,
return
(
pass
?
0
:
1
);
return
(
pass
?
0
:
1
);
}
}
int
main
()
int
run_conv2d_fwd_perlayer_quantization_example
()
{
{
bool
do_verification
=
true
;
bool
do_verification
=
true
;
bool
time_kernel
=
tru
e
;
bool
time_kernel
=
fals
e
;
const
ck
::
index_t
ndim_spatial
=
2
;
const
ck
::
index_t
ndim_spatial
=
2
;
ck
::
utils
::
conv
::
ConvParam
conv_param
{
ck
::
utils
::
conv
::
ConvParam
conv_param
{
...
@@ -232,7 +150,7 @@ int main()
...
@@ -232,7 +150,7 @@ int main()
1
,
// group
1
,
// group
4
,
// batch
4
,
// batch
64
,
// output channels
64
,
// output channels
3
2
,
// input chanels
19
2
,
// input chanels
{
3
,
3
},
// weight HW
{
3
,
3
},
// weight HW
{
71
,
71
},
// x HW
{
71
,
71
},
// x HW
{
2
,
2
},
// strides
{
2
,
2
},
// strides
...
...
example/44_conv2d_fwd_quantization/CMakeLists.txt
deleted
100644 → 0
View file @
ddbeddb4
add_example_executable
(
example_conv2d_fwd_xdl_perchannel_quantization_int8 conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp
)
add_example_executable
(
example_conv2d_fwd_xdl_perlayer_quantization_int8 conv2d_fwd_xdl_perlayer_quantization_int8.cpp
)
add_example_executable
(
example_conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8 conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp
)
include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
View file @
1724f7c8
...
@@ -134,7 +134,8 @@ __global__ void
...
@@ -134,7 +134,8 @@ __global__ void
const
Block2CTileMap
block_2_ctile_map
,
const
Block2CTileMap
block_2_ctile_map
,
const
ComputePtrOffsetOfBatch
compute_ptr_offset_of_batch
)
const
ComputePtrOffsetOfBatch
compute_ptr_offset_of_batch
)
{
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx1030__))
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx1030__) || \
defined(__gfx90a__) || defined(__gfx908__))
// offset base pointer for each work-group
// offset base pointer for each work-group
const
index_t
num_blocks_per_batch
=
const
index_t
num_blocks_per_batch
=
__builtin_amdgcn_readfirstlane
(
get_grid_size
()
/
batch_count
);
__builtin_amdgcn_readfirstlane
(
get_grid_size
()
/
batch_count
);
...
@@ -314,9 +315,8 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
...
@@ -314,9 +315,8 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
const
auto
in_gemmm_gemmk_desc
=
const
auto
in_gemmm_gemmk_desc
=
matrix_padder
.
PadADescriptor_M_K
(
in_gemmmraw_gemmkraw_desc
);
matrix_padder
.
PadADescriptor_M_K
(
in_gemmmraw_gemmkraw_desc
);
const
auto
M
=
in_gemmm_gemmk_desc
.
GetLength
(
I0
);
const
auto
M
=
in_gemmm_gemmk_desc
.
GetLength
(
I0
);
const
auto
K
=
in_gemmm_gemmk_desc
.
GetLength
(
I1
);
const
auto
K
=
in_gemmm_gemmk_desc
.
GetLength
(
I1
);
const
auto
AK0
=
K
/
K1
;
const
auto
AK0
=
K
/
K1
;
return
transform_tensor_descriptor
(
return
transform_tensor_descriptor
(
...
@@ -709,7 +709,8 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
...
@@ -709,7 +709,8 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
namespace
ctc
=
tensor_layout
::
convolution
;
namespace
ctc
=
tensor_layout
::
convolution
;
// check device
// check device
if
(
!
(
ck
::
get_device_name
()
==
"gfx906"
||
ck
::
get_device_name
()
==
"gfx1030"
))
if
(
!
(
ck
::
get_device_name
()
==
"gfx906"
||
ck
::
get_device_name
()
==
"gfx1030"
||
ck
::
get_device_name
()
==
"gfx90a"
||
ck
::
get_device_name
()
==
"gfx908"
))
{
{
return
false
;
return
false
;
}
}
...
@@ -834,6 +835,7 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
...
@@ -834,6 +835,7 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
{
{
return
false
;
return
false
;
}
}
// check Gridwise GEMM
// check Gridwise GEMM
return
GridwiseGemm
::
CheckValidity
(
return
GridwiseGemm
::
CheckValidity
(
arg
.
a_grid_desc_ak0_m_ak1_
,
arg
.
b_grid_desc_bk0_n_bk1_
,
arg
.
e_grid_desc_m_n_
);
arg
.
a_grid_desc_ak0_m_ak1_
,
arg
.
b_grid_desc_bk0_n_bk1_
,
arg
.
e_grid_desc_m_n_
);
...
...
include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp
View file @
1724f7c8
...
@@ -51,7 +51,7 @@ __global__ void
...
@@ -51,7 +51,7 @@ __global__ void
const
Block2CTileMap
block_2_ctile_map
)
const
Block2CTileMap
block_2_ctile_map
)
{
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \
defined(__gfx1030__))
defined(__gfx90a__) ||
defined(__gfx1030__))
constexpr
index_t
shared_block_size
=
constexpr
index_t
shared_block_size
=
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()
/
sizeof
(
ABDataType
);
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()
/
sizeof
(
ABDataType
);
...
@@ -552,7 +552,7 @@ struct DeviceGemmMultipleD_Dl : public DeviceGemmMultipleD<ALayout,
...
@@ -552,7 +552,7 @@ struct DeviceGemmMultipleD_Dl : public DeviceGemmMultipleD<ALayout,
static
bool
IsSupportedArgument
(
const
Argument
&
arg
)
static
bool
IsSupportedArgument
(
const
Argument
&
arg
)
{
{
if
(
ck
::
get_device_name
()
==
"gfx906"
||
ck
::
get_device_name
()
==
"gfx908"
||
if
(
ck
::
get_device_name
()
==
"gfx906"
||
ck
::
get_device_name
()
==
"gfx908"
||
ck
::
get_device_name
()
==
"gfx1030"
)
ck
::
get_device_name
()
==
"gfx90a"
||
ck
::
get_device_name
()
==
"gfx1030"
)
{
{
return
GridwiseGemm
::
CheckValidity
(
return
GridwiseGemm
::
CheckValidity
(
arg
.
a_grid_desc_k0_m_k1_
,
arg
.
b_grid_desc_k0_n_k1_
,
arg
.
e_grid_desc_m_n_
);
arg
.
a_grid_desc_k0_m_k1_
,
arg
.
b_grid_desc_k0_n_k1_
,
arg
.
e_grid_desc_m_n_
);
...
...
include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
0 → 100644
View file @
1724f7c8
This diff is collapsed.
Click to expand it.
include/ck/tensor_operation/gpu/element/quantization_operation.hpp
View file @
1724f7c8
#pragma once
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/data_type.hpp"
// #include "ck/utility/get_id.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
...
@@ -17,18 +18,27 @@ struct Activation_Mul_Clamp
...
@@ -17,18 +18,27 @@ struct Activation_Mul_Clamp
__host__
__device__
constexpr
void
operator
()(
int8_t
&
y
,
const
int32_t
&
x
)
const
__host__
__device__
constexpr
void
operator
()(
int8_t
&
y
,
const
int32_t
&
x
)
const
{
{
float
x
_fp32
=
ck
::
type_convert
<
float
>
(
x
);
float
y
_fp32
=
ck
::
type_convert
<
float
>
(
x
);
activationOp_
(
x
_fp32
,
x
_fp32
);
activationOp_
(
y
_fp32
,
y
_fp32
);
float
y_fp32
=
math
::
clamp
(
requantScale_
*
x
_fp32
,
-
128.
f
,
127.
f
);
y_fp32
=
math
::
clamp
(
requantScale_
*
y
_fp32
,
-
128.
f
,
127.
f
);
y
=
ck
::
type_convert
<
int8_t
>
(
y_fp32
);
y
=
ck
::
type_convert
<
int8_t
>
(
y_fp32
);
}
}
__host__
__device__
constexpr
void
operator
()(
floa
t
&
y
,
const
int32_t
&
x
)
const
__device__
constexpr
void
operator
()(
int32_
t
&
y
,
const
int32_t
&
x
)
const
{
{
// We might type_convert to int8 after lambda in someplace
// CAUSION - We might type_convert to int8 in threadwise copy
float
x_fp32
=
ck
::
type_convert
<
float
>
(
x
);
// eg. GridwiseGemmDlMultipleD_km_kn_mn
activationOp_
(
x_fp32
,
x_fp32
);
float
y_fp32
=
ck
::
type_convert
<
float
>
(
x
);
y
=
math
::
clamp
(
requantScale_
*
x_fp32
,
-
128.
f
,
127.
f
);
activationOp_
(
y_fp32
,
y_fp32
);
y_fp32
=
math
::
clamp
(
requantScale_
*
y_fp32
,
-
128.
f
,
127.
f
);
y
=
ck
::
type_convert
<
int32_t
>
(
y_fp32
);
}
__host__
constexpr
void
operator
()(
float
&
y
,
const
float
&
x
)
const
{
// CAUSION - We might float in & float out in reference code
activationOp_
(
y
,
x
);
y
=
math
::
clamp
(
requantScale_
*
y
,
-
128.
f
,
127.
f
);
}
}
float
requantScale_
;
float
requantScale_
;
...
@@ -51,6 +61,17 @@ struct Activation_Mul2_Clamp
...
@@ -51,6 +61,17 @@ struct Activation_Mul2_Clamp
y
=
ck
::
type_convert
<
int8_t
>
(
y_fp32
);
y
=
ck
::
type_convert
<
int8_t
>
(
y_fp32
);
}
}
__device__
constexpr
void
operator
()(
int32_t
&
y
,
const
int32_t
&
x
,
const
float
&
requantScale
)
const
{
// CAUSION - We might type_convert to int8 in threadwise copy
// eg. GridwiseGemmDlMultipleD_km_kn_mn
float
y_fp32
=
ck
::
type_convert
<
float
>
(
x
);
activationOp_
(
y_fp32
,
y_fp32
);
y_fp32
=
math
::
clamp
(
requantScale
*
y_fp32
,
-
128.
f
,
127.
f
);
y
=
ck
::
type_convert
<
int32_t
>
(
y_fp32
);
}
Activation
activationOp_
;
Activation
activationOp_
;
};
};
...
@@ -72,6 +93,17 @@ struct Add_Activation_Mul_Clamp
...
@@ -72,6 +93,17 @@ struct Add_Activation_Mul_Clamp
y
=
ck
::
type_convert
<
int8_t
>
(
y_fp32
);
y
=
ck
::
type_convert
<
int8_t
>
(
y_fp32
);
}
}
__host__
__device__
constexpr
void
operator
()(
int32_t
&
y
,
const
int32_t
&
x
,
const
int32_t
&
bias
)
const
{
// CAUSION - We might type_convert to int8 in threadwise copy
// eg. GridwiseGemmDlMultipleD_km_kn_mn
float
y_fp32
=
ck
::
type_convert
<
float
>
(
x
+
bias
);
activationOp_
(
y_fp32
,
y_fp32
);
y_fp32
=
math
::
clamp
(
requantScale_
*
y_fp32
,
-
128.
f
,
127.
f
);
y
=
ck
::
type_convert
<
int32_t
>
(
y_fp32
);
}
float
requantScale_
;
float
requantScale_
;
Activation
activationOp_
;
Activation
activationOp_
;
};
};
...
@@ -92,6 +124,17 @@ struct Add_Activation_Mul2_Clamp
...
@@ -92,6 +124,17 @@ struct Add_Activation_Mul2_Clamp
y
=
ck
::
type_convert
<
int8_t
>
(
y_fp32
);
y
=
ck
::
type_convert
<
int8_t
>
(
y_fp32
);
}
}
__host__
__device__
constexpr
void
operator
()(
int32_t
&
y
,
const
int32_t
&
x
,
const
int32_t
&
bias
,
const
float
&
requantScale
)
const
{
// CAUSION - We might type_convert to int8 in threadwise copy
// eg. GridwiseGemmDlMultipleD_km_kn_mn
float
y_fp32
=
ck
::
type_convert
<
float
>
(
x
+
bias
);
activationOp_
(
y_fp32
,
y_fp32
);
y_fp32
=
math
::
clamp
(
requantScale
*
y_fp32
,
-
128.
f
,
127.
f
);
y
=
ck
::
type_convert
<
int32_t
>
(
y_fp32
);
}
Activation
activationOp_
;
Activation
activationOp_
;
};
};
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_multiple_d.hpp
View file @
1724f7c8
...
@@ -185,8 +185,10 @@ struct GridwiseGemmDlMultipleD_km_kn_mn
...
@@ -185,8 +185,10 @@ struct GridwiseGemmDlMultipleD_km_kn_mn
return
b_grid_desc_k0_n0_n1_k1
;
return
b_grid_desc_k0_n0_n1_k1
;
}
}
// E desc for destination in blockwise copy
template
<
typename
CGridDesc_M_N_
>
__host__
__device__
static
constexpr
auto
__host__
__device__
static
constexpr
auto
MakeCGridDescriptor_M0_M10_M11_N0_N10_N11
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
)
MakeCGridDescriptor_M0_M10_M11_N0_N10_N11
(
const
CGridDesc_M_N
_
&
c_grid_desc_m_n
)
{
{
const
auto
M
=
c_grid_desc_m_n
.
GetLength
(
I0
);
const
auto
M
=
c_grid_desc_m_n
.
GetLength
(
I0
);
const
auto
N
=
c_grid_desc_m_n
.
GetLength
(
I1
);
const
auto
N
=
c_grid_desc_m_n
.
GetLength
(
I1
);
...
@@ -238,19 +240,19 @@ struct GridwiseGemmDlMultipleD_km_kn_mn
...
@@ -238,19 +240,19 @@ struct GridwiseGemmDlMultipleD_km_kn_mn
using
BGridDesc_K0_N0_N1_K1
=
decltype
(
MakeBGridDescriptor_K0_N0_N1_K1
(
BGridDesc_K0_N_K1
{}));
using
BGridDesc_K0_N0_N1_K1
=
decltype
(
MakeBGridDescriptor_K0_N0_N1_K1
(
BGridDesc_K0_N_K1
{}));
using
CGridDesc_M0_M10_M11_N0_N10_N11
=
using
CGridDesc_M0_M10_M11_N0_N10_N11
=
decltype
(
MakeCGridDescriptor_M0_M10_M11_N0_N10_N11
(
CGridDesc_M_N
{}));
decltype
(
MakeCGridDescriptor_M0_M10_M11_N0_N10_N11
(
CGridDesc_M_N
{}));
using
Block2CTileMap
=
decltype
(
MakeDefaultBlock2CTileMap
(
CGridDesc_M_N
{}));
using
DsGridPointer
=
decltype
(
MakeDsGridPointer
());
using
DsGridPointer
=
decltype
(
MakeDsGridPointer
());
template
<
typename
DsGridDesc_M0_M10_M11_N0_N10_N11
,
template
<
typename
DsGridDesc_M0_M10_M11_N0_N10_N11
,
bool
HasMainKBlockLoop
,
bool
HasMainKBlockLoop
,
bool
HasDoubleTailKBlockLoop
>
bool
HasDoubleTailKBlockLoop
,
typename
Block2CTileMap
>
__device__
static
void
__device__
static
void
Run
(
const
FloatAB
*
__restrict__
p_a_grid
,
Run
(
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
DsGridPointer
p_ds_grid
,
DsGridPointer
p_ds_grid
,
FloatC
*
__restrict__
p_c_grid
,
FloatC
*
__restrict__
p_c_grid
,
FloatAB
*
__restrict__
p_shared_block
,
void
*
__restrict__
p_shared_block
,
const
AElementwiseOperation
&
,
const
AElementwiseOperation
&
,
const
BElementwiseOperation
&
,
const
BElementwiseOperation
&
,
const
CDEElementwiseOperation
&
cde_element_op
,
const
CDEElementwiseOperation
&
cde_element_op
,
...
@@ -399,8 +401,9 @@ struct GridwiseGemmDlMultipleD_km_kn_mn
...
@@ -399,8 +401,9 @@ struct GridwiseGemmDlMultipleD_km_kn_mn
constexpr
auto
b_block_aligned_space_size
=
math
::
integer_least_multiple
(
constexpr
auto
b_block_aligned_space_size
=
math
::
integer_least_multiple
(
b_block_desc_k0_n0_n1_k1
.
GetElementSpaceSize
(),
max_lds_align
);
b_block_desc_k0_n0_n1_k1
.
GetElementSpaceSize
(),
max_lds_align
);
FloatAB
*
p_a_block_double
=
p_shared_block
;
FloatAB
*
p_a_block_double
=
static_cast
<
FloatAB
*>
(
p_shared_block
);
FloatAB
*
p_b_block_double
=
p_shared_block
+
2
*
a_block_aligned_space_size
;
FloatAB
*
p_b_block_double
=
static_cast
<
FloatAB
*>
(
p_shared_block
)
+
2
*
a_block_aligned_space_size
;
// register allocation for output
// register allocation for output
auto
c_thread_buf
=
make_static_buffer
<
AddressSpaceEnum
::
Vgpr
,
FloatAcc
>
(
auto
c_thread_buf
=
make_static_buffer
<
AddressSpaceEnum
::
Vgpr
,
FloatAcc
>
(
...
...
include/ck/utility/inner_product.hpp
View file @
1724f7c8
...
@@ -135,6 +135,28 @@ __device__ void inner_product<half8_t, half8_t, float>(const half8_t& a, const h
...
@@ -135,6 +135,28 @@ __device__ void inner_product<half8_t, half8_t, float>(const half8_t& a, const h
c
);
c
);
}
}
template
<
>
__device__
void
inner_product
<
int8_t
,
int8_t
,
int32_t
>
(
const
int8_t
&
a
,
const
int8_t
&
b
,
int32_t
&
c
)
{
c
+=
type_convert
<
int32_t
>
(
a
)
*
type_convert
<
int32_t
>
(
b
);
}
template
<
>
__device__
void
inner_product
<
int8x2_t
,
int8x2_t
,
int32_t
>
(
const
int8x2_t
&
a
,
const
int8x2_t
&
b
,
int32_t
&
c
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
inner_product
(
vector_type
<
int8_t
,
2
>
{
a
}.
AsType
<
int8_t
>
()[
I0
],
vector_type
<
int8_t
,
2
>
{
b
}.
AsType
<
int8_t
>
()[
I0
],
c
);
inner_product
(
vector_type
<
int8_t
,
2
>
{
a
}.
AsType
<
int8_t
>
()[
I1
],
vector_type
<
int8_t
,
2
>
{
b
}.
AsType
<
int8_t
>
()[
I1
],
c
);
}
template
<
>
template
<
>
__device__
void
__device__
void
inner_product
<
int8x4_t
,
int8x4_t
,
int32_t
>
(
const
int8x4_t
&
a
,
const
int8x4_t
&
b
,
int32_t
&
c
)
inner_product
<
int8x4_t
,
int8x4_t
,
int32_t
>
(
const
int8x4_t
&
a
,
const
int8x4_t
&
b
,
int32_t
&
c
)
...
...
library/include/ck/library/tensor_operation_instance/gpu/quantization/gemm_quantization.hpp
0 → 100644
View file @
1724f7c8
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// Layout(A, B, C) = [Col, Row, Row]
void
add_device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_kn_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
Activation_Mul_Clamp
<
PassThrough
>>>>&
instances
);
// Layout(A, B, C) = [Col, Col, Row]
void
add_device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_nk_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
Activation_Mul_Clamp
<
PassThrough
>>>>&
instances
);
// Layout(A, B, C) = [Row, Row, Row]
void
add_device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_kn_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
Activation_Mul_Clamp
<
PassThrough
>>>>&
instances
);
// Layout(A, B, C) = [Row, Col, Row]
void
add_device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_nk_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
Activation_Mul_Clamp
<
PassThrough
>>>>&
instances
);
// Layout(A, B, C) = [Col, Row, Row]
void
add_device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
Activation_Mul_Clamp
<
PassThrough
>>>>&
instances
);
// Layout(A, B, C) = [Col, Col, Row]
void
add_device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
Activation_Mul_Clamp
<
PassThrough
>>>>&
instances
);
// Layout(A, B, C) = [Row, Row, Row]
void
add_device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
Activation_Mul_Clamp
<
PassThrough
>>>>&
instances
);
// Layout(A, B, C) = [Row, Col, Row]
void
add_device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
Activation_Mul_Clamp
<
PassThrough
>>>>&
instances
);
template
<
typename
ALayout
,
typename
BLayout
,
typename
ELayout
,
typename
ADataType
,
typename
BDataType
,
typename
EDataType
,
typename
Activation
>
struct
DeviceOperationInstanceFactory
<
ck
::
tensor_operation
::
device
::
DeviceGemmMultipleD
<
ALayout
,
BLayout
,
Empty_Tuple
,
ELayout
,
ADataType
,
BDataType
,
Empty_Tuple
,
EDataType
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
Activation_Mul_Clamp
<
Activation
>>>
{
using
DeviceOp
=
DeviceGemmMultipleD
<
ALayout
,
BLayout
,
Empty_Tuple
,
ELayout
,
ADataType
,
BDataType
,
Empty_Tuple
,
EDataType
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
Activation_Mul_Clamp
<
Activation
>>
;
static
auto
GetInstances
()
{
std
::
vector
<
std
::
unique_ptr
<
DeviceOp
>>
op_ptrs
;
if
constexpr
(
is_same_v
<
ADataType
,
int8_t
>
&&
is_same_v
<
BDataType
,
int8_t
>
&&
is_same_v
<
EDataType
,
int8_t
>
)
{
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Row
>
&&
is_same_v
<
ELayout
,
Row
>
)
{
if
constexpr
(
is_same_v
<
Activation
,
PassThrough
>
)
{
add_device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_kn_mn_instances
(
op_ptrs
);
add_device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances
(
op_ptrs
);
}
}
else
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Col
>
&&
is_same_v
<
ELayout
,
Row
>
)
{
if
constexpr
(
is_same_v
<
Activation
,
PassThrough
>
)
{
add_device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_nk_mn_instances
(
op_ptrs
);
add_device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances
(
op_ptrs
);
}
}
else
if
constexpr
(
is_same_v
<
ALayout
,
Col
>
&&
is_same_v
<
BLayout
,
Row
>
&&
is_same_v
<
ELayout
,
Row
>
)
{
if
constexpr
(
is_same_v
<
Activation
,
PassThrough
>
)
{
add_device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_kn_mn_instances
(
op_ptrs
);
add_device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances
(
op_ptrs
);
}
}
else
if
constexpr
(
is_same_v
<
ALayout
,
Col
>
&&
is_same_v
<
BLayout
,
Col
>
&&
is_same_v
<
ELayout
,
Row
>
)
{
if
constexpr
(
is_same_v
<
Activation
,
PassThrough
>
)
{
add_device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_nk_mn_instances
(
op_ptrs
);
add_device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances
(
op_ptrs
);
}
}
return
op_ptrs
;
}
}
};
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp
View file @
1724f7c8
...
@@ -18,7 +18,7 @@ namespace device {
...
@@ -18,7 +18,7 @@ namespace device {
namespace
instance
{
namespace
instance
{
// grouped conv2d forward, GNHWC/GKYXC/GNHWK
// grouped conv2d forward, GNHWC/GKYXC/GNHWK
void
add_device_conv2d_bias_perchannel_quantization_int8_instances
(
void
add_device_conv2d_
dl_
bias_perchannel_quantization_int8_instances
(
std
::
vector
<
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GNHWC
,
...
@@ -34,7 +34,38 @@ void add_device_conv2d_bias_perchannel_quantization_int8_instances(
...
@@ -34,7 +34,38 @@ void add_device_conv2d_bias_perchannel_quantization_int8_instances(
Add_Activation_Mul2_Clamp
<
PassThrough
>>>>&
Add_Activation_Mul2_Clamp
<
PassThrough
>>>>&
instances
);
instances
);
void
add_device_conv2d_bias_relu_perchannel_quantization_int8_instances
(
void
add_device_conv2d_dl_bias_relu_perchannel_quantization_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GKYXC
,
GK_GK_Tuple
,
GNHWK
,
int8_t
,
int8_t
,
I32_F32_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
Add_Activation_Mul2_Clamp
<
Relu
>>>>&
instances
);
void
add_device_conv2d_xdl_bias_perchannel_quantization_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GKYXC
,
GK_GK_Tuple
,
GNHWK
,
int8_t
,
int8_t
,
I32_F32_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
Add_Activation_Mul2_Clamp
<
PassThrough
>>>>&
instances
);
void
add_device_conv2d_xdl_bias_relu_perchannel_quantization_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GNHWC
,
GKYXC
,
GKYXC
,
...
@@ -98,9 +129,15 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
...
@@ -98,9 +129,15 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
is_same_v
<
DsDataType
,
I32_F32_Tuple
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
is_same_v
<
DsDataType
,
I32_F32_Tuple
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
{
if
constexpr
(
is_same_v
<
Activation
,
PassThrough
>
)
if
constexpr
(
is_same_v
<
Activation
,
PassThrough
>
)
add_device_conv2d_bias_perchannel_quantization_int8_instances
(
op_ptrs
);
{
add_device_conv2d_dl_bias_perchannel_quantization_int8_instances
(
op_ptrs
);
add_device_conv2d_xdl_bias_perchannel_quantization_int8_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
Activation
,
Relu
>
)
else
if
constexpr
(
is_same_v
<
Activation
,
Relu
>
)
add_device_conv2d_bias_relu_perchannel_quantization_int8_instances
(
op_ptrs
);
{
add_device_conv2d_dl_bias_relu_perchannel_quantization_int8_instances
(
op_ptrs
);
add_device_conv2d_xdl_bias_relu_perchannel_quantization_int8_instances
(
op_ptrs
);
}
}
}
}
}
...
...
library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp
View file @
1724f7c8
...
@@ -18,7 +18,7 @@ namespace device {
...
@@ -18,7 +18,7 @@ namespace device {
namespace
instance
{
namespace
instance
{
// grouped conv2d forward, GNHWC/GKYXC/GNHWK
// grouped conv2d forward, GNHWC/GKYXC/GNHWK
void
add_device_conv2d_bias_perlayer_quantization_int8_instances
(
void
add_device_conv2d_
dl_
bias_perlayer_quantization_int8_instances
(
std
::
vector
<
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GNHWC
,
...
@@ -34,7 +34,38 @@ void add_device_conv2d_bias_perlayer_quantization_int8_instances(
...
@@ -34,7 +34,38 @@ void add_device_conv2d_bias_perlayer_quantization_int8_instances(
Add_Activation_Mul_Clamp
<
PassThrough
>>>>&
Add_Activation_Mul_Clamp
<
PassThrough
>>>>&
instances
);
instances
);
void
add_device_conv2d_bias_relu_perlayer_quantization_int8_instances
(
void
add_device_conv2d_dl_bias_relu_perlayer_quantization_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GKYXC
,
GK_Tuple
,
GNHWK
,
int8_t
,
int8_t
,
I32_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
Add_Activation_Mul_Clamp
<
Relu
>>>>&
instances
);
void
add_device_conv2d_xdl_bias_perlayer_quantization_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GKYXC
,
GK_Tuple
,
GNHWK
,
int8_t
,
int8_t
,
I32_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
Add_Activation_Mul_Clamp
<
PassThrough
>>>>&
instances
);
void
add_device_conv2d_xdl_bias_relu_perlayer_quantization_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GNHWC
,
GKYXC
,
GKYXC
,
...
@@ -98,9 +129,15 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
...
@@ -98,9 +129,15 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
is_same_v
<
DsDataType
,
I32_Tuple
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
is_same_v
<
DsDataType
,
I32_Tuple
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
{
if
constexpr
(
is_same_v
<
Activation
,
PassThrough
>
)
if
constexpr
(
is_same_v
<
Activation
,
PassThrough
>
)
add_device_conv2d_bias_perlayer_quantization_int8_instances
(
op_ptrs
);
{
add_device_conv2d_dl_bias_perlayer_quantization_int8_instances
(
op_ptrs
);
add_device_conv2d_xdl_bias_perlayer_quantization_int8_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
Activation
,
Relu
>
)
else
if
constexpr
(
is_same_v
<
Activation
,
Relu
>
)
add_device_conv2d_bias_relu_perlayer_quantization_int8_instances
(
op_ptrs
);
{
add_device_conv2d_dl_bias_relu_perlayer_quantization_int8_instances
(
op_ptrs
);
add_device_conv2d_xdl_bias_relu_perlayer_quantization_int8_instances
(
op_ptrs
);
}
}
}
}
}
...
...
library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perchannel_quantization.hpp
View file @
1724f7c8
...
@@ -18,7 +18,7 @@ namespace device {
...
@@ -18,7 +18,7 @@ namespace device {
namespace
instance
{
namespace
instance
{
// grouped conv2d forward, GNHWC/GKYXC/GNHWK
// grouped conv2d forward, GNHWC/GKYXC/GNHWK
void
add_device_conv2d_perchannel_quantization_int8_instances
(
void
add_device_conv2d_
dl_
perchannel_quantization_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GNHWC
,
GKYXC
,
GKYXC
,
...
@@ -33,7 +33,37 @@ void add_device_conv2d_perchannel_quantization_int8_instances(
...
@@ -33,7 +33,37 @@ void add_device_conv2d_perchannel_quantization_int8_instances(
Activation_Mul2_Clamp
<
PassThrough
>>>>&
Activation_Mul2_Clamp
<
PassThrough
>>>>&
instances
);
instances
);
void
add_device_conv2d_relu_perchannel_quantization_int8_instances
(
void
add_device_conv2d_dl_relu_perchannel_quantization_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GKYXC
,
GK_Tuple
,
GNHWK
,
int8_t
,
int8_t
,
F32_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
Activation_Mul2_Clamp
<
Relu
>>>>&
instances
);
void
add_device_conv2d_xdl_perchannel_quantization_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GKYXC
,
GK_Tuple
,
GNHWK
,
int8_t
,
int8_t
,
F32_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
Activation_Mul2_Clamp
<
PassThrough
>>>>&
instances
);
void
add_device_conv2d_xdl_relu_perchannel_quantization_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GNHWC
,
GKYXC
,
GKYXC
,
...
@@ -97,9 +127,15 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
...
@@ -97,9 +127,15 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
is_same_v
<
OutDataType
,
int8_t
>
)
is_same_v
<
OutDataType
,
int8_t
>
)
{
{
if
constexpr
(
is_same_v
<
Activation
,
PassThrough
>
)
if
constexpr
(
is_same_v
<
Activation
,
PassThrough
>
)
add_device_conv2d_perchannel_quantization_int8_instances
(
op_ptrs
);
{
add_device_conv2d_dl_perchannel_quantization_int8_instances
(
op_ptrs
);
add_device_conv2d_xdl_perchannel_quantization_int8_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
Activation
,
Relu
>
)
else
if
constexpr
(
is_same_v
<
Activation
,
Relu
>
)
add_device_conv2d_relu_perchannel_quantization_int8_instances
(
op_ptrs
);
{
add_device_conv2d_dl_relu_perchannel_quantization_int8_instances
(
op_ptrs
);
add_device_conv2d_xdl_relu_perchannel_quantization_int8_instances
(
op_ptrs
);
}
}
}
}
}
...
...
library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perlayer_quantization.hpp
View file @
1724f7c8
...
@@ -18,7 +18,7 @@ namespace device {
...
@@ -18,7 +18,7 @@ namespace device {
namespace
instance
{
namespace
instance
{
// grouped conv2d forward, GNHWC/GKYXC/GNHWK
// grouped conv2d forward, GNHWC/GKYXC/GNHWK
void
add_device_conv2d_perlayer_quantization_int8_instances
(
void
add_device_conv2d_
dl_
perlayer_quantization_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GNHWC
,
GKYXC
,
GKYXC
,
...
@@ -33,7 +33,37 @@ void add_device_conv2d_perlayer_quantization_int8_instances(
...
@@ -33,7 +33,37 @@ void add_device_conv2d_perlayer_quantization_int8_instances(
Activation_Mul_Clamp
<
PassThrough
>>>>&
Activation_Mul_Clamp
<
PassThrough
>>>>&
instances
);
instances
);
void
add_device_conv2d_relu_perlayer_quantization_int8_instances
(
void
add_device_conv2d_dl_relu_perlayer_quantization_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GKYXC
,
Empty_Tuple
,
GNHWK
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
Activation_Mul_Clamp
<
Relu
>>>>&
instances
);
void
add_device_conv2d_xdl_perlayer_quantization_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GKYXC
,
Empty_Tuple
,
GNHWK
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
Activation_Mul_Clamp
<
PassThrough
>>>>&
instances
);
void
add_device_conv2d_xdl_relu_perlayer_quantization_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GNHWC
,
GKYXC
,
GKYXC
,
...
@@ -94,9 +124,15 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
...
@@ -94,9 +124,15 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
is_same_v
<
OutDataType
,
int8_t
>
)
is_same_v
<
OutDataType
,
int8_t
>
)
{
{
if
constexpr
(
is_same_v
<
Activation
,
PassThrough
>
)
if
constexpr
(
is_same_v
<
Activation
,
PassThrough
>
)
add_device_conv2d_perlayer_quantization_int8_instances
(
op_ptrs
);
{
add_device_conv2d_dl_perlayer_quantization_int8_instances
(
op_ptrs
);
add_device_conv2d_xdl_perlayer_quantization_int8_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
Activation
,
Relu
>
)
else
if
constexpr
(
is_same_v
<
Activation
,
Relu
>
)
add_device_conv2d_relu_perlayer_quantization_int8_instances
(
op_ptrs
);
{
add_device_conv2d_dl_relu_perlayer_quantization_int8_instances
(
op_ptrs
);
add_device_conv2d_xdl_relu_perlayer_quantization_int8_instances
(
op_ptrs
);
}
}
}
}
}
...
...
library/src/tensor_operation_instance/gpu/quantization/CMakeLists.txt
View file @
1724f7c8
set
(
CONV2D_PERLAYER_QUANT_SRC
conv2d_fwd/device_conv2d_dl_perlayer_quantization_int8_instance.cpp
conv2d_fwd/device_conv2d_xdl_perlayer_quantization_int8_instance.cpp
)
set
(
CONV2D_PERCHANNEL_QUANT_SRC
conv2d_fwd/device_conv2d_dl_perchannel_quantization_int8_instance.cpp
conv2d_fwd/device_conv2d_xdl_perchannel_quantization_int8_instance.cpp
)
set
(
CONV2D_BIAS_PERLAYER_QUANT_SRC
conv2d_fwd/device_conv2d_dl_bias_perlayer_quantization_int8_instance.cpp
conv2d_fwd/device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp
)
set
(
CONV2D_BIAS_PERCHANNEL_QUANT_SRC
conv2d_fwd/device_conv2d_dl_bias_perchannel_quantization_int8_instance.cpp
conv2d_fwd/device_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp
)
set
(
GEMM_QUANT_SRC
gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
)
add_instance_library
(
device_quantization_instance
add_instance_library
(
device_quantization_instance
device_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp
${
CONV2D_PERLAYER_QUANT_SRC
}
device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp
${
CONV2D_PERCHANNEL_QUANT_SRC
}
device_conv2d_xdl_perchannel_quantization_int8_instance.cpp
${
CONV2D_BIAS_PERLAYER_QUANT_SRC
}
device_conv2d_xdl_perlayer_quantization_int8_instance.cpp
${
CONV2D_BIAS_PERCHANNEL_QUANT_SRC
}
${
GEMM_QUANT_SRC
}
)
)
library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/conv2d_quantization_common.hpp
0 → 100644
View file @
1724f7c8
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
Empty_Tuple
=
ck
::
Tuple
<>
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
GNHWC
=
ck
::
tensor_layout
::
convolution
::
GNHWC
;
using
GKYXC
=
ck
::
tensor_layout
::
convolution
::
GKYXC
;
using
GNHWK
=
ck
::
tensor_layout
::
convolution
::
GNHWK
;
using
GK
=
ck
::
tensor_layout
::
convolution
::
G_K
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Relu
=
ck
::
tensor_operation
::
element_wise
::
Relu
;
using
GK_Tuple
=
ck
::
Tuple
<
GK
>
;
using
GK_GK_Tuple
=
ck
::
Tuple
<
GK
,
GK
>
;
using
I32_Tuple
=
ck
::
Tuple
<
int32_t
>
;
using
F32_Tuple
=
ck
::
Tuple
<
float
>
;
using
I32_F32_Tuple
=
ck
::
Tuple
<
int32_t
,
float
>
;
using
Mul_Clamp
=
ck
::
tensor_operation
::
element_wise
::
Activation_Mul_Clamp
<
PassThrough
>
;
using
Relu_Mul_Clamp
=
ck
::
tensor_operation
::
element_wise
::
Activation_Mul_Clamp
<
Relu
>
;
using
Add_Mul_Clamp
=
ck
::
tensor_operation
::
element_wise
::
Add_Activation_Mul_Clamp
<
PassThrough
>
;
using
Add_Relu_Mul_Clamp
=
ck
::
tensor_operation
::
element_wise
::
Add_Activation_Mul_Clamp
<
Relu
>
;
using
Mul2_Clamp
=
ck
::
tensor_operation
::
element_wise
::
Activation_Mul2_Clamp
<
PassThrough
>
;
using
Relu_Mul2_Clamp
=
ck
::
tensor_operation
::
element_wise
::
Activation_Mul2_Clamp
<
Relu
>
;
using
Add_Mul2_Clamp
=
ck
::
tensor_operation
::
element_wise
::
Add_Activation_Mul2_Clamp
<
PassThrough
>
;
using
Add_Relu_Mul2_Clamp
=
ck
::
tensor_operation
::
element_wise
::
Add_Activation_Mul2_Clamp
<
Relu
>
;
static
constexpr
ck
::
index_t
NDimSpatial
=
2
;
static
constexpr
auto
GemmSpec
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNKPadding
;
static
constexpr
auto
ConvFwdDefault
=
ck
::
tensor_operation
::
device
::
ConvolutionForwardSpecialization
::
Default
;
static
constexpr
auto
ConvFwd1x1P0
=
ck
::
tensor_operation
::
device
::
ConvolutionForwardSpecialization
::
Filter1x1Pad0
;
static
constexpr
auto
ConvFwd1x1S1P0
=
ck
::
tensor_operation
::
device
::
ConvolutionForwardSpecialization
::
Filter1x1Stride1Pad0
;
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perchannel_quantization_int8_instance.cpp
0 → 100644
View file @
1724f7c8
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "device_conv2d_dl_int8_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_conv2d_dl_bias_perchannel_quantization_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
NDimSpatial
,
GNHWC
,
GKYXC
,
GK_GK_Tuple
,
GNHWK
,
int8_t
,
int8_t
,
I32_F32_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
Add_Mul2_Clamp
>>>&
instances
)
{
// dl
add_device_operation_instances
(
instances
,
device_grouped_conv2d_dl_int8_instances
<
GK_GK_Tuple
,
I32_F32_Tuple
,
Add_Mul2_Clamp
,
ConvFwdDefault
,
4
>
{});
add_device_operation_instances
(
instances
,
device_grouped_conv2d_dl_int8_instances
<
GK_GK_Tuple
,
I32_F32_Tuple
,
Add_Mul2_Clamp
,
ConvFwd1x1P0
,
4
>
{});
add_device_operation_instances
(
instances
,
device_grouped_conv2d_dl_int8_instances
<
GK_GK_Tuple
,
I32_F32_Tuple
,
Add_Mul2_Clamp
,
ConvFwd1x1S1P0
,
4
>
{});
}
void
add_device_conv2d_dl_bias_relu_perchannel_quantization_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
NDimSpatial
,
GNHWC
,
GKYXC
,
GK_GK_Tuple
,
GNHWK
,
int8_t
,
int8_t
,
I32_F32_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
Add_Relu_Mul2_Clamp
>>>&
instances
)
{
// dl
add_device_operation_instances
(
instances
,
device_grouped_conv2d_dl_int8_instances
<
GK_GK_Tuple
,
I32_F32_Tuple
,
Add_Relu_Mul2_Clamp
,
ConvFwdDefault
,
4
>
{});
add_device_operation_instances
(
instances
,
device_grouped_conv2d_dl_int8_instances
<
GK_GK_Tuple
,
I32_F32_Tuple
,
Add_Relu_Mul2_Clamp
,
ConvFwd1x1P0
,
4
>
{});
add_device_operation_instances
(
instances
,
device_grouped_conv2d_dl_int8_instances
<
GK_GK_Tuple
,
I32_F32_Tuple
,
Add_Relu_Mul2_Clamp
,
ConvFwd1x1S1P0
,
4
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perlayer_quantization_int8_instance.cpp
0 → 100644
View file @
1724f7c8
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "device_conv2d_dl_int8_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_conv2d_dl_bias_perlayer_quantization_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
NDimSpatial
,
GNHWC
,
GKYXC
,
GK_Tuple
,
GNHWK
,
int8_t
,
int8_t
,
I32_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
Add_Mul_Clamp
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_grouped_conv2d_dl_int8_instances
<
GK_Tuple
,
I32_Tuple
,
Add_Mul_Clamp
,
ConvFwdDefault
,
4
>
{});
add_device_operation_instances
(
instances
,
device_grouped_conv2d_dl_int8_instances
<
GK_Tuple
,
I32_Tuple
,
Add_Mul_Clamp
,
ConvFwd1x1P0
,
4
>
{});
add_device_operation_instances
(
instances
,
device_grouped_conv2d_dl_int8_instances
<
GK_Tuple
,
I32_Tuple
,
Add_Mul_Clamp
,
ConvFwd1x1S1P0
,
4
>
{});
}
void
add_device_conv2d_dl_bias_relu_perlayer_quantization_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
NDimSpatial
,
GNHWC
,
GKYXC
,
GK_Tuple
,
GNHWK
,
int8_t
,
int8_t
,
I32_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
Add_Relu_Mul_Clamp
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_grouped_conv2d_dl_int8_instances
<
GK_Tuple
,
I32_Tuple
,
Add_Relu_Mul_Clamp
,
ConvFwdDefault
,
4
>
{});
add_device_operation_instances
(
instances
,
device_grouped_conv2d_dl_int8_instances
<
GK_Tuple
,
I32_Tuple
,
Add_Relu_Mul_Clamp
,
ConvFwd1x1P0
,
4
>
{});
add_device_operation_instances
(
instances
,
device_grouped_conv2d_dl_int8_instances
<
GK_Tuple
,
I32_Tuple
,
Add_Relu_Mul_Clamp
,
ConvFwd1x1S1P0
,
4
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment