Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
dd10cb14
Commit
dd10cb14
authored
Mar 22, 2023
by
ltqin
Browse files
Merge branch 'develop' into lib_gemm_softmax_gemm_type
parents
77404e83
8a659a2e
Changes
59
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1113 additions
and
158 deletions
+1113
-158
example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc
...zation/run_conv2d_fwd_perchannel_quantization_example.inc
+235
-0
example/40_conv2d_fwd_quantization/run_conv2d_fwd_perlayer_quantization_example.inc
...tization/run_conv2d_fwd_perlayer_quantization_example.inc
+4
-86
example/44_conv2d_fwd_quantization/CMakeLists.txt
example/44_conv2d_fwd_quantization/CMakeLists.txt
+0
-3
include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
.../device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
+7
-5
include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp
...r_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp
+2
-2
include/ck/tensor_operation/gpu/element/quantization_operation.hpp
...k/tensor_operation/gpu/element/quantization_operation.hpp
+52
-9
include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_multiple_d.hpp
...tensor_operation/gpu/grid/gridwise_gemm_dl_multiple_d.hpp
+3
-1
include/ck/utility/data_type.hpp
include/ck/utility/data_type.hpp
+30
-32
include/ck/utility/inner_product.hpp
include/ck/utility/inner_product.hpp
+22
-0
library/include/ck/library/tensor_operation_instance/gpu/quantization/gemm_quantization.hpp
...operation_instance/gpu/quantization/gemm_quantization.hpp
+224
-0
library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp
...uped_convolution_bias_forward_perchannel_quantization.hpp
+41
-4
library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp
...rouped_convolution_bias_forward_perlayer_quantization.hpp
+41
-4
library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perchannel_quantization.hpp
...n/grouped_convolution_forward_perchannel_quantization.hpp
+40
-4
library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perlayer_quantization.hpp
...ion/grouped_convolution_forward_perlayer_quantization.hpp
+40
-4
library/src/tensor_operation_instance/gpu/quantization/CMakeLists.txt
...tensor_operation_instance/gpu/quantization/CMakeLists.txt
+36
-4
library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/conv2d_quantization_common.hpp
...pu/quantization/conv2d_fwd/conv2d_quantization_common.hpp
+59
-0
library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perchannel_quantization_int8_instance.cpp
..._conv2d_dl_bias_perchannel_quantization_int8_instance.cpp
+82
-0
library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perlayer_quantization_int8_instance.cpp
...ce_conv2d_dl_bias_perlayer_quantization_int8_instance.cpp
+82
-0
library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_int8_instance.hpp
...uantization/conv2d_fwd/device_conv2d_dl_int8_instance.hpp
+33
-0
library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_perchannel_quantization_int8_instance.cpp
...evice_conv2d_dl_perchannel_quantization_int8_instance.cpp
+80
-0
No files found.
example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc
0 → 100644
View file @
dd10cb14
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
template
<
ck
::
index_t
NDimSpatial
,
typename
InDataType
,
typename
WeiDataType
,
typename
OutDataType
,
typename
InElementOp
,
typename
WeiElementOp
,
typename
OutElementOp
,
typename
DeviceConvNDFwdInstance
>
bool
run_grouped_conv_fwd
(
bool
do_verification
,
bool
time_kernel
,
const
ck
::
utils
::
conv
::
ConvParam
&
conv_param
,
const
HostTensorDescriptor
&
in_g_n_c_wis_desc
,
const
HostTensorDescriptor
&
wei_g_k_c_xs_desc
,
const
HostTensorDescriptor
&
requant_scale_g_k_desc
,
const
HostTensorDescriptor
&
out_g_n_k_wos_desc
,
const
InElementOp
&
in_element_op
,
const
WeiElementOp
&
wei_element_op
,
const
OutElementOp
&
out_element_op
)
{
Tensor
<
InDataType
>
in
(
in_g_n_c_wis_desc
);
Tensor
<
WeiDataType
>
wei
(
wei_g_k_c_xs_desc
);
Tensor
<
RequantScaleDataType
>
requant_scale
(
requant_scale_g_k_desc
);
Tensor
<
OutDataType
>
out_host
(
out_g_n_k_wos_desc
);
Tensor
<
OutDataType
>
out_device
(
out_g_n_k_wos_desc
);
std
::
cout
<<
"in: "
<<
in
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"wei: "
<<
wei
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"requant_scale: "
<<
requant_scale
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"out: "
<<
out_host
.
mDesc
<<
std
::
endl
;
in
.
GenerateTensorValue
(
GeneratorTensor_2
<
InDataType
>
{
-
128
,
127
});
wei
.
GenerateTensorValue
(
GeneratorTensor_2
<
WeiDataType
>
{
-
128
,
127
});
requant_scale
.
GenerateTensorValue
(
GeneratorTensor_2
<
RequantScaleDataType
>
{
0
,
1
});
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
in
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
wei_device_buf
(
sizeof
(
WeiDataType
)
*
wei
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
requant_scale_device_buf
(
sizeof
(
RequantScaleDataType
)
*
requant_scale
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
out_device
.
mDesc
.
GetElementSpaceSize
());
in_device_buf
.
ToDevice
(
in
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
wei
.
mData
.
data
());
requant_scale_device_buf
.
ToDevice
(
requant_scale
.
mData
.
data
());
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
a_g_n_c_wis_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
a_g_n_c_wis_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
d0_g_n_k_wos_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
d0_g_n_k_wos_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
e_g_n_k_wos_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
e_g_n_k_wos_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
conv_filter_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
conv_filter_dilations
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_left_pads
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_right_pads
{};
auto
copy
=
[](
const
auto
&
x
,
auto
&
y
)
{
ck
::
ranges
::
copy
(
x
,
y
.
begin
());
};
copy
(
in_g_n_c_wis_desc
.
GetLengths
(),
a_g_n_c_wis_lengths
);
copy
(
in_g_n_c_wis_desc
.
GetStrides
(),
a_g_n_c_wis_strides
);
copy
(
wei_g_k_c_xs_desc
.
GetLengths
(),
b_g_k_c_xs_lengths
);
copy
(
wei_g_k_c_xs_desc
.
GetStrides
(),
b_g_k_c_xs_strides
);
copy
(
requant_scale_g_k_desc
.
GetLengths
(),
d0_g_n_k_wos_lengths
);
copy
(
requant_scale_g_k_desc
.
GetStrides
(),
d0_g_n_k_wos_strides
);
copy
(
out_g_n_k_wos_desc
.
GetLengths
(),
e_g_n_k_wos_lengths
);
copy
(
out_g_n_k_wos_desc
.
GetStrides
(),
e_g_n_k_wos_strides
);
copy
(
conv_param
.
conv_filter_strides_
,
conv_filter_strides
);
copy
(
conv_param
.
conv_filter_dilations_
,
conv_filter_dilations
);
copy
(
conv_param
.
input_left_pads_
,
input_left_pads
);
copy
(
conv_param
.
input_right_pads_
,
input_right_pads
);
// do Conv
auto
conv
=
DeviceConvNDFwdInstance
{};
auto
invoker
=
conv
.
MakeInvoker
();
auto
argument
=
conv
.
MakeArgument
(
in_device_buf
.
GetDeviceBuffer
(),
wei_device_buf
.
GetDeviceBuffer
(),
{
requant_scale_device_buf
.
GetDeviceBuffer
()},
out_device_buf
.
GetDeviceBuffer
(),
a_g_n_c_wis_lengths
,
a_g_n_c_wis_strides
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
,
{
d0_g_n_k_wos_lengths
},
{
d0_g_n_k_wos_strides
},
e_g_n_k_wos_lengths
,
e_g_n_k_wos_strides
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
in_element_op
,
wei_element_op
,
out_element_op
);
if
(
!
conv
.
IsSupportedArgument
(
argument
))
{
throw
std
::
runtime_error
(
"wrong! device_conv with the specified compilation parameters does "
"not support this Conv problem"
);
}
float
avg_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
flop
=
conv_param
.
GetFlops
();
std
::
size_t
num_btype
=
conv_param
.
GetByte
<
InDataType
,
WeiDataType
,
OutDataType
>
();
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
avg_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
avg_time
;
std
::
cout
<<
"Perf: "
<<
avg_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
conv
.
GetTypeString
()
<<
std
::
endl
;
bool
pass
=
true
;
if
(
do_verification
)
{
Tensor
<
AccDataType
>
c_host
(
out_g_n_k_wos_desc
);
auto
ref_conv
=
ck
::
tensor_operation
::
host
::
ReferenceConvFwd
<
NDimSpatial
,
InDataType
,
WeiDataType
,
AccDataType
,
InElementOp
,
WeiElementOp
,
PassThrough
>
();
auto
ref_invoker
=
ref_conv
.
MakeInvoker
();
auto
ref_argument
=
ref_conv
.
MakeArgument
(
in
,
wei
,
c_host
,
conv_param
.
conv_filter_strides_
,
conv_param
.
conv_filter_dilations_
,
conv_param
.
input_left_pads_
,
conv_param
.
input_right_pads_
,
in_element_op
,
wei_element_op
,
PassThrough
{});
ref_invoker
.
Run
(
ref_argument
);
// TODO: implement elementwise operation for host
out_host
.
ForEach
([
&
](
auto
&
,
auto
idx
)
{
out_element_op
(
out_host
(
idx
),
c_host
(
idx
),
requant_scale
(
idx
));
});
out_device_buf
.
FromDevice
(
out_device
.
mData
.
data
());
pass
&=
ck
::
utils
::
check_err
(
out_device
,
out_host
,
"Error: incorrect results!"
,
1
e
-
5
f
,
1
e
-
4
f
);
}
return
(
pass
?
0
:
1
);
}
int
run_conv2d_fwd_perchannel_quantization_example
()
{
bool
do_verification
=
true
;
bool
time_kernel
=
true
;
const
ck
::
index_t
ndim_spatial
=
2
;
ck
::
utils
::
conv
::
ConvParam
conv_param
{
ndim_spatial
,
// n_dim
1
,
// group
4
,
// batch
64
,
// output channels
192
,
// input chanels
{
3
,
3
},
// weight HW
{
71
,
71
},
// x HW
{
2
,
2
},
// strides
{
1
,
1
},
// dilations
{
1
,
1
},
// left_pads
{
1
,
1
}
// right_pads
};
const
auto
in_element_op
=
InElementOp
{};
const
auto
wei_element_op
=
WeiElementOp
{};
const
auto
out_element_op
=
OutElementOp
{
ActivationOp
{}};
using
InLayout
=
ck
::
tensor_layout
::
convolution
::
GNHWC
;
using
WeiLayout
=
ck
::
tensor_layout
::
convolution
::
GKYXC
;
using
RequantScaleLayout
=
ck
::
tensor_layout
::
convolution
::
G_K
;
using
OutLayout
=
ck
::
tensor_layout
::
convolution
::
GNHWK
;
const
auto
in_g_n_c_wis_desc
=
ck
::
utils
::
conv
::
make_input_host_tensor_descriptor_g_n_c_wis_packed
<
InLayout
>
(
conv_param
);
const
auto
wei_g_k_c_xs_desc
=
ck
::
utils
::
conv
::
make_weight_host_tensor_descriptor_g_k_c_xs_packed
<
WeiLayout
>
(
conv_param
);
const
auto
requant_scale_g_k_desc
=
HostTensorDescriptor
({
conv_param
.
G_
,
conv_param
.
N_
,
conv_param
.
K_
,
conv_param
.
output_spatial_lengths_
[
0
],
conv_param
.
output_spatial_lengths_
[
1
]},
{
conv_param
.
K_
,
// g
0
,
// n
1
,
// k
0
,
// ho
0
// wo
});
const
auto
out_g_n_k_wos_desc
=
ck
::
utils
::
conv
::
make_output_host_tensor_descriptor_g_n_k_wos_packed
<
OutLayout
>
(
conv_param
);
using
deviceOp
=
DeviceGroupedConvNDFwdInstance
<
ndim_spatial
,
InLayout
,
WeiLayout
,
RequantScaleLayout
,
OutLayout
>
;
return
run_grouped_conv_fwd
<
ndim_spatial
,
InDataType
,
WeiDataType
,
OutDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
,
deviceOp
>
(
do_verification
,
time_kernel
,
conv_param
,
in_g_n_c_wis_desc
,
wei_g_k_c_xs_desc
,
requant_scale_g_k_desc
,
out_g_n_k_wos_desc
,
in_element_op
,
wei_element_op
,
out_element_op
);
}
example/4
4
_conv2d_fwd_quantization/conv2d_fwd_
xdl_
perlayer_quantization_
int8.cpp
→
example/4
0
_conv2d_fwd_quantization/
run_
conv2d_fwd_perlayer_quantization_
example.inc
View file @
dd10cb14
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/ck.hpp"
#pragma once
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
using
InDataType
=
int8_t
;
using
WeiDataType
=
int8_t
;
using
AccDataType
=
int32_t
;
using
CShuffleDataType
=
int32_t
;
using
OutDataType
=
int8_t
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
InElementOp
=
PassThrough
;
using
WeiElementOp
=
PassThrough
;
using
ActivationOp
=
PassThrough
;
using
OutElementOp
=
ck
::
tensor_operation
::
element_wise
::
Activation_Mul_Clamp
<
ActivationOp
>
;
static
constexpr
auto
ConvSpec
=
ck
::
tensor_operation
::
device
::
ConvolutionForwardSpecialization
::
Default
;
static
constexpr
auto
GemmSpec
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNKPadding
;
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
,
typename
WeiLayout
,
typename
OutLayout
>
using
DeviceGroupedConvNDFwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
NDimSpatial
,
InLayout
,
WeiLayout
,
ck
::
Tuple
<>
,
OutLayout
,
InDataType
,
WeiDataType
,
AccDataType
,
CShuffleDataType
,
ck
::
Tuple
<>
,
OutDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
,
ConvSpec
,
// ConvForwardSpecialization
GemmSpec
,
// GemmSpecialization
1
,
//
256
,
// BlockSize
128
,
// MPerBlock
256
,
// NPerBlock
64
,
// KPerBlock
16
,
// AK1
16
,
// BK1
32
,
// MPerXdl
32
,
// NPerXdl
2
,
// MXdlPerWave
4
,
// NXdlPerWave
S
<
4
,
64
,
1
>
,
// ABlockTransferThreadClusterLengths_AK0_M_AK1
S
<
1
,
0
,
2
>
,
// ABlockTransferThreadClusterArrangeOrder
S
<
1
,
0
,
2
>
,
// ABlockTransferSrcAccessOrder
2
,
// ABlockTransferSrcVectorDim
16
,
// ABlockTransferSrcScalarPerVector
16
,
// ABlockTransferDstScalarPerVector_AK1
1
,
// ABlockLdsExtraM
S
<
4
,
64
,
1
>
,
// BBlockTransferThreadClusterLengths_BK0_N_BK1
S
<
1
,
0
,
2
>
,
// BBlockTransferThreadClusterArrangeOrder
S
<
1
,
0
,
2
>
,
// BBlockTransferSrcAccessOrder
2
,
// BBlockTransferSrcVectorDim
16
,
// BBlockTransferSrcScalarPerVector
16
,
// BBlockTransferDstScalarPerVector_BK1
1
,
// BBlockLdsExtraN
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
16
>
;
template
<
ck
::
index_t
NDimSpatial
,
template
<
ck
::
index_t
NDimSpatial
,
typename
InDataType
,
typename
InDataType
,
...
@@ -221,10 +139,10 @@ bool run_grouped_conv_fwd(bool do_verification,
...
@@ -221,10 +139,10 @@ bool run_grouped_conv_fwd(bool do_verification,
return
(
pass
?
0
:
1
);
return
(
pass
?
0
:
1
);
}
}
int
main
()
int
run_conv2d_fwd_perlayer_quantization_example
()
{
{
bool
do_verification
=
true
;
bool
do_verification
=
true
;
bool
time_kernel
=
tru
e
;
bool
time_kernel
=
fals
e
;
const
ck
::
index_t
ndim_spatial
=
2
;
const
ck
::
index_t
ndim_spatial
=
2
;
ck
::
utils
::
conv
::
ConvParam
conv_param
{
ck
::
utils
::
conv
::
ConvParam
conv_param
{
...
@@ -232,7 +150,7 @@ int main()
...
@@ -232,7 +150,7 @@ int main()
1
,
// group
1
,
// group
4
,
// batch
4
,
// batch
64
,
// output channels
64
,
// output channels
3
2
,
// input chanels
19
2
,
// input chanels
{
3
,
3
},
// weight HW
{
3
,
3
},
// weight HW
{
71
,
71
},
// x HW
{
71
,
71
},
// x HW
{
2
,
2
},
// strides
{
2
,
2
},
// strides
...
...
example/44_conv2d_fwd_quantization/CMakeLists.txt
deleted
100644 → 0
View file @
77404e83
add_example_executable
(
example_conv2d_fwd_xdl_perchannel_quantization_int8 conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp
)
add_example_executable
(
example_conv2d_fwd_xdl_perlayer_quantization_int8 conv2d_fwd_xdl_perlayer_quantization_int8.cpp
)
add_example_executable
(
example_conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8 conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp
)
include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
View file @
dd10cb14
...
@@ -134,7 +134,8 @@ __global__ void
...
@@ -134,7 +134,8 @@ __global__ void
const
Block2CTileMap
block_2_ctile_map
,
const
Block2CTileMap
block_2_ctile_map
,
const
ComputePtrOffsetOfBatch
compute_ptr_offset_of_batch
)
const
ComputePtrOffsetOfBatch
compute_ptr_offset_of_batch
)
{
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx1030__))
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx1030__) || \
defined(__gfx90a__) || defined(__gfx908__))
// offset base pointer for each work-group
// offset base pointer for each work-group
const
index_t
num_blocks_per_batch
=
const
index_t
num_blocks_per_batch
=
__builtin_amdgcn_readfirstlane
(
get_grid_size
()
/
batch_count
);
__builtin_amdgcn_readfirstlane
(
get_grid_size
()
/
batch_count
);
...
@@ -314,9 +315,8 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
...
@@ -314,9 +315,8 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
const
auto
in_gemmm_gemmk_desc
=
const
auto
in_gemmm_gemmk_desc
=
matrix_padder
.
PadADescriptor_M_K
(
in_gemmmraw_gemmkraw_desc
);
matrix_padder
.
PadADescriptor_M_K
(
in_gemmmraw_gemmkraw_desc
);
const
auto
M
=
in_gemmm_gemmk_desc
.
GetLength
(
I0
);
const
auto
M
=
in_gemmm_gemmk_desc
.
GetLength
(
I0
);
const
auto
K
=
in_gemmm_gemmk_desc
.
GetLength
(
I1
);
const
auto
K
=
in_gemmm_gemmk_desc
.
GetLength
(
I1
);
const
auto
AK0
=
K
/
K1
;
const
auto
AK0
=
K
/
K1
;
return
transform_tensor_descriptor
(
return
transform_tensor_descriptor
(
...
@@ -709,7 +709,8 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
...
@@ -709,7 +709,8 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
namespace
ctc
=
tensor_layout
::
convolution
;
namespace
ctc
=
tensor_layout
::
convolution
;
// check device
// check device
if
(
!
(
ck
::
get_device_name
()
==
"gfx906"
||
ck
::
get_device_name
()
==
"gfx1030"
))
if
(
!
(
ck
::
get_device_name
()
==
"gfx906"
||
ck
::
get_device_name
()
==
"gfx1030"
||
ck
::
get_device_name
()
==
"gfx90a"
||
ck
::
get_device_name
()
==
"gfx908"
))
{
{
return
false
;
return
false
;
}
}
...
@@ -834,6 +835,7 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
...
@@ -834,6 +835,7 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
{
{
return
false
;
return
false
;
}
}
// check Gridwise GEMM
// check Gridwise GEMM
return
GridwiseGemm
::
CheckValidity
(
return
GridwiseGemm
::
CheckValidity
(
arg
.
a_grid_desc_ak0_m_ak1_
,
arg
.
b_grid_desc_bk0_n_bk1_
,
arg
.
e_grid_desc_m_n_
);
arg
.
a_grid_desc_ak0_m_ak1_
,
arg
.
b_grid_desc_bk0_n_bk1_
,
arg
.
e_grid_desc_m_n_
);
...
...
include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp
View file @
dd10cb14
...
@@ -51,7 +51,7 @@ __global__ void
...
@@ -51,7 +51,7 @@ __global__ void
const
Block2CTileMap
block_2_ctile_map
)
const
Block2CTileMap
block_2_ctile_map
)
{
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \
defined(__gfx1030__))
defined(__gfx90a__) ||
defined(__gfx1030__))
constexpr
index_t
shared_block_size
=
constexpr
index_t
shared_block_size
=
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()
/
sizeof
(
ABDataType
);
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()
/
sizeof
(
ABDataType
);
...
@@ -552,7 +552,7 @@ struct DeviceGemmMultipleD_Dl : public DeviceGemmMultipleD<ALayout,
...
@@ -552,7 +552,7 @@ struct DeviceGemmMultipleD_Dl : public DeviceGemmMultipleD<ALayout,
static
bool
IsSupportedArgument
(
const
Argument
&
arg
)
static
bool
IsSupportedArgument
(
const
Argument
&
arg
)
{
{
if
(
ck
::
get_device_name
()
==
"gfx906"
||
ck
::
get_device_name
()
==
"gfx908"
||
if
(
ck
::
get_device_name
()
==
"gfx906"
||
ck
::
get_device_name
()
==
"gfx908"
||
ck
::
get_device_name
()
==
"gfx1030"
)
ck
::
get_device_name
()
==
"gfx90a"
||
ck
::
get_device_name
()
==
"gfx1030"
)
{
{
return
GridwiseGemm
::
CheckValidity
(
return
GridwiseGemm
::
CheckValidity
(
arg
.
a_grid_desc_k0_m_k1_
,
arg
.
b_grid_desc_k0_n_k1_
,
arg
.
e_grid_desc_m_n_
);
arg
.
a_grid_desc_k0_m_k1_
,
arg
.
b_grid_desc_k0_n_k1_
,
arg
.
e_grid_desc_m_n_
);
...
...
include/ck/tensor_operation/gpu/element/quantization_operation.hpp
View file @
dd10cb14
#pragma once
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/data_type.hpp"
// #include "ck/utility/get_id.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
...
@@ -17,18 +18,27 @@ struct Activation_Mul_Clamp
...
@@ -17,18 +18,27 @@ struct Activation_Mul_Clamp
__host__
__device__
constexpr
void
operator
()(
int8_t
&
y
,
const
int32_t
&
x
)
const
__host__
__device__
constexpr
void
operator
()(
int8_t
&
y
,
const
int32_t
&
x
)
const
{
{
float
x
_fp32
=
ck
::
type_convert
<
float
>
(
x
);
float
y
_fp32
=
ck
::
type_convert
<
float
>
(
x
);
activationOp_
(
x
_fp32
,
x
_fp32
);
activationOp_
(
y
_fp32
,
y
_fp32
);
float
y_fp32
=
math
::
clamp
(
requantScale_
*
x
_fp32
,
-
128.
f
,
127.
f
);
y_fp32
=
math
::
clamp
(
requantScale_
*
y
_fp32
,
-
128.
f
,
127.
f
);
y
=
ck
::
type_convert
<
int8_t
>
(
y_fp32
);
y
=
ck
::
type_convert
<
int8_t
>
(
y_fp32
);
}
}
__host__
__device__
constexpr
void
operator
()(
floa
t
&
y
,
const
int32_t
&
x
)
const
__device__
constexpr
void
operator
()(
int32_
t
&
y
,
const
int32_t
&
x
)
const
{
{
// We might type_convert to int8 after lambda in someplace
// CAUSION - We might type_convert to int8 in threadwise copy
float
x_fp32
=
ck
::
type_convert
<
float
>
(
x
);
// eg. GridwiseGemmDlMultipleD_km_kn_mn
activationOp_
(
x_fp32
,
x_fp32
);
float
y_fp32
=
ck
::
type_convert
<
float
>
(
x
);
y
=
math
::
clamp
(
requantScale_
*
x_fp32
,
-
128.
f
,
127.
f
);
activationOp_
(
y_fp32
,
y_fp32
);
y_fp32
=
math
::
clamp
(
requantScale_
*
y_fp32
,
-
128.
f
,
127.
f
);
y
=
ck
::
type_convert
<
int32_t
>
(
y_fp32
);
}
__host__
constexpr
void
operator
()(
float
&
y
,
const
float
&
x
)
const
{
// CAUSION - We might float in & float out in reference code
activationOp_
(
y
,
x
);
y
=
math
::
clamp
(
requantScale_
*
y
,
-
128.
f
,
127.
f
);
}
}
float
requantScale_
;
float
requantScale_
;
...
@@ -51,6 +61,17 @@ struct Activation_Mul2_Clamp
...
@@ -51,6 +61,17 @@ struct Activation_Mul2_Clamp
y
=
ck
::
type_convert
<
int8_t
>
(
y_fp32
);
y
=
ck
::
type_convert
<
int8_t
>
(
y_fp32
);
}
}
__device__
constexpr
void
operator
()(
int32_t
&
y
,
const
int32_t
&
x
,
const
float
&
requantScale
)
const
{
// CAUSION - We might type_convert to int8 in threadwise copy
// eg. GridwiseGemmDlMultipleD_km_kn_mn
float
y_fp32
=
ck
::
type_convert
<
float
>
(
x
);
activationOp_
(
y_fp32
,
y_fp32
);
y_fp32
=
math
::
clamp
(
requantScale
*
y_fp32
,
-
128.
f
,
127.
f
);
y
=
ck
::
type_convert
<
int32_t
>
(
y_fp32
);
}
Activation
activationOp_
;
Activation
activationOp_
;
};
};
...
@@ -72,6 +93,17 @@ struct Add_Activation_Mul_Clamp
...
@@ -72,6 +93,17 @@ struct Add_Activation_Mul_Clamp
y
=
ck
::
type_convert
<
int8_t
>
(
y_fp32
);
y
=
ck
::
type_convert
<
int8_t
>
(
y_fp32
);
}
}
__host__
__device__
constexpr
void
operator
()(
int32_t
&
y
,
const
int32_t
&
x
,
const
int32_t
&
bias
)
const
{
// CAUSION - We might type_convert to int8 in threadwise copy
// eg. GridwiseGemmDlMultipleD_km_kn_mn
float
y_fp32
=
ck
::
type_convert
<
float
>
(
x
+
bias
);
activationOp_
(
y_fp32
,
y_fp32
);
y_fp32
=
math
::
clamp
(
requantScale_
*
y_fp32
,
-
128.
f
,
127.
f
);
y
=
ck
::
type_convert
<
int32_t
>
(
y_fp32
);
}
float
requantScale_
;
float
requantScale_
;
Activation
activationOp_
;
Activation
activationOp_
;
};
};
...
@@ -92,6 +124,17 @@ struct Add_Activation_Mul2_Clamp
...
@@ -92,6 +124,17 @@ struct Add_Activation_Mul2_Clamp
y
=
ck
::
type_convert
<
int8_t
>
(
y_fp32
);
y
=
ck
::
type_convert
<
int8_t
>
(
y_fp32
);
}
}
__host__
__device__
constexpr
void
operator
()(
int32_t
&
y
,
const
int32_t
&
x
,
const
int32_t
&
bias
,
const
float
&
requantScale
)
const
{
// CAUSION - We might type_convert to int8 in threadwise copy
// eg. GridwiseGemmDlMultipleD_km_kn_mn
float
y_fp32
=
ck
::
type_convert
<
float
>
(
x
+
bias
);
activationOp_
(
y_fp32
,
y_fp32
);
y_fp32
=
math
::
clamp
(
requantScale
*
y_fp32
,
-
128.
f
,
127.
f
);
y
=
ck
::
type_convert
<
int32_t
>
(
y_fp32
);
}
Activation
activationOp_
;
Activation
activationOp_
;
};
};
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_multiple_d.hpp
View file @
dd10cb14
...
@@ -185,8 +185,10 @@ struct GridwiseGemmDlMultipleD_km_kn_mn
...
@@ -185,8 +185,10 @@ struct GridwiseGemmDlMultipleD_km_kn_mn
return
b_grid_desc_k0_n0_n1_k1
;
return
b_grid_desc_k0_n0_n1_k1
;
}
}
// E desc for destination in blockwise copy
template
<
typename
CGridDesc_M_N_
>
__host__
__device__
static
constexpr
auto
__host__
__device__
static
constexpr
auto
MakeCGridDescriptor_M0_M10_M11_N0_N10_N11
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
)
MakeCGridDescriptor_M0_M10_M11_N0_N10_N11
(
const
CGridDesc_M_N
_
&
c_grid_desc_m_n
)
{
{
const
auto
M
=
c_grid_desc_m_n
.
GetLength
(
I0
);
const
auto
M
=
c_grid_desc_m_n
.
GetLength
(
I0
);
const
auto
N
=
c_grid_desc_m_n
.
GetLength
(
I1
);
const
auto
N
=
c_grid_desc_m_n
.
GetLength
(
I1
);
...
...
include/ck/utility/data_type.hpp
View file @
dd10cb14
...
@@ -974,38 +974,36 @@ inline __host__ __device__ constexpr bhalf_t type_convert<bhalf_t, float>(float
...
@@ -974,38 +974,36 @@ inline __host__ __device__ constexpr bhalf_t type_convert<bhalf_t, float>(float
uint32_t
int32
;
uint32_t
int32
;
}
u
=
{
x
};
}
u
=
{
x
};
if
(
~
u
.
int32
&
0x7f800000
)
// When the exponent bits are not all 1s, then the value is zero, normal,
{
// or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus
// When the exponent bits are not all 1s, then the value is zero, normal,
// 1 if the least significant bit of the bfloat16 mantissa is 1 (odd).
// or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus
// This causes the bfloat16's mantissa to be incremented by 1 if the 16
// 1 if the least significant bit of the bfloat16 mantissa is 1 (odd).
// least significant bits of the float mantissa are greater than 0x8000,
// This causes the bfloat16's mantissa to be incremented by 1 if the 16
// or if they are equal to 0x8000 and the least significant bit of the
// least significant bits of the float mantissa are greater than 0x8000,
// bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when
// or if they are equal to 0x8000 and the least significant bit of the
// the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already
// bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when
// has the value 0x7f, then incrementing it causes it to become 0x00 and
// the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already
// the exponent is incremented by one, which is the next higher FP value
// has the value 0x7f, then incrementing it causes it to become 0x00 and
// to the unrounded bfloat16 value. When the bfloat16 value is subnormal
// the exponent is incremented by one, which is the next higher FP value
// with an exponent of 0x00 and a mantissa of 0x7f, it may be rounded up
// to the unrounded bfloat16 value. When the bfloat16 value is subnormal
// to a normal value with an exponent of 0x01 and a mantissa of 0x00.
// with an exponent of 0x00 and a mantissa of 0x7F, it may be rounded up
// When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F,
// to a normal value with an exponent of 0x01 and a mantissa of 0x00.
// incrementing it causes it to become an exponent of 0xFF and a mantissa
// When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F,
// of 0x00, which is Inf, the next higher value to the unrounded value.
// incrementing it causes it to become an exponent of 0xFF and a mantissa
bool
flag0
=
~
u
.
int32
&
0x7f800000
;
// of 0x00, which is Inf, the next higher value to the unrounded value.
u
.
int32
+=
0x7fff
+
((
u
.
int32
>>
16
)
&
1
);
// Round to nearest, round to even
// When all of the exponent bits are 1, the value is Inf or NaN.
}
// Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
else
if
(
u
.
int32
&
0xffff
)
// mantissa bit. Quiet NaN is indicated by the most significant mantissa
{
// bit being 1. Signaling NaN is indicated by the most significant
// When all of the exponent bits are 1, the value is Inf or NaN.
// mantissa bit being 0 but some other bit(s) being 1. If any of the
// Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
// lower 16 bits of the mantissa are 1, we set the least significant bit
// mantissa bit. Quiet NaN is indicated by the most significant mantissa
// of the bfloat16 mantissa, in order to preserve signaling NaN in case
// bit being 1. Signaling NaN is indicated by the most significant
// the bfloat16's mantissa bits are all 0.
// mantissa bit being 0 but some other bit(s) being 1. If any of the
bool
flag1
=
!
flag0
&&
(
u
.
int32
&
0xffff
);
// lower 16 bits of the mantissa are 1, we set the least significant bit
// of the bfloat16 mantissa, in order to preserve signaling NaN in case
u
.
int32
+=
flag0
?
0x7fff
+
((
u
.
int32
>>
16
)
&
1
)
:
0
;
// Round to nearest, round to even
// the bloat16's mantissa bits are all 0.
u
.
int32
|=
flag1
?
0x10000
:
0x0
;
// Preserve signaling NaN
u
.
int32
|=
0x10000
;
// Preserve signaling NaN
}
return
uint16_t
(
u
.
int32
>>
16
);
return
uint16_t
(
u
.
int32
>>
16
);
}
}
...
...
include/ck/utility/inner_product.hpp
View file @
dd10cb14
...
@@ -135,6 +135,28 @@ __device__ void inner_product<half8_t, half8_t, float>(const half8_t& a, const h
...
@@ -135,6 +135,28 @@ __device__ void inner_product<half8_t, half8_t, float>(const half8_t& a, const h
c
);
c
);
}
}
template
<
>
__device__
void
inner_product
<
int8_t
,
int8_t
,
int32_t
>
(
const
int8_t
&
a
,
const
int8_t
&
b
,
int32_t
&
c
)
{
c
+=
type_convert
<
int32_t
>
(
a
)
*
type_convert
<
int32_t
>
(
b
);
}
template
<
>
__device__
void
inner_product
<
int8x2_t
,
int8x2_t
,
int32_t
>
(
const
int8x2_t
&
a
,
const
int8x2_t
&
b
,
int32_t
&
c
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
inner_product
(
vector_type
<
int8_t
,
2
>
{
a
}.
AsType
<
int8_t
>
()[
I0
],
vector_type
<
int8_t
,
2
>
{
b
}.
AsType
<
int8_t
>
()[
I0
],
c
);
inner_product
(
vector_type
<
int8_t
,
2
>
{
a
}.
AsType
<
int8_t
>
()[
I1
],
vector_type
<
int8_t
,
2
>
{
b
}.
AsType
<
int8_t
>
()[
I1
],
c
);
}
template
<
>
template
<
>
__device__
void
__device__
void
inner_product
<
int8x4_t
,
int8x4_t
,
int32_t
>
(
const
int8x4_t
&
a
,
const
int8x4_t
&
b
,
int32_t
&
c
)
inner_product
<
int8x4_t
,
int8x4_t
,
int32_t
>
(
const
int8x4_t
&
a
,
const
int8x4_t
&
b
,
int32_t
&
c
)
...
...
library/include/ck/library/tensor_operation_instance/gpu/quantization/gemm_quantization.hpp
0 → 100644
View file @
dd10cb14
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// Layout(A, B, C) = [Col, Row, Row]
void
add_device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_kn_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
Activation_Mul_Clamp
<
PassThrough
>>>>&
instances
);
// Layout(A, B, C) = [Col, Col, Row]
void
add_device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_nk_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
Activation_Mul_Clamp
<
PassThrough
>>>>&
instances
);
// Layout(A, B, C) = [Row, Row, Row]
void
add_device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_kn_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
Activation_Mul_Clamp
<
PassThrough
>>>>&
instances
);
// Layout(A, B, C) = [Row, Col, Row]
void
add_device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_nk_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
Activation_Mul_Clamp
<
PassThrough
>>>>&
instances
);
// Layout(A, B, C) = [Col, Row, Row]
void
add_device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
Activation_Mul_Clamp
<
PassThrough
>>>>&
instances
);
// Layout(A, B, C) = [Col, Col, Row]
void
add_device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
Activation_Mul_Clamp
<
PassThrough
>>>>&
instances
);
// Layout(A, B, C) = [Row, Row, Row]
void
add_device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
Activation_Mul_Clamp
<
PassThrough
>>>>&
instances
);
// Layout(A, B, C) = [Row, Col, Row]
void
add_device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
Activation_Mul_Clamp
<
PassThrough
>>>>&
instances
);
template
<
typename
ALayout
,
typename
BLayout
,
typename
ELayout
,
typename
ADataType
,
typename
BDataType
,
typename
EDataType
,
typename
Activation
>
struct
DeviceOperationInstanceFactory
<
ck
::
tensor_operation
::
device
::
DeviceGemmMultipleD
<
ALayout
,
BLayout
,
Empty_Tuple
,
ELayout
,
ADataType
,
BDataType
,
Empty_Tuple
,
EDataType
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
Activation_Mul_Clamp
<
Activation
>>>
{
using
DeviceOp
=
DeviceGemmMultipleD
<
ALayout
,
BLayout
,
Empty_Tuple
,
ELayout
,
ADataType
,
BDataType
,
Empty_Tuple
,
EDataType
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
Activation_Mul_Clamp
<
Activation
>>
;
static
auto
GetInstances
()
{
std
::
vector
<
std
::
unique_ptr
<
DeviceOp
>>
op_ptrs
;
if
constexpr
(
is_same_v
<
ADataType
,
int8_t
>
&&
is_same_v
<
BDataType
,
int8_t
>
&&
is_same_v
<
EDataType
,
int8_t
>
)
{
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Row
>
&&
is_same_v
<
ELayout
,
Row
>
)
{
if
constexpr
(
is_same_v
<
Activation
,
PassThrough
>
)
{
add_device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_kn_mn_instances
(
op_ptrs
);
add_device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances
(
op_ptrs
);
}
}
else
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Col
>
&&
is_same_v
<
ELayout
,
Row
>
)
{
if
constexpr
(
is_same_v
<
Activation
,
PassThrough
>
)
{
add_device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_nk_mn_instances
(
op_ptrs
);
add_device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances
(
op_ptrs
);
}
}
else
if
constexpr
(
is_same_v
<
ALayout
,
Col
>
&&
is_same_v
<
BLayout
,
Row
>
&&
is_same_v
<
ELayout
,
Row
>
)
{
if
constexpr
(
is_same_v
<
Activation
,
PassThrough
>
)
{
add_device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_kn_mn_instances
(
op_ptrs
);
add_device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances
(
op_ptrs
);
}
}
else
if
constexpr
(
is_same_v
<
ALayout
,
Col
>
&&
is_same_v
<
BLayout
,
Col
>
&&
is_same_v
<
ELayout
,
Row
>
)
{
if
constexpr
(
is_same_v
<
Activation
,
PassThrough
>
)
{
add_device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_nk_mn_instances
(
op_ptrs
);
add_device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances
(
op_ptrs
);
}
}
return
op_ptrs
;
}
}
};
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp
View file @
dd10cb14
...
@@ -18,7 +18,7 @@ namespace device {
...
@@ -18,7 +18,7 @@ namespace device {
namespace
instance
{
namespace
instance
{
// grouped conv2d forward, GNHWC/GKYXC/GNHWK
// grouped conv2d forward, GNHWC/GKYXC/GNHWK
void
add_device_conv2d_bias_perchannel_quantization_int8_instances
(
void
add_device_conv2d_
dl_
bias_perchannel_quantization_int8_instances
(
std
::
vector
<
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GNHWC
,
...
@@ -34,7 +34,38 @@ void add_device_conv2d_bias_perchannel_quantization_int8_instances(
...
@@ -34,7 +34,38 @@ void add_device_conv2d_bias_perchannel_quantization_int8_instances(
Add_Activation_Mul2_Clamp
<
PassThrough
>>>>&
Add_Activation_Mul2_Clamp
<
PassThrough
>>>>&
instances
);
instances
);
void
add_device_conv2d_bias_relu_perchannel_quantization_int8_instances
(
void
add_device_conv2d_dl_bias_relu_perchannel_quantization_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GKYXC
,
GK_GK_Tuple
,
GNHWK
,
int8_t
,
int8_t
,
I32_F32_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
Add_Activation_Mul2_Clamp
<
Relu
>>>>&
instances
);
void
add_device_conv2d_xdl_bias_perchannel_quantization_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GKYXC
,
GK_GK_Tuple
,
GNHWK
,
int8_t
,
int8_t
,
I32_F32_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
Add_Activation_Mul2_Clamp
<
PassThrough
>>>>&
instances
);
void
add_device_conv2d_xdl_bias_relu_perchannel_quantization_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GNHWC
,
GKYXC
,
GKYXC
,
...
@@ -98,9 +129,15 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
...
@@ -98,9 +129,15 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
is_same_v
<
DsDataType
,
I32_F32_Tuple
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
is_same_v
<
DsDataType
,
I32_F32_Tuple
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
{
if
constexpr
(
is_same_v
<
Activation
,
PassThrough
>
)
if
constexpr
(
is_same_v
<
Activation
,
PassThrough
>
)
add_device_conv2d_bias_perchannel_quantization_int8_instances
(
op_ptrs
);
{
add_device_conv2d_dl_bias_perchannel_quantization_int8_instances
(
op_ptrs
);
add_device_conv2d_xdl_bias_perchannel_quantization_int8_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
Activation
,
Relu
>
)
else
if
constexpr
(
is_same_v
<
Activation
,
Relu
>
)
add_device_conv2d_bias_relu_perchannel_quantization_int8_instances
(
op_ptrs
);
{
add_device_conv2d_dl_bias_relu_perchannel_quantization_int8_instances
(
op_ptrs
);
add_device_conv2d_xdl_bias_relu_perchannel_quantization_int8_instances
(
op_ptrs
);
}
}
}
}
}
...
...
library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp
View file @
dd10cb14
...
@@ -18,7 +18,7 @@ namespace device {
...
@@ -18,7 +18,7 @@ namespace device {
namespace
instance
{
namespace
instance
{
// grouped conv2d forward, GNHWC/GKYXC/GNHWK
// grouped conv2d forward, GNHWC/GKYXC/GNHWK
void
add_device_conv2d_bias_perlayer_quantization_int8_instances
(
void
add_device_conv2d_
dl_
bias_perlayer_quantization_int8_instances
(
std
::
vector
<
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GNHWC
,
...
@@ -34,7 +34,38 @@ void add_device_conv2d_bias_perlayer_quantization_int8_instances(
...
@@ -34,7 +34,38 @@ void add_device_conv2d_bias_perlayer_quantization_int8_instances(
Add_Activation_Mul_Clamp
<
PassThrough
>>>>&
Add_Activation_Mul_Clamp
<
PassThrough
>>>>&
instances
);
instances
);
void
add_device_conv2d_bias_relu_perlayer_quantization_int8_instances
(
void
add_device_conv2d_dl_bias_relu_perlayer_quantization_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GKYXC
,
GK_Tuple
,
GNHWK
,
int8_t
,
int8_t
,
I32_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
Add_Activation_Mul_Clamp
<
Relu
>>>>&
instances
);
void
add_device_conv2d_xdl_bias_perlayer_quantization_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GKYXC
,
GK_Tuple
,
GNHWK
,
int8_t
,
int8_t
,
I32_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
Add_Activation_Mul_Clamp
<
PassThrough
>>>>&
instances
);
void
add_device_conv2d_xdl_bias_relu_perlayer_quantization_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GNHWC
,
GKYXC
,
GKYXC
,
...
@@ -98,9 +129,15 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
...
@@ -98,9 +129,15 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
is_same_v
<
DsDataType
,
I32_Tuple
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
is_same_v
<
DsDataType
,
I32_Tuple
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
{
if
constexpr
(
is_same_v
<
Activation
,
PassThrough
>
)
if
constexpr
(
is_same_v
<
Activation
,
PassThrough
>
)
add_device_conv2d_bias_perlayer_quantization_int8_instances
(
op_ptrs
);
{
add_device_conv2d_dl_bias_perlayer_quantization_int8_instances
(
op_ptrs
);
add_device_conv2d_xdl_bias_perlayer_quantization_int8_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
Activation
,
Relu
>
)
else
if
constexpr
(
is_same_v
<
Activation
,
Relu
>
)
add_device_conv2d_bias_relu_perlayer_quantization_int8_instances
(
op_ptrs
);
{
add_device_conv2d_dl_bias_relu_perlayer_quantization_int8_instances
(
op_ptrs
);
add_device_conv2d_xdl_bias_relu_perlayer_quantization_int8_instances
(
op_ptrs
);
}
}
}
}
}
...
...
library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perchannel_quantization.hpp
View file @
dd10cb14
...
@@ -18,7 +18,7 @@ namespace device {
...
@@ -18,7 +18,7 @@ namespace device {
namespace
instance
{
namespace
instance
{
// grouped conv2d forward, GNHWC/GKYXC/GNHWK
// grouped conv2d forward, GNHWC/GKYXC/GNHWK
void
add_device_conv2d_perchannel_quantization_int8_instances
(
void
add_device_conv2d_
dl_
perchannel_quantization_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GNHWC
,
GKYXC
,
GKYXC
,
...
@@ -33,7 +33,37 @@ void add_device_conv2d_perchannel_quantization_int8_instances(
...
@@ -33,7 +33,37 @@ void add_device_conv2d_perchannel_quantization_int8_instances(
Activation_Mul2_Clamp
<
PassThrough
>>>>&
Activation_Mul2_Clamp
<
PassThrough
>>>>&
instances
);
instances
);
void
add_device_conv2d_relu_perchannel_quantization_int8_instances
(
void
add_device_conv2d_dl_relu_perchannel_quantization_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GKYXC
,
GK_Tuple
,
GNHWK
,
int8_t
,
int8_t
,
F32_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
Activation_Mul2_Clamp
<
Relu
>>>>&
instances
);
void
add_device_conv2d_xdl_perchannel_quantization_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GKYXC
,
GK_Tuple
,
GNHWK
,
int8_t
,
int8_t
,
F32_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
Activation_Mul2_Clamp
<
PassThrough
>>>>&
instances
);
void
add_device_conv2d_xdl_relu_perchannel_quantization_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GNHWC
,
GKYXC
,
GKYXC
,
...
@@ -97,9 +127,15 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
...
@@ -97,9 +127,15 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
is_same_v
<
OutDataType
,
int8_t
>
)
is_same_v
<
OutDataType
,
int8_t
>
)
{
{
if
constexpr
(
is_same_v
<
Activation
,
PassThrough
>
)
if
constexpr
(
is_same_v
<
Activation
,
PassThrough
>
)
add_device_conv2d_perchannel_quantization_int8_instances
(
op_ptrs
);
{
add_device_conv2d_dl_perchannel_quantization_int8_instances
(
op_ptrs
);
add_device_conv2d_xdl_perchannel_quantization_int8_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
Activation
,
Relu
>
)
else
if
constexpr
(
is_same_v
<
Activation
,
Relu
>
)
add_device_conv2d_relu_perchannel_quantization_int8_instances
(
op_ptrs
);
{
add_device_conv2d_dl_relu_perchannel_quantization_int8_instances
(
op_ptrs
);
add_device_conv2d_xdl_relu_perchannel_quantization_int8_instances
(
op_ptrs
);
}
}
}
}
}
...
...
library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perlayer_quantization.hpp
View file @
dd10cb14
...
@@ -18,7 +18,7 @@ namespace device {
...
@@ -18,7 +18,7 @@ namespace device {
namespace
instance
{
namespace
instance
{
// grouped conv2d forward, GNHWC/GKYXC/GNHWK
// grouped conv2d forward, GNHWC/GKYXC/GNHWK
void
add_device_conv2d_perlayer_quantization_int8_instances
(
void
add_device_conv2d_
dl_
perlayer_quantization_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GNHWC
,
GKYXC
,
GKYXC
,
...
@@ -33,7 +33,37 @@ void add_device_conv2d_perlayer_quantization_int8_instances(
...
@@ -33,7 +33,37 @@ void add_device_conv2d_perlayer_quantization_int8_instances(
Activation_Mul_Clamp
<
PassThrough
>>>>&
Activation_Mul_Clamp
<
PassThrough
>>>>&
instances
);
instances
);
void
add_device_conv2d_relu_perlayer_quantization_int8_instances
(
void
add_device_conv2d_dl_relu_perlayer_quantization_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GKYXC
,
Empty_Tuple
,
GNHWK
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
Activation_Mul_Clamp
<
Relu
>>>>&
instances
);
void
add_device_conv2d_xdl_perlayer_quantization_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GKYXC
,
Empty_Tuple
,
GNHWK
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
Activation_Mul_Clamp
<
PassThrough
>>>>&
instances
);
void
add_device_conv2d_xdl_relu_perlayer_quantization_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GNHWC
,
GKYXC
,
GKYXC
,
...
@@ -94,9 +124,15 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
...
@@ -94,9 +124,15 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
is_same_v
<
OutDataType
,
int8_t
>
)
is_same_v
<
OutDataType
,
int8_t
>
)
{
{
if
constexpr
(
is_same_v
<
Activation
,
PassThrough
>
)
if
constexpr
(
is_same_v
<
Activation
,
PassThrough
>
)
add_device_conv2d_perlayer_quantization_int8_instances
(
op_ptrs
);
{
add_device_conv2d_dl_perlayer_quantization_int8_instances
(
op_ptrs
);
add_device_conv2d_xdl_perlayer_quantization_int8_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
Activation
,
Relu
>
)
else
if
constexpr
(
is_same_v
<
Activation
,
Relu
>
)
add_device_conv2d_relu_perlayer_quantization_int8_instances
(
op_ptrs
);
{
add_device_conv2d_dl_relu_perlayer_quantization_int8_instances
(
op_ptrs
);
add_device_conv2d_xdl_relu_perlayer_quantization_int8_instances
(
op_ptrs
);
}
}
}
}
}
...
...
library/src/tensor_operation_instance/gpu/quantization/CMakeLists.txt
View file @
dd10cb14
set
(
CONV2D_PERLAYER_QUANT_SRC
conv2d_fwd/device_conv2d_dl_perlayer_quantization_int8_instance.cpp
conv2d_fwd/device_conv2d_xdl_perlayer_quantization_int8_instance.cpp
)
set
(
CONV2D_PERCHANNEL_QUANT_SRC
conv2d_fwd/device_conv2d_dl_perchannel_quantization_int8_instance.cpp
conv2d_fwd/device_conv2d_xdl_perchannel_quantization_int8_instance.cpp
)
set
(
CONV2D_BIAS_PERLAYER_QUANT_SRC
conv2d_fwd/device_conv2d_dl_bias_perlayer_quantization_int8_instance.cpp
conv2d_fwd/device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp
)
set
(
CONV2D_BIAS_PERCHANNEL_QUANT_SRC
conv2d_fwd/device_conv2d_dl_bias_perchannel_quantization_int8_instance.cpp
conv2d_fwd/device_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp
)
set
(
GEMM_QUANT_SRC
gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
)
add_instance_library
(
device_quantization_instance
add_instance_library
(
device_quantization_instance
device_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp
${
CONV2D_PERLAYER_QUANT_SRC
}
device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp
${
CONV2D_PERCHANNEL_QUANT_SRC
}
device_conv2d_xdl_perchannel_quantization_int8_instance.cpp
${
CONV2D_BIAS_PERLAYER_QUANT_SRC
}
device_conv2d_xdl_perlayer_quantization_int8_instance.cpp
${
CONV2D_BIAS_PERCHANNEL_QUANT_SRC
}
${
GEMM_QUANT_SRC
}
)
)
library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/conv2d_quantization_common.hpp
0 → 100644
View file @
dd10cb14
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
Empty_Tuple
=
ck
::
Tuple
<>
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
GNHWC
=
ck
::
tensor_layout
::
convolution
::
GNHWC
;
using
GKYXC
=
ck
::
tensor_layout
::
convolution
::
GKYXC
;
using
GNHWK
=
ck
::
tensor_layout
::
convolution
::
GNHWK
;
using
GK
=
ck
::
tensor_layout
::
convolution
::
G_K
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Relu
=
ck
::
tensor_operation
::
element_wise
::
Relu
;
using
GK_Tuple
=
ck
::
Tuple
<
GK
>
;
using
GK_GK_Tuple
=
ck
::
Tuple
<
GK
,
GK
>
;
using
I32_Tuple
=
ck
::
Tuple
<
int32_t
>
;
using
F32_Tuple
=
ck
::
Tuple
<
float
>
;
using
I32_F32_Tuple
=
ck
::
Tuple
<
int32_t
,
float
>
;
using
Mul_Clamp
=
ck
::
tensor_operation
::
element_wise
::
Activation_Mul_Clamp
<
PassThrough
>
;
using
Relu_Mul_Clamp
=
ck
::
tensor_operation
::
element_wise
::
Activation_Mul_Clamp
<
Relu
>
;
using
Add_Mul_Clamp
=
ck
::
tensor_operation
::
element_wise
::
Add_Activation_Mul_Clamp
<
PassThrough
>
;
using
Add_Relu_Mul_Clamp
=
ck
::
tensor_operation
::
element_wise
::
Add_Activation_Mul_Clamp
<
Relu
>
;
using
Mul2_Clamp
=
ck
::
tensor_operation
::
element_wise
::
Activation_Mul2_Clamp
<
PassThrough
>
;
using
Relu_Mul2_Clamp
=
ck
::
tensor_operation
::
element_wise
::
Activation_Mul2_Clamp
<
Relu
>
;
using
Add_Mul2_Clamp
=
ck
::
tensor_operation
::
element_wise
::
Add_Activation_Mul2_Clamp
<
PassThrough
>
;
using
Add_Relu_Mul2_Clamp
=
ck
::
tensor_operation
::
element_wise
::
Add_Activation_Mul2_Clamp
<
Relu
>
;
static
constexpr
ck
::
index_t
NDimSpatial
=
2
;
static
constexpr
auto
GemmSpec
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNKPadding
;
static
constexpr
auto
ConvFwdDefault
=
ck
::
tensor_operation
::
device
::
ConvolutionForwardSpecialization
::
Default
;
static
constexpr
auto
ConvFwd1x1P0
=
ck
::
tensor_operation
::
device
::
ConvolutionForwardSpecialization
::
Filter1x1Pad0
;
static
constexpr
auto
ConvFwd1x1S1P0
=
ck
::
tensor_operation
::
device
::
ConvolutionForwardSpecialization
::
Filter1x1Stride1Pad0
;
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perchannel_quantization_int8_instance.cpp
0 → 100644
View file @
dd10cb14
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "device_conv2d_dl_int8_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_conv2d_dl_bias_perchannel_quantization_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
NDimSpatial
,
GNHWC
,
GKYXC
,
GK_GK_Tuple
,
GNHWK
,
int8_t
,
int8_t
,
I32_F32_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
Add_Mul2_Clamp
>>>&
instances
)
{
// dl
add_device_operation_instances
(
instances
,
device_grouped_conv2d_dl_int8_instances
<
GK_GK_Tuple
,
I32_F32_Tuple
,
Add_Mul2_Clamp
,
ConvFwdDefault
,
4
>
{});
add_device_operation_instances
(
instances
,
device_grouped_conv2d_dl_int8_instances
<
GK_GK_Tuple
,
I32_F32_Tuple
,
Add_Mul2_Clamp
,
ConvFwd1x1P0
,
4
>
{});
add_device_operation_instances
(
instances
,
device_grouped_conv2d_dl_int8_instances
<
GK_GK_Tuple
,
I32_F32_Tuple
,
Add_Mul2_Clamp
,
ConvFwd1x1S1P0
,
4
>
{});
}
void
add_device_conv2d_dl_bias_relu_perchannel_quantization_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
NDimSpatial
,
GNHWC
,
GKYXC
,
GK_GK_Tuple
,
GNHWK
,
int8_t
,
int8_t
,
I32_F32_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
Add_Relu_Mul2_Clamp
>>>&
instances
)
{
// dl
add_device_operation_instances
(
instances
,
device_grouped_conv2d_dl_int8_instances
<
GK_GK_Tuple
,
I32_F32_Tuple
,
Add_Relu_Mul2_Clamp
,
ConvFwdDefault
,
4
>
{});
add_device_operation_instances
(
instances
,
device_grouped_conv2d_dl_int8_instances
<
GK_GK_Tuple
,
I32_F32_Tuple
,
Add_Relu_Mul2_Clamp
,
ConvFwd1x1P0
,
4
>
{});
add_device_operation_instances
(
instances
,
device_grouped_conv2d_dl_int8_instances
<
GK_GK_Tuple
,
I32_F32_Tuple
,
Add_Relu_Mul2_Clamp
,
ConvFwd1x1S1P0
,
4
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perlayer_quantization_int8_instance.cpp
0 → 100644
View file @
dd10cb14
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "device_conv2d_dl_int8_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_conv2d_dl_bias_perlayer_quantization_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
NDimSpatial
,
GNHWC
,
GKYXC
,
GK_Tuple
,
GNHWK
,
int8_t
,
int8_t
,
I32_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
Add_Mul_Clamp
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_grouped_conv2d_dl_int8_instances
<
GK_Tuple
,
I32_Tuple
,
Add_Mul_Clamp
,
ConvFwdDefault
,
4
>
{});
add_device_operation_instances
(
instances
,
device_grouped_conv2d_dl_int8_instances
<
GK_Tuple
,
I32_Tuple
,
Add_Mul_Clamp
,
ConvFwd1x1P0
,
4
>
{});
add_device_operation_instances
(
instances
,
device_grouped_conv2d_dl_int8_instances
<
GK_Tuple
,
I32_Tuple
,
Add_Mul_Clamp
,
ConvFwd1x1S1P0
,
4
>
{});
}
void
add_device_conv2d_dl_bias_relu_perlayer_quantization_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
NDimSpatial
,
GNHWC
,
GKYXC
,
GK_Tuple
,
GNHWK
,
int8_t
,
int8_t
,
I32_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
Add_Relu_Mul_Clamp
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_grouped_conv2d_dl_int8_instances
<
GK_Tuple
,
I32_Tuple
,
Add_Relu_Mul_Clamp
,
ConvFwdDefault
,
4
>
{});
add_device_operation_instances
(
instances
,
device_grouped_conv2d_dl_int8_instances
<
GK_Tuple
,
I32_Tuple
,
Add_Relu_Mul_Clamp
,
ConvFwd1x1P0
,
4
>
{});
add_device_operation_instances
(
instances
,
device_grouped_conv2d_dl_int8_instances
<
GK_Tuple
,
I32_Tuple
,
Add_Relu_Mul_Clamp
,
ConvFwd1x1S1P0
,
4
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_int8_instance.hpp
0 → 100644
View file @
dd10cb14
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "conv2d_quantization_common.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
template
<
typename
DsLayout
,
typename
DsDatatype
,
typename
OutElementOp
,
ConvolutionForwardSpecialization
ConvSpec
,
index_t
DstScalarPerVector
>
using
device_grouped_conv2d_dl_int8_instances
=
std
::
tuple
<
// ###########################################| NDim| InData| WeiData| MultpleD| OutData| AccData| InLayout| WeiLayout| MultipleD| OutLayout| In| Wei| Out| Convolution| GEMM| Block| MPer| NPer| K0Per| K1| M1Per| N1Per| KPer| M11N11Thread| M11N11Thread| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| CThreadTransfer|
// ###########################################| Spatial| Type| Type| Type| Type| Type| | | Layout| | Elementwise| Elementwise| Elementwise| Forward| Spacialization| Size| Block| Block| Block| | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
// ###########################################| | | | | | | | | | | Operation| Operation| Operation| Specialization| | | | | | | | | | | | K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| K0_N0_N1_K1| K0_N0_N1_K1| ArrangeOrder| Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1| Order| | |
// ###########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
<
NDimSpatial
,
int8_t
,
int8_t
,
DsDatatype
,
int8_t
,
int32_t
,
GNHWC
,
GKYXC
,
DsLayout
,
GNHWK
,
PassThrough
,
PassThrough
,
OutElementOp
,
ConvSpec
,
GemmSpec
,
256
,
128
,
128
,
16
,
4
,
4
,
4
,
1
,
S
<
8
,
2
>
,
S
<
8
,
2
>
,
S
<
8
,
1
,
1
,
4
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
8
,
1
,
1
,
4
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
DstScalarPerVector
>
>
;
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_perchannel_quantization_int8_instance.cpp
0 → 100644
View file @
dd10cb14
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "device_conv2d_dl_int8_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_conv2d_dl_perchannel_quantization_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
NDimSpatial
,
GNHWC
,
GKYXC
,
GK_Tuple
,
GNHWK
,
int8_t
,
int8_t
,
F32_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
Mul2_Clamp
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_grouped_conv2d_dl_int8_instances
<
GK_Tuple
,
F32_Tuple
,
Mul2_Clamp
,
ConvFwdDefault
,
4
>
{});
add_device_operation_instances
(
instances
,
device_grouped_conv2d_dl_int8_instances
<
GK_Tuple
,
F32_Tuple
,
Mul2_Clamp
,
ConvFwd1x1P0
,
4
>
{});
add_device_operation_instances
(
instances
,
device_grouped_conv2d_dl_int8_instances
<
GK_Tuple
,
F32_Tuple
,
Mul2_Clamp
,
ConvFwd1x1S1P0
,
4
>
{});
}
void
add_device_conv2d_dl_relu_perchannel_quantization_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
NDimSpatial
,
GNHWC
,
GKYXC
,
GK_Tuple
,
GNHWK
,
int8_t
,
int8_t
,
F32_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
Relu_Mul2_Clamp
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_grouped_conv2d_dl_int8_instances
<
GK_Tuple
,
F32_Tuple
,
Relu_Mul2_Clamp
,
ConvFwdDefault
,
4
>
{});
add_device_operation_instances
(
instances
,
device_grouped_conv2d_dl_int8_instances
<
GK_Tuple
,
F32_Tuple
,
Relu_Mul2_Clamp
,
ConvFwd1x1P0
,
4
>
{});
add_device_operation_instances
(
instances
,
device_grouped_conv2d_dl_int8_instances
<
GK_Tuple
,
F32_Tuple
,
Relu_Mul2_Clamp
,
ConvFwd1x1S1P0
,
4
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment