Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
c5335964
Commit
c5335964
authored
Nov 02, 2023
by
root
Browse files
added pk_cvt
parents
cc9d2a84
f27ea94e
Changes
65
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1305 additions
and
205 deletions
+1305
-205
example/62_conv_fwd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16.cpp
..._fwd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16.cpp
+265
-0
include/ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp
...sor_operation/gpu/device/device_conv_tensor_rearrange.hpp
+10
-7
include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp
...operation/gpu/device/impl/device_column_to_image_impl.hpp
+64
-38
include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp
...e/ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp
+2
-1
include/ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp
...operation/gpu/device/impl/device_image_to_column_impl.hpp
+58
-33
include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
...k/tensor_operation/gpu/element/element_wise_operation.hpp
+65
-0
include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
...or_operation/gpu/element/unary_element_wise_operation.hpp
+2
-2
include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
...k/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
+32
-11
include/ck/utility/type_convert.hpp
include/ck/utility/type_convert.hpp
+29
-0
library/include/ck/library/reference_tensor_operation/cpu/reference_column_to_image.hpp
...erence_tensor_operation/cpu/reference_column_to_image.hpp
+21
-20
library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
...ary/reference_tensor_operation/cpu/reference_conv_fwd.hpp
+72
-6
library/include/ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp
...erence_tensor_operation/cpu/reference_image_to_column.hpp
+19
-18
library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp
...y/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp
+279
-54
library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_instance.hpp
..._grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_instance.hpp
+131
-0
library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_scaleadd_scaleadd_relu.hpp
...pu/grouped_convolution_forward_scaleadd_scaleadd_relu.hpp
+176
-0
library/src/tensor_operation_instance/gpu/column_to_image/CMakeLists.txt
...sor_operation_instance/gpu/column_to_image/CMakeLists.txt
+6
-3
library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_gndhwc_3d_instance.cpp
...mn_to_image/device_column_to_image_gndhwc_3d_instance.cpp
+4
-4
library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_gnhwc_2d_instance.cpp
...umn_to_image/device_column_to_image_gnhwc_2d_instance.cpp
+4
-4
library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_gnwc_1d_instance.cpp
...lumn_to_image/device_column_to_image_gnwc_1d_instance.cpp
+4
-4
library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_ndhwgc_3d_instance.cpp
...mn_to_image/device_column_to_image_ndhwgc_3d_instance.cpp
+62
-0
No files found.
example/62_conv_fwd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16.cpp
0 → 100644
View file @
c5335964
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iostream>
#include <numeric>
#include <type_traits>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
constexpr
ck
::
index_t
NDimSpatial
=
3
;
using
InDataType
=
ck
::
half_t
;
using
WeiDataType
=
ck
::
half_t
;
using
AccDataType
=
float
;
using
CShuffleDataType
=
ck
::
half_t
;
using
OutDataType
=
ck
::
half_t
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
InLayout
=
ck
::
tensor_layout
::
convolution
::
GNDHWC
;
using
WeiLayout
=
ck
::
tensor_layout
::
convolution
::
GKZYXC
;
using
OutLayout
=
ck
::
tensor_layout
::
convolution
::
GNDHWK
;
using
InElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
WeiElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
OutElementOp
=
ck
::
tensor_operation
::
element_wise
::
ScaleAddScaleAddRelu
;
static
constexpr
auto
ConvSpec
=
ck
::
tensor_operation
::
device
::
ConvolutionForwardSpecialization
::
Default
;
static
constexpr
auto
GemmSpec
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNKPadding
;
template
<
typename
OutElementOp
>
using
DeviceGroupedConvNDFwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
NDimSpatial
,
InLayout
,
WeiLayout
,
ck
::
Tuple
<
OutLayout
,
OutLayout
>
,
OutLayout
,
InDataType
,
WeiDataType
,
AccDataType
,
CShuffleDataType
,
ck
::
Tuple
<
OutDataType
,
OutDataType
>
,
OutDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
,
ConvSpec
,
// ConvForwardSpecialization
GemmSpec
,
// GemmSpecialization
1
,
//
256
,
// BlockSize
128
,
// MPerBlock
256
,
// NPerBlock
32
,
// KPerBlock
8
,
// AK1
8
,
// BK1
32
,
// MPerXdl
32
,
// NPerXdl
2
,
// MXdlPerWave
4
,
// NXdlPerWave
S
<
4
,
64
,
1
>
,
// ABlockTransferThreadClusterLengths_AK0_M_AK1
S
<
1
,
0
,
2
>
,
// ABlockTransferThreadClusterArrangeOrder
S
<
1
,
0
,
2
>
,
// ABlockTransferSrcAccessOrder
2
,
// ABlockTransferSrcVectorDim
8
,
// ABlockTransferSrcScalarPerVector
8
,
// ABlockTransferDstScalarPerVector_AK1
1
,
// ABlockLdsExtraM
S
<
4
,
64
,
1
>
,
// BBlockTransferThreadClusterLengths_BK0_N_BK1
S
<
1
,
0
,
2
>
,
// BBlockTransferThreadClusterArrangeOrder
S
<
1
,
0
,
2
>
,
// BBlockTransferSrcAccessOrder
2
,
// BBlockTransferSrcVectorDim
8
,
// BBlockTransferSrcScalarPerVector
8
,
// BBlockTransferDstScalarPerVector_BK1
1
,
// BBlockLdsExtraN
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
;
using
DeviceGroupedConvNDFwdActivInstance
=
DeviceGroupedConvNDFwdInstance
<
OutElementOp
>
;
namespace
{
// Use custom implementation to pass two more tensors for post op
template
<
ck
::
index_t
NDimSpatial
,
typename
InDataType
,
typename
WeiDataType
,
typename
OutDataType
,
typename
InElementOp
,
typename
WeiElementOp
,
typename
OutElementOp
,
typename
DeviceConvNDFwdInstance
>
bool
run_grouped_conv_fwd
(
bool
do_verification
,
int
init_method
,
bool
time_kernel
,
const
ck
::
utils
::
conv
::
ConvParam
&
conv_param
,
const
HostTensorDescriptor
&
in_g_n_c_wis_desc
,
const
HostTensorDescriptor
&
wei_g_k_c_xs_desc
,
const
HostTensorDescriptor
&
out_g_n_k_wos_desc
,
const
InElementOp
&
in_element_op
,
const
WeiElementOp
&
wei_element_op
,
const
OutElementOp
&
out_element_op
)
{
constexpr
ck
::
index_t
NumDs
=
2
;
Tensor
<
InDataType
>
in
(
in_g_n_c_wis_desc
);
Tensor
<
WeiDataType
>
wei
(
wei_g_k_c_xs_desc
);
Tensor
<
OutDataType
>
out_host
(
out_g_n_k_wos_desc
);
Tensor
<
OutDataType
>
out_device
(
out_g_n_k_wos_desc
);
std
::
array
<
Tensor
<
OutDataType
>
,
NumDs
>
d_tensors
=
{
Tensor
<
OutDataType
>
(
out_g_n_k_wos_desc
),
Tensor
<
OutDataType
>
(
out_g_n_k_wos_desc
)};
std
::
cout
<<
"in: "
<<
in
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"wei: "
<<
wei
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"out: "
<<
out_host
.
mDesc
<<
std
::
endl
;
switch
(
init_method
)
{
case
0
:
break
;
case
1
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
<
InDataType
>
{
-
2
,
2
});
wei
.
GenerateTensorValue
(
GeneratorTensor_2
<
WeiDataType
>
{
-
2
,
2
});
d_tensors
[
0
].
GenerateTensorValue
(
GeneratorTensor_2
<
OutDataType
>
{
-
2
,
2
});
d_tensors
[
1
].
GenerateTensorValue
(
GeneratorTensor_2
<
OutDataType
>
{
-
2
,
2
});
break
;
default:
in
.
GenerateTensorValue
(
GeneratorTensor_3
<
InDataType
>
{
-
1.0
,
1.0
});
wei
.
GenerateTensorValue
(
GeneratorTensor_3
<
WeiDataType
>
{
-
0.05
,
0.05
});
d_tensors
[
0
].
GenerateTensorValue
(
GeneratorTensor_3
<
OutDataType
>
{
-
0.05
,
0.05
});
d_tensors
[
1
].
GenerateTensorValue
(
GeneratorTensor_3
<
OutDataType
>
{
-
0.05
,
0.05
});
}
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
in
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
wei_device_buf
(
sizeof
(
WeiDataType
)
*
wei
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
d0_buf
(
sizeof
(
OutDataType
)
*
d_tensors
[
0
].
mDesc
.
GetElementSpaceSize
());
DeviceMem
d1_buf
(
sizeof
(
OutDataType
)
*
d_tensors
[
1
].
mDesc
.
GetElementSpaceSize
());
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
out_device
.
mDesc
.
GetElementSpaceSize
());
in_device_buf
.
ToDevice
(
in
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
wei
.
mData
.
data
());
d0_buf
.
ToDevice
(
d_tensors
[
0
].
mData
.
data
());
d1_buf
.
ToDevice
(
d_tensors
[
1
].
mData
.
data
());
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
a_g_n_c_wis_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
a_g_n_c_wis_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
e_g_n_k_wos_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
e_g_n_k_wos_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
conv_filter_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
conv_filter_dilations
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_left_pads
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_right_pads
{};
auto
copy
=
[](
const
auto
&
x
,
auto
&
y
)
{
ck
::
ranges
::
copy
(
x
,
y
.
begin
());
};
copy
(
in_g_n_c_wis_desc
.
GetLengths
(),
a_g_n_c_wis_lengths
);
copy
(
in_g_n_c_wis_desc
.
GetStrides
(),
a_g_n_c_wis_strides
);
copy
(
wei_g_k_c_xs_desc
.
GetLengths
(),
b_g_k_c_xs_lengths
);
copy
(
wei_g_k_c_xs_desc
.
GetStrides
(),
b_g_k_c_xs_strides
);
copy
(
out_g_n_k_wos_desc
.
GetLengths
(),
e_g_n_k_wos_lengths
);
copy
(
out_g_n_k_wos_desc
.
GetStrides
(),
e_g_n_k_wos_strides
);
copy
(
conv_param
.
conv_filter_strides_
,
conv_filter_strides
);
copy
(
conv_param
.
conv_filter_dilations_
,
conv_filter_dilations
);
copy
(
conv_param
.
input_left_pads_
,
input_left_pads
);
copy
(
conv_param
.
input_right_pads_
,
input_right_pads
);
const
std
::
array
<
const
void
*
,
NumDs
>
ds
=
{
d0_buf
.
GetDeviceBuffer
(),
d1_buf
.
GetDeviceBuffer
()};
auto
conv
=
DeviceConvNDFwdInstance
{};
auto
invoker
=
conv
.
MakeInvoker
();
auto
argument
=
conv
.
MakeArgument
(
in_device_buf
.
GetDeviceBuffer
(),
wei_device_buf
.
GetDeviceBuffer
(),
ds
,
out_device_buf
.
GetDeviceBuffer
(),
a_g_n_c_wis_lengths
,
a_g_n_c_wis_strides
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
,
std
::
array
<
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
,
NumDs
>
{
e_g_n_k_wos_lengths
,
e_g_n_k_wos_lengths
},
std
::
array
<
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
,
NumDs
>
{
e_g_n_k_wos_strides
,
e_g_n_k_wos_strides
},
e_g_n_k_wos_lengths
,
e_g_n_k_wos_strides
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
in_element_op
,
wei_element_op
,
out_element_op
);
if
(
!
conv
.
IsSupportedArgument
(
argument
))
{
throw
std
::
runtime_error
(
"The device op with the specified compilation parameters does "
"not support this convolution problem."
);
}
float
avg_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
flop
=
conv_param
.
GetFlops
()
+
2
*
conv_param
.
GetOutputByte
<
OutDataType
>
()
/
sizeof
(
OutDataType
);
std
::
size_t
num_btype
=
conv_param
.
GetByte
<
InDataType
,
WeiDataType
,
OutDataType
>
()
+
2
*
conv_param
.
GetOutputByte
<
OutDataType
>
();
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
avg_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
avg_time
;
std
::
cout
<<
"Perf: "
<<
avg_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
conv
.
GetTypeString
()
<<
std
::
endl
;
if
(
do_verification
)
{
auto
ref_conv
=
ck
::
tensor_operation
::
host
::
ReferenceConvFwd
<
NDimSpatial
,
InDataType
,
WeiDataType
,
OutDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
,
NumDs
>
();
auto
ref_invoker
=
ref_conv
.
MakeInvoker
();
auto
ref_argument
=
ref_conv
.
MakeArgument
(
in
,
wei
,
out_host
,
conv_param
.
conv_filter_strides_
,
conv_param
.
conv_filter_dilations_
,
conv_param
.
input_left_pads_
,
conv_param
.
input_right_pads_
,
in_element_op
,
wei_element_op
,
out_element_op
,
d_tensors
);
ref_invoker
.
Run
(
ref_argument
);
out_device_buf
.
FromDevice
(
out_device
.
mData
.
data
());
return
ck
::
utils
::
check_err
(
out_device
,
out_host
,
"Error: incorrect results!"
);
}
return
true
;
}
}
// namespace
#include "run_convnd_fwd_activ_example.inc"
int
main
(
int
argc
,
char
*
argv
[])
{
return
!
run_convnd_fwd_example
(
argc
,
argv
);
}
include/ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp
View file @
c5335964
...
@@ -14,11 +14,12 @@ namespace device {
...
@@ -14,11 +14,12 @@ namespace device {
/**
/**
* \brief Convolution Tensor Rearrange.
* \brief Convolution Tensor Rearrange.
*
*
* This Device operator supports conversion image ([G, N, Di, Hi, Wi, C]) to
* This Device operator supports converting an image to
* the gemm problem([N * Do * Ho * Wo, Z * Y * X * C]) (Image to Column) and
* the GEMM representation (Image to Column) and
* conversion gemm form to the image (Column to Image).
* converting a GEMM form to the image (Column to Image).
*
* Supported layouts:
* Note that G must be equal to 1.
* [G, N, Di, Hi, Wi, C] <-> [G, N * Do * Ho * Wo, Z * Y * X * C]
* [N, Di, Hi, Wi, G, C] <-> [N * Do * Ho * Wo, G, Z * Y * X * C]
*
*
* \tparam NDimSpatial Number of spatial dimensions.
* \tparam NDimSpatial Number of spatial dimensions.
* \tparam ImageLayout Input Layout.
* \tparam ImageLayout Input Layout.
...
@@ -39,13 +40,14 @@ struct DeviceConvTensorRearrange : public BaseOperator
...
@@ -39,13 +40,14 @@ struct DeviceConvTensorRearrange : public BaseOperator
*
*
* \param p_in A pointer to the device memory of the input image.
* \param p_in A pointer to the device memory of the input image.
* \param p_out A pointer to the device memory of the output.
* \param p_out A pointer to the device memory of the output.
* \param G Convolution number of groups.
* \param N Convolution batch size.
* \param N Convolution batch size.
* \param C Convolution number of channels.
* \param C Convolution number of channels.
* \param input_spatial_lengths Input spatial lengths.
* \param input_spatial_lengths Input spatial lengths.
* \param filter_spatial_lengths Filter spatial lengths.
* \param filter_spatial_lengths Filter spatial lengths.
* \param output_spatial_lengths Output spatial lengths.
* \param output_spatial_lengths Output spatial lengths.
* \param image_g_n_c_wis_strides Image strides in order [G, N, C, D, H, W].
* \param image_g_n_c_wis_strides Image strides in order [G, N, C, D, H, W].
* \param gemm_m_k_strides Gemm form strides.
* \param gemm_
g_
m_k_strides Gemm form strides.
* \param conv_filter_strides Convolution filter strides.
* \param conv_filter_strides Convolution filter strides.
* \param conv_filter_dilations Convolution filter dilations.
* \param conv_filter_dilations Convolution filter dilations.
* \param input_left_pads Convolution left pads.
* \param input_left_pads Convolution left pads.
...
@@ -55,13 +57,14 @@ struct DeviceConvTensorRearrange : public BaseOperator
...
@@ -55,13 +57,14 @@ struct DeviceConvTensorRearrange : public BaseOperator
virtual
std
::
unique_ptr
<
BaseArgument
>
virtual
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
void
*
p_in
,
MakeArgumentPointer
(
const
void
*
p_in
,
void
*
p_out
,
void
*
p_out
,
const
ck
::
index_t
G
,
const
ck
::
index_t
N
,
const
ck
::
index_t
N
,
const
ck
::
index_t
C
,
const
ck
::
index_t
C
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
filter_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
filter_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
output_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
output_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
image_g_n_c_wis_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
image_g_n_c_wis_strides
,
const
std
::
array
<
index_t
,
2
>&
gemm_m_k_strides
,
const
std
::
array
<
index_t
,
3
>&
gemm_
g_
m_k_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_left_pads
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_left_pads
,
...
...
include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp
View file @
c5335964
...
@@ -17,15 +17,18 @@
...
@@ -17,15 +17,18 @@
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp"
#include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
#include "ck/host_utility/io.hpp"
#include "ck/host_utility/io.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
// Image to column for input layout NDHWC:
// Column to Image:
// input : image converted to the gemm problem [N * Do * Ho * Wo, Z * Y * X * C]
// input : gemm form [G, N * Do * Ho * Wo, Z * Y * X * C]
// output : image [N, Di, Hi, Wi, C]
// output : input image [G, N, Di, Hi, Wi, C]
// input : gemm form [N * Do * Ho * Wo, G, Z * Y * X * C]
// output : input image [N, Di, Hi, Wi, G, C]
template
<
index_t
NDimSpatial
,
template
<
index_t
NDimSpatial
,
typename
ImageLayout
,
typename
ImageLayout
,
typename
InputDataType
,
typename
InputDataType
,
...
@@ -43,6 +46,14 @@ struct DeviceColumnToImageImpl
...
@@ -43,6 +46,14 @@ struct DeviceColumnToImageImpl
OutputDataType
,
OutputDataType
,
conv_tensor_rearrange_op
::
ColumnToImage
>
conv_tensor_rearrange_op
::
ColumnToImage
>
{
{
static
constexpr
bool
is_NSpatialGC
=
std
::
is_same_v
<
ImageLayout
,
tensor_layout
::
convolution
::
NWGC
>
||
std
::
is_same_v
<
ImageLayout
,
tensor_layout
::
convolution
::
NHWGC
>
||
std
::
is_same_v
<
ImageLayout
,
tensor_layout
::
convolution
::
NDHWGC
>
;
static
constexpr
bool
is_GNSpatialC
=
std
::
is_same_v
<
ImageLayout
,
tensor_layout
::
convolution
::
GNWC
>
||
std
::
is_same_v
<
ImageLayout
,
tensor_layout
::
convolution
::
GNHWC
>
||
std
::
is_same_v
<
ImageLayout
,
tensor_layout
::
convolution
::
GNDHWC
>
;
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
...
@@ -90,7 +101,7 @@ struct DeviceColumnToImageImpl
...
@@ -90,7 +101,7 @@ struct DeviceColumnToImageImpl
const
std
::
array
<
index_t
,
NDimSpatial
>&
filter_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
filter_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
output_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
output_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
index_t
,
2
>&
gemm_m_k_strides
,
const
std
::
array
<
index_t
,
3
>&
gemm_
g_
m_k_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
independent_filters
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
independent_filters
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
effs
)
const
std
::
array
<
index_t
,
NDimSpatial
>&
effs
)
{
{
...
@@ -100,23 +111,23 @@ struct DeviceColumnToImageImpl
...
@@ -100,23 +111,23 @@ struct DeviceColumnToImageImpl
C
*
ck
::
accumulate_n
<
index_t
>
(
C
*
ck
::
accumulate_n
<
index_t
>
(
filter_spatial_lengths
.
begin
(),
NDimSpatial
,
1
,
std
::
multiplies
<>
());
filter_spatial_lengths
.
begin
(),
NDimSpatial
,
1
,
std
::
multiplies
<>
());
const
index_t
NStride
=
DoHoWo
*
gemm_m_k_strides
[
I
0
]
*
gemm_m_k_strides
[
I
1
];
const
index_t
NStride
=
DoHoWo
*
gemm_
g_
m_k_strides
[
I
1
]
*
gemm_
g_
m_k_strides
[
I
2
];
// Calculate the appropriate stride for each set of independent filters
// Calculate the appropriate stride for each set of independent filters
// in each dimension
// in each dimension
const
index_t
WStride
=
const
index_t
WStride
=
math
::
integer_divide_ceil
(
effs
[
XIdx
],
conv_filter_strides
[
XIdx
])
*
math
::
integer_divide_ceil
(
effs
[
XIdx
],
conv_filter_strides
[
XIdx
])
*
gemm_m_k_strides
[
I
0
];
gemm_
g_
m_k_strides
[
I
1
];
const
index_t
HStride
=
math
::
integer_divide_ceil
(
effs
[
YIdx
],
conv_filter_strides
[
YIdx
])
*
const
index_t
HStride
=
math
::
integer_divide_ceil
(
effs
[
YIdx
],
conv_filter_strides
[
YIdx
])
*
output_spatial_lengths
[
XIdx
]
*
gemm_m_k_strides
[
I
0
];
output_spatial_lengths
[
XIdx
]
*
gemm_
g_
m_k_strides
[
I
1
];
const
index_t
DStride
=
math
::
integer_divide_ceil
(
effs
[
ZIdx
],
conv_filter_strides
[
ZIdx
])
*
const
index_t
DStride
=
math
::
integer_divide_ceil
(
effs
[
ZIdx
],
conv_filter_strides
[
ZIdx
])
*
output_spatial_lengths
[
YIdx
]
*
output_spatial_lengths
[
XIdx
]
*
output_spatial_lengths
[
YIdx
]
*
output_spatial_lengths
[
XIdx
]
*
gemm_m_k_strides
[
I
0
];
gemm_
g_
m_k_strides
[
I
1
];
// Create descriptor for independent filters in each dimension and
// Create descriptor for independent filters in each dimension and
// then merge them into column form
// then merge them into column form
if
constexpr
(
NDimSpatial
==
1
)
if
constexpr
(
NDimSpatial
==
1
)
{
{
const
auto
desc_gemm_form
=
const
auto
desc_gemm_form
=
make_naive_tensor_descriptor
(
make_tuple
(
N
,
independent_filters
[
XIdx
],
CZYX
),
make_naive_tensor_descriptor
(
make_tuple
(
N
,
independent_filters
[
XIdx
],
CZYX
),
make_tuple
(
NStride
,
WStride
,
gemm_m_k_strides
[
I
1
]));
make_tuple
(
NStride
,
WStride
,
gemm_
g_
m_k_strides
[
I
2
]));
const
auto
desc_gemm_form_merged_filters
=
transform_tensor_descriptor
(
const
auto
desc_gemm_form_merged_filters
=
transform_tensor_descriptor
(
desc_gemm_form
,
desc_gemm_form
,
make_tuple
(
make_merge_transform
(
make_tuple
(
N
,
independent_filters
[
XIdx
])),
make_tuple
(
make_merge_transform
(
make_tuple
(
N
,
independent_filters
[
XIdx
])),
...
@@ -130,7 +141,7 @@ struct DeviceColumnToImageImpl
...
@@ -130,7 +141,7 @@ struct DeviceColumnToImageImpl
{
{
const
auto
desc_gemm_form
=
make_naive_tensor_descriptor
(
const
auto
desc_gemm_form
=
make_naive_tensor_descriptor
(
make_tuple
(
N
,
independent_filters
[
YIdx
],
independent_filters
[
XIdx
],
CZYX
),
make_tuple
(
N
,
independent_filters
[
YIdx
],
independent_filters
[
XIdx
],
CZYX
),
make_tuple
(
NStride
,
HStride
,
WStride
,
gemm_m_k_strides
[
I
1
]));
make_tuple
(
NStride
,
HStride
,
WStride
,
gemm_
g_
m_k_strides
[
I
2
]));
const
auto
desc_gemm_form_merged_filters
=
transform_tensor_descriptor
(
const
auto
desc_gemm_form_merged_filters
=
transform_tensor_descriptor
(
desc_gemm_form
,
desc_gemm_form
,
make_tuple
(
make_merge_transform
(
make_tuple
(
make_merge_transform
(
...
@@ -149,7 +160,7 @@ struct DeviceColumnToImageImpl
...
@@ -149,7 +160,7 @@ struct DeviceColumnToImageImpl
independent_filters
[
YIdx
],
independent_filters
[
YIdx
],
independent_filters
[
XIdx
],
independent_filters
[
XIdx
],
CZYX
),
CZYX
),
make_tuple
(
NStride
,
DStride
,
HStride
,
WStride
,
gemm_m_k_strides
[
I
1
]));
make_tuple
(
NStride
,
DStride
,
HStride
,
WStride
,
gemm_
g_
m_k_strides
[
I
2
]));
const
auto
desc_gemm_form_merged_filters
=
transform_tensor_descriptor
(
const
auto
desc_gemm_form_merged_filters
=
transform_tensor_descriptor
(
desc_gemm_form
,
desc_gemm_form
,
make_tuple
(
make_merge_transform
(
make_tuple
(
N
,
make_tuple
(
make_merge_transform
(
make_tuple
(
N
,
...
@@ -252,34 +263,38 @@ struct DeviceColumnToImageImpl
...
@@ -252,34 +263,38 @@ struct DeviceColumnToImageImpl
decltype
(
BlockToCTileMap_M00_N0_M01Adapt
<
MPerBlock
,
KPerBlock
,
InputGridDesc
>
(
decltype
(
BlockToCTileMap_M00_N0_M01Adapt
<
MPerBlock
,
KPerBlock
,
InputGridDesc
>
(
InputGridDesc
{}))
>
;
InputGridDesc
{}))
>
;
using
GridwiseTensorRearrangeKernel
=
GridwiseTensorRearrange
<
InputGridDesc
,
using
GridwiseTensorRearrangeKernel
=
InputDataType
,
GridwiseTensorRearrange
<
InputGridDesc
,
OutputGridDesc
,
InputDataType
,
OutputDataType
,
OutputGridDesc
,
BlockSize
,
OutputDataType
,
MPerBlock
,
BlockSize
,
KPerBlock
,
MPerBlock
,
ThreadClusterLengths
,
KPerBlock
,
ScalarPerVector
,
ThreadClusterLengths
,
InMemoryDataOperationEnum
::
Add
,
ScalarPerVector
,
Block2ETileMap
>
;
InMemoryDataOperationEnum
::
Add
,
Block2ETileMap
,
ComputePtrOffsetOfStridedBatch
<
I0
>>
;
struct
Argument
:
public
BaseArgument
struct
Argument
:
public
BaseArgument
{
{
Argument
(
const
void
*
p_in
,
// input image
Argument
(
const
void
*
p_in
,
// input image
void
*
p_out
,
// output image
void
*
p_out
,
// output image
const
ck
::
index_t
G
,
const
ck
::
index_t
N
,
const
ck
::
index_t
N
,
const
ck
::
index_t
C
,
const
ck
::
index_t
C
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
filter_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
filter_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
output_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
output_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
image_g_n_c_wis_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
image_g_n_c_wis_strides
,
const
std
::
array
<
index_t
,
2
>&
gemm_m_k_strides
,
const
std
::
array
<
index_t
,
3
>&
gemm_
g_
m_k_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_left_pads
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_left_pads
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_right_pads
)
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_right_pads
)
:
C_
(
C
),
:
G_
(
G
),
C_
(
C
),
X_
(
filter_spatial_lengths
[
NDimSpatial
-
I1
]),
X_
(
filter_spatial_lengths
[
NDimSpatial
-
I1
]),
p_in_
{
static_cast
<
const
InputDataType
*>
(
p_in
)},
p_in_
{
static_cast
<
const
InputDataType
*>
(
p_in
)},
p_out_
{
static_cast
<
OutputDataType
*>
(
p_out
)},
p_out_
{
static_cast
<
OutputDataType
*>
(
p_out
)},
...
@@ -289,6 +304,9 @@ struct DeviceColumnToImageImpl
...
@@ -289,6 +304,9 @@ struct DeviceColumnToImageImpl
input_left_pads_
{
input_left_pads
},
input_left_pads_
{
input_left_pads
},
input_right_pads_
{
input_right_pads
}
input_right_pads_
{
input_right_pads
}
{
{
compute_ptr_offset_of_batch_
.
BatchStrideA_
=
gemm_g_m_k_strides
[
I0
];
compute_ptr_offset_of_batch_
.
BatchStrideC_
=
image_g_n_c_wis_strides
[
I0
];
const
index_t
x_eff
=
const
index_t
x_eff
=
(
filter_spatial_lengths
[
XIdx
]
-
1
)
*
conv_filter_dilations
[
XIdx
]
+
1
;
(
filter_spatial_lengths
[
XIdx
]
-
1
)
*
conv_filter_dilations
[
XIdx
]
+
1
;
const
index_t
y_eff
=
const
index_t
y_eff
=
...
@@ -354,7 +372,7 @@ struct DeviceColumnToImageImpl
...
@@ -354,7 +372,7 @@ struct DeviceColumnToImageImpl
filter_spatial_lengths
,
filter_spatial_lengths
,
output_spatial_lengths
,
output_spatial_lengths
,
conv_filter_strides
,
conv_filter_strides
,
gemm_m_k_strides
,
gemm_
g_
m_k_strides
,
independent_filters
,
independent_filters
,
effs
);
effs
);
const
auto
out_grid_desc_m_k
=
const
auto
out_grid_desc_m_k
=
...
@@ -387,10 +405,9 @@ struct DeviceColumnToImageImpl
...
@@ -387,10 +405,9 @@ struct DeviceColumnToImageImpl
// Memory offsets to next set of independent filters,
// Memory offsets to next set of independent filters,
// move to independent filters in each dimension
// move to independent filters in each dimension
const
index_t
in_offset
=
const
index_t
in_offset
=
x_idx
*
gemm_m_k_strides
[
0
]
+
(
x_idx
+
y_idx
*
output_spatial_lengths
[
XIdx
]
+
y_idx
*
gemm_m_k_strides
[
0
]
*
output_spatial_lengths
[
XIdx
]
+
z_idx
*
output_spatial_lengths
[
YIdx
]
*
output_spatial_lengths
[
XIdx
])
*
z_idx
*
gemm_m_k_strides
[
0
]
*
output_spatial_lengths
[
YIdx
]
*
gemm_g_m_k_strides
[
I1
];
output_spatial_lengths
[
XIdx
];
// Move to independent filters in appropriate dimensions
// Move to independent filters in appropriate dimensions
const
index_t
out_offset
=
const
index_t
out_offset
=
x_offset_with_pad
*
image_g_n_c_wis_strides
[
spatial_offset
+
XIdx
]
+
x_offset_with_pad
*
image_g_n_c_wis_strides
[
spatial_offset
+
XIdx
]
+
...
@@ -417,6 +434,7 @@ struct DeviceColumnToImageImpl
...
@@ -417,6 +434,7 @@ struct DeviceColumnToImageImpl
}
}
}
}
const
ck
::
index_t
G_
;
const
ck
::
index_t
C_
;
const
ck
::
index_t
C_
;
const
ck
::
index_t
X_
;
const
ck
::
index_t
X_
;
...
@@ -434,6 +452,8 @@ struct DeviceColumnToImageImpl
...
@@ -434,6 +452,8 @@ struct DeviceColumnToImageImpl
std
::
vector
<
const
InputDataType
*>
p_in_container_
;
std
::
vector
<
const
InputDataType
*>
p_in_container_
;
std
::
vector
<
OutputDataType
*>
p_out_container_
;
std
::
vector
<
OutputDataType
*>
p_out_container_
;
ComputePtrOffsetOfStridedBatch
<
I0
>
compute_ptr_offset_of_batch_
;
};
};
struct
Invoker
:
public
BaseInvoker
struct
Invoker
:
public
BaseInvoker
...
@@ -451,6 +471,7 @@ struct DeviceColumnToImageImpl
...
@@ -451,6 +471,7 @@ struct DeviceColumnToImageImpl
OutputGridDesc
,
OutputGridDesc
,
OutputDataType
,
OutputDataType
,
Block2ETileMap
,
Block2ETileMap
,
ComputePtrOffsetOfStridedBatch
<
I0
>
,
GridwiseTensorRearrangeKernel
>
;
GridwiseTensorRearrangeKernel
>
;
// Execute each set of independent filters
// Execute each set of independent filters
...
@@ -460,7 +481,7 @@ struct DeviceColumnToImageImpl
...
@@ -460,7 +481,7 @@ struct DeviceColumnToImageImpl
BlockToCTileMap_M00_N0_M01Adapt
<
MPerBlock
,
KPerBlock
,
InputGridDesc
>
(
BlockToCTileMap_M00_N0_M01Adapt
<
MPerBlock
,
KPerBlock
,
InputGridDesc
>
(
arg
.
out_grid_desc_m_k_container_
[
i
]);
arg
.
out_grid_desc_m_k_container_
[
i
]);
const
index_t
grid_size
=
const
index_t
grid_size
=
block_2_tile_map
.
CalculateGridSize
(
arg
.
in_grid_desc_m_k_container_
[
i
]);
block_2_tile_map
.
CalculateGridSize
(
arg
.
in_grid_desc_m_k_container_
[
i
])
*
arg
.
G_
;
elapsed_time
+=
launch_and_time_kernel
(
stream_config
,
elapsed_time
+=
launch_and_time_kernel
(
stream_config
,
kernel
,
kernel
,
dim3
(
grid_size
),
dim3
(
grid_size
),
...
@@ -470,7 +491,9 @@ struct DeviceColumnToImageImpl
...
@@ -470,7 +491,9 @@ struct DeviceColumnToImageImpl
arg
.
p_in_container_
[
i
],
arg
.
p_in_container_
[
i
],
arg
.
out_grid_desc_m_k_container_
[
i
],
arg
.
out_grid_desc_m_k_container_
[
i
],
arg
.
p_out_container_
[
i
],
arg
.
p_out_container_
[
i
],
block_2_tile_map
);
arg
.
G_
,
block_2_tile_map
,
arg
.
compute_ptr_offset_of_batch_
);
}
}
return
elapsed_time
;
return
elapsed_time
;
}
}
...
@@ -485,8 +508,7 @@ struct DeviceColumnToImageImpl
...
@@ -485,8 +508,7 @@ struct DeviceColumnToImageImpl
bool
IsSupportedArgument
(
const
Argument
&
arg
)
bool
IsSupportedArgument
(
const
Argument
&
arg
)
{
{
using
namespace
tensor_layout
::
convolution
;
using
namespace
tensor_layout
::
convolution
;
if
constexpr
(
!
(
std
::
is_same_v
<
ImageLayout
,
GNWC
>
||
std
::
is_same_v
<
ImageLayout
,
GNHWC
>
||
if
constexpr
(
!
(
is_NSpatialGC
||
is_GNSpatialC
))
std
::
is_same_v
<
ImageLayout
,
GNDHWC
>
))
{
{
return
false
;
return
false
;
}
}
...
@@ -534,13 +556,14 @@ struct DeviceColumnToImageImpl
...
@@ -534,13 +556,14 @@ struct DeviceColumnToImageImpl
static
auto
MakeArgument
(
const
void
*
p_in
,
// input image
static
auto
MakeArgument
(
const
void
*
p_in
,
// input image
void
*
p_out
,
// output image
void
*
p_out
,
// output image
const
ck
::
index_t
G
,
const
ck
::
index_t
N
,
const
ck
::
index_t
N
,
const
ck
::
index_t
C
,
const
ck
::
index_t
C
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
filter_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
filter_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
output_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
output_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
image_g_n_c_wis_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
image_g_n_c_wis_strides
,
const
std
::
array
<
index_t
,
2
>&
gemm_m_k_strides
,
const
std
::
array
<
index_t
,
3
>&
gemm_
g_
m_k_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_left_pads
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_left_pads
,
...
@@ -548,13 +571,14 @@ struct DeviceColumnToImageImpl
...
@@ -548,13 +571,14 @@ struct DeviceColumnToImageImpl
{
{
return
Argument
{
static_cast
<
const
InputDataType
*>
(
p_in
),
return
Argument
{
static_cast
<
const
InputDataType
*>
(
p_in
),
static_cast
<
OutputDataType
*>
(
p_out
),
static_cast
<
OutputDataType
*>
(
p_out
),
G
,
N
,
N
,
C
,
C
,
input_spatial_lengths
,
input_spatial_lengths
,
filter_spatial_lengths
,
filter_spatial_lengths
,
output_spatial_lengths
,
output_spatial_lengths
,
image_g_n_c_wis_strides
,
image_g_n_c_wis_strides
,
gemm_m_k_strides
,
gemm_
g_
m_k_strides
,
conv_filter_strides
,
conv_filter_strides
,
conv_filter_dilations
,
conv_filter_dilations
,
input_left_pads
,
input_left_pads
,
...
@@ -566,13 +590,14 @@ struct DeviceColumnToImageImpl
...
@@ -566,13 +590,14 @@ struct DeviceColumnToImageImpl
std
::
unique_ptr
<
BaseArgument
>
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
void
*
p_in
,
// input image
MakeArgumentPointer
(
const
void
*
p_in
,
// input image
void
*
p_out
,
// output image
void
*
p_out
,
// output image
const
ck
::
index_t
G
,
const
ck
::
index_t
N
,
const
ck
::
index_t
N
,
const
ck
::
index_t
C
,
const
ck
::
index_t
C
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
filter_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
filter_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
output_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
output_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
image_g_n_c_wis_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
image_g_n_c_wis_strides
,
const
std
::
array
<
index_t
,
2
>&
gemm_m_k_strides
,
const
std
::
array
<
index_t
,
3
>&
gemm_
g_
m_k_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_left_pads
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_left_pads
,
...
@@ -580,13 +605,14 @@ struct DeviceColumnToImageImpl
...
@@ -580,13 +605,14 @@ struct DeviceColumnToImageImpl
{
{
return
std
::
make_unique
<
Argument
>
(
static_cast
<
const
InputDataType
*>
(
p_in
),
return
std
::
make_unique
<
Argument
>
(
static_cast
<
const
InputDataType
*>
(
p_in
),
static_cast
<
OutputDataType
*>
(
p_out
),
static_cast
<
OutputDataType
*>
(
p_out
),
G
,
N
,
N
,
C
,
C
,
input_spatial_lengths
,
input_spatial_lengths
,
filter_spatial_lengths
,
filter_spatial_lengths
,
output_spatial_lengths
,
output_spatial_lengths
,
image_g_n_c_wis_strides
,
image_g_n_c_wis_strides
,
gemm_m_k_strides
,
gemm_
g_
m_k_strides
,
conv_filter_strides
,
conv_filter_strides
,
conv_filter_dilations
,
conv_filter_dilations
,
input_left_pads
,
input_left_pads
,
...
...
include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp
View file @
c5335964
...
@@ -184,7 +184,8 @@ struct DeviceGemmXdl : public DeviceGemm<ALayout,
...
@@ -184,7 +184,8 @@ struct DeviceGemmXdl : public DeviceGemm<ALayout,
return
false
;
return
false
;
}
}
}
}
else
if
(
ck
::
get_device_name
()
==
"gfx90a"
||
ck
::
get_device_name
()
==
"gfx940"
)
else
if
(
ck
::
get_device_name
()
==
"gfx90a"
||
ck
::
get_device_name
()
==
"gfx940"
||
ck
::
get_device_name
()
==
"gfx941"
||
ck
::
get_device_name
()
==
"gfx942"
)
{
{
if
constexpr
(
!
(
is_same_v
<
AccDataType
,
float
>
||
is_same_v
<
AccDataType
,
float
>
||
if
constexpr
(
!
(
is_same_v
<
AccDataType
,
float
>
||
is_same_v
<
AccDataType
,
float
>
||
is_same_v
<
AccDataType
,
int32_t
>
||
is_same_v
<
AccDataType
,
double
>
))
is_same_v
<
AccDataType
,
int32_t
>
||
is_same_v
<
AccDataType
,
double
>
))
...
...
include/ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp
View file @
c5335964
...
@@ -15,15 +15,18 @@
...
@@ -15,15 +15,18 @@
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp"
#include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
#include "ck/host_utility/io.hpp"
#include "ck/host_utility/io.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
// Image to column for input layout NDHWC:
// Image to column:
// input : input image [N, Di, Hi, Wi, C]
// input : input image [G, N, Di, Hi, Wi, C]
// output : gemm form [N * Do * Ho * Wo, Z * Y * X * C]
// output : gemm form [G * N * Do * Ho * Wo, Z * Y * X * C]
// input : input image [N, Di, Hi, Wi, G, C]
// output : gemm form [N * Do * Ho * Wo * G, Z * Y * X * C]
template
<
index_t
NDimSpatial
,
template
<
index_t
NDimSpatial
,
typename
ImageLayout
,
typename
ImageLayout
,
typename
InputDataType
,
typename
InputDataType
,
...
@@ -41,6 +44,14 @@ struct DeviceImageToColumnImpl
...
@@ -41,6 +44,14 @@ struct DeviceImageToColumnImpl
OutputDataType
,
OutputDataType
,
conv_tensor_rearrange_op
::
ImageToColumn
>
conv_tensor_rearrange_op
::
ImageToColumn
>
{
{
static
constexpr
bool
is_NSpatialGC
=
std
::
is_same_v
<
ImageLayout
,
tensor_layout
::
convolution
::
NWGC
>
||
std
::
is_same_v
<
ImageLayout
,
tensor_layout
::
convolution
::
NHWGC
>
||
std
::
is_same_v
<
ImageLayout
,
tensor_layout
::
convolution
::
NDHWGC
>
;
static
constexpr
bool
is_GNSpatialC
=
std
::
is_same_v
<
ImageLayout
,
tensor_layout
::
convolution
::
GNWC
>
||
std
::
is_same_v
<
ImageLayout
,
tensor_layout
::
convolution
::
GNHWC
>
||
std
::
is_same_v
<
ImageLayout
,
tensor_layout
::
convolution
::
GNDHWC
>
;
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
...
@@ -109,7 +120,7 @@ struct DeviceImageToColumnImpl
...
@@ -109,7 +120,7 @@ struct DeviceImageToColumnImpl
const
ck
::
index_t
C
,
const
ck
::
index_t
C
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
filter_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
filter_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
output_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
output_spatial_lengths
,
const
std
::
array
<
index_t
,
2
>&
gemm_m_k_strides
)
const
std
::
array
<
index_t
,
3
>&
gemm_
g_
m_k_strides
)
{
{
const
index_t
NDoHoWo
=
const
index_t
NDoHoWo
=
N
*
ck
::
accumulate_n
<
index_t
>
(
N
*
ck
::
accumulate_n
<
index_t
>
(
...
@@ -117,11 +128,10 @@ struct DeviceImageToColumnImpl
...
@@ -117,11 +128,10 @@ struct DeviceImageToColumnImpl
const
index_t
CZYX
=
const
index_t
CZYX
=
C
*
ck
::
accumulate_n
<
index_t
>
(
C
*
ck
::
accumulate_n
<
index_t
>
(
filter_spatial_lengths
.
begin
(),
NDimSpatial
,
1
,
std
::
multiplies
<>
());
filter_spatial_lengths
.
begin
(),
NDimSpatial
,
1
,
std
::
multiplies
<>
());
const
auto
desc_mraw_kraw
=
make_naive_tensor_descriptor
(
make_tuple
(
NDoHoWo
,
CZYX
),
make_tuple
(
gemm_m_k_strides
[
I0
],
gemm_m_k_strides
[
I1
]));
const
auto
desc_m_k
=
matrix_padder
.
PadADescriptor_M_K
(
desc_mraw_kraw
);
const
auto
desc_mraw_kraw
=
make_naive_tensor_descriptor
(
return
desc_m_k
;
make_tuple
(
NDoHoWo
,
CZYX
),
make_tuple
(
gemm_g_m_k_strides
[
I1
],
gemm_g_m_k_strides
[
I2
]));
return
matrix_padder
.
PadADescriptor_M_K
(
desc_mraw_kraw
);
}
}
using
InputGridDesc
=
using
InputGridDesc
=
...
@@ -132,34 +142,38 @@ struct DeviceImageToColumnImpl
...
@@ -132,34 +142,38 @@ struct DeviceImageToColumnImpl
decltype
(
BlockToCTileMap_M00_N0_M01Adapt
<
MPerBlock
,
KPerBlock
,
OutputGridDesc
>
(
decltype
(
BlockToCTileMap_M00_N0_M01Adapt
<
MPerBlock
,
KPerBlock
,
OutputGridDesc
>
(
OutputGridDesc
{}))
>
;
OutputGridDesc
{}))
>
;
using
GridwiseTensorRearrangeKernel
=
GridwiseTensorRearrange
<
InputGridDesc
,
using
GridwiseTensorRearrangeKernel
=
InputDataType
,
GridwiseTensorRearrange
<
InputGridDesc
,
OutputGridDesc
,
InputDataType
,
OutputDataType
,
OutputGridDesc
,
BlockSize
,
OutputDataType
,
MPerBlock
,
BlockSize
,
KPerBlock
,
MPerBlock
,
ThreadClusterLengths
,
KPerBlock
,
ScalarPerVector
,
ThreadClusterLengths
,
InMemoryDataOperationEnum
::
Set
,
ScalarPerVector
,
Block2ETileMap
>
;
InMemoryDataOperationEnum
::
Set
,
Block2ETileMap
,
ComputePtrOffsetOfStridedBatch
<
I0
>>
;
struct
Argument
:
public
BaseArgument
struct
Argument
:
public
BaseArgument
{
{
Argument
(
const
void
*
p_in
,
// input image
Argument
(
const
void
*
p_in
,
// input image
void
*
p_out
,
// gemm form
void
*
p_out
,
// gemm form
const
ck
::
index_t
G
,
const
ck
::
index_t
N
,
const
ck
::
index_t
N
,
const
ck
::
index_t
C
,
const
ck
::
index_t
C
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
filter_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
filter_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
output_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
output_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
image_g_n_c_wis_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
image_g_n_c_wis_strides
,
const
std
::
array
<
index_t
,
2
>&
gemm_m_k_strides
,
const
std
::
array
<
index_t
,
3
>&
gemm_
g_
m_k_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_left_pads
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_left_pads
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_right_pads
)
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_right_pads
)
:
C_
(
C
),
:
G_
(
G
),
C_
(
C
),
X_
(
filter_spatial_lengths
[
NDimSpatial
-
I1
]),
X_
(
filter_spatial_lengths
[
NDimSpatial
-
I1
]),
p_in_
{
static_cast
<
const
InputDataType
*>
(
p_in
)},
p_in_
{
static_cast
<
const
InputDataType
*>
(
p_in
)},
p_out_
{
static_cast
<
OutputDataType
*>
(
p_out
)},
p_out_
{
static_cast
<
OutputDataType
*>
(
p_out
)},
...
@@ -176,14 +190,16 @@ struct DeviceImageToColumnImpl
...
@@ -176,14 +190,16 @@ struct DeviceImageToColumnImpl
filter_spatial_lengths
,
filter_spatial_lengths
,
output_spatial_lengths
,
output_spatial_lengths
,
image_g_n_c_wis_strides
,
image_g_n_c_wis_strides
,
conv_filter_strides
,
conv_filter_strides
,
conv_filter_dilations
,
conv_filter_dilations
,
input_left_pads
,
input_left_pads
,
input_right_pads
);
input_right_pads
);
out_grid_desc_m_k_
=
MakeOutDescriptor_M_K
(
out_grid_desc_m_k_
=
MakeOutDescriptor_M_K
(
N
,
C
,
filter_spatial_lengths
,
output_spatial_lengths
,
gemm_m_k_strides
);
N
,
C
,
filter_spatial_lengths
,
output_spatial_lengths
,
gemm_g_m_k_strides
);
compute_ptr_offset_of_batch_
.
BatchStrideA_
=
image_g_n_c_wis_strides
[
I0
];
compute_ptr_offset_of_batch_
.
BatchStrideC_
=
gemm_g_m_k_strides
[
I0
];
}
}
void
Print
()
const
void
Print
()
const
...
@@ -192,6 +208,7 @@ struct DeviceImageToColumnImpl
...
@@ -192,6 +208,7 @@ struct DeviceImageToColumnImpl
std
::
cout
<<
out_grid_desc_m_k_
<<
std
::
endl
;
std
::
cout
<<
out_grid_desc_m_k_
<<
std
::
endl
;
}
}
const
ck
::
index_t
G_
;
const
ck
::
index_t
C_
;
const
ck
::
index_t
C_
;
const
ck
::
index_t
X_
;
const
ck
::
index_t
X_
;
...
@@ -206,6 +223,8 @@ struct DeviceImageToColumnImpl
...
@@ -206,6 +223,8 @@ struct DeviceImageToColumnImpl
InputGridDesc
in_grid_desc_m_k_
;
InputGridDesc
in_grid_desc_m_k_
;
OutputGridDesc
out_grid_desc_m_k_
;
OutputGridDesc
out_grid_desc_m_k_
;
ComputePtrOffsetOfStridedBatch
<
I0
>
compute_ptr_offset_of_batch_
;
};
};
struct
Invoker
:
public
BaseInvoker
struct
Invoker
:
public
BaseInvoker
...
@@ -220,12 +239,14 @@ struct DeviceImageToColumnImpl
...
@@ -220,12 +239,14 @@ struct DeviceImageToColumnImpl
const
auto
block_2_tile_map
=
const
auto
block_2_tile_map
=
BlockToCTileMap_M00_N0_M01Adapt
<
MPerBlock
,
KPerBlock
,
OutputGridDesc
>
(
BlockToCTileMap_M00_N0_M01Adapt
<
MPerBlock
,
KPerBlock
,
OutputGridDesc
>
(
arg
.
out_grid_desc_m_k_
);
arg
.
out_grid_desc_m_k_
);
const
index_t
grid_size
=
block_2_tile_map
.
CalculateGridSize
(
arg
.
out_grid_desc_m_k_
);
const
index_t
grid_size
=
const
auto
kernel
=
kernel_tensor_rearrange
<
InputGridDesc
,
block_2_tile_map
.
CalculateGridSize
(
arg
.
out_grid_desc_m_k_
)
*
arg
.
G_
;
const
auto
kernel
=
kernel_tensor_rearrange
<
InputGridDesc
,
InputDataType
,
InputDataType
,
OutputGridDesc
,
OutputGridDesc
,
OutputDataType
,
OutputDataType
,
Block2ETileMap
,
Block2ETileMap
,
ComputePtrOffsetOfStridedBatch
<
I0
>
,
GridwiseTensorRearrangeKernel
>
;
GridwiseTensorRearrangeKernel
>
;
float
elapsed_time
=
launch_and_time_kernel
(
stream_config
,
float
elapsed_time
=
launch_and_time_kernel
(
stream_config
,
...
@@ -237,7 +258,9 @@ struct DeviceImageToColumnImpl
...
@@ -237,7 +258,9 @@ struct DeviceImageToColumnImpl
arg
.
p_in_
,
arg
.
p_in_
,
arg
.
out_grid_desc_m_k_
,
arg
.
out_grid_desc_m_k_
,
arg
.
p_out_
,
arg
.
p_out_
,
block_2_tile_map
);
arg
.
G_
,
block_2_tile_map
,
arg
.
compute_ptr_offset_of_batch_
);
return
elapsed_time
;
return
elapsed_time
;
}
}
...
@@ -250,9 +273,7 @@ struct DeviceImageToColumnImpl
...
@@ -250,9 +273,7 @@ struct DeviceImageToColumnImpl
bool
IsSupportedArgument
(
const
Argument
&
arg
)
bool
IsSupportedArgument
(
const
Argument
&
arg
)
{
{
using
namespace
tensor_layout
::
convolution
;
if
constexpr
(
!
(
is_NSpatialGC
||
is_GNSpatialC
))
if
constexpr
(
!
(
std
::
is_same_v
<
ImageLayout
,
GNWC
>
||
std
::
is_same_v
<
ImageLayout
,
GNHWC
>
||
std
::
is_same_v
<
ImageLayout
,
GNDHWC
>
))
{
{
return
false
;
return
false
;
}
}
...
@@ -295,13 +316,14 @@ struct DeviceImageToColumnImpl
...
@@ -295,13 +316,14 @@ struct DeviceImageToColumnImpl
static
auto
MakeArgument
(
const
void
*
p_in
,
// input image
static
auto
MakeArgument
(
const
void
*
p_in
,
// input image
void
*
p_out
,
// gemm form
void
*
p_out
,
// gemm form
const
ck
::
index_t
G
,
const
ck
::
index_t
N
,
const
ck
::
index_t
N
,
const
ck
::
index_t
C
,
const
ck
::
index_t
C
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
filter_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
filter_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
output_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
output_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
image_g_n_c_wis_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
image_g_n_c_wis_strides
,
const
std
::
array
<
index_t
,
2
>&
gemm_m_k_strides
,
const
std
::
array
<
index_t
,
3
>&
gemm_
g_
m_k_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_left_pads
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_left_pads
,
...
@@ -309,13 +331,14 @@ struct DeviceImageToColumnImpl
...
@@ -309,13 +331,14 @@ struct DeviceImageToColumnImpl
{
{
return
Argument
{
static_cast
<
const
InputDataType
*>
(
p_in
),
return
Argument
{
static_cast
<
const
InputDataType
*>
(
p_in
),
static_cast
<
OutputDataType
*>
(
p_out
),
static_cast
<
OutputDataType
*>
(
p_out
),
G
,
N
,
N
,
C
,
C
,
input_spatial_lengths
,
input_spatial_lengths
,
filter_spatial_lengths
,
filter_spatial_lengths
,
output_spatial_lengths
,
output_spatial_lengths
,
image_g_n_c_wis_strides
,
image_g_n_c_wis_strides
,
gemm_m_k_strides
,
gemm_
g_
m_k_strides
,
conv_filter_strides
,
conv_filter_strides
,
conv_filter_dilations
,
conv_filter_dilations
,
input_left_pads
,
input_left_pads
,
...
@@ -327,13 +350,14 @@ struct DeviceImageToColumnImpl
...
@@ -327,13 +350,14 @@ struct DeviceImageToColumnImpl
std
::
unique_ptr
<
BaseArgument
>
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
void
*
p_in
,
// input image
MakeArgumentPointer
(
const
void
*
p_in
,
// input image
void
*
p_out
,
// gemm form
void
*
p_out
,
// gemm form
const
ck
::
index_t
G
,
const
ck
::
index_t
N
,
const
ck
::
index_t
N
,
const
ck
::
index_t
C
,
const
ck
::
index_t
C
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
filter_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
filter_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
output_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
output_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
image_g_n_c_wis_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
image_g_n_c_wis_strides
,
const
std
::
array
<
index_t
,
2
>&
gemm_m_k_strides
,
const
std
::
array
<
index_t
,
3
>&
gemm_
g_
m_k_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_left_pads
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_left_pads
,
...
@@ -341,13 +365,14 @@ struct DeviceImageToColumnImpl
...
@@ -341,13 +365,14 @@ struct DeviceImageToColumnImpl
{
{
return
std
::
make_unique
<
Argument
>
(
static_cast
<
const
InputDataType
*>
(
p_in
),
return
std
::
make_unique
<
Argument
>
(
static_cast
<
const
InputDataType
*>
(
p_in
),
static_cast
<
OutputDataType
*>
(
p_out
),
static_cast
<
OutputDataType
*>
(
p_out
),
G
,
N
,
N
,
C
,
C
,
input_spatial_lengths
,
input_spatial_lengths
,
filter_spatial_lengths
,
filter_spatial_lengths
,
output_spatial_lengths
,
output_spatial_lengths
,
image_g_n_c_wis_strides
,
image_g_n_c_wis_strides
,
gemm_m_k_strides
,
gemm_
g_
m_k_strides
,
conv_filter_strides
,
conv_filter_strides
,
conv_filter_dilations
,
conv_filter_dilations
,
input_left_pads
,
input_left_pads
,
...
...
include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
View file @
c5335964
...
@@ -311,6 +311,71 @@ struct AddAddFastGelu
...
@@ -311,6 +311,71 @@ struct AddAddFastGelu
}
}
};
};
// E = Relu(alpha1 * C + alpha2 * D0 + D1)
struct
ScaleAddScaleAddRelu
{
ScaleAddScaleAddRelu
(
const
float
alpha1
=
1.
f
,
const
float
alpha2
=
1.
f
)
:
alpha1_
(
alpha1
),
alpha2_
(
alpha2
)
{
}
template
<
typename
E
,
typename
C
,
typename
D0
,
typename
D1
>
__host__
__device__
constexpr
void
operator
()(
E
&
e
,
const
C
&
c
,
const
D0
&
d0
,
const
D1
&
d1
)
const
;
template
<
>
__host__
__device__
constexpr
void
operator
()
<
float
,
float
,
float
,
float
>
(
float
&
e
,
const
float
&
c
,
const
float
&
d0
,
const
float
&
d1
)
const
{
const
float
x
=
c
*
alpha1_
+
alpha2_
*
d0
+
d1
;
Relu
{}.
template
operator
()
<
float
>(
e
,
x
);
}
template
<
>
__host__
__device__
constexpr
void
operator
()
<
half_t
,
half_t
,
half_t
,
half_t
>
(
half_t
&
e
,
const
half_t
&
c
,
const
half_t
&
d0
,
const
half_t
&
d1
)
const
{
const
float
x
=
type_convert
<
float
>
(
c
)
*
alpha1_
+
alpha2_
*
type_convert
<
float
>
(
d0
)
+
type_convert
<
float
>
(
d1
);
float
result
=
0
;
Relu
{}.
template
operator
()
<
float
>(
result
,
x
);
e
=
type_convert
<
half_t
>
(
result
);
}
template
<
>
__host__
__device__
constexpr
void
operator
()
<
bhalf_t
,
bhalf_t
,
bhalf_t
,
bhalf_t
>
(
bhalf_t
&
e
,
const
bhalf_t
&
c
,
const
bhalf_t
&
d0
,
const
bhalf_t
&
d1
)
const
{
const
float
x
=
type_convert
<
float
>
(
c
)
*
alpha1_
+
alpha2_
*
type_convert
<
float
>
(
d0
)
+
type_convert
<
float
>
(
d1
);
float
result
=
0
;
Relu
{}.
template
operator
()
<
float
>(
result
,
x
);
e
=
type_convert
<
bhalf_t
>
(
result
);
}
template
<
>
__host__
__device__
constexpr
void
operator
()
<
int8_t
,
int8_t
,
float
,
float
>
(
int8_t
&
e
,
const
int8_t
&
c
,
const
float
&
d0
,
const
float
&
d1
)
const
{
const
float
x
=
type_convert
<
float
>
(
c
)
*
alpha1_
+
alpha2_
*
d0
+
d1
;
float
result
=
0
;
Relu
{}.
template
operator
()
<
float
>(
result
,
x
);
e
=
type_convert
<
int8_t
>
(
result
);
}
const
float
alpha1_
;
const
float
alpha2_
;
};
struct
Normalize
struct
Normalize
{
{
// FIXME: is double absolutely necessary?
// FIXME: is double absolutely necessary?
...
...
include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
View file @
c5335964
...
@@ -29,8 +29,8 @@ struct PassThrough
...
@@ -29,8 +29,8 @@ struct PassThrough
__host__
__device__
constexpr
void
operator
()(
ck
::
half2_t
&
y
,
const
ck
::
f8x2_t
&
x
)
const
__host__
__device__
constexpr
void
operator
()(
ck
::
half2_t
&
y
,
const
ck
::
f8x2_t
&
x
)
const
{
{
int32_t
t
=
ck
::
bit_cast
<
uint16
_t
>
(
x
);
auto
t
=
type_convert
<
float2
_t
>
(
x
);
y
=
ck
::
bit_cast
<
ck
::
half2_t
>
(
t
);
y
=
type_convert
<
half2_t
>
(
t
);
}
}
__host__
__device__
constexpr
void
operator
()(
ck
::
half2_t
&
y
,
const
ck
::
half2_t
&
x
)
const
__host__
__device__
constexpr
void
operator
()(
ck
::
half2_t
&
y
,
const
ck
::
half2_t
&
x
)
const
...
...
include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
View file @
c5335964
...
@@ -21,6 +21,7 @@ template <typename InputGridDesc,
...
@@ -21,6 +21,7 @@ template <typename InputGridDesc,
typename
OutputGridDesc
,
typename
OutputGridDesc
,
typename
OutputDataType
,
typename
OutputDataType
,
typename
Block2ETileMap
,
typename
Block2ETileMap
,
typename
ComputePtrOffsetOfStridedBatch
,
typename
GridwiseTensorRearrangeKernel
>
typename
GridwiseTensorRearrangeKernel
>
__global__
void
__global__
void
#if CK_USE_LAUNCH_BOUNDS
#if CK_USE_LAUNCH_BOUNDS
...
@@ -30,13 +31,20 @@ __global__ void
...
@@ -30,13 +31,20 @@ __global__ void
const
InputDataType
*
__restrict__
p_in_global
,
const
InputDataType
*
__restrict__
p_in_global
,
const
OutputGridDesc
out_grid_desc
,
const
OutputGridDesc
out_grid_desc
,
OutputDataType
*
__restrict__
p_out_global
,
OutputDataType
*
__restrict__
p_out_global
,
const
Block2ETileMap
block_2_tile_map
)
const
index_t
batch_count
,
const
Block2ETileMap
block_2_tile_map
,
const
ComputePtrOffsetOfStridedBatch
compute_ptr_offset_of_batch
)
{
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \
defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx1030__) || defined(__gfx1100__) || \
defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx1030__) || defined(__gfx1100__) || \
defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx941__) || defined(__gfx942__))
defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx941__) || defined(__gfx942__))
GridwiseTensorRearrangeKernel
::
Run
(
GridwiseTensorRearrangeKernel
::
Run
(
in_grid_desc
,
in_grid_desc
,
p_in_global
,
out_grid_desc
,
p_out_global
,
block_2_tile_map
);
p_in_global
,
out_grid_desc
,
p_out_global
,
batch_count
,
block_2_tile_map
,
compute_ptr_offset_of_batch
);
#else
#else
ignore
=
in_grid_desc
;
ignore
=
in_grid_desc
;
ignore
=
p_in_global
;
ignore
=
p_in_global
;
...
@@ -56,7 +64,8 @@ template <typename InputGridDesc,
...
@@ -56,7 +64,8 @@ template <typename InputGridDesc,
typename
ThreadClusterLengths
,
typename
ThreadClusterLengths
,
index_t
ScalarPerVector
,
index_t
ScalarPerVector
,
InMemoryDataOperationEnum
DstInMemOp
,
InMemoryDataOperationEnum
DstInMemOp
,
typename
Block2ETileMap
>
typename
Block2ETileMap
,
typename
ComputePtrOffsetOfStridedBatch
>
struct
GridwiseTensorRearrange
struct
GridwiseTensorRearrange
{
{
...
@@ -69,7 +78,9 @@ struct GridwiseTensorRearrange
...
@@ -69,7 +78,9 @@ struct GridwiseTensorRearrange
const
InputDataType
*
__restrict__
p_in_global
,
const
InputDataType
*
__restrict__
p_in_global
,
const
OutputGridDesc
&
out_grid_desc
,
const
OutputGridDesc
&
out_grid_desc
,
OutputDataType
*
__restrict__
p_out_global
,
OutputDataType
*
__restrict__
p_out_global
,
const
Block2ETileMap
&
block_2_tile_map
)
const
index_t
batch_count
,
const
Block2ETileMap
&
block_2_tile_map
,
const
ComputePtrOffsetOfStridedBatch
&
compute_ptr_offset_of_batch
)
{
{
const
auto
block_work_idx
=
const
auto
block_work_idx
=
block_2_tile_map
.
CalculateBottomIndex
(
make_multi_index
(
get_block_1d_id
()));
block_2_tile_map
.
CalculateBottomIndex
(
make_multi_index
(
get_block_1d_id
()));
...
@@ -80,12 +91,6 @@ struct GridwiseTensorRearrange
...
@@ -80,12 +91,6 @@ struct GridwiseTensorRearrange
const
index_t
k_block_data_idx_on_grid
=
const
index_t
k_block_data_idx_on_grid
=
__builtin_amdgcn_readfirstlane
(
block_work_idx
[
I1
]
*
KPerBlock
);
__builtin_amdgcn_readfirstlane
(
block_work_idx
[
I1
]
*
KPerBlock
);
// Global Memory
const
auto
in_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_in_global
,
in_grid_desc
.
GetElementSpaceSize
());
auto
out_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_out_global
,
out_grid_desc
.
GetElementSpaceSize
());
auto
copy_global_to_global
=
auto
copy_global_to_global
=
ThreadGroupTensorSliceTransfer_v7
<
ThisThreadBlock
,
ThreadGroupTensorSliceTransfer_v7
<
ThisThreadBlock
,
Tuple
<
InputDataType
>
,
Tuple
<
InputDataType
>
,
...
@@ -108,6 +113,22 @@ struct GridwiseTensorRearrange
...
@@ -108,6 +113,22 @@ struct GridwiseTensorRearrange
make_tuple
(
make_multi_index
(
m_block_data_idx_on_grid
,
k_block_data_idx_on_grid
)),
make_tuple
(
make_multi_index
(
m_block_data_idx_on_grid
,
k_block_data_idx_on_grid
)),
tensor_operation
::
element_wise
::
PassThrough
{}};
tensor_operation
::
element_wise
::
PassThrough
{}};
const
index_t
num_blocks_per_batch
=
__builtin_amdgcn_readfirstlane
(
get_grid_size
()
/
batch_count
);
const
index_t
g_idx
=
__builtin_amdgcn_readfirstlane
(
get_block_1d_id
()
/
num_blocks_per_batch
);
// Global Memory
const
index_t
a_batch_offset
=
__builtin_amdgcn_readfirstlane
(
compute_ptr_offset_of_batch
.
GetAPtrOffset
(
g_idx
));
const
index_t
c_batch_offset
=
__builtin_amdgcn_readfirstlane
(
compute_ptr_offset_of_batch
.
GetCPtrOffset
(
g_idx
));
const
auto
in_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_in_global
+
a_batch_offset
,
in_grid_desc
.
GetElementSpaceSize
());
auto
out_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_out_global
+
c_batch_offset
,
out_grid_desc
.
GetElementSpaceSize
());
copy_global_to_global
.
Run
(
copy_global_to_global
.
Run
(
tie
(
in_grid_desc
),
tie
(
in_global_buf
),
tie
(
out_grid_desc
),
tie
(
out_global_buf
));
tie
(
in_grid_desc
),
tie
(
in_global_buf
),
tie
(
out_grid_desc
),
tie
(
out_global_buf
));
}
}
...
...
include/ck/utility/type_convert.hpp
View file @
c5335964
...
@@ -100,6 +100,8 @@ template <>
...
@@ -100,6 +100,8 @@ template <>
inline
__host__
__device__
f8_t
type_convert
<
f8_t
,
float
>
(
float
x
)
inline
__host__
__device__
f8_t
type_convert
<
f8_t
,
float
>
(
float
x
)
{
{
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
float
max_fp8
=
240.0
f
;
x
=
x
>
max_fp8
?
max_fp8
:
(
x
<
-
max_fp8
?
-
max_fp8
:
x
);
union
union
{
{
float
fval
;
float
fval
;
...
@@ -138,6 +140,33 @@ inline __host__ __device__ float type_convert<float, f8_t>(f8_t x)
...
@@ -138,6 +140,33 @@ inline __host__ __device__ float type_convert<float, f8_t>(f8_t x)
#endif
#endif
}
}
template
<
>
inline
__host__
__device__
float2_t
type_convert
<
float2_t
,
f8x2_t
>
(
f8x2_t
x
)
{
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
f8x2_v
.
template
AsType
<
f8_t
>()[
Number
<
0
>
{}]
return
__builtin_amdgcn_cvt_pk_f32_fp8
(
i16val
,
0
);
#else
constexpr
bool
negative_zero_nan
=
true
;
const
auto
f8x2_v
=
vector_type
<
f8_t
,
2
>
(
x
);
vector_type
<
float
,
2
>
f32x2_v
;
f32x2_v
.
template
AsType
<
float
>()(
Number
<
0
>
{})
=
utils
::
cast_from_f8
<
f8_t
,
float
,
negative_zero_nan
>
(
f8x2_v
.
template
AsType
<
f8_t
>()[
Number
<
0
>
{}]);
f32x2_v
.
template
AsType
<
float
>()(
Number
<
1
>
{})
=
utils
::
cast_from_f8
<
f8_t
,
float
,
negative_zero_nan
>
(
f8x2_v
.
template
AsType
<
f8_t
>()[
Number
<
1
>
{}]);
return
f32x2_v
.
template
AsType
<
float2_t
>()[
Number
<
0
>
{}];
#endif
}
template
<
>
inline
__host__
__device__
half2_t
type_convert
<
half2_t
,
float2_t
>
(
float2_t
x
)
{
const
vector_type
<
float
,
2
>
f32x2_v
(
x
);
const
auto
y
=
__builtin_amdgcn_cvt_pkrtz
(
f32x2_v
.
template
AsType
<
float
>()[
Number
<
0
>
{}],
f32x2_v
.
template
AsType
<
float
>()[
Number
<
1
>
{}]);
return
bit_cast
<
half2_t
>
(
y
);
}
// convert fp16 to fp8
// convert fp16 to fp8
template
<
>
template
<
>
inline
__host__
__device__
f8_t
type_convert
<
f8_t
,
half_t
>
(
half_t
x
)
inline
__host__
__device__
f8_t
type_convert
<
f8_t
,
half_t
>
(
half_t
x
)
...
...
library/include/ck/library/reference_tensor_operation/cpu/reference_column_to_image.hpp
View file @
c5335964
...
@@ -19,9 +19,7 @@ namespace host {
...
@@ -19,9 +19,7 @@ namespace host {
* \brief Reference implementation for column to image.
* \brief Reference implementation for column to image.
*
*
* Input tensor descriptor has [N * Do * Ho * Wo, Z * Y * X * C] data layout.
* Input tensor descriptor has [N * Do * Ho * Wo, Z * Y * X * C] data layout.
* Memory layout is the same.
* Output tensor descriptor has [G, N, C, Di, Hi, Wi] data layout.
* Output tensor descriptor has [G, N, C, Di, Hi, Wi] data layout.
* G must be equal to 1. Memory layout is [G, N, Di, Hi, Wi, C].
*
*
* \tparam NDimSpatial Number of spatial dimensions.
* \tparam NDimSpatial Number of spatial dimensions.
* \tparam ImageLayout Image Layout.
* \tparam ImageLayout Image Layout.
...
@@ -95,18 +93,19 @@ struct ReferenceColumnToImage : public device::BaseOperator
...
@@ -95,18 +93,19 @@ struct ReferenceColumnToImage : public device::BaseOperator
float
Run
(
const
Argument
&
arg
)
float
Run
(
const
Argument
&
arg
)
{
{
if
(
!
(
arg
.
output_
.
GetNumOfDimension
()
==
NDimSpatial
+
3
&&
if
(
!
(
arg
.
output_
.
GetNumOfDimension
()
==
NDimSpatial
+
3
&&
arg
.
input_
.
GetNumOfDimension
()
==
2
))
arg
.
input_
.
GetNumOfDimension
()
==
3
))
{
{
throw
std
::
runtime_error
(
"wrong! inconsistent dimension"
);
throw
std
::
runtime_error
(
"wrong! inconsistent dimension"
);
}
}
const
index_t
G
=
arg
.
output_
.
GetLengths
()[
0
];
const
index_t
N
=
arg
.
output_
.
GetLengths
()[
1
];
const
index_t
N
=
arg
.
output_
.
GetLengths
()[
1
];
const
index_t
C
=
arg
.
output_
.
GetLengths
()[
2
];
const
index_t
C
=
arg
.
output_
.
GetLengths
()[
2
];
if
constexpr
(
NDimSpatial
==
1
)
if
constexpr
(
NDimSpatial
==
1
)
{
{
const
index_t
Wo
=
arg
.
output_spatial_lengths_
[
0
];
const
index_t
Wo
=
arg
.
output_spatial_lengths_
[
0
];
auto
func
=
[
&
](
auto
n
)
{
auto
func
=
[
&
](
auto
g
,
auto
n
)
{
for
(
index_t
wo
=
0
;
wo
<
Wo
;
++
wo
)
for
(
index_t
wo
=
0
;
wo
<
Wo
;
++
wo
)
{
{
index_t
row
=
n
*
Wo
+
wo
;
index_t
row
=
n
*
Wo
+
wo
;
...
@@ -123,9 +122,10 @@ struct ReferenceColumnToImage : public device::BaseOperator
...
@@ -123,9 +122,10 @@ struct ReferenceColumnToImage : public device::BaseOperator
if
(
wi
>=
0
&&
if
(
wi
>=
0
&&
ck
::
type_convert
<
std
::
size_t
>
(
wi
)
<
arg
.
output_
.
GetLengths
()[
3
])
ck
::
type_convert
<
std
::
size_t
>
(
wi
)
<
arg
.
output_
.
GetLengths
()[
3
])
{
{
float
v_in
=
ck
::
type_convert
<
float
>
(
arg
.
input_
(
row
,
column
));
float
v_in
=
float
v_out
=
ck
::
type_convert
<
float
>
(
arg
.
output_
(
0
,
n
,
c
,
wi
));
ck
::
type_convert
<
float
>
(
arg
.
input_
(
g
,
row
,
column
));
arg
.
output_
(
0
,
n
,
c
,
wi
)
=
float
v_out
=
ck
::
type_convert
<
float
>
(
arg
.
output_
(
g
,
n
,
c
,
wi
));
arg
.
output_
(
g
,
n
,
c
,
wi
)
=
ck
::
type_convert
<
OutDataType
>
(
v_in
+
v_out
);
ck
::
type_convert
<
OutDataType
>
(
v_in
+
v_out
);
}
}
column
++
;
column
++
;
...
@@ -134,7 +134,7 @@ struct ReferenceColumnToImage : public device::BaseOperator
...
@@ -134,7 +134,7 @@ struct ReferenceColumnToImage : public device::BaseOperator
}
}
};
};
make_ParallelTensorFunctor
(
func
,
N
)(
std
::
thread
::
hardware_concurrency
());
make_ParallelTensorFunctor
(
func
,
G
,
N
)(
std
::
thread
::
hardware_concurrency
());
return
0
;
return
0
;
}
}
...
@@ -143,7 +143,7 @@ struct ReferenceColumnToImage : public device::BaseOperator
...
@@ -143,7 +143,7 @@ struct ReferenceColumnToImage : public device::BaseOperator
const
index_t
Ho
=
arg
.
output_spatial_lengths_
[
0
];
const
index_t
Ho
=
arg
.
output_spatial_lengths_
[
0
];
const
index_t
Wo
=
arg
.
output_spatial_lengths_
[
1
];
const
index_t
Wo
=
arg
.
output_spatial_lengths_
[
1
];
auto
func
=
[
&
](
auto
n
)
{
auto
func
=
[
&
](
auto
g
,
auto
n
)
{
for
(
index_t
ho
=
0
;
ho
<
Ho
;
++
ho
)
for
(
index_t
ho
=
0
;
ho
<
Ho
;
++
ho
)
{
{
for
(
index_t
wo
=
0
;
wo
<
Wo
;
++
wo
)
for
(
index_t
wo
=
0
;
wo
<
Wo
;
++
wo
)
...
@@ -176,10 +176,10 @@ struct ReferenceColumnToImage : public device::BaseOperator
...
@@ -176,10 +176,10 @@ struct ReferenceColumnToImage : public device::BaseOperator
arg
.
output_
.
GetLengths
()[
4
])
arg
.
output_
.
GetLengths
()[
4
])
{
{
float
v_in
=
float
v_in
=
ck
::
type_convert
<
float
>
(
arg
.
input_
(
row
,
column
));
ck
::
type_convert
<
float
>
(
arg
.
input_
(
g
,
row
,
column
));
float
v_out
=
ck
::
type_convert
<
float
>
(
float
v_out
=
ck
::
type_convert
<
float
>
(
arg
.
output_
(
0
,
n
,
c
,
hi
,
wi
));
arg
.
output_
(
g
,
n
,
c
,
hi
,
wi
));
arg
.
output_
(
0
,
n
,
c
,
hi
,
wi
)
=
arg
.
output_
(
g
,
n
,
c
,
hi
,
wi
)
=
ck
::
type_convert
<
OutDataType
>
(
v_in
+
v_out
);
ck
::
type_convert
<
OutDataType
>
(
v_in
+
v_out
);
}
}
column
++
;
column
++
;
...
@@ -190,7 +190,7 @@ struct ReferenceColumnToImage : public device::BaseOperator
...
@@ -190,7 +190,7 @@ struct ReferenceColumnToImage : public device::BaseOperator
}
}
};
};
make_ParallelTensorFunctor
(
func
,
N
)(
std
::
thread
::
hardware_concurrency
());
make_ParallelTensorFunctor
(
func
,
G
,
N
)(
std
::
thread
::
hardware_concurrency
());
return
0
;
return
0
;
}
}
...
@@ -200,7 +200,7 @@ struct ReferenceColumnToImage : public device::BaseOperator
...
@@ -200,7 +200,7 @@ struct ReferenceColumnToImage : public device::BaseOperator
const
index_t
Ho
=
arg
.
output_spatial_lengths_
[
1
];
const
index_t
Ho
=
arg
.
output_spatial_lengths_
[
1
];
const
index_t
Wo
=
arg
.
output_spatial_lengths_
[
2
];
const
index_t
Wo
=
arg
.
output_spatial_lengths_
[
2
];
auto
func
=
[
&
](
auto
n
)
{
auto
func
=
[
&
](
auto
g
,
auto
n
)
{
for
(
index_t
d_o
=
0
;
d_o
<
Do
;
++
d_o
)
for
(
index_t
d_o
=
0
;
d_o
<
Do
;
++
d_o
)
{
{
for
(
index_t
ho
=
0
;
ho
<
Ho
;
++
ho
)
for
(
index_t
ho
=
0
;
ho
<
Ho
;
++
ho
)
...
@@ -245,10 +245,10 @@ struct ReferenceColumnToImage : public device::BaseOperator
...
@@ -245,10 +245,10 @@ struct ReferenceColumnToImage : public device::BaseOperator
arg
.
output_
.
GetLengths
()[
5
])
arg
.
output_
.
GetLengths
()[
5
])
{
{
float
v_in
=
ck
::
type_convert
<
float
>
(
float
v_in
=
ck
::
type_convert
<
float
>
(
arg
.
input_
(
row
,
column
));
arg
.
input_
(
g
,
row
,
column
));
float
v_out
=
ck
::
type_convert
<
float
>
(
float
v_out
=
ck
::
type_convert
<
float
>
(
arg
.
output_
(
0
,
n
,
c
,
di
,
hi
,
wi
));
arg
.
output_
(
g
,
n
,
c
,
di
,
hi
,
wi
));
arg
.
output_
(
0
,
n
,
c
,
di
,
hi
,
wi
)
=
arg
.
output_
(
g
,
n
,
c
,
di
,
hi
,
wi
)
=
ck
::
type_convert
<
OutDataType
>
(
v_in
+
v_out
);
ck
::
type_convert
<
OutDataType
>
(
v_in
+
v_out
);
}
}
column
++
;
column
++
;
...
@@ -261,7 +261,7 @@ struct ReferenceColumnToImage : public device::BaseOperator
...
@@ -261,7 +261,7 @@ struct ReferenceColumnToImage : public device::BaseOperator
}
}
};
};
make_ParallelTensorFunctor
(
func
,
N
)(
std
::
thread
::
hardware_concurrency
());
make_ParallelTensorFunctor
(
func
,
G
,
N
)(
std
::
thread
::
hardware_concurrency
());
return
0
;
return
0
;
}
}
...
@@ -303,8 +303,9 @@ struct ReferenceColumnToImage : public device::BaseOperator
...
@@ -303,8 +303,9 @@ struct ReferenceColumnToImage : public device::BaseOperator
C
*
ck
::
accumulate_n
<
index_t
>
(
C
*
ck
::
accumulate_n
<
index_t
>
(
arg
.
filter_spatial_lengths_
.
begin
(),
NDimSpatial
,
1
,
std
::
multiplies
<>
());
arg
.
filter_spatial_lengths_
.
begin
(),
NDimSpatial
,
1
,
std
::
multiplies
<>
());
if
(
!
(
arg
.
input_
.
GetLengths
()[
0
]
==
static_cast
<
std
::
size_t
>
(
NDoHoWo
)
&&
if
(
!
(
arg
.
input_
.
GetLengths
()[
0
]
==
static_cast
<
std
::
size_t
>
(
G
)
&&
arg
.
input_
.
GetLengths
()[
1
]
==
static_cast
<
std
::
size_t
>
(
CZYX
)))
arg
.
input_
.
GetLengths
()[
1
]
==
static_cast
<
std
::
size_t
>
(
NDoHoWo
)
&&
arg
.
input_
.
GetLengths
()[
2
]
==
static_cast
<
std
::
size_t
>
(
CZYX
)))
{
{
return
false
;
return
false
;
}
}
...
...
library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
View file @
c5335964
...
@@ -42,6 +42,7 @@ template <ck::index_t NDimSpatial,
...
@@ -42,6 +42,7 @@ template <ck::index_t NDimSpatial,
typename
InElementwiseOperation
,
typename
InElementwiseOperation
,
typename
WeiElementwiseOperation
,
typename
WeiElementwiseOperation
,
typename
OutElementwiseOperation
,
typename
OutElementwiseOperation
,
ck
::
index_t
NumDTensor
=
0
,
typename
std
::
enable_if
<
NDimSpatial
>
=
1
&&
NDimSpatial
<=
3
,
bool
>::
type
=
false
>
typename
std
::
enable_if
<
NDimSpatial
>
=
1
&&
NDimSpatial
<=
3
,
bool
>::
type
=
false
>
struct
ReferenceConvFwd
:
public
device
::
BaseOperator
struct
ReferenceConvFwd
:
public
device
::
BaseOperator
{
{
...
@@ -57,10 +58,12 @@ struct ReferenceConvFwd : public device::BaseOperator
...
@@ -57,10 +58,12 @@ struct ReferenceConvFwd : public device::BaseOperator
std
::
vector
<
ck
::
index_t
>
input_right_pads
,
std
::
vector
<
ck
::
index_t
>
input_right_pads
,
InElementwiseOperation
in_element_op
,
InElementwiseOperation
in_element_op
,
WeiElementwiseOperation
wei_element_op
,
WeiElementwiseOperation
wei_element_op
,
OutElementwiseOperation
out_element_op
)
OutElementwiseOperation
out_element_op
,
const
std
::
array
<
Tensor
<
OutDataType
>
,
NumDTensor
>&
d_tensors
)
:
input_
{
input
},
:
input_
{
input
},
weight_
{
weight
},
weight_
{
weight
},
output_
{
output
},
output_
{
output
},
d_tensors_
{
d_tensors
},
conv_strides_
{
conv_filter_strides
},
conv_strides_
{
conv_filter_strides
},
conv_dilations_
{
conv_filter_dilations
},
conv_dilations_
{
conv_filter_dilations
},
in_left_pads_
{
input_left_pads
},
in_left_pads_
{
input_left_pads
},
...
@@ -75,6 +78,8 @@ struct ReferenceConvFwd : public device::BaseOperator
...
@@ -75,6 +78,8 @@ struct ReferenceConvFwd : public device::BaseOperator
const
Tensor
<
WeiDataType
>&
weight_
;
const
Tensor
<
WeiDataType
>&
weight_
;
Tensor
<
OutDataType
>&
output_
;
Tensor
<
OutDataType
>&
output_
;
const
std
::
array
<
Tensor
<
OutDataType
>
,
NumDTensor
>&
d_tensors_
;
std
::
vector
<
index_t
>
conv_strides_
;
std
::
vector
<
index_t
>
conv_strides_
;
std
::
vector
<
index_t
>
conv_dilations_
;
std
::
vector
<
index_t
>
conv_dilations_
;
std
::
vector
<
index_t
>
in_left_pads_
;
std
::
vector
<
index_t
>
in_left_pads_
;
...
@@ -129,7 +134,26 @@ struct ReferenceConvFwd : public device::BaseOperator
...
@@ -129,7 +134,26 @@ struct ReferenceConvFwd : public device::BaseOperator
}
}
OutDataType
v_out
;
OutDataType
v_out
;
arg
.
out_element_op_
(
v_out
,
ck
::
type_convert
<
OutDataType
>
(
v_acc
));
OutDataType
v_acc_converted
=
ck
::
type_convert
<
OutDataType
>
(
v_acc
);
if
constexpr
(
NumDTensor
==
0
)
{
arg
.
out_element_op_
(
v_out
,
v_acc_converted
);
}
else
if
constexpr
(
NumDTensor
==
1
)
{
arg
.
out_element_op_
(
v_out
,
v_acc_converted
,
arg
.
d_tensors_
[
0
](
g
,
n
,
k
,
wo
));
}
else
if
constexpr
(
NumDTensor
==
2
)
{
arg
.
out_element_op_
(
v_out
,
v_acc_converted
,
arg
.
d_tensors_
[
0
](
g
,
n
,
k
,
wo
),
arg
.
d_tensors_
[
1
](
g
,
n
,
k
,
wo
));
}
else
{
throw
std
::
runtime_error
(
"Output ElementOp not supported in reference."
);
}
arg
.
output_
(
g
,
n
,
k
,
wo
)
=
v_out
;
arg
.
output_
(
g
,
n
,
k
,
wo
)
=
v_out
;
};
};
...
@@ -183,7 +207,27 @@ struct ReferenceConvFwd : public device::BaseOperator
...
@@ -183,7 +207,27 @@ struct ReferenceConvFwd : public device::BaseOperator
}
}
OutDataType
v_out
;
OutDataType
v_out
;
arg
.
out_element_op_
(
v_out
,
ck
::
type_convert
<
OutDataType
>
(
v_acc
));
OutDataType
v_acc_converted
=
ck
::
type_convert
<
OutDataType
>
(
v_acc
);
if
constexpr
(
NumDTensor
==
0
)
{
arg
.
out_element_op_
(
v_out
,
v_acc_converted
);
}
else
if
constexpr
(
NumDTensor
==
1
)
{
arg
.
out_element_op_
(
v_out
,
v_acc_converted
,
arg
.
d_tensors_
[
0
](
g
,
n
,
k
,
ho
,
wo
));
}
else
if
constexpr
(
NumDTensor
==
2
)
{
arg
.
out_element_op_
(
v_out
,
v_acc_converted
,
arg
.
d_tensors_
[
0
](
g
,
n
,
k
,
ho
,
wo
),
arg
.
d_tensors_
[
1
](
g
,
n
,
k
,
ho
,
wo
));
}
else
{
throw
std
::
runtime_error
(
"Output ElementOp not supported in reference."
);
}
arg
.
output_
(
g
,
n
,
k
,
ho
,
wo
)
=
v_out
;
arg
.
output_
(
g
,
n
,
k
,
ho
,
wo
)
=
v_out
;
};
};
...
@@ -250,7 +294,27 @@ struct ReferenceConvFwd : public device::BaseOperator
...
@@ -250,7 +294,27 @@ struct ReferenceConvFwd : public device::BaseOperator
}
}
OutDataType
v_out
;
OutDataType
v_out
;
arg
.
out_element_op_
(
v_out
,
ck
::
type_convert
<
OutDataType
>
(
v_acc
));
OutDataType
v_acc_converted
=
ck
::
type_convert
<
OutDataType
>
(
v_acc
);
if
constexpr
(
NumDTensor
==
0
)
{
arg
.
out_element_op_
(
v_out
,
v_acc_converted
);
}
else
if
constexpr
(
NumDTensor
==
1
)
{
arg
.
out_element_op_
(
v_out
,
v_acc_converted
,
arg
.
d_tensors_
[
0
](
g
,
n
,
k
,
d_o
,
ho
,
wo
));
}
else
if
constexpr
(
NumDTensor
==
2
)
{
arg
.
out_element_op_
(
v_out
,
v_acc_converted
,
arg
.
d_tensors_
[
0
](
g
,
n
,
k
,
d_o
,
ho
,
wo
),
arg
.
d_tensors_
[
1
](
g
,
n
,
k
,
d_o
,
ho
,
wo
));
}
else
{
throw
std
::
runtime_error
(
"Output ElementOp not supported in reference."
);
}
arg
.
output_
(
g
,
n
,
k
,
d_o
,
ho
,
wo
)
=
v_out
;
arg
.
output_
(
g
,
n
,
k
,
d_o
,
ho
,
wo
)
=
v_out
;
};
};
...
@@ -294,7 +358,8 @@ struct ReferenceConvFwd : public device::BaseOperator
...
@@ -294,7 +358,8 @@ struct ReferenceConvFwd : public device::BaseOperator
std
::
vector
<
ck
::
index_t
>
input_right_pads
,
std
::
vector
<
ck
::
index_t
>
input_right_pads
,
InElementwiseOperation
in_element_op
,
InElementwiseOperation
in_element_op
,
WeiElementwiseOperation
wei_element_op
,
WeiElementwiseOperation
wei_element_op
,
OutElementwiseOperation
out_element_op
)
OutElementwiseOperation
out_element_op
,
const
std
::
array
<
Tensor
<
OutDataType
>
,
NumDTensor
>&
d_tensors
=
{})
{
{
return
Argument
{
input
,
return
Argument
{
input
,
weight
,
weight
,
...
@@ -305,7 +370,8 @@ struct ReferenceConvFwd : public device::BaseOperator
...
@@ -305,7 +370,8 @@ struct ReferenceConvFwd : public device::BaseOperator
input_right_pads
,
input_right_pads
,
in_element_op
,
in_element_op
,
wei_element_op
,
wei_element_op
,
out_element_op
};
out_element_op
,
d_tensors
};
}
}
static
auto
MakeInvoker
()
{
return
Invoker
{};
}
static
auto
MakeInvoker
()
{
return
Invoker
{};
}
...
...
library/include/ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp
View file @
c5335964
...
@@ -19,9 +19,7 @@ namespace host {
...
@@ -19,9 +19,7 @@ namespace host {
* \brief Reference implementation for image to column.
* \brief Reference implementation for image to column.
*
*
* Input tensor descriptor has [G, N, C, Di, Hi, Wi] data layout.
* Input tensor descriptor has [G, N, C, Di, Hi, Wi] data layout.
* G must be equal to 1. Memory layout is [G, N, Di, Hi, Wi, C].
* Output tensor descriptor has [G * N * Do * Ho * Wo, Z * Y * X * C] data layout.
* Output tensor descriptor has [N * Do * Ho * Wo, Z * Y * X * C] data layout.
* Memory layout is the same.
*
*
* \tparam NDimSpatial Number of spatial dimensions.
* \tparam NDimSpatial Number of spatial dimensions.
* \tparam ImageLayout Image Layout.
* \tparam ImageLayout Image Layout.
...
@@ -95,18 +93,19 @@ struct ReferenceImageToColumn : public device::BaseOperator
...
@@ -95,18 +93,19 @@ struct ReferenceImageToColumn : public device::BaseOperator
float
Run
(
const
Argument
&
arg
)
float
Run
(
const
Argument
&
arg
)
{
{
if
(
!
(
arg
.
input_
.
GetNumOfDimension
()
==
NDimSpatial
+
3
&&
if
(
!
(
arg
.
input_
.
GetNumOfDimension
()
==
NDimSpatial
+
3
&&
arg
.
output_
.
GetNumOfDimension
()
==
2
))
arg
.
output_
.
GetNumOfDimension
()
==
3
))
{
{
throw
std
::
runtime_error
(
"wrong! inconsistent dimension"
);
throw
std
::
runtime_error
(
"wrong! inconsistent dimension"
);
}
}
const
index_t
G
=
arg
.
input_
.
GetLengths
()[
0
];
const
index_t
N
=
arg
.
input_
.
GetLengths
()[
1
];
const
index_t
N
=
arg
.
input_
.
GetLengths
()[
1
];
const
index_t
C
=
arg
.
input_
.
GetLengths
()[
2
];
const
index_t
C
=
arg
.
input_
.
GetLengths
()[
2
];
if
constexpr
(
NDimSpatial
==
1
)
if
constexpr
(
NDimSpatial
==
1
)
{
{
const
index_t
Wo
=
arg
.
output_spatial_lengths_
[
0
];
const
index_t
Wo
=
arg
.
output_spatial_lengths_
[
0
];
auto
func
=
[
&
](
auto
n
,
auto
wo
)
{
auto
func
=
[
&
](
auto
g
,
auto
n
,
auto
wo
)
{
index_t
row
=
n
*
Wo
+
wo
;
index_t
row
=
n
*
Wo
+
wo
;
index_t
column
=
0
;
index_t
column
=
0
;
...
@@ -121,15 +120,15 @@ struct ReferenceImageToColumn : public device::BaseOperator
...
@@ -121,15 +120,15 @@ struct ReferenceImageToColumn : public device::BaseOperator
if
(
wi
>=
0
&&
if
(
wi
>=
0
&&
ck
::
type_convert
<
std
::
size_t
>
(
wi
)
<
arg
.
input_
.
GetLengths
()[
3
])
ck
::
type_convert
<
std
::
size_t
>
(
wi
)
<
arg
.
input_
.
GetLengths
()[
3
])
{
{
InDataType
v_in
=
arg
.
input_
(
0
,
n
,
c
,
wi
);
InDataType
v_in
=
arg
.
input_
(
g
,
n
,
c
,
wi
);
arg
.
output_
(
row
,
column
)
=
ck
::
type_convert
<
OutDataType
>
(
v_in
);
arg
.
output_
(
g
,
row
,
column
)
=
ck
::
type_convert
<
OutDataType
>
(
v_in
);
}
}
column
++
;
column
++
;
}
}
}
}
};
};
make_ParallelTensorFunctor
(
func
,
N
,
Wo
)(
std
::
thread
::
hardware_concurrency
());
make_ParallelTensorFunctor
(
func
,
G
,
N
,
Wo
)(
std
::
thread
::
hardware_concurrency
());
return
0
;
return
0
;
}
}
...
@@ -138,7 +137,7 @@ struct ReferenceImageToColumn : public device::BaseOperator
...
@@ -138,7 +137,7 @@ struct ReferenceImageToColumn : public device::BaseOperator
const
index_t
Ho
=
arg
.
output_spatial_lengths_
[
0
];
const
index_t
Ho
=
arg
.
output_spatial_lengths_
[
0
];
const
index_t
Wo
=
arg
.
output_spatial_lengths_
[
1
];
const
index_t
Wo
=
arg
.
output_spatial_lengths_
[
1
];
auto
func
=
[
&
](
auto
n
,
auto
ho
,
auto
wo
)
{
auto
func
=
[
&
](
auto
g
,
auto
n
,
auto
ho
,
auto
wo
)
{
index_t
row
=
n
*
Ho
*
Wo
+
ho
*
Wo
+
wo
;
index_t
row
=
n
*
Ho
*
Wo
+
ho
*
Wo
+
wo
;
index_t
column
=
0
;
index_t
column
=
0
;
...
@@ -162,8 +161,9 @@ struct ReferenceImageToColumn : public device::BaseOperator
...
@@ -162,8 +161,9 @@ struct ReferenceImageToColumn : public device::BaseOperator
wi
>=
0
&&
wi
>=
0
&&
ck
::
type_convert
<
std
::
size_t
>
(
wi
)
<
arg
.
input_
.
GetLengths
()[
4
])
ck
::
type_convert
<
std
::
size_t
>
(
wi
)
<
arg
.
input_
.
GetLengths
()[
4
])
{
{
InDataType
v_in
=
arg
.
input_
(
0
,
n
,
c
,
hi
,
wi
);
InDataType
v_in
=
arg
.
input_
(
g
,
n
,
c
,
hi
,
wi
);
arg
.
output_
(
row
,
column
)
=
ck
::
type_convert
<
OutDataType
>
(
v_in
);
arg
.
output_
(
g
,
row
,
column
)
=
ck
::
type_convert
<
OutDataType
>
(
v_in
);
}
}
column
++
;
column
++
;
}
}
...
@@ -171,7 +171,7 @@ struct ReferenceImageToColumn : public device::BaseOperator
...
@@ -171,7 +171,7 @@ struct ReferenceImageToColumn : public device::BaseOperator
}
}
};
};
make_ParallelTensorFunctor
(
func
,
N
,
Ho
,
Wo
)(
std
::
thread
::
hardware_concurrency
());
make_ParallelTensorFunctor
(
func
,
G
,
N
,
Ho
,
Wo
)(
std
::
thread
::
hardware_concurrency
());
return
0
;
return
0
;
}
}
...
@@ -181,7 +181,7 @@ struct ReferenceImageToColumn : public device::BaseOperator
...
@@ -181,7 +181,7 @@ struct ReferenceImageToColumn : public device::BaseOperator
const
index_t
Ho
=
arg
.
output_spatial_lengths_
[
1
];
const
index_t
Ho
=
arg
.
output_spatial_lengths_
[
1
];
const
index_t
Wo
=
arg
.
output_spatial_lengths_
[
2
];
const
index_t
Wo
=
arg
.
output_spatial_lengths_
[
2
];
auto
func
=
[
&
](
auto
n
,
auto
d_o
,
auto
ho
,
auto
wo
)
{
auto
func
=
[
&
](
auto
g
,
auto
n
,
auto
d_o
,
auto
ho
,
auto
wo
)
{
index_t
row
=
n
*
Do
*
Ho
*
Wo
+
d_o
*
Ho
*
Wo
+
ho
*
Wo
+
wo
;
index_t
row
=
n
*
Do
*
Ho
*
Wo
+
d_o
*
Ho
*
Wo
+
ho
*
Wo
+
wo
;
index_t
column
=
0
;
index_t
column
=
0
;
...
@@ -213,8 +213,8 @@ struct ReferenceImageToColumn : public device::BaseOperator
...
@@ -213,8 +213,8 @@ struct ReferenceImageToColumn : public device::BaseOperator
ck
::
type_convert
<
std
::
size_t
>
(
wi
)
<
ck
::
type_convert
<
std
::
size_t
>
(
wi
)
<
arg
.
input_
.
GetLengths
()[
5
])
arg
.
input_
.
GetLengths
()[
5
])
{
{
InDataType
v_in
=
arg
.
input_
(
0
,
n
,
c
,
di
,
hi
,
wi
);
InDataType
v_in
=
arg
.
input_
(
g
,
n
,
c
,
di
,
hi
,
wi
);
arg
.
output_
(
row
,
column
)
=
arg
.
output_
(
g
,
row
,
column
)
=
ck
::
type_convert
<
OutDataType
>
(
v_in
);
ck
::
type_convert
<
OutDataType
>
(
v_in
);
}
}
column
++
;
column
++
;
...
@@ -224,7 +224,7 @@ struct ReferenceImageToColumn : public device::BaseOperator
...
@@ -224,7 +224,7 @@ struct ReferenceImageToColumn : public device::BaseOperator
}
}
};
};
make_ParallelTensorFunctor
(
func
,
N
,
Do
,
Ho
,
Wo
)(
make_ParallelTensorFunctor
(
func
,
G
,
N
,
Do
,
Ho
,
Wo
)(
std
::
thread
::
hardware_concurrency
());
std
::
thread
::
hardware_concurrency
());
return
0
;
return
0
;
...
@@ -267,8 +267,9 @@ struct ReferenceImageToColumn : public device::BaseOperator
...
@@ -267,8 +267,9 @@ struct ReferenceImageToColumn : public device::BaseOperator
C
*
ck
::
accumulate_n
<
index_t
>
(
C
*
ck
::
accumulate_n
<
index_t
>
(
arg
.
filter_spatial_lengths_
.
begin
(),
NDimSpatial
,
1
,
std
::
multiplies
<>
());
arg
.
filter_spatial_lengths_
.
begin
(),
NDimSpatial
,
1
,
std
::
multiplies
<>
());
if
(
!
(
arg
.
output_
.
GetLengths
()[
0
]
==
static_cast
<
std
::
size_t
>
(
NDoHoWo
)
&&
if
(
!
(
arg
.
output_
.
GetLengths
()[
0
]
==
static_cast
<
std
::
size_t
>
(
G
)
&&
arg
.
output_
.
GetLengths
()[
1
]
==
static_cast
<
std
::
size_t
>
(
CZYX
)))
arg
.
output_
.
GetLengths
()[
1
]
==
static_cast
<
std
::
size_t
>
(
NDoHoWo
)
&&
arg
.
output_
.
GetLengths
()[
2
]
==
static_cast
<
std
::
size_t
>
(
CZYX
)))
{
{
return
false
;
return
false
;
}
}
...
...
library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp
View file @
c5335964
This diff is collapsed.
Click to expand it.
library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_instance.hpp
0 → 100644
View file @
c5335964
This diff is collapsed.
Click to expand it.
library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_scaleadd_scaleadd_relu.hpp
0 → 100644
View file @
c5335964
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include <memory>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
ScaleAddScaleAddRelu
=
ck
::
tensor_operation
::
element_wise
::
ScaleAddScaleAddRelu
;
#ifdef CK_ENABLE_BF16
// grouped conv3d forward, NDHWGC/GKZYXC/NDHWGK
void
add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
3
,
NDHWGC
,
GKZYXC
,
ck
::
Tuple
<
NDHWGK
,
NDHWGK
>
,
NDHWGK
,
BF16
,
BF16
,
ck
::
Tuple
<
BF16
,
BF16
>
,
BF16
,
PassThrough
,
PassThrough
,
ScaleAddScaleAddRelu
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP16
void
add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
3
,
NDHWGC
,
GKZYXC
,
ck
::
Tuple
<
NDHWGK
,
NDHWGK
>
,
NDHWGK
,
F16
,
F16
,
ck
::
Tuple
<
F16
,
F16
>
,
F16
,
PassThrough
,
PassThrough
,
ScaleAddScaleAddRelu
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP32
void
add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
3
,
NDHWGC
,
GKZYXC
,
ck
::
Tuple
<
NDHWGK
,
NDHWGK
>
,
NDHWGK
,
F32
,
F32
,
ck
::
Tuple
<
F32
,
F32
>
,
F32
,
PassThrough
,
PassThrough
,
ScaleAddScaleAddRelu
>>>&
instances
);
#endif
#ifdef CK_ENABLE_INT8
void
add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
3
,
NDHWGC
,
GKZYXC
,
ck
::
Tuple
<
NDHWGK
,
NDHWGK
>
,
NDHWGK
,
int8_t
,
int8_t
,
ck
::
Tuple
<
F32
,
F32
>
,
int8_t
,
PassThrough
,
PassThrough
,
ScaleAddScaleAddRelu
>>>&
instances
);
#endif
template
<
ck
::
index_t
NumDimSpatial
,
typename
InLayout
,
typename
WeiLayout
,
typename
DLayouts
,
typename
OutLayout
,
typename
InDataType
,
typename
WeiDataType
,
typename
DDataTypes
,
typename
OutDataType
,
typename
ComputeType
>
struct
DeviceOperationInstanceFactory
<
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultipleD
<
NumDimSpatial
,
InLayout
,
WeiLayout
,
DLayouts
,
OutLayout
,
InDataType
,
WeiDataType
,
DDataTypes
,
OutDataType
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
ScaleAddScaleAddRelu
,
ComputeType
>>
{
using
DeviceOp
=
DeviceGroupedConvFwdMultipleD
<
NumDimSpatial
,
InLayout
,
WeiLayout
,
DLayouts
,
OutLayout
,
InDataType
,
WeiDataType
,
DDataTypes
,
OutDataType
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
ScaleAddScaleAddRelu
,
ComputeType
>
;
static
auto
GetInstances
()
{
std
::
vector
<
std
::
unique_ptr
<
DeviceOp
>>
op_ptrs
;
if
constexpr
(
NumDimSpatial
==
3
&&
is_same_v
<
InLayout
,
NDHWGC
>
&&
is_same_v
<
WeiLayout
,
GKZYXC
>
&&
is_same_v
<
OutLayout
,
NDHWGK
>
)
{
#ifdef CK_ENABLE_FP32
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f32_instances
(
op_ptrs
);
}
#endif
#ifdef CK_ENABLE_FP16
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
&&
is_same_v
<
ComputeType
,
half_t
>
)
{
add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f16_instances
(
op_ptrs
);
}
#endif
#ifdef CK_ENABLE_BF16
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
WeiDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_bf16_instances
(
op_ptrs
);
}
#endif
#ifdef CK_ENABLE_INT8
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
WeiDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_int8_instances
(
op_ptrs
);
}
#endif
}
return
op_ptrs
;
}
};
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/column_to_image/CMakeLists.txt
View file @
c5335964
add_instance_library
(
device_column_to_image_instance
add_instance_library
(
device_column_to_image_instance
device_column_to_image_nhwc_1d_instance.cpp
device_column_to_image_gnwc_1d_instance.cpp
device_column_to_image_nhwc_2d_instance.cpp
device_column_to_image_gnhwc_2d_instance.cpp
device_column_to_image_nhwc_3d_instance.cpp
device_column_to_image_gndhwc_3d_instance.cpp
device_column_to_image_nwgc_1d_instance.cpp
device_column_to_image_nhwgc_2d_instance.cpp
device_column_to_image_ndhwgc_3d_instance.cpp
)
)
library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_
n
hwc_3d_instance.cpp
→
library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_
gnd
hwc_3d_instance.cpp
View file @
c5335964
...
@@ -11,7 +11,7 @@ namespace instance {
...
@@ -11,7 +11,7 @@ namespace instance {
using
namespace
ck
::
conv_tensor_rearrange_op
;
using
namespace
ck
::
conv_tensor_rearrange_op
;
void
add_device_column_to_image_ndhwc_3d_bf16_instances
(
void
add_device_column_to_image_
g
ndhwc_3d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
GNDHWC
,
BF16
,
BF16
,
ColumnToImage
>>>&
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
GNDHWC
,
BF16
,
BF16
,
ColumnToImage
>>>&
instances
)
instances
)
{
{
...
@@ -22,7 +22,7 @@ void add_device_column_to_image_ndhwc_3d_bf16_instances(
...
@@ -22,7 +22,7 @@ void add_device_column_to_image_ndhwc_3d_bf16_instances(
#endif
#endif
}
}
void
add_device_column_to_image_ndhwc_3d_f16_instances
(
void
add_device_column_to_image_
g
ndhwc_3d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
GNDHWC
,
F16
,
F16
,
ColumnToImage
>>>&
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
GNDHWC
,
F16
,
F16
,
ColumnToImage
>>>&
instances
)
instances
)
{
{
...
@@ -33,7 +33,7 @@ void add_device_column_to_image_ndhwc_3d_f16_instances(
...
@@ -33,7 +33,7 @@ void add_device_column_to_image_ndhwc_3d_f16_instances(
#endif
#endif
}
}
void
add_device_column_to_image_ndhwc_3d_f32_instances
(
void
add_device_column_to_image_
g
ndhwc_3d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
GNDHWC
,
F32
,
F32
,
ColumnToImage
>>>&
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
GNDHWC
,
F32
,
F32
,
ColumnToImage
>>>&
instances
)
instances
)
{
{
...
@@ -44,7 +44,7 @@ void add_device_column_to_image_ndhwc_3d_f32_instances(
...
@@ -44,7 +44,7 @@ void add_device_column_to_image_ndhwc_3d_f32_instances(
#endif
#endif
}
}
void
add_device_column_to_image_ndhwc_3d_i8_instances
(
void
add_device_column_to_image_
g
ndhwc_3d_i8_instances
(
std
::
vector
<
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
GNDHWC
,
int8_t
,
int8_t
,
ColumnToImage
>>>&
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
GNDHWC
,
int8_t
,
int8_t
,
ColumnToImage
>>>&
instances
)
instances
)
...
...
library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_2d_instance.cpp
→
library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_
g
nhwc_2d_instance.cpp
View file @
c5335964
...
@@ -11,7 +11,7 @@ namespace instance {
...
@@ -11,7 +11,7 @@ namespace instance {
using
namespace
ck
::
conv_tensor_rearrange_op
;
using
namespace
ck
::
conv_tensor_rearrange_op
;
void
add_device_column_to_image_nhwc_2d_bf16_instances
(
void
add_device_column_to_image_
g
nhwc_2d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
GNHWC
,
BF16
,
BF16
,
ColumnToImage
>>>&
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
GNHWC
,
BF16
,
BF16
,
ColumnToImage
>>>&
instances
)
instances
)
{
{
...
@@ -22,7 +22,7 @@ void add_device_column_to_image_nhwc_2d_bf16_instances(
...
@@ -22,7 +22,7 @@ void add_device_column_to_image_nhwc_2d_bf16_instances(
#endif
#endif
}
}
void
add_device_column_to_image_nhwc_2d_f16_instances
(
void
add_device_column_to_image_
g
nhwc_2d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
GNHWC
,
F16
,
F16
,
ColumnToImage
>>>&
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
GNHWC
,
F16
,
F16
,
ColumnToImage
>>>&
instances
)
instances
)
{
{
...
@@ -33,7 +33,7 @@ void add_device_column_to_image_nhwc_2d_f16_instances(
...
@@ -33,7 +33,7 @@ void add_device_column_to_image_nhwc_2d_f16_instances(
#endif
#endif
}
}
void
add_device_column_to_image_nhwc_2d_f32_instances
(
void
add_device_column_to_image_
g
nhwc_2d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
GNHWC
,
F32
,
F32
,
ColumnToImage
>>>&
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
GNHWC
,
F32
,
F32
,
ColumnToImage
>>>&
instances
)
instances
)
{
{
...
@@ -44,7 +44,7 @@ void add_device_column_to_image_nhwc_2d_f32_instances(
...
@@ -44,7 +44,7 @@ void add_device_column_to_image_nhwc_2d_f32_instances(
#endif
#endif
}
}
void
add_device_column_to_image_nhwc_2d_i8_instances
(
void
add_device_column_to_image_
g
nhwc_2d_i8_instances
(
std
::
vector
<
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
GNHWC
,
int8_t
,
int8_t
,
ColumnToImage
>>>&
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
GNHWC
,
int8_t
,
int8_t
,
ColumnToImage
>>>&
instances
)
instances
)
...
...
library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_n
h
wc_1d_instance.cpp
→
library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_
g
nwc_1d_instance.cpp
View file @
c5335964
...
@@ -11,7 +11,7 @@ namespace instance {
...
@@ -11,7 +11,7 @@ namespace instance {
using
namespace
ck
::
conv_tensor_rearrange_op
;
using
namespace
ck
::
conv_tensor_rearrange_op
;
void
add_device_column_to_image_nwc_1d_bf16_instances
(
void
add_device_column_to_image_
g
nwc_1d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
GNWC
,
BF16
,
BF16
,
ColumnToImage
>>>&
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
GNWC
,
BF16
,
BF16
,
ColumnToImage
>>>&
instances
)
instances
)
{
{
...
@@ -22,7 +22,7 @@ void add_device_column_to_image_nwc_1d_bf16_instances(
...
@@ -22,7 +22,7 @@ void add_device_column_to_image_nwc_1d_bf16_instances(
#endif
#endif
}
}
void
add_device_column_to_image_nwc_1d_f16_instances
(
void
add_device_column_to_image_
g
nwc_1d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
GNWC
,
F16
,
F16
,
ColumnToImage
>>>&
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
GNWC
,
F16
,
F16
,
ColumnToImage
>>>&
instances
)
instances
)
{
{
...
@@ -33,7 +33,7 @@ void add_device_column_to_image_nwc_1d_f16_instances(
...
@@ -33,7 +33,7 @@ void add_device_column_to_image_nwc_1d_f16_instances(
#endif
#endif
}
}
void
add_device_column_to_image_nwc_1d_f32_instances
(
void
add_device_column_to_image_
g
nwc_1d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
GNWC
,
F32
,
F32
,
ColumnToImage
>>>&
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
GNWC
,
F32
,
F32
,
ColumnToImage
>>>&
instances
)
instances
)
{
{
...
@@ -44,7 +44,7 @@ void add_device_column_to_image_nwc_1d_f32_instances(
...
@@ -44,7 +44,7 @@ void add_device_column_to_image_nwc_1d_f32_instances(
#endif
#endif
}
}
void
add_device_column_to_image_nwc_1d_i8_instances
(
void
add_device_column_to_image_
g
nwc_1d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
GNWC
,
int8_t
,
int8_t
,
ColumnToImage
>>>&
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
GNWC
,
int8_t
,
int8_t
,
ColumnToImage
>>>&
instances
)
instances
)
{
{
...
...
library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_ndhwgc_3d_instance.cpp
0 → 100644
View file @
c5335964
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
namespace
ck
::
conv_tensor_rearrange_op
;
void
add_device_column_to_image_ndhwgc_3d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
NDHWGC
,
BF16
,
BF16
,
ColumnToImage
>>>&
instances
)
{
#ifdef CK_ENABLE_BF16
add_device_operation_instances
(
instances
,
device_column_to_image_bf16_instances
<
3
,
NDHWGC
>
{});
#else
ignore
=
instances
;
#endif
}
void
add_device_column_to_image_ndhwgc_3d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
NDHWGC
,
F16
,
F16
,
ColumnToImage
>>>&
instances
)
{
#ifdef CK_ENABLE_FP16
add_device_operation_instances
(
instances
,
device_column_to_image_f16_instances
<
3
,
NDHWGC
>
{});
#else
ignore
=
instances
;
#endif
}
void
add_device_column_to_image_ndhwgc_3d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
NDHWGC
,
F32
,
F32
,
ColumnToImage
>>>&
instances
)
{
#ifdef CK_ENABLE_FP32
add_device_operation_instances
(
instances
,
device_column_to_image_f32_instances
<
3
,
NDHWGC
>
{});
#else
ignore
=
instances
;
#endif
}
void
add_device_column_to_image_ndhwgc_3d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
NDHWGC
,
int8_t
,
int8_t
,
ColumnToImage
>>>&
instances
)
{
#ifdef CK_ENABLE_INT8
add_device_operation_instances
(
instances
,
device_column_to_image_i8_instances
<
3
,
NDHWGC
>
{});
#else
ignore
=
instances
;
#endif
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment