Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
68fda7c2
Commit
68fda7c2
authored
Oct 25, 2023
by
Bartlomiej Kocot
Browse files
Add support for groups in Img2Col/Col2Img
parent
bec84efb
Changes
29
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
784 additions
and
194 deletions
+784
-194
client_example/22_im2col_col2im/column_to_image.cpp
client_example/22_im2col_col2im/column_to_image.cpp
+6
-4
client_example/22_im2col_col2im/image_to_column.cpp
client_example/22_im2col_col2im/image_to_column.cpp
+5
-3
example/52_im2col_col2im/column_to_image_f32.cpp
example/52_im2col_col2im/column_to_image_f32.cpp
+8
-6
example/52_im2col_col2im/image_to_column_f32.cpp
example/52_im2col_col2im/image_to_column_f32.cpp
+8
-6
include/ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp
...sor_operation/gpu/device/device_conv_tensor_rearrange.hpp
+7
-4
include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp
...operation/gpu/device/impl/device_column_to_image_impl.hpp
+76
-30
include/ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp
...operation/gpu/device/impl/device_image_to_column_impl.hpp
+74
-27
include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
...k/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
+32
-13
library/include/ck/library/reference_tensor_operation/cpu/reference_column_to_image.hpp
...erence_tensor_operation/cpu/reference_column_to_image.hpp
+42
-16
library/include/ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp
...erence_tensor_operation/cpu/reference_image_to_column.hpp
+40
-15
library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp
...y/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp
+273
-48
library/src/tensor_operation_instance/gpu/column_to_image/CMakeLists.txt
...sor_operation_instance/gpu/column_to_image/CMakeLists.txt
+6
-3
library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_gndhwc_3d_instance.cpp
...mn_to_image/device_column_to_image_gndhwc_3d_instance.cpp
+4
-4
library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_gnhwc_2d_instance.cpp
...umn_to_image/device_column_to_image_gnhwc_2d_instance.cpp
+4
-4
library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_gnwc_1d_instance.cpp
...lumn_to_image/device_column_to_image_gnwc_1d_instance.cpp
+4
-4
library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_ndhwgc_3d_instance.cpp
...mn_to_image/device_column_to_image_ndhwgc_3d_instance.cpp
+62
-0
library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwgc_2d_instance.cpp
...umn_to_image/device_column_to_image_nhwgc_2d_instance.cpp
+62
-0
library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nwgc_1d_instance.cpp
...lumn_to_image/device_column_to_image_nwgc_1d_instance.cpp
+61
-0
library/src/tensor_operation_instance/gpu/image_to_column/CMakeLists.txt
...sor_operation_instance/gpu/image_to_column/CMakeLists.txt
+6
-3
library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_gndhwc_3d_instance.cpp
...e_to_column/device_image_to_column_gndhwc_3d_instance.cpp
+4
-4
No files found.
client_example/22_im2col_col2im/column_to_image.cpp
View file @
68fda7c2
...
...
@@ -19,7 +19,7 @@ using OutDataType = ck::half_t;
using
ImageLayout
=
ck
::
tensor_layout
::
convolution
::
GNHWC
;
static
constexpr
ck
::
index_t
NumDimSpatial
=
2
;
static
constexpr
ck
::
index_t
G
=
1
;
static
constexpr
ck
::
index_t
G
=
2
;
static
constexpr
ck
::
index_t
N
=
32
;
// batch size
static
constexpr
ck
::
index_t
C
=
32
;
// input channel (per group)
static
constexpr
ck
::
index_t
Y
=
3
;
// filter H
...
...
@@ -52,7 +52,7 @@ int main()
std
::
array
<
ck
::
index_t
,
2
>
wei_spatial_lengths
{
Y
,
X
};
std
::
array
<
ck
::
index_t
,
2
>
out_spatial_lengths
{
Ho
,
Wo
};
// We have NHWGC in memory space
(G is dummy)
// We have NHWGC in memory space
// However, CK's API only accept length and stride with order of GNCHW
// Hence, we need to adjust the order of stride
std
::
array
<
ck
::
index_t
,
5
>
image_strides
{
C
,
Hi
*
Wi
*
G
*
C
,
1
,
Wi
*
G
*
C
,
G
*
C
};
...
...
@@ -63,7 +63,7 @@ int main()
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
input_left_pads
{
1
,
1
};
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
input_right_pads
{
1
,
1
};
SimpleDeviceMem
in
(
sizeof
(
InDataType
)
*
N
*
Ho
*
Wo
*
Y
*
X
*
C
);
SimpleDeviceMem
in
(
sizeof
(
InDataType
)
*
G
*
N
*
Ho
*
Wo
*
Y
*
X
*
C
);
SimpleDeviceMem
out
(
sizeof
(
OutDataType
)
*
N
*
Hi
*
Wi
*
G
*
C
);
using
namespace
ck
::
conv_tensor_rearrange_op
;
...
...
@@ -93,6 +93,7 @@ int main()
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
in
.
GetDeviceBuffer
(),
out
.
GetDeviceBuffer
(),
G
,
N
,
C
,
in_spatial_lengths
,
...
...
@@ -112,7 +113,7 @@ int main()
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
true
});
std
::
size_t
num_bytes
=
sizeof
(
InDataType
)
*
N
*
Hi
*
Wi
*
G
*
C
+
sizeof
(
OutDataType
)
*
N
*
Ho
*
Wo
*
Y
*
X
*
C
;
sizeof
(
OutDataType
)
*
G
*
N
*
Ho
*
Wo
*
Y
*
X
*
C
;
float
gb_per_sec
=
num_bytes
/
1.E6
/
avg_time
;
...
...
@@ -149,6 +150,7 @@ int main()
<<
std
::
endl
;
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
in
.
GetDeviceBuffer
(),
out
.
GetDeviceBuffer
(),
G
,
N
,
C
,
in_spatial_lengths
,
...
...
client_example/22_im2col_col2im/image_to_column.cpp
View file @
68fda7c2
...
...
@@ -19,7 +19,7 @@ using OutDataType = ck::half_t;
using
ImageLayout
=
ck
::
tensor_layout
::
convolution
::
GNHWC
;
static
constexpr
ck
::
index_t
NumDimSpatial
=
2
;
static
constexpr
ck
::
index_t
G
=
1
;
static
constexpr
ck
::
index_t
G
=
2
;
static
constexpr
ck
::
index_t
N
=
32
;
// batch size
static
constexpr
ck
::
index_t
C
=
32
;
// input channel (per group)
static
constexpr
ck
::
index_t
Y
=
3
;
// filter H
...
...
@@ -64,7 +64,7 @@ int main()
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
input_right_pads
{
1
,
1
};
SimpleDeviceMem
in
(
sizeof
(
InDataType
)
*
N
*
Hi
*
Wi
*
G
*
C
);
SimpleDeviceMem
out
(
sizeof
(
OutDataType
)
*
N
*
Ho
*
Wo
*
Y
*
X
*
C
);
SimpleDeviceMem
out
(
sizeof
(
OutDataType
)
*
G
*
N
*
Ho
*
Wo
*
Y
*
X
*
C
);
using
namespace
ck
::
conv_tensor_rearrange_op
;
...
...
@@ -93,6 +93,7 @@ int main()
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
in
.
GetDeviceBuffer
(),
out
.
GetDeviceBuffer
(),
G
,
N
,
C
,
in_spatial_lengths
,
...
...
@@ -112,7 +113,7 @@ int main()
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
true
});
std
::
size_t
num_bytes
=
sizeof
(
InDataType
)
*
N
*
Hi
*
Wi
*
G
*
C
+
sizeof
(
OutDataType
)
*
N
*
Ho
*
Wo
*
Y
*
X
*
C
;
sizeof
(
OutDataType
)
*
G
*
N
*
Ho
*
Wo
*
Y
*
X
*
C
;
float
gb_per_sec
=
num_bytes
/
1.E6
/
avg_time
;
...
...
@@ -149,6 +150,7 @@ int main()
<<
std
::
endl
;
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
in
.
GetDeviceBuffer
(),
out
.
GetDeviceBuffer
(),
G
,
N
,
C
,
in_spatial_lengths
,
...
...
example/52_im2col_col2im/column_to_image_f32.cpp
View file @
68fda7c2
...
...
@@ -20,18 +20,19 @@ using DeviceColToImgInstance = ck::tensor_operation::device::DeviceColumnToImage
bool
RunColumnToImage
(
const
ExecutionConfig
&
config
,
const
ck
::
utils
::
conv
::
ConvParam
&
conv_params
)
{
const
auto
G
=
conv_params
.
G_
;
const
auto
N
=
conv_params
.
N_
;
const
auto
C
=
conv_params
.
C_
;
const
ck
::
index_t
NDoHoWo
=
N
*
ck
::
accumulate_n
<
ck
::
index_t
>
(
const
ck
::
index_t
GNDoHoWo
=
G
*
N
*
ck
::
accumulate_n
<
ck
::
index_t
>
(
conv_params
.
output_spatial_lengths_
.
begin
(),
NDimSpatial
,
1
,
std
::
multiplies
<>
());
const
ck
::
index_t
CZYX
=
C
*
ck
::
accumulate_n
<
ck
::
index_t
>
(
conv_params
.
filter_spatial_lengths_
.
begin
(),
NDimSpatial
,
1
,
std
::
multiplies
<>
());
const
auto
in_desc
=
HostTensorDescriptor
({
NDoHoWo
,
CZYX
});
const
auto
in_desc
=
HostTensorDescriptor
({
G
NDoHoWo
,
CZYX
});
const
auto
out_desc
=
ck
::
utils
::
conv
::
make_input_host_tensor_descriptor_g_n_c_wis_packed
<
ImLayout
>
(
conv_params
);
...
...
@@ -86,6 +87,7 @@ bool RunColumnToImage(const ExecutionConfig& config, const ck::utils::conv::Conv
auto
invoker
=
col2img
.
MakeInvoker
();
auto
argument
=
col2img
.
MakeArgument
(
in_device_buf
.
GetDeviceBuffer
(),
out_device_buf
.
GetDeviceBuffer
(),
G
,
N
,
C
,
input_spatial_lengths
,
...
...
@@ -108,7 +110,7 @@ bool RunColumnToImage(const ExecutionConfig& config, const ck::utils::conv::Conv
}
float
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
config
.
time_kernel
});
std
::
size_t
num_btype
=
NDoHoWo
*
CZYX
*
(
sizeof
(
OutDataType
)
+
sizeof
(
InDataType
));
std
::
size_t
num_btype
=
G
NDoHoWo
*
CZYX
*
(
sizeof
(
OutDataType
)
+
sizeof
(
InDataType
));
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s"
<<
std
::
endl
;
...
...
example/52_im2col_col2im/image_to_column_f32.cpp
View file @
68fda7c2
...
...
@@ -20,12 +20,13 @@ using DeviceImgToColInstance = ck::tensor_operation::device::DeviceImageToColumn
bool
RunImageToColumn
(
const
ExecutionConfig
&
config
,
const
ck
::
utils
::
conv
::
ConvParam
&
conv_params
)
{
const
auto
G
=
conv_params
.
G_
;
const
auto
N
=
conv_params
.
N_
;
const
auto
C
=
conv_params
.
C_
;
const
ck
::
index_t
NDoHoWo
=
N
*
ck
::
accumulate_n
<
ck
::
index_t
>
(
const
ck
::
index_t
GNDoHoWo
=
G
*
N
*
ck
::
accumulate_n
<
ck
::
index_t
>
(
conv_params
.
output_spatial_lengths_
.
begin
(),
NDimSpatial
,
1
,
std
::
multiplies
<>
());
const
ck
::
index_t
CZYX
=
C
*
ck
::
accumulate_n
<
ck
::
index_t
>
(
...
...
@@ -33,7 +34,7 @@ bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::Conv
const
auto
in_desc
=
ck
::
utils
::
conv
::
make_input_host_tensor_descriptor_g_n_c_wis_packed
<
ImLayout
>
(
conv_params
);
const
auto
out_desc
=
HostTensorDescriptor
({
NDoHoWo
,
CZYX
});
const
auto
out_desc
=
HostTensorDescriptor
({
G
NDoHoWo
,
CZYX
});
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_spatial_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
filter_spatial_lengths
{};
...
...
@@ -86,6 +87,7 @@ bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::Conv
auto
invoker
=
img2col
.
MakeInvoker
();
auto
argument
=
img2col
.
MakeArgument
(
in_device_buf
.
GetDeviceBuffer
(),
out_device_buf
.
GetDeviceBuffer
(),
G
,
N
,
C
,
input_spatial_lengths
,
...
...
@@ -108,7 +110,7 @@ bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::Conv
}
float
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
config
.
time_kernel
});
std
::
size_t
num_btype
=
NDoHoWo
*
CZYX
*
(
sizeof
(
OutDataType
)
+
sizeof
(
InDataType
));
std
::
size_t
num_btype
=
G
NDoHoWo
*
CZYX
*
(
sizeof
(
OutDataType
)
+
sizeof
(
InDataType
));
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s"
<<
std
::
endl
;
...
...
include/ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp
View file @
68fda7c2
...
...
@@ -14,11 +14,12 @@ namespace device {
/**
* \brief Convolution Tensor Rearrange.
*
* This Device operator supports conversion image
([G, N, Di, Hi, Wi, C])
to
* the gemm problem
([N * Do * Ho * Wo, Z * Y * X * C])
(Image to Column) and
* This Device operator supports conversion image to
* the gemm problem (Image to Column) and
* conversion gemm form to the image (Column to Image).
*
* Note that G must be equal to 1.
* Supported layouts:
* [G, N, Di, Hi, Wi, C] <-> [G * N * Do * Ho * Wo, Z * Y * X * C]
* [N, Di, Hi, Wi, G, C] <-> [N * Do * Ho * Wo * G, Z * Y * X * C]
*
* \tparam NDimSpatial Number of spatial dimensions.
* \tparam ImageLayout Input Layout.
...
...
@@ -39,6 +40,7 @@ struct DeviceConvTensorRearrange : public BaseOperator
*
* \param p_in A pointer to the device memory of the input image.
* \param p_out A pointer to the device memory of the output.
* \param G Convolution number of groups.
* \param N Convolution batch size.
* \param C Convolution number of channels.
* \param input_spatial_lengths Input spatial lengths.
...
...
@@ -55,6 +57,7 @@ struct DeviceConvTensorRearrange : public BaseOperator
virtual
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
void
*
p_in
,
void
*
p_out
,
const
ck
::
index_t
G
,
const
ck
::
index_t
N
,
const
ck
::
index_t
C
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_spatial_lengths
,
...
...
include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp
View file @
68fda7c2
...
...
@@ -17,15 +17,18 @@
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
#include "ck/host_utility/io.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
// Image to column for input layout NDHWC:
// input : image converted to the gemm problem [N * Do * Ho * Wo, Z * Y * X * C]
// output : image [N, Di, Hi, Wi, C]
// Column to Image:
// input : gemm form [G * N * Do * Ho * Wo, Z * Y * X * C]
// output : input image [G, N, Di, Hi, Wi, C]
// input : gemm form [N * Do * Ho * Wo * G, Z * Y * X * C]
// output : input image [N, Di, Hi, Wi, G, C]
template
<
index_t
NDimSpatial
,
typename
ImageLayout
,
typename
InputDataType
,
...
...
@@ -43,6 +46,14 @@ struct DeviceColumnToImageImpl
OutputDataType
,
conv_tensor_rearrange_op
::
ColumnToImage
>
{
static
constexpr
bool
is_NSpatialGC
=
std
::
is_same_v
<
ImageLayout
,
tensor_layout
::
convolution
::
NWGC
>
||
std
::
is_same_v
<
ImageLayout
,
tensor_layout
::
convolution
::
NHWGC
>
||
std
::
is_same_v
<
ImageLayout
,
tensor_layout
::
convolution
::
NDHWGC
>
;
static
constexpr
bool
is_GNSpatialC
=
std
::
is_same_v
<
ImageLayout
,
tensor_layout
::
convolution
::
GNWC
>
||
std
::
is_same_v
<
ImageLayout
,
tensor_layout
::
convolution
::
GNHWC
>
||
std
::
is_same_v
<
ImageLayout
,
tensor_layout
::
convolution
::
GNDHWC
>
;
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
...
...
@@ -85,7 +96,8 @@ struct DeviceColumnToImageImpl
// Make column form descriptor
static
auto
MakeInputDescriptor_M_K
(
const
ck
::
index_t
N
,
MakeInputDescriptor_M_K
(
const
ck
::
index_t
G
,
const
ck
::
index_t
N
,
const
ck
::
index_t
C
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
filter_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
output_spatial_lengths
,
...
...
@@ -100,16 +112,19 @@ struct DeviceColumnToImageImpl
C
*
ck
::
accumulate_n
<
index_t
>
(
filter_spatial_lengths
.
begin
(),
NDimSpatial
,
1
,
std
::
multiplies
<>
());
const
index_t
NStride
=
DoHoWo
*
gemm_m_k_strides
[
I0
]
*
gemm_m_k_strides
[
I1
];
const
index_t
AdditionalGroupStride
=
is_NSpatialGC
?
G
:
1
;
const
index_t
NStride
=
DoHoWo
*
gemm_m_k_strides
[
I0
]
*
gemm_m_k_strides
[
I1
]
*
AdditionalGroupStride
;
// Calculate the appropriate stride for each set of independent filters
// in each dimension
const
index_t
WStride
=
math
::
integer_divide_ceil
(
effs
[
XIdx
],
conv_filter_strides
[
XIdx
])
*
gemm_m_k_strides
[
I0
]
;
const
index_t
WStride
=
math
::
integer_divide_ceil
(
effs
[
XIdx
],
conv_filter_strides
[
XIdx
])
*
gemm_m_k_strides
[
I0
]
*
AdditionalGroupStride
;
const
index_t
HStride
=
math
::
integer_divide_ceil
(
effs
[
YIdx
],
conv_filter_strides
[
YIdx
])
*
output_spatial_lengths
[
XIdx
]
*
gemm_m_k_strides
[
I0
];
output_spatial_lengths
[
XIdx
]
*
gemm_m_k_strides
[
I0
]
*
AdditionalGroupStride
;
const
index_t
DStride
=
math
::
integer_divide_ceil
(
effs
[
ZIdx
],
conv_filter_strides
[
ZIdx
])
*
output_spatial_lengths
[
YIdx
]
*
output_spatial_lengths
[
XIdx
]
*
gemm_m_k_strides
[
I0
];
gemm_m_k_strides
[
I0
]
*
AdditionalGroupStride
;
// Create descriptor for independent filters in each dimension and
// then merge them into column form
if
constexpr
(
NDimSpatial
==
1
)
...
...
@@ -244,7 +259,7 @@ struct DeviceColumnToImageImpl
}
using
InputGridDesc
=
remove_cvref_t
<
decltype
(
MakeInputDescriptor_M_K
(
1
,
1
,
{},
{},
{},
{},
{},
{}))
>
;
remove_cvref_t
<
decltype
(
MakeInputDescriptor_M_K
(
1
,
1
,
1
,
{},
{},
{},
{},
{},
{}))
>
;
using
OutputGridDesc
=
remove_cvref_t
<
decltype
(
MakeOutDescriptor_M_K
(
1
,
1
,
{},
{},
{},
{},
{},
{},
{},
{},
{},
{}))
>
;
...
...
@@ -252,7 +267,8 @@ struct DeviceColumnToImageImpl
decltype
(
BlockToCTileMap_M00_N0_M01Adapt
<
MPerBlock
,
KPerBlock
,
InputGridDesc
>
(
InputGridDesc
{}))
>
;
using
GridwiseTensorRearrangeKernel
=
GridwiseTensorRearrange
<
InputGridDesc
,
using
GridwiseTensorRearrangeKernel
=
GridwiseTensorRearrange
<
InputGridDesc
,
InputDataType
,
OutputGridDesc
,
OutputDataType
,
...
...
@@ -262,12 +278,14 @@ struct DeviceColumnToImageImpl
ThreadClusterLengths
,
ScalarPerVector
,
InMemoryDataOperationEnum
::
Add
,
Block2ETileMap
>
;
Block2ETileMap
,
ComputePtrOffsetOfStridedBatch
<
I0
>>
;
struct
Argument
:
public
BaseArgument
{
Argument
(
const
void
*
p_in
,
// input image
void
*
p_out
,
// output image
const
ck
::
index_t
G
,
const
ck
::
index_t
N
,
const
ck
::
index_t
C
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_spatial_lengths
,
...
...
@@ -279,7 +297,8 @@ struct DeviceColumnToImageImpl
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_left_pads
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_right_pads
)
:
C_
(
C
),
:
G_
(
G
),
C_
(
C
),
X_
(
filter_spatial_lengths
[
NDimSpatial
-
I1
]),
p_in_
{
static_cast
<
const
InputDataType
*>
(
p_in
)},
p_out_
{
static_cast
<
OutputDataType
*>
(
p_out
)},
...
...
@@ -289,6 +308,23 @@ struct DeviceColumnToImageImpl
input_left_pads_
{
input_left_pads
},
input_right_pads_
{
input_right_pads
}
{
using
namespace
tensor_layout
::
convolution
;
if
constexpr
(
is_NSpatialGC
)
{
compute_ptr_offset_of_batch_
.
BatchStrideA_
=
gemm_m_k_strides
[
I0
]
*
gemm_m_k_strides
[
I1
];
}
else
if
constexpr
(
is_GNSpatialC
)
{
const
index_t
NDoHoWo
=
N
*
ck
::
accumulate_n
<
index_t
>
(
output_spatial_lengths
.
begin
(),
NDimSpatial
,
1
,
std
::
multiplies
<>
());
compute_ptr_offset_of_batch_
.
BatchStrideA_
=
NDoHoWo
*
gemm_m_k_strides
[
I0
]
*
gemm_m_k_strides
[
I1
];
}
compute_ptr_offset_of_batch_
.
BatchStrideC_
=
image_g_n_c_wis_strides
[
I0
];
const
index_t
x_eff
=
(
filter_spatial_lengths
[
XIdx
]
-
1
)
*
conv_filter_dilations
[
XIdx
]
+
1
;
const
index_t
y_eff
=
...
...
@@ -349,7 +385,8 @@ struct DeviceColumnToImageImpl
continue
;
const
auto
in_grid_desc_m_k
=
MakeInputDescriptor_M_K
(
N
,
MakeInputDescriptor_M_K
(
G
,
N
,
C
,
filter_spatial_lengths
,
output_spatial_lengths
,
...
...
@@ -384,13 +421,13 @@ struct DeviceColumnToImageImpl
const
index_t
z_offset_with_pad
=
math
::
max
(
0
,
z_img_offset
-
input_left_pads
[
ZIdx
]);
const
index_t
AdditionalGroupStride
=
is_NSpatialGC
?
G
:
1
;
// Memory offsets to next set of independent filters,
// move to independent filters in each dimension
const
index_t
in_offset
=
x_idx
*
gemm_m_k_strides
[
0
]
+
y_idx
*
gemm_m_k_strides
[
0
]
*
output_spatial_lengths
[
XIdx
]
+
z_idx
*
gemm_m_k_strides
[
0
]
*
output_spatial_lengths
[
YIdx
]
*
output_spatial_lengths
[
XIdx
];
(
x_idx
+
y_idx
*
output_spatial_lengths
[
XIdx
]
+
z_idx
*
output_spatial_lengths
[
YIdx
]
*
output_spatial_lengths
[
XIdx
])
*
gemm_m_k_strides
[
0
]
*
AdditionalGroupStride
;
// Move to independent filters in appropriate dimensions
const
index_t
out_offset
=
x_offset_with_pad
*
image_g_n_c_wis_strides
[
spatial_offset
+
XIdx
]
+
...
...
@@ -417,6 +454,7 @@ struct DeviceColumnToImageImpl
}
}
const
ck
::
index_t
G_
;
const
ck
::
index_t
C_
;
const
ck
::
index_t
X_
;
...
...
@@ -434,6 +472,8 @@ struct DeviceColumnToImageImpl
std
::
vector
<
const
InputDataType
*>
p_in_container_
;
std
::
vector
<
OutputDataType
*>
p_out_container_
;
ComputePtrOffsetOfStridedBatch
<
I0
>
compute_ptr_offset_of_batch_
;
};
struct
Invoker
:
public
BaseInvoker
...
...
@@ -451,6 +491,7 @@ struct DeviceColumnToImageImpl
OutputGridDesc
,
OutputDataType
,
Block2ETileMap
,
ComputePtrOffsetOfStridedBatch
<
I0
>
,
GridwiseTensorRearrangeKernel
>
;
// Execute each set of independent filters
...
...
@@ -470,7 +511,9 @@ struct DeviceColumnToImageImpl
arg
.
p_in_container_
[
i
],
arg
.
out_grid_desc_m_k_container_
[
i
],
arg
.
p_out_container_
[
i
],
block_2_tile_map
);
arg
.
G_
,
block_2_tile_map
,
arg
.
compute_ptr_offset_of_batch_
);
}
return
elapsed_time
;
}
...
...
@@ -485,8 +528,7 @@ struct DeviceColumnToImageImpl
bool
IsSupportedArgument
(
const
Argument
&
arg
)
{
using
namespace
tensor_layout
::
convolution
;
if
constexpr
(
!
(
std
::
is_same_v
<
ImageLayout
,
GNWC
>
||
std
::
is_same_v
<
ImageLayout
,
GNHWC
>
||
std
::
is_same_v
<
ImageLayout
,
GNDHWC
>
))
if
constexpr
(
!
(
is_NSpatialGC
||
is_GNSpatialC
))
{
return
false
;
}
...
...
@@ -534,6 +576,7 @@ struct DeviceColumnToImageImpl
static
auto
MakeArgument
(
const
void
*
p_in
,
// input image
void
*
p_out
,
// output image
const
ck
::
index_t
G
,
const
ck
::
index_t
N
,
const
ck
::
index_t
C
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_spatial_lengths
,
...
...
@@ -548,6 +591,7 @@ struct DeviceColumnToImageImpl
{
return
Argument
{
static_cast
<
const
InputDataType
*>
(
p_in
),
static_cast
<
OutputDataType
*>
(
p_out
),
G
,
N
,
C
,
input_spatial_lengths
,
...
...
@@ -566,6 +610,7 @@ struct DeviceColumnToImageImpl
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
void
*
p_in
,
// input image
void
*
p_out
,
// output image
const
ck
::
index_t
G
,
const
ck
::
index_t
N
,
const
ck
::
index_t
C
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_spatial_lengths
,
...
...
@@ -580,6 +625,7 @@ struct DeviceColumnToImageImpl
{
return
std
::
make_unique
<
Argument
>
(
static_cast
<
const
InputDataType
*>
(
p_in
),
static_cast
<
OutputDataType
*>
(
p_out
),
G
,
N
,
C
,
input_spatial_lengths
,
...
...
include/ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp
View file @
68fda7c2
...
...
@@ -15,15 +15,18 @@
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
#include "ck/host_utility/io.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
// Image to column for input layout NDHWC:
// input : input image [N, Di, Hi, Wi, C]
// output : gemm form [N * Do * Ho * Wo, Z * Y * X * C]
// Image to column:
// input : input image [G, N, Di, Hi, Wi, C]
// output : gemm form [G * N * Do * Ho * Wo, Z * Y * X * C]
// input : input image [N, Di, Hi, Wi, G, C]
// output : gemm form [N * Do * Ho * Wo * G, Z * Y * X * C]
template
<
index_t
NDimSpatial
,
typename
ImageLayout
,
typename
InputDataType
,
...
...
@@ -41,6 +44,14 @@ struct DeviceImageToColumnImpl
OutputDataType
,
conv_tensor_rearrange_op
::
ImageToColumn
>
{
static
constexpr
bool
is_NSpatialGC
=
std
::
is_same_v
<
ImageLayout
,
tensor_layout
::
convolution
::
NWGC
>
||
std
::
is_same_v
<
ImageLayout
,
tensor_layout
::
convolution
::
NHWGC
>
||
std
::
is_same_v
<
ImageLayout
,
tensor_layout
::
convolution
::
NDHWGC
>
;
static
constexpr
bool
is_GNSpatialC
=
std
::
is_same_v
<
ImageLayout
,
tensor_layout
::
convolution
::
GNWC
>
||
std
::
is_same_v
<
ImageLayout
,
tensor_layout
::
convolution
::
GNHWC
>
||
std
::
is_same_v
<
ImageLayout
,
tensor_layout
::
convolution
::
GNDHWC
>
;
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
...
...
@@ -105,7 +116,8 @@ struct DeviceImageToColumnImpl
}
static
auto
MakeOutDescriptor_M_K
(
const
ck
::
index_t
N
,
MakeOutDescriptor_M_K
(
const
ck
::
index_t
G
,
const
ck
::
index_t
N
,
const
ck
::
index_t
C
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
filter_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
output_spatial_lengths
,
...
...
@@ -117,22 +129,32 @@ struct DeviceImageToColumnImpl
const
index_t
CZYX
=
C
*
ck
::
accumulate_n
<
index_t
>
(
filter_spatial_lengths
.
begin
(),
NDimSpatial
,
1
,
std
::
multiplies
<>
());
if
constexpr
(
is_NSpatialGC
)
{
const
auto
desc_mraw_kraw
=
make_naive_tensor_descriptor
(
make_tuple
(
NDoHoWo
,
CZYX
),
make_tuple
(
gemm_m_k_strides
[
I0
]
*
G
,
gemm_m_k_strides
[
I1
]));
return
matrix_padder
.
PadADescriptor_M_K
(
desc_mraw_kraw
);
}
else
if
constexpr
(
is_GNSpatialC
)
{
const
auto
desc_mraw_kraw
=
make_naive_tensor_descriptor
(
make_tuple
(
NDoHoWo
,
CZYX
),
make_tuple
(
gemm_m_k_strides
[
I0
],
gemm_m_k_strides
[
I1
]));
const
auto
desc_m_k
=
matrix_padder
.
PadADescriptor_M_K
(
desc_mraw_kraw
);
return
desc_m_k
;
return
matrix_padder
.
PadADescriptor_M_K
(
desc_mraw_kraw
);
}
}
using
InputGridDesc
=
remove_cvref_t
<
decltype
(
MakeInputDescriptor_M_K
(
1
,
1
,
{},
{},
{},
{},
{},
{},
{},
{}))
>
;
using
OutputGridDesc
=
remove_cvref_t
<
decltype
(
MakeOutDescriptor_M_K
(
1
,
1
,
{},
{},
{}))
>
;
using
OutputGridDesc
=
remove_cvref_t
<
decltype
(
MakeOutDescriptor_M_K
(
1
,
1
,
1
,
{},
{},
{}))
>
;
using
Block2ETileMap
=
remove_cvref_t
<
decltype
(
BlockToCTileMap_M00_N0_M01Adapt
<
MPerBlock
,
KPerBlock
,
OutputGridDesc
>
(
OutputGridDesc
{}))
>
;
using
GridwiseTensorRearrangeKernel
=
GridwiseTensorRearrange
<
InputGridDesc
,
using
GridwiseTensorRearrangeKernel
=
GridwiseTensorRearrange
<
InputGridDesc
,
InputDataType
,
OutputGridDesc
,
OutputDataType
,
...
...
@@ -142,12 +164,14 @@ struct DeviceImageToColumnImpl
ThreadClusterLengths
,
ScalarPerVector
,
InMemoryDataOperationEnum
::
Set
,
Block2ETileMap
>
;
Block2ETileMap
,
ComputePtrOffsetOfStridedBatch
<
I0
>>
;
struct
Argument
:
public
BaseArgument
{
Argument
(
const
void
*
p_in
,
// input image
void
*
p_out
,
// gemm form
const
ck
::
index_t
G
,
const
ck
::
index_t
N
,
const
ck
::
index_t
C
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_spatial_lengths
,
...
...
@@ -159,7 +183,8 @@ struct DeviceImageToColumnImpl
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_left_pads
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_right_pads
)
:
C_
(
C
),
:
G_
(
G
),
C_
(
C
),
X_
(
filter_spatial_lengths
[
NDimSpatial
-
I1
]),
p_in_
{
static_cast
<
const
InputDataType
*>
(
p_in
)},
p_out_
{
static_cast
<
OutputDataType
*>
(
p_out
)},
...
...
@@ -176,14 +201,28 @@ struct DeviceImageToColumnImpl
filter_spatial_lengths
,
output_spatial_lengths
,
image_g_n_c_wis_strides
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
);
out_grid_desc_m_k_
=
MakeOutDescriptor_M_K
(
N
,
C
,
filter_spatial_lengths
,
output_spatial_lengths
,
gemm_m_k_strides
);
G
,
N
,
C
,
filter_spatial_lengths
,
output_spatial_lengths
,
gemm_m_k_strides
);
compute_ptr_offset_of_batch_
.
BatchStrideA_
=
image_g_n_c_wis_strides
[
I0
];
if
constexpr
(
is_NSpatialGC
)
{
compute_ptr_offset_of_batch_
.
BatchStrideC_
=
gemm_m_k_strides
[
I0
]
*
gemm_m_k_strides
[
I1
];
}
else
if
constexpr
(
is_GNSpatialC
)
{
const
index_t
NDoHoWo
=
N
*
ck
::
accumulate_n
<
index_t
>
(
output_spatial_lengths
.
begin
(),
NDimSpatial
,
1
,
std
::
multiplies
<>
());
compute_ptr_offset_of_batch_
.
BatchStrideC_
=
NDoHoWo
*
gemm_m_k_strides
[
I0
]
*
gemm_m_k_strides
[
I1
];
}
}
void
Print
()
const
...
...
@@ -192,6 +231,7 @@ struct DeviceImageToColumnImpl
std
::
cout
<<
out_grid_desc_m_k_
<<
std
::
endl
;
}
const
ck
::
index_t
G_
;
const
ck
::
index_t
C_
;
const
ck
::
index_t
X_
;
...
...
@@ -206,6 +246,8 @@ struct DeviceImageToColumnImpl
InputGridDesc
in_grid_desc_m_k_
;
OutputGridDesc
out_grid_desc_m_k_
;
ComputePtrOffsetOfStridedBatch
<
I0
>
compute_ptr_offset_of_batch_
;
};
struct
Invoker
:
public
BaseInvoker
...
...
@@ -226,6 +268,7 @@ struct DeviceImageToColumnImpl
OutputGridDesc
,
OutputDataType
,
Block2ETileMap
,
ComputePtrOffsetOfStridedBatch
<
I0
>
,
GridwiseTensorRearrangeKernel
>
;
float
elapsed_time
=
launch_and_time_kernel
(
stream_config
,
...
...
@@ -237,7 +280,9 @@ struct DeviceImageToColumnImpl
arg
.
p_in_
,
arg
.
out_grid_desc_m_k_
,
arg
.
p_out_
,
block_2_tile_map
);
arg
.
G_
,
block_2_tile_map
,
arg
.
compute_ptr_offset_of_batch_
);
return
elapsed_time
;
}
...
...
@@ -250,9 +295,7 @@ struct DeviceImageToColumnImpl
bool
IsSupportedArgument
(
const
Argument
&
arg
)
{
using
namespace
tensor_layout
::
convolution
;
if
constexpr
(
!
(
std
::
is_same_v
<
ImageLayout
,
GNWC
>
||
std
::
is_same_v
<
ImageLayout
,
GNHWC
>
||
std
::
is_same_v
<
ImageLayout
,
GNDHWC
>
))
if
constexpr
(
!
(
is_NSpatialGC
||
is_GNSpatialC
))
{
return
false
;
}
...
...
@@ -295,6 +338,7 @@ struct DeviceImageToColumnImpl
static
auto
MakeArgument
(
const
void
*
p_in
,
// input image
void
*
p_out
,
// gemm form
const
ck
::
index_t
G
,
const
ck
::
index_t
N
,
const
ck
::
index_t
C
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_spatial_lengths
,
...
...
@@ -309,6 +353,7 @@ struct DeviceImageToColumnImpl
{
return
Argument
{
static_cast
<
const
InputDataType
*>
(
p_in
),
static_cast
<
OutputDataType
*>
(
p_out
),
G
,
N
,
C
,
input_spatial_lengths
,
...
...
@@ -327,6 +372,7 @@ struct DeviceImageToColumnImpl
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
void
*
p_in
,
// input image
void
*
p_out
,
// gemm form
const
ck
::
index_t
G
,
const
ck
::
index_t
N
,
const
ck
::
index_t
C
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_spatial_lengths
,
...
...
@@ -341,6 +387,7 @@ struct DeviceImageToColumnImpl
{
return
std
::
make_unique
<
Argument
>
(
static_cast
<
const
InputDataType
*>
(
p_in
),
static_cast
<
OutputDataType
*>
(
p_out
),
G
,
N
,
C
,
input_spatial_lengths
,
...
...
include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
View file @
68fda7c2
...
...
@@ -21,6 +21,7 @@ template <typename InputGridDesc,
typename
OutputGridDesc
,
typename
OutputDataType
,
typename
Block2ETileMap
,
typename
ComputePtrOffsetOfStridedBatch
,
typename
GridwiseTensorRearrangeKernel
>
__global__
void
#if CK_USE_LAUNCH_BOUNDS
...
...
@@ -30,13 +31,20 @@ __global__ void
const
InputDataType
*
__restrict__
p_in_global
,
const
OutputGridDesc
out_grid_desc
,
OutputDataType
*
__restrict__
p_out_global
,
const
Block2ETileMap
block_2_tile_map
)
const
index_t
batch_count
,
const
Block2ETileMap
block_2_tile_map
,
const
ComputePtrOffsetOfStridedBatch
compute_ptr_offset_of_batch
)
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \
defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx1030__) || defined(__gfx1100__) || \
defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx941__) || defined(__gfx942__))
GridwiseTensorRearrangeKernel
::
Run
(
in_grid_desc
,
p_in_global
,
out_grid_desc
,
p_out_global
,
block_2_tile_map
);
GridwiseTensorRearrangeKernel
::
Run
(
in_grid_desc
,
p_in_global
,
out_grid_desc
,
p_out_global
,
batch_count
,
block_2_tile_map
,
compute_ptr_offset_of_batch
);
#else
ignore
=
in_grid_desc
;
ignore
=
p_in_global
;
...
...
@@ -56,7 +64,8 @@ template <typename InputGridDesc,
typename
ThreadClusterLengths
,
index_t
ScalarPerVector
,
InMemoryDataOperationEnum
DstInMemOp
,
typename
Block2ETileMap
>
typename
Block2ETileMap
,
typename
ComputePtrOffsetOfStridedBatch
>
struct
GridwiseTensorRearrange
{
...
...
@@ -69,7 +78,9 @@ struct GridwiseTensorRearrange
const
InputDataType
*
__restrict__
p_in_global
,
const
OutputGridDesc
&
out_grid_desc
,
OutputDataType
*
__restrict__
p_out_global
,
const
Block2ETileMap
&
block_2_tile_map
)
const
index_t
batch_count
,
const
Block2ETileMap
&
block_2_tile_map
,
const
ComputePtrOffsetOfStridedBatch
&
compute_ptr_offset_of_batch
)
{
const
auto
block_work_idx
=
block_2_tile_map
.
CalculateBottomIndex
(
make_multi_index
(
get_block_1d_id
()));
...
...
@@ -80,12 +91,6 @@ struct GridwiseTensorRearrange
const
index_t
k_block_data_idx_on_grid
=
__builtin_amdgcn_readfirstlane
(
block_work_idx
[
I1
]
*
KPerBlock
);
// Global Memory
const
auto
in_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_in_global
,
in_grid_desc
.
GetElementSpaceSize
());
auto
out_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_out_global
,
out_grid_desc
.
GetElementSpaceSize
());
auto
copy_global_to_global
=
ThreadGroupTensorSliceTransfer_v7
<
ThisThreadBlock
,
Tuple
<
InputDataType
>
,
...
...
@@ -108,9 +113,23 @@ struct GridwiseTensorRearrange
make_tuple
(
make_multi_index
(
m_block_data_idx_on_grid
,
k_block_data_idx_on_grid
)),
tensor_operation
::
element_wise
::
PassThrough
{}};
for
(
index_t
idx
=
0
;
idx
<
batch_count
;
idx
++
)
{
// Global Memory
const
index_t
a_batch_offset
=
__builtin_amdgcn_readfirstlane
(
compute_ptr_offset_of_batch
.
GetAPtrOffset
(
idx
));
const
index_t
c_batch_offset
=
__builtin_amdgcn_readfirstlane
(
compute_ptr_offset_of_batch
.
GetCPtrOffset
(
idx
));
const
auto
in_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_in_global
+
a_batch_offset
,
in_grid_desc
.
GetElementSpaceSize
());
auto
out_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_out_global
+
c_batch_offset
,
out_grid_desc
.
GetElementSpaceSize
());
copy_global_to_global
.
Run
(
tie
(
in_grid_desc
),
tie
(
in_global_buf
),
tie
(
out_grid_desc
),
tie
(
out_global_buf
));
}
}
__host__
static
constexpr
bool
CheckValidity
(
const
InputGridDesc
&
in_grid_desc
,
const
OutputGridDesc
&
out_grid_desc
)
...
...
library/include/ck/library/reference_tensor_operation/cpu/reference_column_to_image.hpp
View file @
68fda7c2
...
...
@@ -19,9 +19,7 @@ namespace host {
* \brief Reference implementation for column to image.
*
* Input tensor descriptor has [N * Do * Ho * Wo, Z * Y * X * C] data layout.
* Memory layout is the same.
* Output tensor descriptor has [G, N, C, Di, Hi, Wi] data layout.
* G must be equal to 1. Memory layout is [G, N, Di, Hi, Wi, C].
*
* \tparam NDimSpatial Number of spatial dimensions.
* \tparam ImageLayout Image Layout.
...
...
@@ -94,23 +92,33 @@ struct ReferenceColumnToImage : public device::BaseOperator
float
Run
(
const
Argument
&
arg
)
{
using
namespace
ck
::
tensor_layout
::
convolution
;
if
(
!
(
arg
.
output_
.
GetNumOfDimension
()
==
NDimSpatial
+
3
&&
arg
.
input_
.
GetNumOfDimension
()
==
2
))
{
throw
std
::
runtime_error
(
"wrong! inconsistent dimension"
);
}
const
index_t
G
=
arg
.
output_
.
GetLengths
()[
0
];
const
index_t
N
=
arg
.
output_
.
GetLengths
()[
1
];
const
index_t
C
=
arg
.
output_
.
GetLengths
()[
2
];
if
constexpr
(
NDimSpatial
==
1
)
{
const
index_t
Wo
=
arg
.
output_spatial_lengths_
[
0
];
auto
func
=
[
&
](
auto
n
)
{
auto
func
=
[
&
](
auto
g
,
auto
n
)
{
for
(
index_t
wo
=
0
;
wo
<
Wo
;
++
wo
)
{
index_t
row
=
n
*
Wo
+
wo
;
index_t
column
=
0
;
if
constexpr
(
std
::
is_same_v
<
ImageLayout
,
GNWC
>
)
{
row
=
g
*
N
*
Wo
+
n
*
Wo
+
wo
;
}
else
if
constexpr
(
std
::
is_same_v
<
ImageLayout
,
NWGC
>
)
{
row
=
n
*
Wo
*
G
+
wo
*
G
+
g
;
}
for
(
index_t
x
=
0
;
x
<
arg
.
filter_spatial_lengths_
[
0
];
++
x
)
{
...
...
@@ -124,8 +132,8 @@ struct ReferenceColumnToImage : public device::BaseOperator
ck
::
type_convert
<
std
::
size_t
>
(
wi
)
<
arg
.
output_
.
GetLengths
()[
3
])
{
float
v_in
=
ck
::
type_convert
<
float
>
(
arg
.
input_
(
row
,
column
));
float
v_out
=
ck
::
type_convert
<
float
>
(
arg
.
output_
(
0
,
n
,
c
,
wi
));
arg
.
output_
(
0
,
n
,
c
,
wi
)
=
float
v_out
=
ck
::
type_convert
<
float
>
(
arg
.
output_
(
g
,
n
,
c
,
wi
));
arg
.
output_
(
g
,
n
,
c
,
wi
)
=
ck
::
type_convert
<
OutDataType
>
(
v_in
+
v_out
);
}
column
++
;
...
...
@@ -134,7 +142,7 @@ struct ReferenceColumnToImage : public device::BaseOperator
}
};
make_ParallelTensorFunctor
(
func
,
N
)(
std
::
thread
::
hardware_concurrency
());
make_ParallelTensorFunctor
(
func
,
G
,
N
)(
std
::
thread
::
hardware_concurrency
());
return
0
;
}
...
...
@@ -143,13 +151,21 @@ struct ReferenceColumnToImage : public device::BaseOperator
const
index_t
Ho
=
arg
.
output_spatial_lengths_
[
0
];
const
index_t
Wo
=
arg
.
output_spatial_lengths_
[
1
];
auto
func
=
[
&
](
auto
n
)
{
auto
func
=
[
&
](
auto
g
,
auto
n
)
{
for
(
index_t
ho
=
0
;
ho
<
Ho
;
++
ho
)
{
for
(
index_t
wo
=
0
;
wo
<
Wo
;
++
wo
)
{
index_t
row
=
n
*
Ho
*
Wo
+
ho
*
Wo
+
wo
;
index_t
row
=
0
;
index_t
column
=
0
;
if
constexpr
(
std
::
is_same_v
<
ImageLayout
,
GNHWC
>
)
{
row
=
g
*
N
*
Ho
*
Wo
+
n
*
Ho
*
Wo
+
ho
*
Wo
+
wo
;
}
else
if
constexpr
(
std
::
is_same_v
<
ImageLayout
,
NHWGC
>
)
{
row
=
n
*
Ho
*
Wo
*
G
+
ho
*
Wo
*
G
+
wo
*
G
+
g
;
}
for
(
index_t
y
=
0
;
y
<
arg
.
filter_spatial_lengths_
[
0
];
++
y
)
{
...
...
@@ -178,8 +194,8 @@ struct ReferenceColumnToImage : public device::BaseOperator
float
v_in
=
ck
::
type_convert
<
float
>
(
arg
.
input_
(
row
,
column
));
float
v_out
=
ck
::
type_convert
<
float
>
(
arg
.
output_
(
0
,
n
,
c
,
hi
,
wi
));
arg
.
output_
(
0
,
n
,
c
,
hi
,
wi
)
=
arg
.
output_
(
g
,
n
,
c
,
hi
,
wi
));
arg
.
output_
(
g
,
n
,
c
,
hi
,
wi
)
=
ck
::
type_convert
<
OutDataType
>
(
v_in
+
v_out
);
}
column
++
;
...
...
@@ -190,7 +206,7 @@ struct ReferenceColumnToImage : public device::BaseOperator
}
};
make_ParallelTensorFunctor
(
func
,
N
)(
std
::
thread
::
hardware_concurrency
());
make_ParallelTensorFunctor
(
func
,
G
,
N
)(
std
::
thread
::
hardware_concurrency
());
return
0
;
}
...
...
@@ -200,15 +216,25 @@ struct ReferenceColumnToImage : public device::BaseOperator
const
index_t
Ho
=
arg
.
output_spatial_lengths_
[
1
];
const
index_t
Wo
=
arg
.
output_spatial_lengths_
[
2
];
auto
func
=
[
&
](
auto
n
)
{
auto
func
=
[
&
](
auto
g
,
auto
n
)
{
for
(
index_t
d_o
=
0
;
d_o
<
Do
;
++
d_o
)
{
for
(
index_t
ho
=
0
;
ho
<
Ho
;
++
ho
)
{
for
(
index_t
wo
=
0
;
wo
<
Wo
;
++
wo
)
{
index_t
row
=
n
*
Do
*
Ho
*
Wo
+
d_o
*
Ho
*
Wo
+
ho
*
Wo
+
wo
;
index_t
row
=
0
;
index_t
column
=
0
;
if
constexpr
(
std
::
is_same_v
<
ImageLayout
,
GNDHWC
>
)
{
row
=
g
*
N
*
Do
*
Ho
*
Wo
+
n
*
Do
*
Ho
*
Wo
+
d_o
*
Ho
*
Wo
+
ho
*
Wo
+
wo
;
}
else
if
constexpr
(
std
::
is_same_v
<
ImageLayout
,
NDHWGC
>
)
{
row
=
n
*
Do
*
Ho
*
Wo
*
G
+
d_o
*
Ho
*
Wo
*
G
+
ho
*
Wo
*
G
+
wo
*
G
+
g
;
}
for
(
index_t
z
=
0
;
z
<
arg
.
filter_spatial_lengths_
[
0
];
++
z
)
{
...
...
@@ -247,8 +273,8 @@ struct ReferenceColumnToImage : public device::BaseOperator
float
v_in
=
ck
::
type_convert
<
float
>
(
arg
.
input_
(
row
,
column
));
float
v_out
=
ck
::
type_convert
<
float
>
(
arg
.
output_
(
0
,
n
,
c
,
di
,
hi
,
wi
));
arg
.
output_
(
0
,
n
,
c
,
di
,
hi
,
wi
)
=
arg
.
output_
(
g
,
n
,
c
,
di
,
hi
,
wi
));
arg
.
output_
(
g
,
n
,
c
,
di
,
hi
,
wi
)
=
ck
::
type_convert
<
OutDataType
>
(
v_in
+
v_out
);
}
column
++
;
...
...
@@ -261,7 +287,7 @@ struct ReferenceColumnToImage : public device::BaseOperator
}
};
make_ParallelTensorFunctor
(
func
,
N
)(
std
::
thread
::
hardware_concurrency
());
make_ParallelTensorFunctor
(
func
,
G
,
N
)(
std
::
thread
::
hardware_concurrency
());
return
0
;
}
...
...
library/include/ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp
View file @
68fda7c2
...
...
@@ -19,9 +19,7 @@ namespace host {
* \brief Reference implementation for image to column.
*
* Input tensor descriptor has [G, N, C, Di, Hi, Wi] data layout.
* G must be equal to 1. Memory layout is [G, N, Di, Hi, Wi, C].
* Output tensor descriptor has [N * Do * Ho * Wo, Z * Y * X * C] data layout.
* Memory layout is the same.
* Output tensor descriptor has [G * N * Do * Ho * Wo, Z * Y * X * C] data layout.
*
* \tparam NDimSpatial Number of spatial dimensions.
* \tparam ImageLayout Image Layout.
...
...
@@ -94,21 +92,31 @@ struct ReferenceImageToColumn : public device::BaseOperator
float
Run
(
const
Argument
&
arg
)
{
using
namespace
ck
::
tensor_layout
::
convolution
;
if
(
!
(
arg
.
input_
.
GetNumOfDimension
()
==
NDimSpatial
+
3
&&
arg
.
output_
.
GetNumOfDimension
()
==
2
))
{
throw
std
::
runtime_error
(
"wrong! inconsistent dimension"
);
}
const
index_t
G
=
arg
.
input_
.
GetLengths
()[
0
];
const
index_t
N
=
arg
.
input_
.
GetLengths
()[
1
];
const
index_t
C
=
arg
.
input_
.
GetLengths
()[
2
];
if
constexpr
(
NDimSpatial
==
1
)
{
const
index_t
Wo
=
arg
.
output_spatial_lengths_
[
0
];
auto
func
=
[
&
](
auto
n
,
auto
wo
)
{
index_t
row
=
n
*
Wo
+
wo
;
auto
func
=
[
&
](
auto
g
,
auto
n
,
auto
wo
)
{
index_t
row
=
0
;
index_t
column
=
0
;
if
constexpr
(
std
::
is_same_v
<
ImageLayout
,
GNWC
>
)
{
row
=
g
*
N
*
Wo
+
n
*
Wo
+
wo
;
}
else
if
constexpr
(
std
::
is_same_v
<
ImageLayout
,
NWGC
>
)
{
row
=
n
*
Wo
*
G
+
wo
*
G
+
g
;
}
for
(
index_t
x
=
0
;
x
<
arg
.
filter_spatial_lengths_
[
0
];
++
x
)
{
...
...
@@ -121,7 +129,7 @@ struct ReferenceImageToColumn : public device::BaseOperator
if
(
wi
>=
0
&&
ck
::
type_convert
<
std
::
size_t
>
(
wi
)
<
arg
.
input_
.
GetLengths
()[
3
])
{
InDataType
v_in
=
arg
.
input_
(
0
,
n
,
c
,
wi
);
InDataType
v_in
=
arg
.
input_
(
g
,
n
,
c
,
wi
);
arg
.
output_
(
row
,
column
)
=
ck
::
type_convert
<
OutDataType
>
(
v_in
);
}
column
++
;
...
...
@@ -129,7 +137,7 @@ struct ReferenceImageToColumn : public device::BaseOperator
}
};
make_ParallelTensorFunctor
(
func
,
N
,
Wo
)(
std
::
thread
::
hardware_concurrency
());
make_ParallelTensorFunctor
(
func
,
G
,
N
,
Wo
)(
std
::
thread
::
hardware_concurrency
());
return
0
;
}
...
...
@@ -138,9 +146,17 @@ struct ReferenceImageToColumn : public device::BaseOperator
const
index_t
Ho
=
arg
.
output_spatial_lengths_
[
0
];
const
index_t
Wo
=
arg
.
output_spatial_lengths_
[
1
];
auto
func
=
[
&
](
auto
n
,
auto
ho
,
auto
wo
)
{
index_t
row
=
n
*
Ho
*
Wo
+
ho
*
Wo
+
wo
;
auto
func
=
[
&
](
auto
g
,
auto
n
,
auto
ho
,
auto
wo
)
{
index_t
row
=
0
;
index_t
column
=
0
;
if
constexpr
(
std
::
is_same_v
<
ImageLayout
,
GNHWC
>
)
{
row
=
g
*
N
*
Ho
*
Wo
+
n
*
Ho
*
Wo
+
ho
*
Wo
+
wo
;
}
else
if
constexpr
(
std
::
is_same_v
<
ImageLayout
,
NHWGC
>
)
{
row
=
n
*
Ho
*
Wo
*
G
+
ho
*
Wo
*
G
+
wo
*
G
+
g
;
}
for
(
index_t
y
=
0
;
y
<
arg
.
filter_spatial_lengths_
[
0
];
++
y
)
{
...
...
@@ -162,7 +178,7 @@ struct ReferenceImageToColumn : public device::BaseOperator
wi
>=
0
&&
ck
::
type_convert
<
std
::
size_t
>
(
wi
)
<
arg
.
input_
.
GetLengths
()[
4
])
{
InDataType
v_in
=
arg
.
input_
(
0
,
n
,
c
,
hi
,
wi
);
InDataType
v_in
=
arg
.
input_
(
g
,
n
,
c
,
hi
,
wi
);
arg
.
output_
(
row
,
column
)
=
ck
::
type_convert
<
OutDataType
>
(
v_in
);
}
column
++
;
...
...
@@ -171,7 +187,7 @@ struct ReferenceImageToColumn : public device::BaseOperator
}
};
make_ParallelTensorFunctor
(
func
,
N
,
Ho
,
Wo
)(
std
::
thread
::
hardware_concurrency
());
make_ParallelTensorFunctor
(
func
,
G
,
N
,
Ho
,
Wo
)(
std
::
thread
::
hardware_concurrency
());
return
0
;
}
...
...
@@ -181,9 +197,18 @@ struct ReferenceImageToColumn : public device::BaseOperator
const
index_t
Ho
=
arg
.
output_spatial_lengths_
[
1
];
const
index_t
Wo
=
arg
.
output_spatial_lengths_
[
2
];
auto
func
=
[
&
](
auto
n
,
auto
d_o
,
auto
ho
,
auto
wo
)
{
index_t
row
=
n
*
Do
*
Ho
*
Wo
+
d_o
*
Ho
*
Wo
+
ho
*
Wo
+
wo
;
auto
func
=
[
&
](
auto
g
,
auto
n
,
auto
d_o
,
auto
ho
,
auto
wo
)
{
index_t
row
=
0
;
index_t
column
=
0
;
if
constexpr
(
std
::
is_same_v
<
ImageLayout
,
GNDHWC
>
)
{
row
=
g
*
N
*
Do
*
Ho
*
Wo
+
n
*
Do
*
Ho
*
Wo
+
d_o
*
Ho
*
Wo
+
ho
*
Wo
+
wo
;
}
else
if
constexpr
(
std
::
is_same_v
<
ImageLayout
,
NDHWGC
>
)
{
row
=
n
*
Do
*
Ho
*
Wo
*
G
+
d_o
*
Ho
*
Wo
*
G
+
ho
*
Wo
*
G
+
wo
*
G
+
g
;
}
for
(
index_t
z
=
0
;
z
<
arg
.
filter_spatial_lengths_
[
0
];
++
z
)
{
...
...
@@ -213,7 +238,7 @@ struct ReferenceImageToColumn : public device::BaseOperator
ck
::
type_convert
<
std
::
size_t
>
(
wi
)
<
arg
.
input_
.
GetLengths
()[
5
])
{
InDataType
v_in
=
arg
.
input_
(
0
,
n
,
c
,
di
,
hi
,
wi
);
InDataType
v_in
=
arg
.
input_
(
g
,
n
,
c
,
di
,
hi
,
wi
);
arg
.
output_
(
row
,
column
)
=
ck
::
type_convert
<
OutDataType
>
(
v_in
);
}
...
...
@@ -224,7 +249,7 @@ struct ReferenceImageToColumn : public device::BaseOperator
}
};
make_ParallelTensorFunctor
(
func
,
N
,
Do
,
Ho
,
Wo
)(
make_ParallelTensorFunctor
(
func
,
G
,
N
,
Do
,
Ho
,
Wo
)(
std
::
thread
::
hardware_concurrency
());
return
0
;
...
...
library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp
View file @
68fda7c2
This diff is collapsed.
Click to expand it.
library/src/tensor_operation_instance/gpu/column_to_image/CMakeLists.txt
View file @
68fda7c2
add_instance_library
(
device_column_to_image_instance
device_column_to_image_nhwc_1d_instance.cpp
device_column_to_image_nhwc_2d_instance.cpp
device_column_to_image_nhwc_3d_instance.cpp
device_column_to_image_gnwc_1d_instance.cpp
device_column_to_image_gnhwc_2d_instance.cpp
device_column_to_image_gndhwc_3d_instance.cpp
device_column_to_image_nwgc_1d_instance.cpp
device_column_to_image_nhwgc_2d_instance.cpp
device_column_to_image_ndhwgc_3d_instance.cpp
)
library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_
n
hwc_3d_instance.cpp
→
library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_
gnd
hwc_3d_instance.cpp
View file @
68fda7c2
...
...
@@ -11,7 +11,7 @@ namespace instance {
using
namespace
ck
::
conv_tensor_rearrange_op
;
void
add_device_column_to_image_ndhwc_3d_bf16_instances
(
void
add_device_column_to_image_
g
ndhwc_3d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
GNDHWC
,
BF16
,
BF16
,
ColumnToImage
>>>&
instances
)
{
...
...
@@ -22,7 +22,7 @@ void add_device_column_to_image_ndhwc_3d_bf16_instances(
#endif
}
void
add_device_column_to_image_ndhwc_3d_f16_instances
(
void
add_device_column_to_image_
g
ndhwc_3d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
GNDHWC
,
F16
,
F16
,
ColumnToImage
>>>&
instances
)
{
...
...
@@ -33,7 +33,7 @@ void add_device_column_to_image_ndhwc_3d_f16_instances(
#endif
}
void
add_device_column_to_image_ndhwc_3d_f32_instances
(
void
add_device_column_to_image_
g
ndhwc_3d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
GNDHWC
,
F32
,
F32
,
ColumnToImage
>>>&
instances
)
{
...
...
@@ -44,7 +44,7 @@ void add_device_column_to_image_ndhwc_3d_f32_instances(
#endif
}
void
add_device_column_to_image_ndhwc_3d_i8_instances
(
void
add_device_column_to_image_
g
ndhwc_3d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
GNDHWC
,
int8_t
,
int8_t
,
ColumnToImage
>>>&
instances
)
...
...
library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_2d_instance.cpp
→
library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_
g
nhwc_2d_instance.cpp
View file @
68fda7c2
...
...
@@ -11,7 +11,7 @@ namespace instance {
using
namespace
ck
::
conv_tensor_rearrange_op
;
void
add_device_column_to_image_nhwc_2d_bf16_instances
(
void
add_device_column_to_image_
g
nhwc_2d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
GNHWC
,
BF16
,
BF16
,
ColumnToImage
>>>&
instances
)
{
...
...
@@ -22,7 +22,7 @@ void add_device_column_to_image_nhwc_2d_bf16_instances(
#endif
}
void
add_device_column_to_image_nhwc_2d_f16_instances
(
void
add_device_column_to_image_
g
nhwc_2d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
GNHWC
,
F16
,
F16
,
ColumnToImage
>>>&
instances
)
{
...
...
@@ -33,7 +33,7 @@ void add_device_column_to_image_nhwc_2d_f16_instances(
#endif
}
void
add_device_column_to_image_nhwc_2d_f32_instances
(
void
add_device_column_to_image_
g
nhwc_2d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
GNHWC
,
F32
,
F32
,
ColumnToImage
>>>&
instances
)
{
...
...
@@ -44,7 +44,7 @@ void add_device_column_to_image_nhwc_2d_f32_instances(
#endif
}
void
add_device_column_to_image_nhwc_2d_i8_instances
(
void
add_device_column_to_image_
g
nhwc_2d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
GNHWC
,
int8_t
,
int8_t
,
ColumnToImage
>>>&
instances
)
...
...
library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_n
h
wc_1d_instance.cpp
→
library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_
g
nwc_1d_instance.cpp
View file @
68fda7c2
...
...
@@ -11,7 +11,7 @@ namespace instance {
using
namespace
ck
::
conv_tensor_rearrange_op
;
void
add_device_column_to_image_nwc_1d_bf16_instances
(
void
add_device_column_to_image_
g
nwc_1d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
GNWC
,
BF16
,
BF16
,
ColumnToImage
>>>&
instances
)
{
...
...
@@ -22,7 +22,7 @@ void add_device_column_to_image_nwc_1d_bf16_instances(
#endif
}
void
add_device_column_to_image_nwc_1d_f16_instances
(
void
add_device_column_to_image_
g
nwc_1d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
GNWC
,
F16
,
F16
,
ColumnToImage
>>>&
instances
)
{
...
...
@@ -33,7 +33,7 @@ void add_device_column_to_image_nwc_1d_f16_instances(
#endif
}
void
add_device_column_to_image_nwc_1d_f32_instances
(
void
add_device_column_to_image_
g
nwc_1d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
GNWC
,
F32
,
F32
,
ColumnToImage
>>>&
instances
)
{
...
...
@@ -44,7 +44,7 @@ void add_device_column_to_image_nwc_1d_f32_instances(
#endif
}
void
add_device_column_to_image_nwc_1d_i8_instances
(
void
add_device_column_to_image_
g
nwc_1d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
GNWC
,
int8_t
,
int8_t
,
ColumnToImage
>>>&
instances
)
{
...
...
library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_ndhwgc_3d_instance.cpp
0 → 100644
View file @
68fda7c2
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
namespace
ck
::
conv_tensor_rearrange_op
;
void
add_device_column_to_image_ndhwgc_3d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
NDHWGC
,
BF16
,
BF16
,
ColumnToImage
>>>&
instances
)
{
#ifdef CK_ENABLE_BF16
add_device_operation_instances
(
instances
,
device_column_to_image_bf16_instances
<
3
,
NDHWGC
>
{});
#else
ignore
=
instances
;
#endif
}
void
add_device_column_to_image_ndhwgc_3d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
NDHWGC
,
F16
,
F16
,
ColumnToImage
>>>&
instances
)
{
#ifdef CK_ENABLE_FP16
add_device_operation_instances
(
instances
,
device_column_to_image_f16_instances
<
3
,
NDHWGC
>
{});
#else
ignore
=
instances
;
#endif
}
void
add_device_column_to_image_ndhwgc_3d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
NDHWGC
,
F32
,
F32
,
ColumnToImage
>>>&
instances
)
{
#ifdef CK_ENABLE_FP32
add_device_operation_instances
(
instances
,
device_column_to_image_f32_instances
<
3
,
NDHWGC
>
{});
#else
ignore
=
instances
;
#endif
}
void
add_device_column_to_image_ndhwgc_3d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
NDHWGC
,
int8_t
,
int8_t
,
ColumnToImage
>>>&
instances
)
{
#ifdef CK_ENABLE_INT8
add_device_operation_instances
(
instances
,
device_column_to_image_i8_instances
<
3
,
NDHWGC
>
{});
#else
ignore
=
instances
;
#endif
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwgc_2d_instance.cpp
0 → 100644
View file @
68fda7c2
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
namespace
ck
::
conv_tensor_rearrange_op
;
void
add_device_column_to_image_nhwgc_2d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
NHWGC
,
BF16
,
BF16
,
ColumnToImage
>>>&
instances
)
{
#ifdef CK_ENABLE_BF16
add_device_operation_instances
(
instances
,
device_column_to_image_bf16_instances
<
2
,
NHWGC
>
{});
#else
ignore
=
instances
;
#endif
}
void
add_device_column_to_image_nhwgc_2d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
NHWGC
,
F16
,
F16
,
ColumnToImage
>>>&
instances
)
{
#ifdef CK_ENABLE_FP16
add_device_operation_instances
(
instances
,
device_column_to_image_f16_instances
<
2
,
NHWGC
>
{});
#else
ignore
=
instances
;
#endif
}
void
add_device_column_to_image_nhwgc_2d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
NHWGC
,
F32
,
F32
,
ColumnToImage
>>>&
instances
)
{
#ifdef CK_ENABLE_FP32
add_device_operation_instances
(
instances
,
device_column_to_image_f32_instances
<
2
,
NHWGC
>
{});
#else
ignore
=
instances
;
#endif
}
void
add_device_column_to_image_nhwgc_2d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
NHWGC
,
int8_t
,
int8_t
,
ColumnToImage
>>>&
instances
)
{
#ifdef CK_ENABLE_INT8
add_device_operation_instances
(
instances
,
device_column_to_image_i8_instances
<
2
,
NHWGC
>
{});
#else
ignore
=
instances
;
#endif
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nwgc_1d_instance.cpp
0 → 100644
View file @
68fda7c2
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
namespace
ck
::
conv_tensor_rearrange_op
;
void
add_device_column_to_image_nwgc_1d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
NWGC
,
BF16
,
BF16
,
ColumnToImage
>>>&
instances
)
{
#ifdef CK_ENABLE_BF16
add_device_operation_instances
(
instances
,
device_column_to_image_bf16_instances
<
1
,
NWGC
>
{});
#else
ignore
=
instances
;
#endif
}
void
add_device_column_to_image_nwgc_1d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
NWGC
,
F16
,
F16
,
ColumnToImage
>>>&
instances
)
{
#ifdef CK_ENABLE_FP16
add_device_operation_instances
(
instances
,
device_column_to_image_f16_instances
<
1
,
NWGC
>
{});
#else
ignore
=
instances
;
#endif
}
void
add_device_column_to_image_nwgc_1d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
NWGC
,
F32
,
F32
,
ColumnToImage
>>>&
instances
)
{
#ifdef CK_ENABLE_FP32
add_device_operation_instances
(
instances
,
device_column_to_image_f32_instances
<
1
,
NWGC
>
{});
#else
ignore
=
instances
;
#endif
}
void
add_device_column_to_image_nwgc_1d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
NWGC
,
int8_t
,
int8_t
,
ColumnToImage
>>>&
instances
)
{
#ifdef CK_ENABLE_INT8
add_device_operation_instances
(
instances
,
device_column_to_image_i8_instances
<
1
,
NWGC
>
{});
#else
ignore
=
instances
;
#endif
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/image_to_column/CMakeLists.txt
View file @
68fda7c2
add_instance_library
(
device_image_to_column_instance
device_image_to_column_nhwc_1d_instance.cpp
device_image_to_column_nhwc_2d_instance.cpp
device_image_to_column_nhwc_3d_instance.cpp
device_image_to_column_gnwc_1d_instance.cpp
device_image_to_column_gnhwc_2d_instance.cpp
device_image_to_column_gndhwc_3d_instance.cpp
device_image_to_column_nwgc_1d_instance.cpp
device_image_to_column_nhwgc_2d_instance.cpp
device_image_to_column_ndhwgc_3d_instance.cpp
)
library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_
n
hwc_3d_instance.cpp
→
library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_
gnd
hwc_3d_instance.cpp
View file @
68fda7c2
...
...
@@ -11,7 +11,7 @@ namespace instance {
using
namespace
ck
::
conv_tensor_rearrange_op
;
void
add_device_image_to_column_ndhwc_3d_bf16_instances
(
void
add_device_image_to_column_
g
ndhwc_3d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
GNDHWC
,
BF16
,
BF16
,
ImageToColumn
>>>&
instances
)
{
...
...
@@ -22,7 +22,7 @@ void add_device_image_to_column_ndhwc_3d_bf16_instances(
#endif
}
void
add_device_image_to_column_ndhwc_3d_f16_instances
(
void
add_device_image_to_column_
g
ndhwc_3d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
GNDHWC
,
F16
,
F16
,
ImageToColumn
>>>&
instances
)
{
...
...
@@ -33,7 +33,7 @@ void add_device_image_to_column_ndhwc_3d_f16_instances(
#endif
}
void
add_device_image_to_column_ndhwc_3d_f32_instances
(
void
add_device_image_to_column_
g
ndhwc_3d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
GNDHWC
,
F32
,
F32
,
ImageToColumn
>>>&
instances
)
{
...
...
@@ -44,7 +44,7 @@ void add_device_image_to_column_ndhwc_3d_f32_instances(
#endif
}
void
add_device_image_to_column_ndhwc_3d_i8_instances
(
void
add_device_image_to_column_
g
ndhwc_3d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
GNDHWC
,
int8_t
,
int8_t
,
ImageToColumn
>>>&
instances
)
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment