Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
ad24acb6
"example/91_tile_program/fmha_fwd.cpp" did not exist on "7337ec2565d82399d6b9587e8fec128a39a4d4c5"
Commit
ad24acb6
authored
Sep 20, 2023
by
Bartlomiej Kocot
Browse files
Add column to image kernel
parent
bba085d2
Changes
36
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2049 additions
and
696 deletions
+2049
-696
client_example/22_im2col_col2im/CMakeLists.txt
client_example/22_im2col_col2im/CMakeLists.txt
+5
-0
client_example/22_im2col_col2im/column_to_image.cpp
client_example/22_im2col_col2im/column_to_image.cpp
+173
-0
client_example/22_im2col_col2im/image_to_column.cpp
client_example/22_im2col_col2im/image_to_column.cpp
+16
-10
example/52_im2col_col2im/CMakeLists.txt
example/52_im2col_col2im/CMakeLists.txt
+4
-2
example/52_im2col_col2im/column_to_image_f32.cpp
example/52_im2col_col2im/column_to_image_f32.cpp
+165
-0
example/52_im2col_col2im/common.hpp
example/52_im2col_col2im/common.hpp
+3
-1
example/52_im2col_col2im/image_to_column_f32.cpp
example/52_im2col_col2im/image_to_column_f32.cpp
+15
-14
include/ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp
.../tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp
+33
-0
include/ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp
...sor_operation/gpu/device/device_conv_tensor_rearrange.hpp
+14
-11
include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp
...operation/gpu/device/impl/device_column_to_image_impl.hpp
+622
-0
include/ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp
...operation/gpu/device/impl/device_image_to_column_impl.hpp
+65
-83
include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
...k/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
+53
-22
include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp
...eration/operator_transform/transform_conv_fwd_to_gemm.hpp
+6
-339
include/ck/utility/dynamic_buffer.hpp
include/ck/utility/dynamic_buffer.hpp
+30
-4
library/include/ck/library/reference_tensor_operation/cpu/reference_column_to_image.hpp
...erence_tensor_operation/cpu/reference_column_to_image.hpp
+363
-0
library/include/ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp
...erence_tensor_operation/cpu/reference_image_to_column.hpp
+7
-5
library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp
...y/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp
+282
-0
library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp
...conv_tensor_rearrange/device_column_to_image_instance.hpp
+122
-0
library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_image_to_column_instance.hpp
...conv_tensor_rearrange/device_image_to_column_instance.hpp
+71
-70
library/include/ck/library/tensor_operation_instance/gpu/image_to_column.hpp
...library/tensor_operation_instance/gpu/image_to_column.hpp
+0
-135
No files found.
client_example/2
0
_im
age_to
_col
umn
/CMakeLists.txt
→
client_example/2
2
_im
2col
_col
2im
/CMakeLists.txt
View file @
ad24acb6
add_executable
(
client_image_to_column image_to_column.cpp
)
add_executable
(
client_image_to_column image_to_column.cpp
)
target_link_libraries
(
client_image_to_column PRIVATE composable_kernel::device_operations
)
target_link_libraries
(
client_image_to_column PRIVATE composable_kernel::device_operations
)
add_executable
(
client_column_to_image column_to_image.cpp
)
target_link_libraries
(
client_column_to_image PRIVATE composable_kernel::device_operations
)
client_example/22_im2col_col2im/column_to_image.cpp
0 → 100644
View file @
ad24acb6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <iterator>
#include <numeric>
#include <vector>
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp"
#include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
using
InDataType
=
ck
::
half_t
;
using
OutDataType
=
ck
::
half_t
;
using
ImageLayout
=
ck
::
tensor_layout
::
convolution
::
GNHWC
;
static
constexpr
ck
::
index_t
NumDimSpatial
=
2
;
static
constexpr
ck
::
index_t
G
=
1
;
static
constexpr
ck
::
index_t
N
=
32
;
// batch size
static
constexpr
ck
::
index_t
C
=
32
;
// input channel (per group)
static
constexpr
ck
::
index_t
Y
=
3
;
// filter H
static
constexpr
ck
::
index_t
X
=
3
;
// filter W
static
constexpr
ck
::
index_t
Hi
=
28
;
// input H
static
constexpr
ck
::
index_t
Wi
=
28
;
// input W
static
constexpr
ck
::
index_t
Ho
=
28
;
// output H
static
constexpr
ck
::
index_t
Wo
=
28
;
// output W
struct
SimpleDeviceMem
{
SimpleDeviceMem
()
=
delete
;
SimpleDeviceMem
(
std
::
size_t
mem_size
)
:
p_mem_
{}
{
(
void
)
hipMalloc
(
static_cast
<
void
**>
(
&
p_mem_
),
mem_size
);
}
void
*
GetDeviceBuffer
()
{
return
p_mem_
;
}
~
SimpleDeviceMem
()
{
(
void
)
hipFree
(
p_mem_
);
}
void
*
p_mem_
;
};
int
main
()
{
std
::
array
<
ck
::
index_t
,
2
>
in_spatial_lengths
{
Hi
,
Wi
};
std
::
array
<
ck
::
index_t
,
2
>
wei_spatial_lengths
{
Y
,
X
};
std
::
array
<
ck
::
index_t
,
2
>
out_spatial_lengths
{
Ho
,
Wo
};
// We have NHWGC in memory space (G is dummy)
// However, CK's API only accept length and stride with order of GNCHW
// Hence, we need to adjust the order of stride
std
::
array
<
ck
::
index_t
,
5
>
image_strides
{
C
,
Hi
*
Wi
*
G
*
C
,
1
,
Wi
*
G
*
C
,
G
*
C
};
std
::
array
<
ck
::
index_t
,
2
>
gemm_strides
{
Y
*
X
*
C
,
1
};
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
filter_strides
{
1
,
1
};
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
filter_dilations
{
1
,
1
};
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
input_left_pads
{
1
,
1
};
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
input_right_pads
{
1
,
1
};
SimpleDeviceMem
in
(
sizeof
(
InDataType
)
*
N
*
Ho
*
Wo
*
Y
*
X
*
C
);
SimpleDeviceMem
out
(
sizeof
(
OutDataType
)
*
N
*
Hi
*
Wi
*
G
*
C
);
using
namespace
ck
::
conv_tensor_rearrange_op
;
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceConvTensorRearrange
<
NumDimSpatial
,
ImageLayout
,
InDataType
,
OutDataType
,
ColumnToImageOp
>
;
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
std
::
string
best_op_name
;
int
best_op_id
=
-
1
;
float
best_avg_time
=
std
::
numeric_limits
<
float
>::
max
();
float
best_gb_per_sec
=
0
;
// profile device operation instances
std
::
cout
<<
"Run all instances and do timing"
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
op_ptrs
.
size
();
++
i
)
{
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
in
.
GetDeviceBuffer
(),
out
.
GetDeviceBuffer
(),
N
,
C
,
in_spatial_lengths
,
out_spatial_lengths
,
wei_spatial_lengths
,
image_strides
,
gemm_strides
,
filter_strides
,
filter_dilations
,
input_left_pads
,
input_right_pads
);
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
true
});
std
::
size_t
num_bytes
=
sizeof
(
InDataType
)
*
N
*
Hi
*
Wi
*
G
*
C
+
sizeof
(
OutDataType
)
*
N
*
Ho
*
Wo
*
Y
*
X
*
C
;
float
gb_per_sec
=
num_bytes
/
1.E6
/
avg_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
avg_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
if
(
avg_time
<
best_avg_time
)
{
best_op_id
=
i
;
best_op_name
=
op_name
;
best_avg_time
=
avg_time
;
best_gb_per_sec
=
gb_per_sec
;
}
}
else
{
std
::
cerr
<<
op_name
<<
" does not support this problem"
<<
std
::
endl
;
}
}
if
(
best_op_id
<
0
)
{
std
::
cerr
<<
"no suitable instance"
<<
std
::
endl
;
return
EXIT_FAILURE
;
}
std
::
cout
<<
"Best Perf: "
<<
std
::
setw
(
10
)
<<
best_avg_time
<<
" ms, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
// run the best intance
{
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
<<
std
::
endl
;
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
in
.
GetDeviceBuffer
(),
out
.
GetDeviceBuffer
(),
N
,
C
,
in_spatial_lengths
,
out_spatial_lengths
,
wei_spatial_lengths
,
image_strides
,
gemm_strides
,
filter_strides
,
filter_dilations
,
input_left_pads
,
input_right_pads
);
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
}
std
::
cout
<<
"Done"
<<
std
::
endl
;
}
}
client_example/2
0
_im
age_to
_col
umn
/image_to_column.cpp
→
client_example/2
2
_im
2col
_col
2im
/image_to_column.cpp
View file @
ad24acb6
...
@@ -9,13 +9,14 @@
...
@@ -9,13 +9,14 @@
#include <vector>
#include <vector>
#include "ck/ck.hpp"
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/image_to_column.hpp"
#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp"
#include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
using
InDataType
=
ck
::
half_t
;
using
InDataType
=
ck
::
half_t
;
using
OutDataType
=
ck
::
half_t
;
using
OutDataType
=
ck
::
half_t
;
using
I
n
Layout
=
ck
::
tensor_layout
::
convolution
::
GNHWC
;
using
I
mage
Layout
=
ck
::
tensor_layout
::
convolution
::
GNHWC
;
static
constexpr
ck
::
index_t
NumDimSpatial
=
2
;
static
constexpr
ck
::
index_t
NumDimSpatial
=
2
;
static
constexpr
ck
::
index_t
G
=
1
;
static
constexpr
ck
::
index_t
G
=
1
;
...
@@ -54,8 +55,8 @@ int main()
...
@@ -54,8 +55,8 @@ int main()
// We have NHWGC in memory space (G is dummy)
// We have NHWGC in memory space (G is dummy)
// However, CK's API only accept length and stride with order of GNCHW
// However, CK's API only accept length and stride with order of GNCHW
// Hence, we need to adjust the order of stride
// Hence, we need to adjust the order of stride
std
::
array
<
ck
::
index_t
,
5
>
i
n
_strides
{
C
,
Hi
*
Wi
*
G
*
C
,
1
,
Wi
*
G
*
C
,
G
*
C
};
std
::
array
<
ck
::
index_t
,
5
>
i
mage
_strides
{
C
,
Hi
*
Wi
*
G
*
C
,
1
,
Wi
*
G
*
C
,
G
*
C
};
std
::
array
<
ck
::
index_t
,
2
>
out
_strides
{
Y
*
X
*
C
,
1
};
std
::
array
<
ck
::
index_t
,
2
>
gemm
_strides
{
Y
*
X
*
C
,
1
};
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
filter_strides
{
1
,
1
};
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
filter_strides
{
1
,
1
};
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
filter_dilations
{
1
,
1
};
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
filter_dilations
{
1
,
1
};
...
@@ -65,8 +66,13 @@ int main()
...
@@ -65,8 +66,13 @@ int main()
SimpleDeviceMem
in
(
sizeof
(
InDataType
)
*
N
*
Hi
*
Wi
*
G
*
C
);
SimpleDeviceMem
in
(
sizeof
(
InDataType
)
*
N
*
Hi
*
Wi
*
G
*
C
);
SimpleDeviceMem
out
(
sizeof
(
OutDataType
)
*
N
*
Ho
*
Wo
*
Y
*
X
*
C
);
SimpleDeviceMem
out
(
sizeof
(
OutDataType
)
*
N
*
Ho
*
Wo
*
Y
*
X
*
C
);
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
using
ImageToColumnOp
=
ck
::
conv_tensor_rearrange_op
::
ImageToColumn
;
DeviceImageToColumn
<
NumDimSpatial
,
InLayout
,
InDataType
,
OutDataType
>
;
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceConvTensorRearrange
<
NumDimSpatial
,
ImageLayout
,
InDataType
,
OutDataType
,
ImageToColumnOp
>
;
// get device op instances
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
...
@@ -92,8 +98,8 @@ int main()
...
@@ -92,8 +98,8 @@ int main()
in_spatial_lengths
,
in_spatial_lengths
,
out_spatial_lengths
,
out_spatial_lengths
,
wei_spatial_lengths
,
wei_spatial_lengths
,
i
n
_strides
,
i
mage
_strides
,
out
_strides
,
gemm
_strides
,
filter_strides
,
filter_strides
,
filter_dilations
,
filter_dilations
,
input_left_pads
,
input_left_pads
,
...
@@ -148,8 +154,8 @@ int main()
...
@@ -148,8 +154,8 @@ int main()
in_spatial_lengths
,
in_spatial_lengths
,
out_spatial_lengths
,
out_spatial_lengths
,
wei_spatial_lengths
,
wei_spatial_lengths
,
i
n
_strides
,
i
mage
_strides
,
out
_strides
,
gemm
_strides
,
filter_strides
,
filter_strides
,
filter_dilations
,
filter_dilations
,
input_left_pads
,
input_left_pads
,
...
...
example/52_im
age_to
_col
umn
/CMakeLists.txt
→
example/52_im
2col
_col
2im
/CMakeLists.txt
View file @
ad24acb6
...
@@ -2,9 +2,11 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
...
@@ -2,9 +2,11 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
set
(
target 0
)
set
(
target 0
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
if
(
gpu IN_LIST gpu_list AND target EQUAL 0
)
if
(
gpu IN_LIST gpu_list AND target EQUAL 0
)
add_custom_target
(
example_im
age_to
_col
umn
)
add_custom_target
(
example_im
2col
_col
2im
)
add_example_executable
(
example_image_to_column_f32 image_to_column_f32.cpp
)
add_example_executable
(
example_image_to_column_f32 image_to_column_f32.cpp
)
add_dependencies
(
example_image_to_column example_image_to_column_f32
)
add_dependencies
(
example_im2col_col2im example_image_to_column_f32
)
add_example_executable
(
example_column_to_image_f32 column_to_image_f32.cpp
)
add_dependencies
(
example_im2col_col2im example_column_to_image_f32
)
set
(
target 1
)
set
(
target 1
)
endif
()
endif
()
endforeach
()
endforeach
()
example/52_im2col_col2im/column_to_image_f32.cpp
0 → 100644
View file @
ad24acb6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
using
InDataType
=
FP32
;
// ck::bhalf_t;//FP32;
using
OutDataType
=
FP32
;
// ck::bhalf_t;//FP32;
using
ImLayout
=
ck
::
tensor_layout
::
convolution
::
GNHWC
;
using
ColumnToImageOp
=
ck
::
conv_tensor_rearrange_op
::
ColumnToImage
;
// clang-format off
using
DeviceColToImgInstance
=
ck
::
tensor_operation
::
device
::
DeviceColumnToImageImpl
//#####################| Num| ImLayout| InDataType| OutDataType| Block| MPer| KPer| Thread| Scalar| ConvTensor|
//#####################| Dim| | | | Size| Block| Block| Cluster| Per| RearrangeOp|
//#####################| Spatial| | | | | | | Lengths| Vector| |
//#####################| | | | | | | | | | |
<
NDimSpatial
,
ImLayout
,
InDataType
,
OutDataType
,
256
,
128
,
128
,
S
<
16
,
16
>
,
1
,
ColumnToImageOp
>
;
// clang-format on
bool
RunColumnToImage
(
const
ExecutionConfig
&
config
,
const
ck
::
utils
::
conv
::
ConvParam
&
conv_params
)
{
const
auto
N
=
conv_params
.
N_
;
const
auto
C
=
conv_params
.
C_
;
const
ck
::
index_t
NDoHoWo
=
N
*
ck
::
accumulate_n
<
ck
::
index_t
>
(
conv_params
.
output_spatial_lengths_
.
begin
(),
NDimSpatial
,
1
,
std
::
multiplies
<>
());
const
ck
::
index_t
CZYX
=
C
*
ck
::
accumulate_n
<
ck
::
index_t
>
(
conv_params
.
filter_spatial_lengths_
.
begin
(),
NDimSpatial
,
1
,
std
::
multiplies
<>
());
const
auto
in_desc
=
HostTensorDescriptor
({
NDoHoWo
,
CZYX
});
const
auto
out_desc
=
ck
::
utils
::
conv
::
make_input_host_tensor_descriptor_g_n_c_wis_packed
<
ImLayout
>
(
conv_params
);
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_spatial_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
filter_spatial_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
output_spatial_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
image_g_n_c_wis_strides
{};
std
::
array
<
ck
::
index_t
,
2
>
gemm_m_k_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
conv_filter_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
conv_filter_dilations
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_left_pads
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_right_pads
{};
auto
copy
=
[](
const
auto
&
x
,
auto
&
y
)
{
std
::
copy
(
x
.
begin
(),
x
.
end
(),
y
.
begin
());
};
copy
(
conv_params
.
input_spatial_lengths_
,
input_spatial_lengths
);
copy
(
conv_params
.
filter_spatial_lengths_
,
filter_spatial_lengths
);
copy
(
conv_params
.
output_spatial_lengths_
,
output_spatial_lengths
);
copy
(
in_desc
.
GetStrides
(),
gemm_m_k_strides
);
copy
(
out_desc
.
GetStrides
(),
image_g_n_c_wis_strides
);
copy
(
conv_params
.
conv_filter_strides_
,
conv_filter_strides
);
copy
(
conv_params
.
conv_filter_dilations_
,
conv_filter_dilations
);
copy
(
conv_params
.
input_left_pads_
,
input_left_pads
);
copy
(
conv_params
.
input_right_pads_
,
input_right_pads
);
Tensor
<
InDataType
>
in
(
in_desc
);
Tensor
<
OutDataType
>
out_device
(
out_desc
);
Tensor
<
OutDataType
>
out_host
(
out_desc
);
std
::
cout
<<
"in: "
<<
in
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"out: "
<<
out_device
.
mDesc
<<
std
::
endl
;
switch
(
config
.
init_method
)
{
case
0
:
break
;
case
1
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
<
InDataType
>
{
1
,
2
});
break
;
default:
in
.
GenerateTensorValue
(
GeneratorTensor_3
<
InDataType
>
{
-
0.5
,
0.5
});
}
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
in
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
out_device
.
mDesc
.
GetElementSpaceSize
());
in_device_buf
.
ToDevice
(
in
.
mData
.
data
());
// reset input to zero
out_device_buf
.
SetZero
();
static_assert
(
std
::
is_default_constructible_v
<
DeviceColToImgInstance
>
);
// do conv
auto
col2img
=
DeviceColToImgInstance
{};
auto
invoker
=
col2img
.
MakeInvoker
();
auto
argument
=
col2img
.
MakeArgument
(
in_device_buf
.
GetDeviceBuffer
(),
out_device_buf
.
GetDeviceBuffer
(),
N
,
C
,
input_spatial_lengths
,
filter_spatial_lengths
,
output_spatial_lengths
,
image_g_n_c_wis_strides
,
gemm_m_k_strides
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
);
if
(
!
col2img
.
IsSupportedArgument
(
argument
))
{
std
::
cerr
<<
"wrong! device_col2img with the specified compilation parameters does "
"not support this col2img problem"
<<
std
::
endl
;
return
false
;
}
float
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
config
.
time_kernel
});
std
::
size_t
num_btype
=
NDoHoWo
*
CZYX
*
(
sizeof
(
OutDataType
)
+
sizeof
(
InDataType
));
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s"
<<
std
::
endl
;
if
(
config
.
do_verification
)
{
auto
ref_column_to_image
=
ck
::
tensor_operation
::
host
::
ReferenceColumnToImage
<
NDimSpatial
,
ImLayout
,
InDataType
,
OutDataType
>
();
auto
ref_invoker
=
ref_column_to_image
.
MakeInvoker
();
auto
ref_argument
=
ref_column_to_image
.
MakeArgument
(
in
,
out_host
,
conv_params
.
filter_spatial_lengths_
,
conv_params
.
conv_filter_strides_
,
conv_params
.
conv_filter_dilations_
,
conv_params
.
input_left_pads_
,
conv_params
.
input_right_pads_
);
if
(
!
ref_column_to_image
.
IsSupportedArgument
(
&
ref_argument
))
{
std
::
cerr
<<
"wrong! ref_col2img with the specified compilation parameters does "
"not support this col2img problem"
<<
std
::
endl
;
return
false
;
}
ref_invoker
.
Run
(
ref_argument
);
out_device_buf
.
FromDevice
(
out_device
.
mData
.
data
());
return
ck
::
utils
::
check_err
(
out_device
.
mData
,
out_host
.
mData
);
}
return
true
;
}
int
RunColumnToImageExample
(
int
argc
,
char
*
argv
[])
{
ExecutionConfig
config
;
ck
::
utils
::
conv
::
ConvParam
conv_params
=
DefaultConvParams
;
if
(
!
parse_cmd_args
(
argc
,
argv
,
config
,
conv_params
))
{
return
EXIT_FAILURE
;
}
if
(
conv_params
.
num_dim_spatial_
!=
NDimSpatial
)
{
std
::
cerr
<<
"unsupported # of spatial dimensions"
<<
std
::
endl
;
return
EXIT_FAILURE
;
}
return
!
RunColumnToImage
(
config
,
conv_params
);
}
int
main
(
int
argc
,
char
*
argv
[])
{
return
RunColumnToImageExample
(
argc
,
argv
);
}
example/52_im
age_to
_col
umn
/common.hpp
→
example/52_im
2col
_col
2im
/common.hpp
View file @
ad24acb6
...
@@ -10,6 +10,7 @@
...
@@ -10,6 +10,7 @@
#include "ck/ck.hpp"
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/algorithm.hpp"
...
@@ -20,6 +21,7 @@
...
@@ -20,6 +21,7 @@
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_column_to_image.hpp"
template
<
ck
::
index_t
...
Is
>
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
S
=
ck
::
Sequence
<
Is
...
>
;
...
@@ -32,7 +34,7 @@ struct ExecutionConfig final
...
@@ -32,7 +34,7 @@ struct ExecutionConfig final
{
{
bool
do_verification
=
true
;
bool
do_verification
=
true
;
int
init_method
=
1
;
int
init_method
=
1
;
bool
time_kernel
=
tru
e
;
bool
time_kernel
=
fals
e
;
};
};
#define DefaultConvParams \
#define DefaultConvParams \
...
...
example/52_im
age_to
_col
umn
/image_to_column_f32.cpp
→
example/52_im
2col
_col
2im
/image_to_column_f32.cpp
View file @
ad24acb6
...
@@ -6,15 +6,16 @@
...
@@ -6,15 +6,16 @@
using
InDataType
=
FP32
;
using
InDataType
=
FP32
;
using
OutDataType
=
FP32
;
using
OutDataType
=
FP32
;
using
InLayout
=
ck
::
tensor_layout
::
convolution
::
GNHWC
;
using
ImLayout
=
ck
::
tensor_layout
::
convolution
::
GNHWC
;
using
ImageToColumnOp
=
ck
::
conv_tensor_rearrange_op
::
ImageToColumn
;
// clang-format off
// clang-format off
using
DeviceImgToColInstance
=
ck
::
tensor_operation
::
device
::
DeviceImageToColumnImpl
using
DeviceImgToColInstance
=
ck
::
tensor_operation
::
device
::
DeviceImageToColumnImpl
//#####################| Num| I
n
Layout| InDataType| OutDataType| Block| MPer| KPer| Thread| Scalar|
//#####################| Num| I
m
Layout| InDataType| OutDataType| Block| MPer| KPer| Thread| Scalar|
ConvTensor|
//#####################| Dim| | | | Size| Block| Block| Cluster| Per|
//#####################| Dim| | | | Size| Block| Block| Cluster| Per|
RearrangeOp|
//#####################| Spatial| | | | | | | Lengths| Vector|
//#####################| Spatial| | | | | | | Lengths| Vector|
|
//#####################| | | | | | | | | |
//#####################| | | | | | | | | |
|
<
NDimSpatial
,
I
n
Layout
,
InDataType
,
OutDataType
,
256
,
128
,
128
,
S
<
16
,
16
>
,
1
>
;
<
NDimSpatial
,
I
m
Layout
,
InDataType
,
OutDataType
,
256
,
128
,
128
,
S
<
16
,
16
>
,
1
,
ImageToColumnOp
>
;
// clang-format on
// clang-format on
bool
RunImageToColumn
(
const
ExecutionConfig
&
config
,
const
ck
::
utils
::
conv
::
ConvParam
&
conv_params
)
bool
RunImageToColumn
(
const
ExecutionConfig
&
config
,
const
ck
::
utils
::
conv
::
ConvParam
&
conv_params
)
...
@@ -31,14 +32,14 @@ bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::Conv
...
@@ -31,14 +32,14 @@ bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::Conv
conv_params
.
filter_spatial_lengths_
.
begin
(),
NDimSpatial
,
1
,
std
::
multiplies
<>
());
conv_params
.
filter_spatial_lengths_
.
begin
(),
NDimSpatial
,
1
,
std
::
multiplies
<>
());
const
auto
in_desc
=
const
auto
in_desc
=
ck
::
utils
::
conv
::
make_input_host_tensor_descriptor_g_n_c_wis_packed
<
I
n
Layout
>
(
conv_params
);
ck
::
utils
::
conv
::
make_input_host_tensor_descriptor_g_n_c_wis_packed
<
I
m
Layout
>
(
conv_params
);
const
auto
out_desc
=
HostTensorDescriptor
({
NDoHoWo
,
CZYX
});
const
auto
out_desc
=
HostTensorDescriptor
({
NDoHoWo
,
CZYX
});
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_spatial_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_spatial_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
filter_spatial_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
filter_spatial_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
output_spatial_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
output_spatial_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
i
nput
_g_n_c_wis_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
i
mage
_g_n_c_wis_strides
{};
std
::
array
<
ck
::
index_t
,
2
>
output
_m_k_strides
{};
std
::
array
<
ck
::
index_t
,
2
>
gemm
_m_k_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
conv_filter_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
conv_filter_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
conv_filter_dilations
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
conv_filter_dilations
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_left_pads
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_left_pads
{};
...
@@ -49,8 +50,8 @@ bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::Conv
...
@@ -49,8 +50,8 @@ bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::Conv
copy
(
conv_params
.
input_spatial_lengths_
,
input_spatial_lengths
);
copy
(
conv_params
.
input_spatial_lengths_
,
input_spatial_lengths
);
copy
(
conv_params
.
filter_spatial_lengths_
,
filter_spatial_lengths
);
copy
(
conv_params
.
filter_spatial_lengths_
,
filter_spatial_lengths
);
copy
(
conv_params
.
output_spatial_lengths_
,
output_spatial_lengths
);
copy
(
conv_params
.
output_spatial_lengths_
,
output_spatial_lengths
);
copy
(
in_desc
.
GetStrides
(),
i
nput
_g_n_c_wis_strides
);
copy
(
in_desc
.
GetStrides
(),
i
mage
_g_n_c_wis_strides
);
copy
(
out_desc
.
GetStrides
(),
output
_m_k_strides
);
copy
(
out_desc
.
GetStrides
(),
gemm
_m_k_strides
);
copy
(
conv_params
.
conv_filter_strides_
,
conv_filter_strides
);
copy
(
conv_params
.
conv_filter_strides_
,
conv_filter_strides
);
copy
(
conv_params
.
conv_filter_dilations_
,
conv_filter_dilations
);
copy
(
conv_params
.
conv_filter_dilations_
,
conv_filter_dilations
);
copy
(
conv_params
.
input_left_pads_
,
input_left_pads
);
copy
(
conv_params
.
input_left_pads_
,
input_left_pads
);
...
@@ -90,8 +91,8 @@ bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::Conv
...
@@ -90,8 +91,8 @@ bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::Conv
input_spatial_lengths
,
input_spatial_lengths
,
filter_spatial_lengths
,
filter_spatial_lengths
,
output_spatial_lengths
,
output_spatial_lengths
,
i
nput
_g_n_c_wis_strides
,
i
mage
_g_n_c_wis_strides
,
output
_m_k_strides
,
gemm
_m_k_strides
,
conv_filter_strides
,
conv_filter_strides
,
conv_filter_dilations
,
conv_filter_dilations
,
input_left_pads
,
input_left_pads
,
...
@@ -114,7 +115,7 @@ bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::Conv
...
@@ -114,7 +115,7 @@ bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::Conv
if
(
config
.
do_verification
)
if
(
config
.
do_verification
)
{
{
auto
ref_image_to_column
=
ck
::
tensor_operation
::
host
::
auto
ref_image_to_column
=
ck
::
tensor_operation
::
host
::
ReferenceImageToColumn
<
NDimSpatial
,
I
n
Layout
,
InDataType
,
OutDataType
>
();
ReferenceImageToColumn
<
NDimSpatial
,
I
m
Layout
,
InDataType
,
OutDataType
>
();
auto
ref_invoker
=
ref_image_to_column
.
MakeInvoker
();
auto
ref_invoker
=
ref_image_to_column
.
MakeInvoker
();
...
...
include/ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp
0 → 100644
View file @
ad24acb6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
namespace
ck
{
namespace
conv_tensor_rearrange_op
{
struct
BaseConvTensorRearrangeOp
{
};
struct
ImageToColumn
:
public
BaseConvTensorRearrangeOp
{
static
constexpr
const
char
*
name
=
"Image to Column"
;
};
struct
ColumnToImage
:
public
BaseConvTensorRearrangeOp
{
static
constexpr
const
char
*
name
=
"Column to Image"
;
};
template
<
typename
Op
,
typename
std
::
enable_if
<
std
::
is_base_of
<
BaseConvTensorRearrangeOp
,
Op
>
::
value
,
bool
>::
type
=
false
>
std
::
ostream
&
operator
<<
(
std
::
ostream
&
os
,
const
BaseConvTensorRearrangeOp
&
)
{
os
<<
Op
::
name
;
return
os
;
}
}
// namespace conv_tensor_rearrange_op
}
// namespace ck
include/ck/tensor_operation/gpu/device/device_
image_to_column
.hpp
→
include/ck/tensor_operation/gpu/device/device_
conv_tensor_rearrange
.hpp
View file @
ad24acb6
...
@@ -12,21 +12,24 @@ namespace tensor_operation {
...
@@ -12,21 +12,24 @@ namespace tensor_operation {
namespace
device
{
namespace
device
{
/**
/**
* \brief
Image to column
.
* \brief
Convolution Tensor Rearrange
.
*
*
* This Device operator converts image ([G, N, Di, Hi, Wi, C]) to the gemm
* This Device operator supports conversion image ([G, N, Di, Hi, Wi, C]) to
* problem([N * Do * Ho * Wo, Z * Y * X * C]). G must be equal to 1.
* the gemm problem([N * Do * Ho * Wo, Z * Y * X * C]) (Image to Column) and
* conversion gemm form to the image (Column to Image). G must be equal to 1.
*
*
* \tparam NDimSpatial Number of spatial dimensions.
* \tparam NDimSpatial Number of spatial dimensions.
* \tparam I
nput
Layout Input Layout.
* \tparam I
mage
Layout Input Layout.
* \tparam InputDataType Input Data Type.
* \tparam InputDataType Input Data Type.
* \tparam OutputDataType Output Data Type.
* \tparam OutputDataType Output Data Type.
* \tparam ConvTensorRearrangeOp Operation type: ImageToColumn, ColumnToImage.
*/
*/
template
<
index_t
NDimSpatial
,
template
<
index_t
NDimSpatial
,
typename
I
nput
Layout
,
typename
I
mage
Layout
,
typename
InputDataType
,
typename
InputDataType
,
typename
OutputDataType
>
typename
OutputDataType
,
struct
DeviceImageToColumn
:
public
BaseOperator
typename
ConvTensorRearrangeOp
>
struct
DeviceConvTensorRearrange
:
public
BaseOperator
{
{
/**
/**
...
@@ -39,8 +42,8 @@ struct DeviceImageToColumn : public BaseOperator
...
@@ -39,8 +42,8 @@ struct DeviceImageToColumn : public BaseOperator
* \param input_spatial_lengths Input spatial lengths.
* \param input_spatial_lengths Input spatial lengths.
* \param filter_spatial_lengths Filter spatial lengths.
* \param filter_spatial_lengths Filter spatial lengths.
* \param output_spatial_lengths Output spatial lengths.
* \param output_spatial_lengths Output spatial lengths.
* \param i
nput
_g_n_c_wis_strides I
nput
strides in order [G, N, C, D, H, W].
* \param i
mage
_g_n_c_wis_strides I
mage
strides in order [G, N, C, D, H, W].
* \param
output
_m_k_strides
Output
strides.
* \param
gemm
_m_k_strides
Gemm form
strides.
* \param conv_filter_strides Convolution filter strides.
* \param conv_filter_strides Convolution filter strides.
* \param conv_filter_dilations Convolution filter dilations.
* \param conv_filter_dilations Convolution filter dilations.
* \param input_left_pads Convolution left pads.
* \param input_left_pads Convolution left pads.
...
@@ -55,8 +58,8 @@ struct DeviceImageToColumn : public BaseOperator
...
@@ -55,8 +58,8 @@ struct DeviceImageToColumn : public BaseOperator
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
filter_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
filter_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
output_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
output_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
i
nput
_g_n_c_wis_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
i
mage
_g_n_c_wis_strides
,
const
std
::
array
<
index_t
,
2
>&
output
_m_k_strides
,
const
std
::
array
<
index_t
,
2
>&
gemm
_m_k_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_left_pads
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_left_pads
,
...
...
include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp
0 → 100644
View file @
ad24acb6
This diff is collapsed.
Click to expand it.
include/ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp
View file @
ad24acb6
...
@@ -5,64 +5,40 @@
...
@@ -5,64 +5,40 @@
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/gpu/device/device_
image_to_column
.hpp"
#include "ck/tensor_operation/gpu/device/device_
conv_tensor_rearrange
.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_
image_to_column
.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_
tensor_rearrange
.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
#include "ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp"
#include "ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp"
#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp"
#include "ck/host_utility/io.hpp"
#include "ck/host_utility/io.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
template
<
typename
InputGridDesc
,
typename
InputDataType
,
typename
OutputGridDesc
,
typename
OutputDataType
,
typename
Block2ETileMap
,
typename
GridwiseImageToColumnKernel
>
__global__
void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
#endif
kernel_image_to_column
(
const
InputGridDesc
in_grid_desc
,
const
InputDataType
*
__restrict__
p_in_global
,
const
OutputGridDesc
out_grid_desc
,
OutputDataType
*
__restrict__
p_out_global
,
const
Block2ETileMap
block_2_tile_map
)
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \
defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx1030__) || defined(__gfx1100__) || \
defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx941__) || defined(__gfx942__))
GridwiseImageToColumnKernel
::
Run
(
in_grid_desc
,
p_in_global
,
out_grid_desc
,
p_out_global
,
block_2_tile_map
);
#else
ignore
=
in_grid_desc
;
ignore
=
p_in_global
;
ignore
=
out_grid_desc
;
ignore
=
p_out_global
;
ignore
=
block_2_tile_map
;
#endif
}
// Image to column for input layout NDHWC:
// Image to column for input layout NDHWC:
// input : input image [N, Di, Hi, Wi, C]
,
// input : input image [N, Di, Hi, Wi, C]
// output :
output image
[N * Do * Ho * Wo, Z * Y * X * C]
// output :
gemm form
[N * Do * Ho * Wo, Z * Y * X * C]
template
<
index_t
NDimSpatial
,
template
<
index_t
NDimSpatial
,
typename
I
nput
Layout
,
typename
I
mage
Layout
,
typename
InputDataType
,
typename
InputDataType
,
typename
OutputDataType
,
typename
OutputDataType
,
index_t
BlockSize
,
index_t
BlockSize
,
index_t
MPerBlock
,
index_t
MPerBlock
,
index_t
KPerBlock
,
index_t
KPerBlock
,
typename
ThreadClusterLengths
,
typename
ThreadClusterLengths
,
index_t
ScalarPerVector
>
index_t
ScalarPerVector
,
struct
DeviceImageToColumnImpl
typename
ConvTensorRearrangeOp
>
:
public
DeviceImageToColumn
<
NDimSpatial
,
InputLayout
,
InputDataType
,
OutputDataType
>
struct
DeviceImageToColumnImpl
:
public
DeviceConvTensorRearrange
<
NDimSpatial
,
ImageLayout
,
InputDataType
,
OutputDataType
,
ConvTensorRearrangeOp
>
{
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I0
=
Number
<
0
>
{};
...
@@ -83,7 +59,7 @@ struct DeviceImageToColumnImpl
...
@@ -83,7 +59,7 @@ struct DeviceImageToColumnImpl
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
filter_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
filter_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
output_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
output_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
i
nput
_g_n_c_wis_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
i
mage
_g_n_c_wis_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_left_pads
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_left_pads
,
...
@@ -110,9 +86,9 @@ struct DeviceImageToColumnImpl
...
@@ -110,9 +86,9 @@ struct DeviceImageToColumnImpl
c_g_n_k_wos_lengths
[
I1
]
=
N
;
c_g_n_k_wos_lengths
[
I1
]
=
N
;
const
auto
in_gemmmraw_gemmkraw_desc
=
const
auto
in_gemmmraw_gemmkraw_desc
=
conv_to_gemm_transformer
.
template
MakeADescriptor_M_K
<
I
nput
Layout
>(
conv_to_gemm_transformer
.
template
MakeADescriptor_M_K
<
I
mage
Layout
>(
a_g_n_c_wis_lengths
,
a_g_n_c_wis_lengths
,
i
nput
_g_n_c_wis_strides
,
i
mage
_g_n_c_wis_strides
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_lengths
,
{},
// not needed for A Descriptor
{},
// not needed for A Descriptor
c_g_n_k_wos_lengths
,
c_g_n_k_wos_lengths
,
...
@@ -132,7 +108,7 @@ struct DeviceImageToColumnImpl
...
@@ -132,7 +108,7 @@ struct DeviceImageToColumnImpl
const
ck
::
index_t
C
,
const
ck
::
index_t
C
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
filter_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
filter_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
output_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
output_spatial_lengths
,
const
std
::
array
<
index_t
,
2
>&
output
_m_k_strides
)
const
std
::
array
<
index_t
,
2
>&
gemm
_m_k_strides
)
{
{
const
index_t
NDoHoWo
=
const
index_t
NDoHoWo
=
N
*
ck
::
accumulate_n
<
index_t
>
(
N
*
ck
::
accumulate_n
<
index_t
>
(
...
@@ -141,7 +117,7 @@ struct DeviceImageToColumnImpl
...
@@ -141,7 +117,7 @@ struct DeviceImageToColumnImpl
C
*
ck
::
accumulate_n
<
index_t
>
(
C
*
ck
::
accumulate_n
<
index_t
>
(
filter_spatial_lengths
.
begin
(),
NDimSpatial
,
1
,
std
::
multiplies
<>
());
filter_spatial_lengths
.
begin
(),
NDimSpatial
,
1
,
std
::
multiplies
<>
());
const
auto
desc_mraw_kraw
=
make_naive_tensor_descriptor
(
const
auto
desc_mraw_kraw
=
make_naive_tensor_descriptor
(
make_tuple
(
NDoHoWo
,
CZYX
),
make_tuple
(
output
_m_k_strides
[
I0
],
output
_m_k_strides
[
I1
]));
make_tuple
(
NDoHoWo
,
CZYX
),
make_tuple
(
gemm
_m_k_strides
[
I0
],
gemm
_m_k_strides
[
I1
]));
const
auto
desc_m_k
=
matrix_padder
.
PadADescriptor_M_K
(
desc_mraw_kraw
);
const
auto
desc_m_k
=
matrix_padder
.
PadADescriptor_M_K
(
desc_mraw_kraw
);
return
desc_m_k
;
return
desc_m_k
;
...
@@ -155,7 +131,7 @@ struct DeviceImageToColumnImpl
...
@@ -155,7 +131,7 @@ struct DeviceImageToColumnImpl
decltype
(
BlockToCTileMap_M00_N0_M01Adapt
<
MPerBlock
,
KPerBlock
,
OutputGridDesc
>
(
decltype
(
BlockToCTileMap_M00_N0_M01Adapt
<
MPerBlock
,
KPerBlock
,
OutputGridDesc
>
(
OutputGridDesc
{}))
>
;
OutputGridDesc
{}))
>
;
using
Gridwise
ImageToColumn
Kernel
=
Gridwise
ImageToColumn
<
InputGridDesc
,
using
Gridwise
TensorRearrange
Kernel
=
Gridwise
TensorRearrange
<
InputGridDesc
,
InputDataType
,
InputDataType
,
OutputGridDesc
,
OutputGridDesc
,
OutputDataType
,
OutputDataType
,
...
@@ -164,19 +140,20 @@ struct DeviceImageToColumnImpl
...
@@ -164,19 +140,20 @@ struct DeviceImageToColumnImpl
KPerBlock
,
KPerBlock
,
ThreadClusterLengths
,
ThreadClusterLengths
,
ScalarPerVector
,
ScalarPerVector
,
InMemoryDataOperationEnum
::
Set
,
Block2ETileMap
>
;
Block2ETileMap
>
;
struct
Argument
:
public
BaseArgument
struct
Argument
:
public
BaseArgument
{
{
Argument
(
const
void
*
p_in
,
// input image
Argument
(
const
void
*
p_in
,
// input image
void
*
p_out
,
//
output image
void
*
p_out
,
//
gemm form
const
ck
::
index_t
N
,
const
ck
::
index_t
N
,
const
ck
::
index_t
C
,
const
ck
::
index_t
C
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
filter_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
filter_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
output_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
output_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
i
nput
_g_n_c_wis_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
i
mage
_g_n_c_wis_strides
,
const
std
::
array
<
index_t
,
2
>&
output
_m_k_strides
,
const
std
::
array
<
index_t
,
2
>&
gemm
_m_k_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_left_pads
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_left_pads
,
...
@@ -185,7 +162,7 @@ struct DeviceImageToColumnImpl
...
@@ -185,7 +162,7 @@ struct DeviceImageToColumnImpl
X_
(
filter_spatial_lengths
[
NDimSpatial
-
I1
]),
X_
(
filter_spatial_lengths
[
NDimSpatial
-
I1
]),
p_in_
{
static_cast
<
const
InputDataType
*>
(
p_in
)},
p_in_
{
static_cast
<
const
InputDataType
*>
(
p_in
)},
p_out_
{
static_cast
<
OutputDataType
*>
(
p_out
)},
p_out_
{
static_cast
<
OutputDataType
*>
(
p_out
)},
i
nput
_g_n_c_wis_strides_
{
i
nput
_g_n_c_wis_strides
},
i
mage
_g_n_c_wis_strides_
{
i
mage
_g_n_c_wis_strides
},
conv_filter_strides_
{
conv_filter_strides
},
conv_filter_strides_
{
conv_filter_strides
},
conv_filter_dilations_
{
conv_filter_dilations
},
conv_filter_dilations_
{
conv_filter_dilations
},
input_left_pads_
{
input_left_pads
},
input_left_pads_
{
input_left_pads
},
...
@@ -197,7 +174,7 @@ struct DeviceImageToColumnImpl
...
@@ -197,7 +174,7 @@ struct DeviceImageToColumnImpl
input_spatial_lengths
,
input_spatial_lengths
,
filter_spatial_lengths
,
filter_spatial_lengths
,
output_spatial_lengths
,
output_spatial_lengths
,
i
nput
_g_n_c_wis_strides
,
i
mage
_g_n_c_wis_strides
,
conv_filter_strides
,
conv_filter_strides
,
conv_filter_dilations
,
conv_filter_dilations
,
...
@@ -205,7 +182,7 @@ struct DeviceImageToColumnImpl
...
@@ -205,7 +182,7 @@ struct DeviceImageToColumnImpl
input_right_pads
);
input_right_pads
);
out_grid_desc_m_k_
=
MakeOutDescriptor_M_K
(
out_grid_desc_m_k_
=
MakeOutDescriptor_M_K
(
N
,
C
,
filter_spatial_lengths
,
output_spatial_lengths
,
output
_m_k_strides
);
N
,
C
,
filter_spatial_lengths
,
output_spatial_lengths
,
gemm
_m_k_strides
);
}
}
void
Print
()
const
void
Print
()
const
...
@@ -220,7 +197,7 @@ struct DeviceImageToColumnImpl
...
@@ -220,7 +197,7 @@ struct DeviceImageToColumnImpl
const
InputDataType
*
p_in_
;
const
InputDataType
*
p_in_
;
OutputDataType
*
p_out_
;
OutputDataType
*
p_out_
;
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
i
nput
_g_n_c_wis_strides_
;
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
i
mage
_g_n_c_wis_strides_
;
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_strides_
;
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_strides_
;
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_dilations_
;
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_dilations_
;
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_left_pads_
;
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_left_pads_
;
...
@@ -243,12 +220,12 @@ struct DeviceImageToColumnImpl
...
@@ -243,12 +220,12 @@ struct DeviceImageToColumnImpl
BlockToCTileMap_M00_N0_M01Adapt
<
MPerBlock
,
KPerBlock
,
OutputGridDesc
>
(
BlockToCTileMap_M00_N0_M01Adapt
<
MPerBlock
,
KPerBlock
,
OutputGridDesc
>
(
arg
.
out_grid_desc_m_k_
);
arg
.
out_grid_desc_m_k_
);
const
index_t
grid_size
=
block_2_tile_map
.
CalculateGridSize
(
arg
.
out_grid_desc_m_k_
);
const
index_t
grid_size
=
block_2_tile_map
.
CalculateGridSize
(
arg
.
out_grid_desc_m_k_
);
const
auto
kernel
=
kernel_
image_to_column
<
InputGridDesc
,
const
auto
kernel
=
kernel_
tensor_rearrange
<
InputGridDesc
,
InputDataType
,
InputDataType
,
OutputGridDesc
,
OutputGridDesc
,
OutputDataType
,
OutputDataType
,
Block2ETileMap
,
Block2ETileMap
,
Gridwise
ImageToColumn
Kernel
>
;
Gridwise
TensorRearrange
Kernel
>
;
float
elapsed_time
=
launch_and_time_kernel
(
stream_config
,
float
elapsed_time
=
launch_and_time_kernel
(
stream_config
,
kernel
,
kernel
,
...
@@ -272,13 +249,18 @@ struct DeviceImageToColumnImpl
...
@@ -272,13 +249,18 @@ struct DeviceImageToColumnImpl
bool
IsSupportedArgument
(
const
Argument
&
arg
)
bool
IsSupportedArgument
(
const
Argument
&
arg
)
{
{
using
namespace
conv_tensor_rearrange_op
;
using
namespace
tensor_layout
::
convolution
;
using
namespace
tensor_layout
::
convolution
;
if
(
!
(
std
::
is_same_v
<
InputLayout
,
GNWC
>
||
std
::
is_same_v
<
InputLayout
,
GNHWC
>
||
if
constexpr
(
!
std
::
is_same_v
<
ConvTensorRearrangeOp
,
ImageToColumn
>
)
std
::
is_same_v
<
InputLayout
,
GNDHWC
>
))
{
return
false
;
}
if
constexpr
(
!
(
std
::
is_same_v
<
ImageLayout
,
GNWC
>
||
std
::
is_same_v
<
ImageLayout
,
GNHWC
>
||
std
::
is_same_v
<
ImageLayout
,
GNDHWC
>
))
{
{
return
false
;
return
false
;
}
}
if
(
!
(
NDimSpatial
>=
1
&&
NDimSpatial
<=
3
))
if
constexpr
(
!
(
NDimSpatial
>=
1
&&
NDimSpatial
<=
3
))
{
{
return
false
;
return
false
;
}
}
...
@@ -287,8 +269,8 @@ struct DeviceImageToColumnImpl
...
@@ -287,8 +269,8 @@ struct DeviceImageToColumnImpl
const
auto
w_pad_right
=
arg
.
input_right_pads_
[
NDimSpatial
-
I1
];
const
auto
w_pad_right
=
arg
.
input_right_pads_
[
NDimSpatial
-
I1
];
const
auto
dilation_x
=
arg
.
conv_filter_dilations_
[
NDimSpatial
-
I1
];
const
auto
dilation_x
=
arg
.
conv_filter_dilations_
[
NDimSpatial
-
I1
];
const
auto
stride_x
=
arg
.
conv_filter_strides_
[
NDimSpatial
-
I1
];
const
auto
stride_x
=
arg
.
conv_filter_strides_
[
NDimSpatial
-
I1
];
bool
is_w_packed
=
arg
.
i
nput
_g_n_c_wis_strides_
[
NDimSpatial
+
I2
]
==
arg
.
C_
;
bool
is_w_packed
=
arg
.
i
mage
_g_n_c_wis_strides_
[
NDimSpatial
+
I2
]
==
arg
.
C_
;
bool
is_c_packed
=
arg
.
i
nput
_g_n_c_wis_strides_
[
I2
]
==
1
;
bool
is_c_packed
=
arg
.
i
mage
_g_n_c_wis_strides_
[
I2
]
==
1
;
// check vector acces with c not packed
// check vector acces with c not packed
if
(
!
is_c_packed
&&
ScalarPerVector
!=
1
)
if
(
!
is_c_packed
&&
ScalarPerVector
!=
1
)
...
@@ -310,7 +292,7 @@ struct DeviceImageToColumnImpl
...
@@ -310,7 +292,7 @@ struct DeviceImageToColumnImpl
if
(
dilation_x
>
1
&&
arg
.
C_
%
ScalarPerVector
!=
0
)
if
(
dilation_x
>
1
&&
arg
.
C_
%
ScalarPerVector
!=
0
)
return
false
;
return
false
;
return
Gridwise
ImageToColumn
Kernel
::
CheckValidity
(
arg
.
in_grid_desc_m_k_
,
return
Gridwise
TensorRearrange
Kernel
::
CheckValidity
(
arg
.
in_grid_desc_m_k_
,
arg
.
out_grid_desc_m_k_
);
arg
.
out_grid_desc_m_k_
);
}
}
...
@@ -320,14 +302,14 @@ struct DeviceImageToColumnImpl
...
@@ -320,14 +302,14 @@ struct DeviceImageToColumnImpl
}
}
static
auto
MakeArgument
(
const
void
*
p_in
,
// input image
static
auto
MakeArgument
(
const
void
*
p_in
,
// input image
void
*
p_out
,
//
output image
void
*
p_out
,
//
gemm form
const
ck
::
index_t
N
,
const
ck
::
index_t
N
,
const
ck
::
index_t
C
,
const
ck
::
index_t
C
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
filter_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
filter_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
output_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
output_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
i
nput
_g_n_c_wis_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
i
mage
_g_n_c_wis_strides
,
const
std
::
array
<
index_t
,
2
>&
output
_m_k_strides
,
const
std
::
array
<
index_t
,
2
>&
gemm
_m_k_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_left_pads
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_left_pads
,
...
@@ -340,8 +322,8 @@ struct DeviceImageToColumnImpl
...
@@ -340,8 +322,8 @@ struct DeviceImageToColumnImpl
input_spatial_lengths
,
input_spatial_lengths
,
filter_spatial_lengths
,
filter_spatial_lengths
,
output_spatial_lengths
,
output_spatial_lengths
,
i
nput
_g_n_c_wis_strides
,
i
mage
_g_n_c_wis_strides
,
output
_m_k_strides
,
gemm
_m_k_strides
,
conv_filter_strides
,
conv_filter_strides
,
conv_filter_dilations
,
conv_filter_dilations
,
input_left_pads
,
input_left_pads
,
...
@@ -352,14 +334,14 @@ struct DeviceImageToColumnImpl
...
@@ -352,14 +334,14 @@ struct DeviceImageToColumnImpl
std
::
unique_ptr
<
BaseArgument
>
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
void
*
p_in
,
// input image
MakeArgumentPointer
(
const
void
*
p_in
,
// input image
void
*
p_out
,
//
output image
void
*
p_out
,
//
gemm form
const
ck
::
index_t
N
,
const
ck
::
index_t
N
,
const
ck
::
index_t
C
,
const
ck
::
index_t
C
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
filter_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
filter_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
output_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
output_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
i
nput
_g_n_c_wis_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
i
mage
_g_n_c_wis_strides
,
const
std
::
array
<
index_t
,
2
>&
output
_m_k_strides
,
const
std
::
array
<
index_t
,
2
>&
gemm
_m_k_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_left_pads
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_left_pads
,
...
@@ -372,8 +354,8 @@ struct DeviceImageToColumnImpl
...
@@ -372,8 +354,8 @@ struct DeviceImageToColumnImpl
input_spatial_lengths
,
input_spatial_lengths
,
filter_spatial_lengths
,
filter_spatial_lengths
,
output_spatial_lengths
,
output_spatial_lengths
,
i
nput
_g_n_c_wis_strides
,
i
mage
_g_n_c_wis_strides
,
output
_m_k_strides
,
gemm
_m_k_strides
,
conv_filter_strides
,
conv_filter_strides
,
conv_filter_dilations
,
conv_filter_dilations
,
input_left_pads
,
input_left_pads
,
...
...
include/ck/tensor_operation/gpu/grid/gridwise_
image_to_column
.hpp
→
include/ck/tensor_operation/gpu/grid/gridwise_
tensor_rearrange
.hpp
View file @
ad24acb6
...
@@ -16,6 +16,36 @@
...
@@ -16,6 +16,36 @@
namespace
ck
{
namespace
ck
{
template
<
typename
InputGridDesc
,
typename
InputDataType
,
typename
OutputGridDesc
,
typename
OutputDataType
,
typename
Block2ETileMap
,
typename
GridwiseTensorRearrangeKernel
>
__global__
void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
#endif
kernel_tensor_rearrange
(
const
InputGridDesc
in_grid_desc
,
const
InputDataType
*
__restrict__
p_in_global
,
const
OutputGridDesc
out_grid_desc
,
OutputDataType
*
__restrict__
p_out_global
,
const
Block2ETileMap
block_2_tile_map
)
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \
defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx1030__) || defined(__gfx1100__) || \
defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx941__) || defined(__gfx942__))
GridwiseTensorRearrangeKernel
::
Run
(
in_grid_desc
,
p_in_global
,
out_grid_desc
,
p_out_global
,
block_2_tile_map
);
#else
ignore
=
in_grid_desc
;
ignore
=
p_in_global
;
ignore
=
out_grid_desc
;
ignore
=
p_out_global
;
ignore
=
block_2_tile_map
;
#endif
}
template
<
typename
InputGridDesc
,
template
<
typename
InputGridDesc
,
typename
InputDataType
,
typename
InputDataType
,
typename
OutputGridDesc
,
typename
OutputGridDesc
,
...
@@ -25,8 +55,9 @@ template <typename InputGridDesc,
...
@@ -25,8 +55,9 @@ template <typename InputGridDesc,
index_t
KPerBlock
,
index_t
KPerBlock
,
typename
ThreadClusterLengths
,
typename
ThreadClusterLengths
,
index_t
ScalarPerVector
,
index_t
ScalarPerVector
,
InMemoryDataOperationEnum
DstInMemOp
,
typename
Block2ETileMap
>
typename
Block2ETileMap
>
struct
Gridwise
ImageToColumn
struct
Gridwise
TensorRearrange
{
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I0
=
Number
<
0
>
{};
...
@@ -55,14 +86,14 @@ struct GridwiseImageToColumn
...
@@ -55,14 +86,14 @@ struct GridwiseImageToColumn
auto
out_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
auto
out_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_out_global
,
out_grid_desc
.
GetElementSpaceSize
());
p_out_global
,
out_grid_desc
.
GetElementSpaceSize
());
auto
copy_global_to_global
=
ThreadGroupTensorSliceTransfer_v7
<
auto
copy_global_to_global
=
ThisThreadBlock
,
ThreadGroupTensorSliceTransfer_v7
<
ThisThreadBlock
,
Tuple
<
InputDataType
>
,
Tuple
<
InputDataType
>
,
Tuple
<
OutputDataType
>
,
Tuple
<
OutputDataType
>
,
decltype
(
tie
(
in_grid_desc
)),
decltype
(
tie
(
in_grid_desc
)),
decltype
(
tie
(
out_grid_desc
)),
decltype
(
tie
(
out_grid_desc
)),
tensor_operation
::
element_wise
::
PassThrough
,
tensor_operation
::
element_wise
::
PassThrough
,
Sequence
<
static_cast
<
index_t
>
(
InMem
oryDataOperationEnum
::
Set
)
>
,
Sequence
<
static_cast
<
index_t
>
(
Dst
InMem
Op
)
>
,
Sequence
<
MPerBlock
,
KPerBlock
>
,
Sequence
<
MPerBlock
,
KPerBlock
>
,
ThreadClusterLengths
,
ThreadClusterLengths
,
Sequence
<
0
,
1
>
,
Sequence
<
0
,
1
>
,
...
...
include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp
View file @
ad24acb6
...
@@ -20,348 +20,13 @@ struct TransformConvFwdToGemm
...
@@ -20,348 +20,13 @@ struct TransformConvFwdToGemm
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
template
<
typename
ALayout
,
typename
std
::
enable_if
<
NDimSpatial
==
1
&&
is_same_v
<
ALayout
,
tensor_layout
::
convolution
::
GNWC
>,
bool
>::
type
=
false
>
static
auto
MakeADescriptor_M_K
(
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
a_g_n_c_wis_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
/* a_g_n_c_wis_strides */
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
/* b_g_k_c_xs_strides */
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
c_g_n_k_wos_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
/* c_g_n_k_wos_strides */
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_left_pads
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_right_pads
)
{
const
index_t
N
=
a_g_n_c_wis_lengths
[
1
];
const
index_t
C
=
a_g_n_c_wis_lengths
[
2
];
const
index_t
Wi
=
a_g_n_c_wis_lengths
[
3
];
const
index_t
Wo
=
c_g_n_k_wos_lengths
[
3
];
const
index_t
ConvStrideW
=
conv_filter_strides
[
0
];
if
constexpr
(
ConvForwardSpecialization
==
device
::
ConvolutionForwardSpecialization
::
Filter1x1Stride1Pad0
)
{
const
index_t
NWo
=
N
*
ck
::
accumulate_n
<
index_t
>
(
c_g_n_k_wos_lengths
.
begin
()
+
3
,
NDimSpatial
,
1
,
std
::
multiplies
<>
());
const
auto
in_gemmm_gemmk_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
NWo
,
C
));
return
in_gemmm_gemmk_desc
;
}
else
if
constexpr
(
ConvForwardSpecialization
==
device
::
ConvolutionForwardSpecialization
::
Filter1x1Pad0
)
{
const
auto
in_n_wi_c_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
,
Wi
,
C
));
const
auto
in_n_wo_c_desc
=
transform_tensor_descriptor
(
in_n_wi_c_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_embed_transform
(
make_tuple
(
Wo
),
make_tuple
(
ConvStrideW
)),
make_pass_through_transform
(
C
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{}));
const
auto
in_gemmm_gemmk_desc
=
transform_tensor_descriptor
(
in_n_wo_c_desc
,
make_tuple
(
make_merge_transform
(
make_tuple
(
N
,
Wo
)),
make_pass_through_transform
(
C
)),
make_tuple
(
Sequence
<
0
,
1
>
{},
Sequence
<
2
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
return
in_gemmm_gemmk_desc
;
}
else
{
const
index_t
X
=
b_g_k_c_xs_lengths
[
3
];
const
index_t
ConvDilationW
=
conv_filter_dilations
[
0
];
const
index_t
InLeftPadW
=
input_left_pads
[
0
];
const
index_t
InRightPadW
=
input_right_pads
[
0
];
const
auto
in_n_wi_c_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
,
Wi
,
C
));
const
auto
in_n_wip_c_desc
=
transform_tensor_descriptor
(
in_n_wi_c_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_pad_transform
(
Wi
,
InLeftPadW
,
InRightPadW
),
make_pass_through_transform
(
C
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{}));
const
auto
in_n_x_wo_c_desc
=
transform_tensor_descriptor
(
in_n_wip_c_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_embed_transform
(
make_tuple
(
X
,
Wo
),
make_tuple
(
ConvDilationW
,
ConvStrideW
)),
make_pass_through_transform
(
C
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
>
{}));
const
auto
in_gemmm_gemmk_desc
=
transform_tensor_descriptor
(
in_n_x_wo_c_desc
,
make_tuple
(
make_merge_transform
(
make_tuple
(
N
,
Wo
)),
make_merge_transform
(
make_tuple
(
X
,
C
))),
make_tuple
(
Sequence
<
0
,
2
>
{},
Sequence
<
1
,
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
return
in_gemmm_gemmk_desc
;
}
}
template
<
typename
ALayout
,
typename
std
::
enable_if
<
NDimSpatial
==
2
&&
is_same_v
<
ALayout
,
tensor_layout
::
convolution
::
GNHWC
>,
bool
>::
type
=
false
>
static
auto
MakeADescriptor_M_K
(
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
a_g_n_c_wis_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
/* a_g_n_c_wis_strides */
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
/* b_g_k_c_xs_strides */
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
c_g_n_k_wos_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
/* c_g_n_k_wos_strides */
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_left_pads
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_right_pads
)
{
const
index_t
N
=
a_g_n_c_wis_lengths
[
1
];
const
index_t
C
=
a_g_n_c_wis_lengths
[
2
];
const
index_t
Hi
=
a_g_n_c_wis_lengths
[
3
];
const
index_t
Wi
=
a_g_n_c_wis_lengths
[
4
];
const
index_t
Ho
=
c_g_n_k_wos_lengths
[
3
];
const
index_t
Wo
=
c_g_n_k_wos_lengths
[
4
];
const
index_t
ConvStrideH
=
conv_filter_strides
[
0
];
const
index_t
ConvStrideW
=
conv_filter_strides
[
1
];
if
constexpr
(
ConvForwardSpecialization
==
device
::
ConvolutionForwardSpecialization
::
Filter1x1Stride1Pad0
)
{
const
index_t
NHoWo
=
N
*
ck
::
accumulate_n
<
index_t
>
(
c_g_n_k_wos_lengths
.
begin
()
+
3
,
NDimSpatial
,
1
,
std
::
multiplies
<>
());
const
auto
in_gemmm_gemmk_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
NHoWo
,
C
));
return
in_gemmm_gemmk_desc
;
}
else
if
constexpr
(
ConvForwardSpecialization
==
device
::
ConvolutionForwardSpecialization
::
Filter1x1Pad0
)
{
const
auto
in_n_hi_wi_c_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
,
Hi
,
Wi
,
C
));
const
auto
in_n_ho_wo_c_desc
=
transform_tensor_descriptor
(
in_n_hi_wi_c_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_embed_transform
(
make_tuple
(
Ho
),
make_tuple
(
ConvStrideH
)),
make_embed_transform
(
make_tuple
(
Wo
),
make_tuple
(
ConvStrideW
)),
make_pass_through_transform
(
C
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
const
auto
in_gemmm_gemmk_desc
=
transform_tensor_descriptor
(
in_n_ho_wo_c_desc
,
make_tuple
(
make_merge_transform
(
make_tuple
(
N
,
Ho
,
Wo
)),
make_pass_through_transform
(
C
)),
make_tuple
(
Sequence
<
0
,
1
,
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
return
in_gemmm_gemmk_desc
;
}
else
{
const
index_t
Y
=
b_g_k_c_xs_lengths
[
3
];
const
index_t
X
=
b_g_k_c_xs_lengths
[
4
];
const
index_t
ConvDilationH
=
conv_filter_dilations
[
0
];
const
index_t
ConvDilationW
=
conv_filter_dilations
[
1
];
const
index_t
InLeftPadH
=
input_left_pads
[
0
];
const
index_t
InLeftPadW
=
input_left_pads
[
1
];
const
index_t
InRightPadH
=
input_right_pads
[
0
];
const
index_t
InRightPadW
=
input_right_pads
[
1
];
const
auto
in_n_hi_wi_c_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
,
Hi
,
Wi
,
C
));
const
auto
in_n_hip_wip_c_desc
=
transform_tensor_descriptor
(
in_n_hi_wi_c_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_pad_transform
(
Hi
,
InLeftPadH
,
InRightPadH
),
make_pad_transform
(
Wi
,
InLeftPadW
,
InRightPadW
),
make_pass_through_transform
(
C
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
const
auto
in_n_y_ho_x_wo_c_desc
=
transform_tensor_descriptor
(
in_n_hip_wip_c_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_embed_transform
(
make_tuple
(
Y
,
Ho
),
make_tuple
(
ConvDilationH
,
ConvStrideH
)),
make_embed_transform
(
make_tuple
(
X
,
Wo
),
make_tuple
(
ConvDilationW
,
ConvStrideW
)),
make_pass_through_transform
(
C
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
,
4
>
{},
Sequence
<
5
>
{}));
const
auto
in_gemmm_gemmk_desc
=
transform_tensor_descriptor
(
in_n_y_ho_x_wo_c_desc
,
make_tuple
(
make_merge_transform
(
make_tuple
(
N
,
Ho
,
Wo
)),
make_merge_transform
(
make_tuple
(
Y
,
X
,
C
))),
make_tuple
(
Sequence
<
0
,
2
,
4
>
{},
Sequence
<
1
,
3
,
5
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
return
in_gemmm_gemmk_desc
;
}
}
template
<
typename
ALayout
,
typename
std
::
enable_if
<
NDimSpatial
==
3
&&
is_same_v
<
ALayout
,
tensor_layout
::
convolution
::
GNDHWC
>,
bool
>::
type
=
false
>
static
auto
MakeADescriptor_M_K
(
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
a_g_n_c_wis_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
/* a_g_n_c_wis_strides */
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
/* b_g_k_c_xs_strides */
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
c_g_n_k_wos_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
/* c_g_n_k_wos_strides */
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_left_pads
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_right_pads
)
{
const
index_t
N
=
a_g_n_c_wis_lengths
[
1
];
const
index_t
C
=
a_g_n_c_wis_lengths
[
2
];
const
index_t
Di
=
a_g_n_c_wis_lengths
[
3
];
const
index_t
Hi
=
a_g_n_c_wis_lengths
[
4
];
const
index_t
Wi
=
a_g_n_c_wis_lengths
[
5
];
const
index_t
Do
=
c_g_n_k_wos_lengths
[
3
];
const
index_t
Ho
=
c_g_n_k_wos_lengths
[
4
];
const
index_t
Wo
=
c_g_n_k_wos_lengths
[
5
];
const
index_t
ConvStrideD
=
conv_filter_strides
[
0
];
const
index_t
ConvStrideH
=
conv_filter_strides
[
1
];
const
index_t
ConvStrideW
=
conv_filter_strides
[
2
];
if
constexpr
(
ConvForwardSpecialization
==
device
::
ConvolutionForwardSpecialization
::
Filter1x1Stride1Pad0
)
{
const
index_t
NDoHoWo
=
N
*
ck
::
accumulate_n
<
index_t
>
(
c_g_n_k_wos_lengths
.
begin
()
+
3
,
NDimSpatial
,
1
,
std
::
multiplies
<>
());
const
auto
in_gemmm_gemmk_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
NDoHoWo
,
C
));
return
in_gemmm_gemmk_desc
;
}
else
if
constexpr
(
ConvForwardSpecialization
==
device
::
ConvolutionForwardSpecialization
::
Filter1x1Pad0
)
{
const
auto
in_n_di_hi_wi_c_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
,
Di
,
Hi
,
Wi
,
C
));
const
auto
in_n_do_ho_wo_c_desc
=
transform_tensor_descriptor
(
in_n_di_hi_wi_c_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_embed_transform
(
make_tuple
(
Do
),
make_tuple
(
ConvStrideD
)),
make_embed_transform
(
make_tuple
(
Ho
),
make_tuple
(
ConvStrideH
)),
make_embed_transform
(
make_tuple
(
Wo
),
make_tuple
(
ConvStrideW
)),
make_pass_through_transform
(
C
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{}));
const
auto
in_gemmm_gemmk_desc
=
transform_tensor_descriptor
(
in_n_do_ho_wo_c_desc
,
make_tuple
(
make_merge_transform
(
make_tuple
(
N
,
Do
,
Ho
,
Wo
)),
make_pass_through_transform
(
C
)),
make_tuple
(
Sequence
<
0
,
1
,
2
,
3
>
{},
Sequence
<
4
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
return
in_gemmm_gemmk_desc
;
}
else
{
const
index_t
Z
=
b_g_k_c_xs_lengths
[
3
];
const
index_t
Y
=
b_g_k_c_xs_lengths
[
4
];
const
index_t
X
=
b_g_k_c_xs_lengths
[
5
];
const
index_t
ConvDilationD
=
conv_filter_dilations
[
0
];
const
index_t
ConvDilationH
=
conv_filter_dilations
[
1
];
const
index_t
ConvDilationW
=
conv_filter_dilations
[
2
];
const
index_t
InLeftPadD
=
input_left_pads
[
0
];
const
index_t
InLeftPadH
=
input_left_pads
[
1
];
const
index_t
InLeftPadW
=
input_left_pads
[
2
];
const
index_t
InRightPadD
=
input_right_pads
[
0
];
const
index_t
InRightPadH
=
input_right_pads
[
1
];
const
index_t
InRightPadW
=
input_right_pads
[
2
];
const
auto
in_n_di_hi_wi_c_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
,
Di
,
Hi
,
Wi
,
C
));
const
auto
in_n_hip_wip_c_desc
=
transform_tensor_descriptor
(
in_n_di_hi_wi_c_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_pad_transform
(
Di
,
InLeftPadD
,
InRightPadD
),
make_pad_transform
(
Hi
,
InLeftPadH
,
InRightPadH
),
make_pad_transform
(
Wi
,
InLeftPadW
,
InRightPadW
),
make_pass_through_transform
(
C
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{}));
const
auto
in_n_z_do_y_ho_x_wo_c_desc
=
transform_tensor_descriptor
(
in_n_hip_wip_c_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_embed_transform
(
make_tuple
(
Z
,
Do
),
make_tuple
(
ConvDilationD
,
ConvStrideD
)),
make_embed_transform
(
make_tuple
(
Y
,
Ho
),
make_tuple
(
ConvDilationH
,
ConvStrideH
)),
make_embed_transform
(
make_tuple
(
X
,
Wo
),
make_tuple
(
ConvDilationW
,
ConvStrideW
)),
make_pass_through_transform
(
C
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
,
4
>
{},
Sequence
<
5
,
6
>
{},
Sequence
<
7
>
{}));
const
auto
in_gemmm_gemmk_desc
=
transform_tensor_descriptor
(
in_n_z_do_y_ho_x_wo_c_desc
,
make_tuple
(
make_merge_transform
(
make_tuple
(
N
,
Do
,
Ho
,
Wo
)),
make_merge_transform
(
make_tuple
(
Z
,
Y
,
X
,
C
))),
make_tuple
(
Sequence
<
0
,
2
,
4
,
6
>
{},
Sequence
<
1
,
3
,
5
,
7
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
return
in_gemmm_gemmk_desc
;
}
}
// TODO: implement ck::tensor_layout::convolution that describe packed/strided dimemsion as
// TODO: implement ck::tensor_layout::convolution that describe packed/strided dimemsion as
// properties
// properties
template
<
typename
ALayout
,
template
<
typename
ALayout
,
typename
std
::
enable_if
<
NDimSpatial
==
1
&&
typename
std
::
enable_if
<
NDimSpatial
==
1
&&
(
is_same_v
<
ALayout
,
tensor_layout
::
convolution
::
G_NW_C
>
||
(
is_same_v
<
ALayout
,
tensor_layout
::
convolution
::
G_NW_C
>
||
is_same_v
<
ALayout
,
tensor_layout
::
convolution
::
NWGC
>
),
is_same_v
<
ALayout
,
tensor_layout
::
convolution
::
NWGC
>
||
is_same_v
<
ALayout
,
tensor_layout
::
convolution
::
GNWC
>
),
bool
>::
type
=
false
>
bool
>::
type
=
false
>
static
auto
static
auto
MakeADescriptor_M_K
(
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
a_g_n_c_wis_lengths
,
MakeADescriptor_M_K
(
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
a_g_n_c_wis_lengths
,
...
@@ -473,7 +138,8 @@ struct TransformConvFwdToGemm
...
@@ -473,7 +138,8 @@ struct TransformConvFwdToGemm
template
<
typename
ALayout
,
template
<
typename
ALayout
,
typename
std
::
enable_if
<
typename
std
::
enable_if
<
NDimSpatial
==
2
&&
(
is_same_v
<
ALayout
,
tensor_layout
::
convolution
::
G_NHW_C
>
||
NDimSpatial
==
2
&&
(
is_same_v
<
ALayout
,
tensor_layout
::
convolution
::
G_NHW_C
>
||
is_same_v
<
ALayout
,
tensor_layout
::
convolution
::
NHWGC
>
),
is_same_v
<
ALayout
,
tensor_layout
::
convolution
::
NHWGC
>
||
is_same_v
<
ALayout
,
tensor_layout
::
convolution
::
GNHWC
>
),
bool
>::
type
=
false
>
bool
>::
type
=
false
>
static
auto
static
auto
MakeADescriptor_M_K
(
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
a_g_n_c_wis_lengths
,
MakeADescriptor_M_K
(
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
a_g_n_c_wis_lengths
,
...
@@ -601,7 +267,8 @@ struct TransformConvFwdToGemm
...
@@ -601,7 +267,8 @@ struct TransformConvFwdToGemm
template
<
typename
ALayout
,
template
<
typename
ALayout
,
typename
std
::
enable_if
<
typename
std
::
enable_if
<
NDimSpatial
==
3
&&
(
is_same_v
<
ALayout
,
tensor_layout
::
convolution
::
G_NDHW_C
>
||
NDimSpatial
==
3
&&
(
is_same_v
<
ALayout
,
tensor_layout
::
convolution
::
G_NDHW_C
>
||
is_same_v
<
ALayout
,
tensor_layout
::
convolution
::
NDHWGC
>
),
is_same_v
<
ALayout
,
tensor_layout
::
convolution
::
NDHWGC
>
||
is_same_v
<
ALayout
,
tensor_layout
::
convolution
::
GNDHWC
>
),
bool
>::
type
=
false
>
bool
>::
type
=
false
>
static
auto
static
auto
MakeADescriptor_M_K
(
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
a_g_n_c_wis_lengths
,
MakeADescriptor_M_K
(
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
a_g_n_c_wis_lengths
,
...
...
include/ck/utility/dynamic_buffer.hpp
View file @
ad24acb6
...
@@ -141,9 +141,35 @@ struct DynamicBuffer
...
@@ -141,9 +141,35 @@ struct DynamicBuffer
else
if
constexpr
(
Op
==
InMemoryDataOperationEnum
::
Add
)
else
if
constexpr
(
Op
==
InMemoryDataOperationEnum
::
Add
)
{
{
auto
tmp
=
this
->
template
Get
<
X
>(
i
,
is_valid_element
);
auto
tmp
=
this
->
template
Get
<
X
>(
i
,
is_valid_element
);
using
scalar_t
=
typename
scalar_type
<
remove_cvref_t
<
T
>>::
type
;
// handle bfloat addition
if
constexpr
(
is_same_v
<
scalar_t
,
bhalf_t
>
)
{
if
constexpr
(
is_scalar_type
<
X
>::
value
)
{
// Scalar type
auto
result
=
type_convert
<
X
>
(
type_convert
<
float
>
(
x
)
+
type_convert
<
float
>
(
tmp
));
this
->
template
Set
<
X
>(
i
,
is_valid_element
,
result
);
}
else
{
// Vector type
constexpr
auto
vector_size
=
scalar_type
<
remove_cvref_t
<
X
>>::
vector_size
;
const
vector_type
<
scalar_t
,
vector_size
>
a_vector
{
tmp
};
const
vector_type
<
scalar_t
,
vector_size
>
b_vector
{
x
};
static_for
<
0
,
vector_size
,
1
>
{}([
&
](
auto
idx
)
{
auto
result
=
type_convert
<
scalar_t
>
(
type_convert
<
float
>
(
a_vector
.
template
AsType
<
scalar_t
>()[
idx
])
+
type_convert
<
float
>
(
b_vector
.
template
AsType
<
scalar_t
>()[
idx
]));
this
->
template
Set
<
scalar_t
>(
i
+
idx
,
is_valid_element
,
result
);
});
}
}
else
{
this
->
template
Set
<
X
>(
i
,
is_valid_element
,
x
+
tmp
);
this
->
template
Set
<
X
>(
i
,
is_valid_element
,
x
+
tmp
);
// tmp += x;
}
// this->template Set<X>(i, is_valid_element, tmp);
}
}
}
}
...
...
library/include/ck/library/reference_tensor_operation/cpu/reference_column_to_image.hpp
0 → 100644
View file @
ad24acb6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <type_traits>
#include <sstream>
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/library/utility/host_tensor.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
host
{
/**
* \brief Reference implementation for column to image.
*
* Input tensor descriptor has [N * Do * Ho * Wo, Z * Y * X * C] data layout.
* Memory layout is the same.
* Output tensor descriptor has [G, N, C, Di, Hi, Wi] data layout.
* G must be equal to 1. Memory layout is [G, N, Di, Hi, Wi, C].
*
* \tparam NDimSpatial Number of spatial dimensions.
* \tparam ImageLayout Image Layout.
* \tparam InDataType Input Data Type.
* \tparam OutDataType Output Data Type.
*/
template
<
ck
::
index_t
NDimSpatial
,
typename
ImageLayout
,
typename
InDataType
,
typename
OutDataType
,
typename
std
::
enable_if
<
NDimSpatial
>
=
1
&&
NDimSpatial
<=
3
,
bool
>::
type
=
false
>
struct
ReferenceColumnToImage
:
public
device
::
BaseOperator
{
// Argument
struct
Argument
:
public
device
::
BaseArgument
{
public:
Argument
(
const
Tensor
<
InDataType
>&
input
,
Tensor
<
OutDataType
>&
output
,
std
::
vector
<
ck
::
index_t
>
filter_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>
conv_filter_strides
,
std
::
vector
<
ck
::
index_t
>
conv_filter_dilations
,
std
::
vector
<
ck
::
index_t
>
input_left_pads
,
std
::
vector
<
ck
::
index_t
>
input_right_pads
)
:
input_
{
input
},
output_
{
output
},
conv_strides_
{
conv_filter_strides
},
conv_dilations_
{
conv_filter_dilations
},
in_left_pads_
{
input_left_pads
},
in_right_pads_
{
input_right_pads
},
filter_spatial_lengths_
{
filter_spatial_lengths
}
{
initOutputSpatialLengths
();
}
const
Tensor
<
InDataType
>&
input_
;
Tensor
<
OutDataType
>&
output_
;
std
::
vector
<
index_t
>
conv_strides_
;
std
::
vector
<
index_t
>
conv_dilations_
;
std
::
vector
<
index_t
>
in_left_pads_
;
std
::
vector
<
index_t
>
in_right_pads_
;
std
::
vector
<
index_t
>
filter_spatial_lengths_
;
std
::
vector
<
index_t
>
output_spatial_lengths_
;
private:
void
initOutputSpatialLengths
()
{
constexpr
auto
input_offset_to_spatial
=
3
;
for
(
ck
::
index_t
i
=
0
;
i
<
NDimSpatial
;
++
i
)
{
// XEff = (X - 1) * conv_dilation_w + 1;
// Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
const
ck
::
index_t
x_eff
=
(
filter_spatial_lengths_
[
i
]
-
1
)
*
conv_dilations_
[
i
]
+
1
;
output_spatial_lengths_
.
push_back
(
(
output_
.
GetLengths
()[
i
+
input_offset_to_spatial
]
+
in_left_pads_
[
i
]
+
in_right_pads_
[
i
]
-
x_eff
)
/
conv_strides_
[
i
]
+
1
);
}
}
};
struct
Invoker
:
public
device
::
BaseInvoker
{
using
Argument
=
ReferenceColumnToImage
::
Argument
;
float
Run
(
const
Argument
&
arg
)
{
if
(
!
(
arg
.
output_
.
GetNumOfDimension
()
==
NDimSpatial
+
3
&&
arg
.
input_
.
GetNumOfDimension
()
==
2
))
{
throw
std
::
runtime_error
(
"wrong! inconsistent dimension"
);
}
const
index_t
N
=
arg
.
output_
.
GetLengths
()[
1
];
const
index_t
C
=
arg
.
output_
.
GetLengths
()[
2
];
if
constexpr
(
NDimSpatial
==
1
)
{
const
index_t
Wo
=
arg
.
output_spatial_lengths_
[
0
];
auto
func
=
[
&
](
auto
n
)
{
for
(
index_t
wo
=
0
;
wo
<
Wo
;
++
wo
)
{
index_t
row
=
n
*
Wo
+
wo
;
index_t
column
=
0
;
for
(
index_t
x
=
0
;
x
<
arg
.
filter_spatial_lengths_
[
0
];
++
x
)
{
auto
wi
=
static_cast
<
ck
::
long_index_t
>
(
wo
*
arg
.
conv_strides_
[
0
])
+
static_cast
<
ck
::
long_index_t
>
(
x
*
arg
.
conv_dilations_
[
0
])
-
static_cast
<
ck
::
long_index_t
>
(
arg
.
in_left_pads_
[
0
]);
for
(
index_t
c
=
0
;
c
<
C
;
++
c
)
{
if
(
wi
>=
0
&&
ck
::
type_convert
<
std
::
size_t
>
(
wi
)
<
arg
.
output_
.
GetLengths
()[
3
])
{
float
v_in
=
ck
::
type_convert
<
float
>
(
arg
.
input_
(
row
,
column
));
float
v_out
=
ck
::
type_convert
<
float
>
(
arg
.
output_
(
0
,
n
,
c
,
wi
));
arg
.
output_
(
0
,
n
,
c
,
wi
)
=
ck
::
type_convert
<
OutDataType
>
(
v_in
+
v_out
);
}
column
++
;
}
}
}
};
make_ParallelTensorFunctor
(
func
,
N
)(
std
::
thread
::
hardware_concurrency
());
return
0
;
}
else
if
constexpr
(
NDimSpatial
==
2
)
{
const
index_t
Ho
=
arg
.
output_spatial_lengths_
[
0
];
const
index_t
Wo
=
arg
.
output_spatial_lengths_
[
1
];
auto
func
=
[
&
](
auto
n
)
{
for
(
index_t
ho
=
0
;
ho
<
Ho
;
++
ho
)
{
for
(
index_t
wo
=
0
;
wo
<
Wo
;
++
wo
)
{
index_t
row
=
n
*
Ho
*
Wo
+
ho
*
Wo
+
wo
;
index_t
column
=
0
;
for
(
index_t
y
=
0
;
y
<
arg
.
filter_spatial_lengths_
[
0
];
++
y
)
{
auto
hi
=
static_cast
<
ck
::
long_index_t
>
(
ho
*
arg
.
conv_strides_
[
0
])
+
static_cast
<
ck
::
long_index_t
>
(
y
*
arg
.
conv_dilations_
[
0
])
-
static_cast
<
ck
::
long_index_t
>
(
arg
.
in_left_pads_
[
0
]);
for
(
index_t
x
=
0
;
x
<
arg
.
filter_spatial_lengths_
[
1
];
++
x
)
{
auto
wi
=
static_cast
<
ck
::
long_index_t
>
(
wo
*
arg
.
conv_strides_
[
1
])
+
static_cast
<
ck
::
long_index_t
>
(
x
*
arg
.
conv_dilations_
[
1
])
-
static_cast
<
ck
::
long_index_t
>
(
arg
.
in_left_pads_
[
1
]);
for
(
index_t
c
=
0
;
c
<
C
;
++
c
)
{
if
(
hi
>=
0
&&
ck
::
type_convert
<
std
::
size_t
>
(
hi
)
<
arg
.
output_
.
GetLengths
()[
3
]
&&
wi
>=
0
&&
ck
::
type_convert
<
std
::
size_t
>
(
wi
)
<
arg
.
output_
.
GetLengths
()[
4
])
{
float
v_in
=
ck
::
type_convert
<
float
>
(
arg
.
input_
(
row
,
column
));
float
v_out
=
ck
::
type_convert
<
float
>
(
arg
.
output_
(
0
,
n
,
c
,
hi
,
wi
));
arg
.
output_
(
0
,
n
,
c
,
hi
,
wi
)
=
ck
::
type_convert
<
OutDataType
>
(
v_in
+
v_out
);
}
column
++
;
}
}
}
}
}
};
make_ParallelTensorFunctor
(
func
,
N
)(
std
::
thread
::
hardware_concurrency
());
return
0
;
}
else
if
constexpr
(
NDimSpatial
==
3
)
{
const
index_t
Do
=
arg
.
output_spatial_lengths_
[
0
];
const
index_t
Ho
=
arg
.
output_spatial_lengths_
[
1
];
const
index_t
Wo
=
arg
.
output_spatial_lengths_
[
2
];
auto
func
=
[
&
](
auto
n
)
{
for
(
index_t
d_o
=
0
;
d_o
<
Do
;
++
d_o
)
{
for
(
index_t
ho
=
0
;
ho
<
Ho
;
++
ho
)
{
for
(
index_t
wo
=
0
;
wo
<
Wo
;
++
wo
)
{
index_t
row
=
n
*
Do
*
Ho
*
Wo
+
d_o
*
Ho
*
Wo
+
ho
*
Wo
+
wo
;
index_t
column
=
0
;
for
(
index_t
z
=
0
;
z
<
arg
.
filter_spatial_lengths_
[
0
];
++
z
)
{
auto
di
=
static_cast
<
ck
::
long_index_t
>
(
d_o
*
arg
.
conv_strides_
[
0
])
+
static_cast
<
ck
::
long_index_t
>
(
z
*
arg
.
conv_dilations_
[
0
])
-
static_cast
<
ck
::
long_index_t
>
(
arg
.
in_left_pads_
[
0
]);
for
(
index_t
y
=
0
;
y
<
arg
.
filter_spatial_lengths_
[
1
];
++
y
)
{
auto
hi
=
static_cast
<
ck
::
long_index_t
>
(
ho
*
arg
.
conv_strides_
[
1
])
+
static_cast
<
ck
::
long_index_t
>
(
y
*
arg
.
conv_dilations_
[
1
])
-
static_cast
<
ck
::
long_index_t
>
(
arg
.
in_left_pads_
[
1
]);
for
(
index_t
x
=
0
;
x
<
arg
.
filter_spatial_lengths_
[
2
];
++
x
)
{
auto
wi
=
static_cast
<
ck
::
long_index_t
>
(
wo
*
arg
.
conv_strides_
[
2
])
+
static_cast
<
ck
::
long_index_t
>
(
x
*
arg
.
conv_dilations_
[
2
])
-
static_cast
<
ck
::
long_index_t
>
(
arg
.
in_left_pads_
[
2
]);
for
(
index_t
c
=
0
;
c
<
C
;
++
c
)
{
if
(
di
>=
0
&&
ck
::
type_convert
<
std
::
size_t
>
(
di
)
<
arg
.
output_
.
GetLengths
()[
3
]
&&
hi
>=
0
&&
ck
::
type_convert
<
std
::
size_t
>
(
hi
)
<
arg
.
output_
.
GetLengths
()[
4
]
&&
wi
>=
0
&&
ck
::
type_convert
<
std
::
size_t
>
(
wi
)
<
arg
.
output_
.
GetLengths
()[
5
])
{
float
v_in
=
ck
::
type_convert
<
float
>
(
arg
.
input_
(
row
,
column
));
float
v_out
=
ck
::
type_convert
<
float
>
(
arg
.
output_
(
0
,
n
,
c
,
di
,
hi
,
wi
));
arg
.
output_
(
0
,
n
,
c
,
di
,
hi
,
wi
)
=
ck
::
type_convert
<
OutDataType
>
(
v_in
+
v_out
);
}
column
++
;
}
}
}
}
}
}
}
};
make_ParallelTensorFunctor
(
func
,
N
)(
std
::
thread
::
hardware_concurrency
());
return
0
;
}
}
float
Run
(
const
device
::
BaseArgument
*
p_arg
,
const
StreamConfig
&
/*stream_config*/
=
StreamConfig
{})
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
));
}
};
static
constexpr
bool
IsValidCompilationParameter
()
{
using
namespace
tensor_layout
::
convolution
;
if
constexpr
(
!
(
std
::
is_same_v
<
ImageLayout
,
GNWC
>
||
std
::
is_same_v
<
ImageLayout
,
GNHWC
>
||
std
::
is_same_v
<
ImageLayout
,
GNDHWC
>
))
{
return
false
;
}
if
constexpr
(
!
(
NDimSpatial
>=
1
&&
NDimSpatial
<=
3
))
{
return
false
;
}
return
true
;
}
bool
IsSupportedArgument
(
const
Argument
&
arg
)
{
const
ck
::
index_t
G
=
arg
.
output_
.
GetLengths
()[
0
];
const
ck
::
index_t
N
=
arg
.
output_
.
GetLengths
()[
1
];
const
ck
::
index_t
C
=
arg
.
output_
.
GetLengths
()[
2
];
const
index_t
NDoHoWo
=
N
*
ck
::
accumulate_n
<
index_t
>
(
arg
.
output_spatial_lengths_
.
begin
(),
NDimSpatial
,
1
,
std
::
multiplies
<>
());
const
index_t
CZYX
=
C
*
ck
::
accumulate_n
<
index_t
>
(
arg
.
filter_spatial_lengths_
.
begin
(),
NDimSpatial
,
1
,
std
::
multiplies
<>
());
if
(
!
(
arg
.
input_
.
GetLengths
()[
0
]
==
static_cast
<
std
::
size_t
>
(
NDoHoWo
)
&&
arg
.
input_
.
GetLengths
()[
1
]
==
static_cast
<
std
::
size_t
>
(
CZYX
)))
{
return
false
;
}
if
(
G
!=
1
)
{
return
false
;
}
return
true
;
}
bool
IsSupportedArgument
(
const
device
::
BaseArgument
*
p_arg
)
override
{
return
IsSupportedArgument
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
));
}
static
auto
MakeArgument
(
const
Tensor
<
InDataType
>&
input
,
Tensor
<
OutDataType
>&
output
,
std
::
vector
<
ck
::
index_t
>
filter_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>
conv_filter_strides
,
std
::
vector
<
ck
::
index_t
>
conv_filter_dilations
,
std
::
vector
<
ck
::
index_t
>
input_left_pads
,
std
::
vector
<
ck
::
index_t
>
input_right_pads
)
{
return
Argument
{
input
,
output
,
filter_spatial_lengths
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
};
}
static
auto
MakeInvoker
()
{
return
Invoker
{};
}
virtual
std
::
unique_ptr
<
device
::
BaseInvoker
>
MakeInvokerPointer
()
{
return
std
::
make_unique
<
Invoker
>
(
Invoker
{});
}
std
::
string
GetTypeString
()
const
override
{
auto
str
=
std
::
stringstream
();
// clang-format off
str
<<
"ReferenceColumnToImage"
<<
std
::
endl
;
// clang-format on
return
str
.
str
();
}
};
}
// namespace host
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp
View file @
ad24acb6
...
@@ -18,16 +18,18 @@ namespace host {
...
@@ -18,16 +18,18 @@ namespace host {
/**
/**
* \brief Reference implementation for image to column.
* \brief Reference implementation for image to column.
*
*
*
T
ensor descriptor has [G, N, C, Di, Hi, Wi] data layout.
*
Input t
ensor descriptor has [G, N, C, Di, Hi, Wi] data layout.
* G must be equal to 1. Memory layout is [G, N, Di, Hi, Wi, C].
* G must be equal to 1. Memory layout is [G, N, Di, Hi, Wi, C].
* Output tensor descriptor has [N * Do * Ho * Wo, Z * Y * X * C] data layout.
* Memory layout is the same.
*
*
* \tparam NDimSpatial Number of spatial dimensions.
* \tparam NDimSpatial Number of spatial dimensions.
* \tparam I
nput
Layout I
nput
Layout.
* \tparam I
mage
Layout I
mage
Layout.
* \tparam InDataType Input Data Type.
* \tparam InDataType Input Data Type.
* \tparam OutDataType Output Data Type.
* \tparam OutDataType Output Data Type.
*/
*/
template
<
ck
::
index_t
NDimSpatial
,
template
<
ck
::
index_t
NDimSpatial
,
typename
I
nput
Layout
,
typename
I
mage
Layout
,
typename
InDataType
,
typename
InDataType
,
typename
OutDataType
,
typename
OutDataType
,
typename
std
::
enable_if
<
NDimSpatial
>
=
1
&&
NDimSpatial
<=
3
,
bool
>::
type
=
false
>
typename
std
::
enable_if
<
NDimSpatial
>
=
1
&&
NDimSpatial
<=
3
,
bool
>::
type
=
false
>
...
@@ -240,8 +242,8 @@ struct ReferenceImageToColumn : public device::BaseOperator
...
@@ -240,8 +242,8 @@ struct ReferenceImageToColumn : public device::BaseOperator
{
{
using
namespace
tensor_layout
::
convolution
;
using
namespace
tensor_layout
::
convolution
;
if
constexpr
(
!
(
std
::
is_same_v
<
I
nput
Layout
,
GNWC
>
||
std
::
is_same_v
<
I
nput
Layout
,
GNHWC
>
||
if
constexpr
(
!
(
std
::
is_same_v
<
I
mage
Layout
,
GNWC
>
||
std
::
is_same_v
<
I
mage
Layout
,
GNHWC
>
||
std
::
is_same_v
<
I
nput
Layout
,
GNDHWC
>
))
std
::
is_same_v
<
I
mage
Layout
,
GNDHWC
>
))
{
{
return
false
;
return
false
;
}
}
...
...
library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp
0 → 100644
View file @
ad24acb6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include <memory>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
namespace
ck
::
conv_tensor_rearrange_op
;
// Image to Column
// nhwc, 1d
void
add_device_image_to_column_nhwc_1d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
GNWC
,
BF16
,
BF16
,
ImageToColumn
>>>&
instances
);
void
add_device_image_to_column_nhwc_1d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
GNWC
,
F16
,
F16
,
ImageToColumn
>>>&
instances
);
void
add_device_image_to_column_nhwc_1d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
GNWC
,
F32
,
F32
,
ImageToColumn
>>>&
instances
);
void
add_device_image_to_column_nhwc_1d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
GNWC
,
int8_t
,
int8_t
,
ImageToColumn
>>>&
instances
);
// nhwc, 2d
void
add_device_image_to_column_nhwc_2d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
GNHWC
,
BF16
,
BF16
,
ImageToColumn
>>>&
instances
);
void
add_device_image_to_column_nhwc_2d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
GNHWC
,
F16
,
F16
,
ImageToColumn
>>>&
instances
);
void
add_device_image_to_column_nhwc_2d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
GNHWC
,
F32
,
F32
,
ImageToColumn
>>>&
instances
);
void
add_device_image_to_column_nhwc_2d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
GNHWC
,
int8_t
,
int8_t
,
ImageToColumn
>>>&
instances
);
// nhwc, 3d
void
add_device_image_to_column_nhwc_3d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
GNDHWC
,
BF16
,
BF16
,
ImageToColumn
>>>&
instances
);
void
add_device_image_to_column_nhwc_3d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
GNDHWC
,
F16
,
F16
,
ImageToColumn
>>>&
instances
);
void
add_device_image_to_column_nhwc_3d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
GNDHWC
,
F32
,
F32
,
ImageToColumn
>>>&
instances
);
void
add_device_image_to_column_nhwc_3d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
GNDHWC
,
int8_t
,
int8_t
,
ImageToColumn
>>>&
instances
);
// Column to Image
// nhwc, 1d
void
add_device_column_to_image_nhwc_1d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
GNWC
,
BF16
,
BF16
,
ColumnToImage
>>>&
instances
);
void
add_device_column_to_image_nhwc_1d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
GNWC
,
F16
,
F16
,
ColumnToImage
>>>&
instances
);
void
add_device_column_to_image_nhwc_1d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
GNWC
,
F32
,
F32
,
ColumnToImage
>>>&
instances
);
void
add_device_column_to_image_nhwc_1d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
GNWC
,
int8_t
,
int8_t
,
ColumnToImage
>>>&
instances
);
// nhwc, 2d
void
add_device_column_to_image_nhwc_2d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
GNHWC
,
BF16
,
BF16
,
ColumnToImage
>>>&
instances
);
void
add_device_column_to_image_nhwc_2d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
GNHWC
,
F16
,
F16
,
ColumnToImage
>>>&
instances
);
void
add_device_column_to_image_nhwc_2d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
GNHWC
,
F32
,
F32
,
ColumnToImage
>>>&
instances
);
void
add_device_column_to_image_nhwc_2d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
GNHWC
,
int8_t
,
int8_t
,
ColumnToImage
>>>&
instances
);
// nhwc, 3d
void
add_device_column_to_image_nhwc_3d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
GNDHWC
,
BF16
,
BF16
,
ColumnToImage
>>>&
instances
);
void
add_device_column_to_image_nhwc_3d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
GNDHWC
,
F16
,
F16
,
ColumnToImage
>>>&
instances
);
void
add_device_column_to_image_nhwc_3d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
GNDHWC
,
F32
,
F32
,
ColumnToImage
>>>&
instances
);
void
add_device_column_to_image_nhwc_3d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
GNDHWC
,
int8_t
,
int8_t
,
ColumnToImage
>>>&
instances
);
template
<
ck
::
index_t
NumDimSpatial
,
typename
ImageLayout
,
typename
InDataType
,
typename
OutDataType
,
typename
ConvTensorRearrangeOp
>
struct
DeviceOperationInstanceFactory
<
ck
::
tensor_operation
::
device
::
DeviceConvTensorRearrange
<
NumDimSpatial
,
ImageLayout
,
InDataType
,
OutDataType
,
ConvTensorRearrangeOp
>>
{
using
DeviceOp
=
DeviceConvTensorRearrange
<
NumDimSpatial
,
ImageLayout
,
InDataType
,
OutDataType
,
ConvTensorRearrangeOp
>
;
static
auto
GetInstances
()
{
std
::
vector
<
std
::
unique_ptr
<
DeviceOp
>>
op_ptrs
;
if
constexpr
(
is_same_v
<
ConvTensorRearrangeOp
,
ImageToColumn
>
)
{
if
constexpr
(
NumDimSpatial
==
1
&&
is_same_v
<
ImageLayout
,
GNWC
>
)
{
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_image_to_column_nhwc_1d_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
{
add_device_image_to_column_nhwc_1d_f16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
add_device_image_to_column_nhwc_1d_bf16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
add_device_image_to_column_nhwc_1d_i8_instances
(
op_ptrs
);
}
}
else
if
constexpr
(
NumDimSpatial
==
2
&&
is_same_v
<
ImageLayout
,
GNHWC
>
)
{
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_image_to_column_nhwc_2d_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
{
add_device_image_to_column_nhwc_2d_f16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
add_device_image_to_column_nhwc_2d_bf16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
add_device_image_to_column_nhwc_2d_i8_instances
(
op_ptrs
);
}
}
else
if
constexpr
(
NumDimSpatial
==
3
&&
is_same_v
<
ImageLayout
,
GNDHWC
>
)
{
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_image_to_column_nhwc_3d_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
{
add_device_image_to_column_nhwc_3d_f16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
add_device_image_to_column_nhwc_3d_bf16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
add_device_image_to_column_nhwc_3d_i8_instances
(
op_ptrs
);
}
}
}
else
if
constexpr
(
is_same_v
<
ConvTensorRearrangeOp
,
ColumnToImage
>
)
{
if
constexpr
(
NumDimSpatial
==
1
&&
is_same_v
<
ImageLayout
,
GNWC
>
)
{
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_column_to_image_nhwc_1d_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
{
add_device_column_to_image_nhwc_1d_f16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
add_device_column_to_image_nhwc_1d_bf16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
add_device_column_to_image_nhwc_1d_i8_instances
(
op_ptrs
);
}
}
else
if
constexpr
(
NumDimSpatial
==
2
&&
is_same_v
<
ImageLayout
,
GNHWC
>
)
{
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_column_to_image_nhwc_2d_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
{
add_device_column_to_image_nhwc_2d_f16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
add_device_column_to_image_nhwc_2d_bf16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
add_device_column_to_image_nhwc_2d_i8_instances
(
op_ptrs
);
}
}
else
if
constexpr
(
NumDimSpatial
==
3
&&
is_same_v
<
ImageLayout
,
GNDHWC
>
)
{
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_column_to_image_nhwc_3d_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
{
add_device_column_to_image_nhwc_3d_f16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
add_device_column_to_image_nhwc_3d_bf16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
add_device_column_to_image_nhwc_3d_i8_instances
(
op_ptrs
);
}
}
}
return
op_ptrs
;
}
};
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp
0 → 100644
View file @
ad24acb6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
namespace
ck
::
tensor_layout
::
convolution
;
using
namespace
ck
::
conv_tensor_rearrange_op
;
using
BF16
=
ck
::
bhalf_t
;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
>
using
device_column_to_image_bf16_instances
=
std
::
tuple
<
// clang-format off
//#####################| Num| InLayout| InDataType| OutDataType| Block| MPer| KPer| Thread| Scalar| ConvTensor|
//#####################| Dim| | | | Size| Block| Block| Cluster| Per| RearrangeOp|
//#####################| Spatial| | | | | | | Lengths| Vector| |
//#####################| | | | | | | | | | |
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
64
,
8
,
8
,
S
<
8
,
8
>
,
1
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
64
,
16
,
16
,
S
<
8
,
8
>
,
1
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
64
,
32
,
32
,
S
<
8
,
8
>
,
4
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
64
,
64
,
64
,
S
<
8
,
8
>
,
8
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
128
,
16
,
16
,
S
<
8
,
16
>
,
1
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
128
,
64
,
64
,
S
<
8
,
16
>
,
1
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
128
,
32
,
64
,
S
<
8
,
16
>
,
4
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
128
,
64
,
128
,
S
<
8
,
16
>
,
8
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
256
,
16
,
16
,
S
<
16
,
16
>
,
1
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
256
,
64
,
64
,
S
<
16
,
16
>
,
1
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
256
,
128
,
128
,
S
<
16
,
16
>
,
1
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
256
,
64
,
64
,
S
<
16
,
16
>
,
4
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
256
,
128
,
128
,
S
<
16
,
16
>
,
4
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
256
,
128
,
128
,
S
<
16
,
16
>
,
8
,
ColumnToImage
>
// clang-format on
>
;
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
>
using
device_column_to_image_f16_instances
=
std
::
tuple
<
// clang-format off
//#####################| Num| InLayout| InDataType| OutDataType| Block| MPer| KPer| Thread| Scalar| ConvTensor|
//#####################| Dim| | | | Size| Block| Block| Cluster| Per| RearrangeOp|
//#####################| Spatial| | | | | | | Lengths| Vector| |
//#####################| | | | | | | | | | |
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
64
,
8
,
8
,
S
<
8
,
8
>
,
1
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
64
,
16
,
16
,
S
<
8
,
8
>
,
1
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
64
,
32
,
32
,
S
<
8
,
8
>
,
4
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
64
,
64
,
64
,
S
<
8
,
8
>
,
8
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
128
,
16
,
16
,
S
<
8
,
16
>
,
1
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
128
,
64
,
64
,
S
<
8
,
16
>
,
1
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
128
,
32
,
64
,
S
<
8
,
16
>
,
4
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
128
,
64
,
128
,
S
<
8
,
16
>
,
8
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
256
,
16
,
16
,
S
<
16
,
16
>
,
1
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
256
,
64
,
64
,
S
<
16
,
16
>
,
1
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
256
,
128
,
128
,
S
<
16
,
16
>
,
1
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
256
,
64
,
64
,
S
<
16
,
16
>
,
4
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
256
,
128
,
128
,
S
<
16
,
16
>
,
4
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
256
,
128
,
128
,
S
<
16
,
16
>
,
8
,
ColumnToImage
>
// clang-format on
>
;
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
>
using
device_column_to_image_f32_instances
=
std
::
tuple
<
// clang-format off
//#####################| Num| InLayout| InDataType| OutDataType| Block| MPer| KPer| Thread| Scalar| ConvTensor|
//#####################| Dim| | | | Size| Block| Block| Cluster| Per| RearrangeOp|
//#####################| Spatial| | | | | | | Lengths| Vector| |
//#####################| | | | | | | | | | |
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
64
,
8
,
8
,
S
<
8
,
8
>
,
1
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
64
,
16
,
16
,
S
<
8
,
8
>
,
1
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
64
,
32
,
32
,
S
<
8
,
8
>
,
4
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
128
,
16
,
16
,
S
<
8
,
16
>
,
1
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
128
,
64
,
64
,
S
<
8
,
16
>
,
1
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
128
,
32
,
64
,
S
<
8
,
16
>
,
4
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
256
,
16
,
16
,
S
<
16
,
16
>
,
1
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
256
,
64
,
64
,
S
<
16
,
16
>
,
1
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
256
,
128
,
128
,
S
<
16
,
16
>
,
1
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
256
,
64
,
64
,
S
<
16
,
16
>
,
4
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
256
,
128
,
128
,
S
<
16
,
16
>
,
4
,
ColumnToImage
>
// clang-format on
>
;
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
>
using
device_column_to_image_i8_instances
=
std
::
tuple
<
// clang-format off
//#####################| Num| InLayout| InDataType| OutDataType| Block| MPer| KPer| Thread| Scalar| ConvTensor|
//#####################| Dim| | | | Size| Block| Block| Cluster| Per| RearrangeOp|
//#####################| Spatial| | | | | | | Lengths| Vector| |
//#####################| | | | | | | | | | |
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
64
,
8
,
8
,
S
<
8
,
8
>
,
1
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
64
,
16
,
16
,
S
<
8
,
8
>
,
1
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
64
,
32
,
32
,
S
<
8
,
8
>
,
4
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
64
,
64
,
64
,
S
<
8
,
8
>
,
8
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
128
,
16
,
16
,
S
<
8
,
16
>
,
1
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
128
,
64
,
64
,
S
<
8
,
16
>
,
1
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
128
,
32
,
64
,
S
<
8
,
16
>
,
4
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
128
,
64
,
128
,
S
<
8
,
16
>
,
8
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
256
,
16
,
16
,
S
<
16
,
16
>
,
1
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
256
,
64
,
64
,
S
<
16
,
16
>
,
1
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
256
,
128
,
128
,
S
<
16
,
16
>
,
1
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
256
,
64
,
64
,
S
<
16
,
16
>
,
4
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
256
,
128
,
128
,
S
<
16
,
16
>
,
4
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
256
,
128
,
128
,
S
<
16
,
16
>
,
8
,
ColumnToImage
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
256
,
256
,
256
,
S
<
16
,
16
>
,
16
,
ColumnToImage
>
// clang-format on
>
;
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/
image_to_column
/device_image_to_column_instance.hpp
→
library/include/ck/library/tensor_operation_instance/gpu/
conv_tensor_rearrange
/device_image_to_column_instance.hpp
View file @
ad24acb6
...
@@ -13,6 +13,7 @@ namespace device {
...
@@ -13,6 +13,7 @@ namespace device {
namespace
instance
{
namespace
instance
{
using
namespace
ck
::
tensor_layout
::
convolution
;
using
namespace
ck
::
tensor_layout
::
convolution
;
using
namespace
ck
::
conv_tensor_rearrange_op
;
using
BF16
=
ck
::
bhalf_t
;
using
BF16
=
ck
::
bhalf_t
;
using
F16
=
ck
::
half_t
;
using
F16
=
ck
::
half_t
;
...
@@ -24,94 +25,94 @@ using S = ck::Sequence<Is...>;
...
@@ -24,94 +25,94 @@ using S = ck::Sequence<Is...>;
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
>
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
>
using
device_image_to_column_bf16_instances
=
std
::
tuple
<
using
device_image_to_column_bf16_instances
=
std
::
tuple
<
// clang-format off
// clang-format off
//#####################| Num| InLayout| InDataType| OutDataType| Block| MPer| KPer| Thread| Scalar|
//#####################| Num| InLayout| InDataType| OutDataType| Block| MPer| KPer| Thread| Scalar|
ConvTensor|
//#####################| Dim| | | | Size| Block| Block| Cluster| Per|
//#####################| Dim| | | | Size| Block| Block| Cluster| Per|
RearrangeOp|
//#####################| Spatial| | | | | | | Lengths| Vector|
//#####################| Spatial| | | | | | | Lengths| Vector|
|
//#####################| | | | | | | | | |
//#####################| | | | | | | | | |
|
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
64
,
8
,
8
,
S
<
8
,
8
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
64
,
8
,
8
,
S
<
8
,
8
>
,
1
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
64
,
16
,
16
,
S
<
8
,
8
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
64
,
16
,
16
,
S
<
8
,
8
>
,
1
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
64
,
32
,
32
,
S
<
8
,
8
>
,
4
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
64
,
32
,
32
,
S
<
8
,
8
>
,
4
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
64
,
64
,
64
,
S
<
8
,
8
>
,
8
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
64
,
64
,
64
,
S
<
8
,
8
>
,
8
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
128
,
16
,
16
,
S
<
8
,
16
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
128
,
16
,
16
,
S
<
8
,
16
>
,
1
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
128
,
64
,
64
,
S
<
8
,
16
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
128
,
64
,
64
,
S
<
8
,
16
>
,
1
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
128
,
32
,
64
,
S
<
8
,
16
>
,
4
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
128
,
32
,
64
,
S
<
8
,
16
>
,
4
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
128
,
64
,
128
,
S
<
8
,
16
>
,
8
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
128
,
64
,
128
,
S
<
8
,
16
>
,
8
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
256
,
16
,
16
,
S
<
16
,
16
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
256
,
16
,
16
,
S
<
16
,
16
>
,
1
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
256
,
64
,
64
,
S
<
16
,
16
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
256
,
64
,
64
,
S
<
16
,
16
>
,
1
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
256
,
128
,
128
,
S
<
16
,
16
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
256
,
128
,
128
,
S
<
16
,
16
>
,
1
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
256
,
64
,
64
,
S
<
16
,
16
>
,
4
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
256
,
64
,
64
,
S
<
16
,
16
>
,
4
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
256
,
128
,
128
,
S
<
16
,
16
>
,
4
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
256
,
128
,
128
,
S
<
16
,
16
>
,
4
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
256
,
128
,
128
,
S
<
16
,
16
>
,
8
>
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
256
,
128
,
128
,
S
<
16
,
16
>
,
8
,
ImageToColumn
>
// clang-format on
// clang-format on
>
;
>
;
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
>
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
>
using
device_image_to_column_f16_instances
=
std
::
tuple
<
using
device_image_to_column_f16_instances
=
std
::
tuple
<
// clang-format off
// clang-format off
//#####################| Num| InLayout| InDataType| OutDataType| Block| MPer| KPer| Thread| Scalar|
//#####################| Num| InLayout| InDataType| OutDataType| Block| MPer| KPer| Thread| Scalar|
ConvTensor|
//#####################| Dim| | | | Size| Block| Block| Cluster| Per|
//#####################| Dim| | | | Size| Block| Block| Cluster| Per|
RearrangeOp|
//#####################| Spatial| | | | | | | Lengths| Vector|
//#####################| Spatial| | | | | | | Lengths| Vector|
|
//#####################| | | | | | | | | |
//#####################| | | | | | | | | |
|
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
64
,
8
,
8
,
S
<
8
,
8
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
64
,
8
,
8
,
S
<
8
,
8
>
,
1
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
64
,
16
,
16
,
S
<
8
,
8
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
64
,
16
,
16
,
S
<
8
,
8
>
,
1
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
64
,
32
,
32
,
S
<
8
,
8
>
,
4
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
64
,
32
,
32
,
S
<
8
,
8
>
,
4
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
64
,
64
,
64
,
S
<
8
,
8
>
,
8
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
64
,
64
,
64
,
S
<
8
,
8
>
,
8
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
128
,
16
,
16
,
S
<
8
,
16
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
128
,
16
,
16
,
S
<
8
,
16
>
,
1
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
128
,
64
,
64
,
S
<
8
,
16
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
128
,
64
,
64
,
S
<
8
,
16
>
,
1
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
128
,
32
,
64
,
S
<
8
,
16
>
,
4
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
128
,
32
,
64
,
S
<
8
,
16
>
,
4
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
128
,
64
,
128
,
S
<
8
,
16
>
,
8
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
128
,
64
,
128
,
S
<
8
,
16
>
,
8
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
256
,
16
,
16
,
S
<
16
,
16
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
256
,
16
,
16
,
S
<
16
,
16
>
,
1
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
256
,
64
,
64
,
S
<
16
,
16
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
256
,
64
,
64
,
S
<
16
,
16
>
,
1
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
256
,
128
,
128
,
S
<
16
,
16
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
256
,
128
,
128
,
S
<
16
,
16
>
,
1
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
256
,
64
,
64
,
S
<
16
,
16
>
,
4
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
256
,
64
,
64
,
S
<
16
,
16
>
,
4
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
256
,
128
,
128
,
S
<
16
,
16
>
,
4
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
256
,
128
,
128
,
S
<
16
,
16
>
,
4
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
256
,
128
,
128
,
S
<
16
,
16
>
,
8
>
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
256
,
128
,
128
,
S
<
16
,
16
>
,
8
,
ImageToColumn
>
// clang-format on
// clang-format on
>
;
>
;
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
>
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
>
using
device_image_to_column_f32_instances
=
std
::
tuple
<
using
device_image_to_column_f32_instances
=
std
::
tuple
<
// clang-format off
// clang-format off
//#####################| Num| InLayout| InDataType| OutDataType| Block| MPer| KPer| Thread| Scalar|
//#####################| Num| InLayout| InDataType| OutDataType| Block| MPer| KPer| Thread| Scalar|
ConvTensor|
//#####################| Dim| | | | Size| Block| Block| Cluster| Per|
//#####################| Dim| | | | Size| Block| Block| Cluster| Per|
RearrangeOp|
//#####################| Spatial| | | | | | | Lengths| Vector|
//#####################| Spatial| | | | | | | Lengths| Vector|
|
//#####################| | | | | | | | | |
//#####################| | | | | | | | | |
|
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
64
,
8
,
8
,
S
<
8
,
8
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
64
,
8
,
8
,
S
<
8
,
8
>
,
1
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
64
,
16
,
16
,
S
<
8
,
8
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
64
,
16
,
16
,
S
<
8
,
8
>
,
1
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
64
,
32
,
32
,
S
<
8
,
8
>
,
4
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
64
,
32
,
32
,
S
<
8
,
8
>
,
4
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
128
,
16
,
16
,
S
<
8
,
16
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
128
,
16
,
16
,
S
<
8
,
16
>
,
1
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
128
,
64
,
64
,
S
<
8
,
16
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
128
,
64
,
64
,
S
<
8
,
16
>
,
1
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
128
,
32
,
64
,
S
<
8
,
16
>
,
4
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
128
,
32
,
64
,
S
<
8
,
16
>
,
4
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
256
,
16
,
16
,
S
<
16
,
16
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
256
,
16
,
16
,
S
<
16
,
16
>
,
1
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
256
,
64
,
64
,
S
<
16
,
16
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
256
,
64
,
64
,
S
<
16
,
16
>
,
1
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
256
,
128
,
128
,
S
<
16
,
16
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
256
,
128
,
128
,
S
<
16
,
16
>
,
1
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
256
,
64
,
64
,
S
<
16
,
16
>
,
4
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
256
,
64
,
64
,
S
<
16
,
16
>
,
4
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
256
,
128
,
128
,
S
<
16
,
16
>
,
4
>
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
256
,
128
,
128
,
S
<
16
,
16
>
,
4
,
ImageToColumn
>
// clang-format on
// clang-format on
>
;
>
;
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
>
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
>
using
device_image_to_column_i8_instances
=
std
::
tuple
<
using
device_image_to_column_i8_instances
=
std
::
tuple
<
// clang-format off
// clang-format off
//#####################| Num| InLayout| InDataType| OutDataType| Block| MPer| KPer| Thread| Scalar|
//#####################| Num| InLayout| InDataType| OutDataType| Block| MPer| KPer| Thread| Scalar|
ConvTensor|
//#####################| Dim| | | | Size| Block| Block| Cluster| Per|
//#####################| Dim| | | | Size| Block| Block| Cluster| Per|
RearrangeOp|
//#####################| Spatial| | | | | | | Lengths| Vector|
//#####################| Spatial| | | | | | | Lengths| Vector|
|
//#####################| | | | | | | | | |
//#####################| | | | | | | | | |
|
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
64
,
8
,
8
,
S
<
8
,
8
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
64
,
8
,
8
,
S
<
8
,
8
>
,
1
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
64
,
16
,
16
,
S
<
8
,
8
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
64
,
16
,
16
,
S
<
8
,
8
>
,
1
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
64
,
32
,
32
,
S
<
8
,
8
>
,
4
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
64
,
32
,
32
,
S
<
8
,
8
>
,
4
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
64
,
64
,
64
,
S
<
8
,
8
>
,
8
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
64
,
64
,
64
,
S
<
8
,
8
>
,
8
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
128
,
16
,
16
,
S
<
8
,
16
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
128
,
16
,
16
,
S
<
8
,
16
>
,
1
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
128
,
64
,
64
,
S
<
8
,
16
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
128
,
64
,
64
,
S
<
8
,
16
>
,
1
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
128
,
32
,
64
,
S
<
8
,
16
>
,
4
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
128
,
32
,
64
,
S
<
8
,
16
>
,
4
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
128
,
64
,
128
,
S
<
8
,
16
>
,
8
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
128
,
64
,
128
,
S
<
8
,
16
>
,
8
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
256
,
16
,
16
,
S
<
16
,
16
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
256
,
16
,
16
,
S
<
16
,
16
>
,
1
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
256
,
64
,
64
,
S
<
16
,
16
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
256
,
64
,
64
,
S
<
16
,
16
>
,
1
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
256
,
128
,
128
,
S
<
16
,
16
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
256
,
128
,
128
,
S
<
16
,
16
>
,
1
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
256
,
64
,
64
,
S
<
16
,
16
>
,
4
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
256
,
64
,
64
,
S
<
16
,
16
>
,
4
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
256
,
128
,
128
,
S
<
16
,
16
>
,
4
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
256
,
128
,
128
,
S
<
16
,
16
>
,
4
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
256
,
128
,
128
,
S
<
16
,
16
>
,
8
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
256
,
128
,
128
,
S
<
16
,
16
>
,
8
,
ImageToColumn
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
256
,
256
,
256
,
S
<
16
,
16
>
,
16
>
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
256
,
256
,
256
,
S
<
16
,
16
>
,
16
,
ImageToColumn
>
// clang-format on
// clang-format on
>
;
>
;
...
...
library/include/ck/library/tensor_operation_instance/gpu/image_to_column.hpp
deleted
100644 → 0
View file @
bba085d2
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include <memory>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_image_to_column.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// nhwc, 1d
void
add_device_image_to_column_nhwc_1d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceImageToColumn
<
1
,
GNWC
,
BF16
,
BF16
>>>&
instances
);
void
add_device_image_to_column_nhwc_1d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceImageToColumn
<
1
,
GNWC
,
F16
,
F16
>>>&
instances
);
void
add_device_image_to_column_nhwc_1d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceImageToColumn
<
1
,
GNWC
,
F32
,
F32
>>>&
instances
);
void
add_device_image_to_column_nhwc_1d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceImageToColumn
<
1
,
GNWC
,
int8_t
,
int8_t
>>>&
instances
);
// nhwc, 2d
void
add_device_image_to_column_nhwc_2d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceImageToColumn
<
2
,
GNHWC
,
BF16
,
BF16
>>>&
instances
);
void
add_device_image_to_column_nhwc_2d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceImageToColumn
<
2
,
GNHWC
,
F16
,
F16
>>>&
instances
);
void
add_device_image_to_column_nhwc_2d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceImageToColumn
<
2
,
GNHWC
,
F32
,
F32
>>>&
instances
);
void
add_device_image_to_column_nhwc_2d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceImageToColumn
<
2
,
GNHWC
,
int8_t
,
int8_t
>>>&
instances
);
// nhwc, 3d
void
add_device_image_to_column_nhwc_3d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceImageToColumn
<
3
,
GNDHWC
,
BF16
,
BF16
>>>&
instances
);
void
add_device_image_to_column_nhwc_3d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceImageToColumn
<
3
,
GNDHWC
,
F16
,
F16
>>>&
instances
);
void
add_device_image_to_column_nhwc_3d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceImageToColumn
<
3
,
GNDHWC
,
F32
,
F32
>>>&
instances
);
void
add_device_image_to_column_nhwc_3d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceImageToColumn
<
3
,
GNDHWC
,
int8_t
,
int8_t
>>>&
instances
);
template
<
ck
::
index_t
NumDimSpatial
,
typename
InLayout
,
typename
InDataType
,
typename
OutDataType
>
struct
DeviceOperationInstanceFactory
<
ck
::
tensor_operation
::
device
::
DeviceImageToColumn
<
NumDimSpatial
,
InLayout
,
InDataType
,
OutDataType
>>
{
using
DeviceOp
=
DeviceImageToColumn
<
NumDimSpatial
,
InLayout
,
InDataType
,
OutDataType
>
;
static
auto
GetInstances
()
{
std
::
vector
<
std
::
unique_ptr
<
DeviceOp
>>
op_ptrs
;
if
constexpr
(
NumDimSpatial
==
1
&&
is_same_v
<
InLayout
,
GNWC
>
)
{
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_image_to_column_nhwc_1d_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
{
add_device_image_to_column_nhwc_1d_f16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
add_device_image_to_column_nhwc_1d_bf16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
add_device_image_to_column_nhwc_1d_i8_instances
(
op_ptrs
);
}
}
else
if
constexpr
(
NumDimSpatial
==
2
&&
is_same_v
<
InLayout
,
GNHWC
>
)
{
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_image_to_column_nhwc_2d_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
{
add_device_image_to_column_nhwc_2d_f16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
add_device_image_to_column_nhwc_2d_bf16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
add_device_image_to_column_nhwc_2d_i8_instances
(
op_ptrs
);
}
}
else
if
constexpr
(
NumDimSpatial
==
3
&&
is_same_v
<
InLayout
,
GNDHWC
>
)
{
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_image_to_column_nhwc_3d_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
{
add_device_image_to_column_nhwc_3d_f16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
add_device_image_to_column_nhwc_3d_bf16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
add_device_image_to_column_nhwc_3d_i8_instances
(
op_ptrs
);
}
}
return
op_ptrs
;
}
};
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment