Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
0f84c0c7
Unverified
Commit
0f84c0c7
authored
Sep 05, 2023
by
zjing14
Committed by
GitHub
Sep 05, 2023
Browse files
Merge branch 'develop' into fp8_gemm_generic_instance
parents
04058bf6
0077eeb3
Changes
32
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
853 additions
and
0 deletions
+853
-0
library/src/tensor_operation_instance/gpu/image_to_column/CMakeLists.txt
...sor_operation_instance/gpu/image_to_column/CMakeLists.txt
+5
-0
library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_1d_instance.cpp
...age_to_column/device_image_to_column_nhwc_1d_instance.cpp
+39
-0
library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_2d_instance.cpp
...age_to_column/device_image_to_column_nhwc_2d_instance.cpp
+39
-0
library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_3d_instance.cpp
...age_to_column/device_image_to_column_nhwc_3d_instance.cpp
+39
-0
profiler/README.md
profiler/README.md
+38
-0
profiler/include/profiler/profile_image_to_column_impl.hpp
profiler/include/profiler/profile_image_to_column_impl.hpp
+200
-0
profiler/src/CMakeLists.txt
profiler/src/CMakeLists.txt
+2
-0
profiler/src/profile_image_to_column.cpp
profiler/src/profile_image_to_column.cpp
+169
-0
test/CMakeLists.txt
test/CMakeLists.txt
+1
-0
test/image_to_column/CMakeLists.txt
test/image_to_column/CMakeLists.txt
+4
-0
test/image_to_column/test_image_to_column.cpp
test/image_to_column/test_image_to_column.cpp
+121
-0
test/image_to_column/test_image_to_column_interface.cpp
test/image_to_column/test_image_to_column_interface.cpp
+196
-0
No files found.
library/src/tensor_operation_instance/gpu/image_to_column/CMakeLists.txt
0 → 100644
View file @
0f84c0c7
add_instance_library
(
device_image_to_column_instance
device_image_to_column_nhwc_1d_instance.cpp
device_image_to_column_nhwc_2d_instance.cpp
device_image_to_column_nhwc_3d_instance.cpp
)
library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_1d_instance.cpp
0 → 100644
View file @
0f84c0c7
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/gpu/image_to_column/device_image_to_column_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_image_to_column_nhwc_1d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceImageToColumn
<
1
,
GNWC
,
BF16
,
BF16
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_image_to_column_bf16_instances
<
1
,
GNWC
>
{});
}
void
add_device_image_to_column_nhwc_1d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceImageToColumn
<
1
,
GNWC
,
F16
,
F16
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_image_to_column_f16_instances
<
1
,
GNWC
>
{});
}
void
add_device_image_to_column_nhwc_1d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceImageToColumn
<
1
,
GNWC
,
F32
,
F32
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_image_to_column_f32_instances
<
1
,
GNWC
>
{});
}
void
add_device_image_to_column_nhwc_1d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceImageToColumn
<
1
,
GNWC
,
int8_t
,
int8_t
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_image_to_column_i8_instances
<
1
,
GNWC
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_2d_instance.cpp
0 → 100644
View file @
0f84c0c7
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/gpu/image_to_column/device_image_to_column_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_image_to_column_nhwc_2d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceImageToColumn
<
2
,
GNHWC
,
BF16
,
BF16
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_image_to_column_bf16_instances
<
2
,
GNHWC
>
{});
}
void
add_device_image_to_column_nhwc_2d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceImageToColumn
<
2
,
GNHWC
,
F16
,
F16
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_image_to_column_f16_instances
<
2
,
GNHWC
>
{});
}
void
add_device_image_to_column_nhwc_2d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceImageToColumn
<
2
,
GNHWC
,
F32
,
F32
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_image_to_column_f32_instances
<
2
,
GNHWC
>
{});
}
void
add_device_image_to_column_nhwc_2d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceImageToColumn
<
2
,
GNHWC
,
int8_t
,
int8_t
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_image_to_column_i8_instances
<
2
,
GNHWC
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_3d_instance.cpp
0 → 100644
View file @
0f84c0c7
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/gpu/image_to_column/device_image_to_column_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_image_to_column_nhwc_3d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceImageToColumn
<
3
,
GNDHWC
,
BF16
,
BF16
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_image_to_column_bf16_instances
<
3
,
GNDHWC
>
{});
}
void
add_device_image_to_column_nhwc_3d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceImageToColumn
<
3
,
GNDHWC
,
F16
,
F16
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_image_to_column_f16_instances
<
3
,
GNDHWC
>
{});
}
void
add_device_image_to_column_nhwc_3d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceImageToColumn
<
3
,
GNDHWC
,
F32
,
F32
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_image_to_column_f32_instances
<
3
,
GNDHWC
>
{});
}
void
add_device_image_to_column_nhwc_3d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceImageToColumn
<
3
,
GNDHWC
,
int8_t
,
int8_t
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_image_to_column_i8_instances
<
3
,
GNDHWC
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
profiler/README.md
View file @
0f84c0c7
...
...
@@ -184,3 +184,41 @@ tflops: 95.337
GB/s: 69.2301
```
Note: This kernel use atomic add, this will cause output buffer to be accumulated multiple times, causing verification failure. To work around it, do not use CK's own timer and do verification at the same time.
## Profile image to column kernels
```
bash
# arg1: tensor operation (" OP_NAME ": " OP_DESC ")
# arg2: data type (0: Input fp32, Weight fp32, Output fp32
# 1: Input fp16, Weight fp16, Output fp16
# 2: Input bf16, Weight bf16, Output bf16
# 3: Input int8, Weight int8, Output int8)
# arg3: tensor layout (0: Input[N, Hi, Wi, C], Output[N * Ho * Wo, Y * X * C])
# arg4: verification (0: no, 1: yes)
# arg5: initialization (0: no init, 1: integer value, 2: decimal value)
# arg6: print tensor value (0: no; 1: yes)
# arg7: time kernel (0: no, 1: yes)
# Following arguments (depending on number of spatial dims):
# Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)
# G, N, K, C,
# <filter spatial dimensions>, (ie Y, X for 2D)
# <input image spatial dimensions>, (ie Hi, Wi for 2D)
# <strides>, (ie Sy, Sx for 2D)
# <dilations>, (ie Dy, Dx for 2D)
# <left padding>, (ie LeftPy, LeftPx for 2D)
# <right padding>, (ie RightPy, RightPx for 2D)
################ op datatype layout verify init log time Ndims G N K C Y X Hi Wi Sy Sx Dy Dx LeftPy LeftPx RightPy RightPx
./bin/ckProfiler image_to_column 0 0 1 1 0 1 2 1 256 1 512 3 3 28 28 1 1 1 1 0 0 0 0
```
Result
(
MI210, FP32, NHWC
)
```
input: dim 5, lengths {1, 256, 512, 28, 28}, strides {102760448, 401408, 1, 14336, 512}
output: dim 2, lengths {173056, 4608}, strides {4608, 1}
....
Best configuration parameters:
name: DeviceImageToColumn
<
128,
32,
64,
4
>
avg_time: 3.12326
GB/s: 2042.59
```
profiler/include/profiler/profile_image_to_column_impl.hpp
0 → 100644
View file @
0f84c0c7
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iomanip>
#include <iostream>
#include <typeinfo>
#include <limits>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_image_to_column.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp"
#include "ck/library/tensor_operation_instance/gpu/image_to_column.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp"
namespace
ck
{
namespace
profiler
{
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
template
<
index_t
NDimSpatial
,
typename
InputLayout
,
typename
InputDataType
,
typename
OutputDataType
>
bool
profile_image_to_column_impl
(
int
do_verification
,
int
init_method
,
bool
do_log
,
bool
time_kernel
,
const
ck
::
utils
::
conv
::
ConvParam
&
conv_param
)
{
const
ck
::
index_t
NDoHoWo
=
conv_param
.
N_
*
ck
::
accumulate_n
<
ck
::
index_t
>
(
conv_param
.
output_spatial_lengths_
.
begin
(),
NDimSpatial
,
1
,
std
::
multiplies
<>
());
const
ck
::
index_t
CZYX
=
conv_param
.
C_
*
ck
::
accumulate_n
<
ck
::
index_t
>
(
conv_param
.
filter_spatial_lengths_
.
begin
(),
NDimSpatial
,
1
,
std
::
multiplies
<>
());
const
auto
in_desc
=
ck
::
utils
::
conv
::
make_input_host_tensor_descriptor_g_n_c_wis_packed
<
InputLayout
>
(
conv_param
);
const
auto
out_desc
=
HostTensorDescriptor
({
NDoHoWo
,
CZYX
});
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_spatial_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
filter_spatial_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
output_spatial_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
input_g_n_c_wis_strides
{};
std
::
array
<
ck
::
index_t
,
2
>
output_m_k_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
conv_filter_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
conv_filter_dilations
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_left_pads
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_right_pads
{};
auto
copy
=
[](
const
auto
&
x
,
auto
&
y
)
{
std
::
copy
(
x
.
begin
(),
x
.
end
(),
y
.
begin
());
};
copy
(
conv_param
.
input_spatial_lengths_
,
input_spatial_lengths
);
copy
(
conv_param
.
filter_spatial_lengths_
,
filter_spatial_lengths
);
copy
(
conv_param
.
output_spatial_lengths_
,
output_spatial_lengths
);
copy
(
in_desc
.
GetStrides
(),
input_g_n_c_wis_strides
);
copy
(
out_desc
.
GetStrides
(),
output_m_k_strides
);
copy
(
conv_param
.
conv_filter_strides_
,
conv_filter_strides
);
copy
(
conv_param
.
conv_filter_dilations_
,
conv_filter_dilations
);
copy
(
conv_param
.
input_left_pads_
,
input_left_pads
);
copy
(
conv_param
.
input_right_pads_
,
input_right_pads
);
Tensor
<
InputDataType
>
input
(
in_desc
);
Tensor
<
OutputDataType
>
host_output
(
out_desc
);
Tensor
<
OutputDataType
>
device_output
(
out_desc
);
std
::
cout
<<
"input: "
<<
input
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"output: "
<<
host_output
.
mDesc
<<
std
::
endl
;
switch
(
init_method
)
{
case
0
:
break
;
case
1
:
input
.
GenerateTensorValue
(
GeneratorTensor_2
<
InputDataType
>
{
-
5
,
5
});
break
;
default:
input
.
GenerateTensorValue
(
GeneratorTensor_3
<
InputDataType
>
{
0.0
,
1.0
});
}
DeviceMem
in_device_buf
(
sizeof
(
InputDataType
)
*
input
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
out_device_buf
(
sizeof
(
OutputDataType
)
*
device_output
.
mDesc
.
GetElementSpaceSize
());
in_device_buf
.
ToDevice
(
input
.
mData
.
data
());
// run reference op
if
(
do_verification
)
{
auto
ref_image_to_column
=
ck
::
tensor_operation
::
host
::
ReferenceImageToColumn
<
NDimSpatial
,
InputLayout
,
InputDataType
,
OutputDataType
>
{};
auto
ref_invoker
=
ref_image_to_column
.
MakeInvoker
();
auto
ref_argument
=
ref_image_to_column
.
MakeArgument
(
input
,
host_output
,
conv_param
.
filter_spatial_lengths_
,
conv_param
.
conv_filter_strides_
,
conv_param
.
conv_filter_dilations_
,
conv_param
.
input_left_pads_
,
conv_param
.
input_right_pads_
);
// init host output to zero
host_output
.
SetZero
();
ref_invoker
.
Run
(
ref_argument
);
}
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceImageToColumn
<
NDimSpatial
,
InputLayout
,
InputDataType
,
OutputDataType
>
;
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
std
::
string
best_op_name
;
float
best_avg_time
=
std
::
numeric_limits
<
float
>::
max
();
float
best_gb_per_sec
=
0
;
// profile device op instances
bool
pass
=
true
;
bool
is_supporting_instance
=
false
;
for
(
auto
&
op_ptr
:
op_ptrs
)
{
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
static_cast
<
InputDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutputDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
conv_param
.
N_
,
conv_param
.
C_
,
input_spatial_lengths
,
filter_spatial_lengths
,
output_spatial_lengths
,
input_g_n_c_wis_strides
,
output_m_k_strides
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
);
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
is_supporting_instance
=
true
;
// re-init output to zero before profiling next kernel
out_device_buf
.
SetZero
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
num_btype
=
NDoHoWo
*
CZYX
*
(
sizeof
(
OutputDataType
)
+
sizeof
(
InputDataType
));
float
gb_per_sec
=
num_btype
/
1.E6
/
avg_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
avg_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
if
(
avg_time
<
best_avg_time
)
{
best_op_name
=
op_name
;
best_avg_time
=
avg_time
;
best_gb_per_sec
=
gb_per_sec
;
}
if
(
do_verification
)
{
out_device_buf
.
FromDevice
(
device_output
.
mData
.
data
());
pass
=
pass
&
ck
::
utils
::
check_err
(
device_output
,
host_output
);
if
(
do_log
)
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"input : "
,
input
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"host_output : "
,
host_output
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"device_output: "
,
device_output
.
mData
,
","
)
<<
std
::
endl
;
}
}
}
else
{
std
::
cout
<<
op_ptr
->
GetTypeString
()
<<
" does not support this problem"
<<
std
::
endl
;
}
}
std
::
cout
<<
"Best configuration parameters:"
<<
"
\n
name: "
<<
best_op_name
<<
"
\n
avg_time: "
<<
best_avg_time
<<
"
\n
GB/s: "
<<
best_gb_per_sec
<<
std
::
endl
;
return
is_supporting_instance
&&
pass
;
}
}
// namespace profiler
}
// namespace ck
profiler/src/CMakeLists.txt
View file @
0f84c0c7
...
...
@@ -28,6 +28,7 @@ set(PROFILER_SOURCES
profile_contraction_bilinear.cpp
profile_contraction_scale.cpp
profile_grouped_conv_bwd_data.cpp
profile_image_to_column.cpp
)
if
(
DL_KERNELS
)
list
(
APPEND PROFILER_SOURCES profile_batched_gemm_multi_d.cpp
)
...
...
@@ -82,6 +83,7 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_avg_pool3d_bwd_insta
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_max_pool_bwd_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv2d_bwd_data_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv3d_bwd_data_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_image_to_column_instance
)
if
(
DL_KERNELS
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_batched_gemm_multi_d_instance
)
endif
()
...
...
profiler/src/profile_image_to_column.cpp
0 → 100644
View file @
0f84c0c7
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "profiler/profile_image_to_column_impl.hpp"
#include "profiler_operation_registry.hpp"
namespace
{
enum
struct
ConvLayout
{
NHWC
,
// 0
};
enum
struct
DataType
{
F32_F32
,
// 0
F16_F16
,
// 1
BF16_BF16
,
// 2
INT8_INT8
,
// 3
};
#define OP_NAME "image_to_column"
#define OP_DESC "Image To Column"
static
void
print_helper_msg
()
{
std
::
cout
// clang-format off
<<
"arg1: tensor operation ("
OP_NAME
": "
OP_DESC
")
\n
"
<<
"arg2: data type (0: Input fp32, Weight fp32, Output fp32
\n
"
<<
" 1: Input fp16, Weight fp16, Output fp16
\n
"
<<
" 2: Input bf16, Weight bf16, Output bf16
\n
"
<<
" 3: Input int8, Weight int8, Output int8)
\n
"
<<
"arg3: tensor layout (0: Input[N, Hi, Wi, C], Output[N * Ho * Wo, Y * X * C])
\n
"
<<
"arg4: verification (0: no, 1: yes)
\n
"
<<
"arg5: initialization (0: no init, 1: integer value, 2: decimal value)
\n
"
<<
"arg6: print tensor value (0: no; 1: yes)
\n
"
<<
"arg7: time kernel (0: no, 1: yes)
\n
"
<<
ck
::
utils
::
conv
::
get_conv_param_parser_helper_msg
()
<<
std
::
endl
;
// clang-format on
}
}
// namespace
int
profile_image_to_column
(
int
argc
,
char
*
argv
[])
{
// 8 for control, 1 for num_dim_spatial
if
(
argc
<
9
)
{
print_helper_msg
();
return
1
;
}
const
auto
data_type
=
static_cast
<
DataType
>
(
std
::
stoi
(
argv
[
2
]));
const
auto
layout
=
static_cast
<
ConvLayout
>
(
std
::
stoi
(
argv
[
3
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
4
]);
const
int
init_method
=
std
::
stoi
(
argv
[
5
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
6
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
7
]);
const
int
num_dim_spatial
=
std
::
stoi
(
argv
[
8
]);
// 8 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial
if
(
argc
!=
8
+
1
+
4
+
6
*
num_dim_spatial
)
{
print_helper_msg
();
return
1
;
}
const
auto
params
=
ck
::
utils
::
conv
::
parse_conv_param
(
num_dim_spatial
,
9
,
argv
);
using
F32
=
float
;
using
F16
=
ck
::
half_t
;
using
BF16
=
ck
::
bhalf_t
;
using
INT8
=
int8_t
;
using
namespace
ck
::
tensor_layout
::
convolution
;
constexpr
auto
I1
=
ck
::
Number
<
1
>
{};
constexpr
auto
I2
=
ck
::
Number
<
2
>
{};
constexpr
auto
I3
=
ck
::
Number
<
3
>
{};
auto
profile
=
[
&
](
auto
num_dim_spatial_tmp
,
auto
in_layout
,
auto
in_type
,
auto
out_type
)
{
constexpr
ck
::
index_t
NDimSpatial
=
num_dim_spatial_tmp
.
value
;
using
InLayout
=
decltype
(
in_layout
);
using
InDataType
=
decltype
(
in_type
);
using
OutDataType
=
decltype
(
out_type
);
bool
pass
=
ck
::
profiler
::
profile_image_to_column_impl
<
NDimSpatial
,
InLayout
,
InDataType
,
OutDataType
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
params
);
return
pass
?
0
:
1
;
};
// NHWC
if
(
layout
==
ConvLayout
::
NHWC
)
{
if
(
num_dim_spatial
==
1
)
{
if
(
data_type
==
DataType
::
F32_F32
)
{
return
profile
(
I1
,
GNWC
{},
F32
{},
F32
{});
}
else
if
(
data_type
==
DataType
::
F16_F16
)
{
return
profile
(
I1
,
GNWC
{},
F16
{},
F16
{});
}
else
if
(
data_type
==
DataType
::
BF16_BF16
)
{
return
profile
(
I1
,
GNWC
{},
BF16
{},
BF16
{});
}
else
if
(
data_type
==
DataType
::
INT8_INT8
)
{
return
profile
(
I1
,
GNWC
{},
INT8
{},
INT8
{});
}
}
else
if
(
num_dim_spatial
==
2
)
{
if
(
data_type
==
DataType
::
F32_F32
)
{
return
profile
(
I2
,
GNHWC
{},
F32
{},
F32
{});
}
else
if
(
data_type
==
DataType
::
F16_F16
)
{
return
profile
(
I2
,
GNHWC
{},
F16
{},
F16
{});
}
else
if
(
data_type
==
DataType
::
BF16_BF16
)
{
return
profile
(
I2
,
GNHWC
{},
BF16
{},
BF16
{});
}
else
if
(
data_type
==
DataType
::
INT8_INT8
)
{
return
profile
(
I2
,
GNHWC
{},
INT8
{},
INT8
{});
}
}
else
if
(
num_dim_spatial
==
3
)
{
if
(
data_type
==
DataType
::
F32_F32
)
{
return
profile
(
I3
,
GNDHWC
{},
F32
{},
F32
{});
}
else
if
(
data_type
==
DataType
::
F16_F16
)
{
return
profile
(
I3
,
GNDHWC
{},
F16
{},
F16
{});
}
else
if
(
data_type
==
DataType
::
BF16_BF16
)
{
return
profile
(
I3
,
GNDHWC
{},
BF16
{},
BF16
{});
}
else
if
(
data_type
==
DataType
::
INT8_INT8
)
{
return
profile
(
I3
,
GNDHWC
{},
INT8
{},
INT8
{});
}
}
}
std
::
cout
<<
"this data_type & layout is not implemented"
<<
std
::
endl
;
return
1
;
}
REGISTER_PROFILER_OPERATION
(
OP_NAME
,
OP_DESC
,
profile_image_to_column
);
test/CMakeLists.txt
View file @
0f84c0c7
...
...
@@ -60,6 +60,7 @@ add_subdirectory(contraction)
add_subdirectory
(
pool
)
add_subdirectory
(
batched_gemm_multi_d
)
add_subdirectory
(
grouped_convnd_bwd_data
)
add_subdirectory
(
image_to_column
)
if
(
GPU_TARGETS MATCHES
"gfx11"
)
add_subdirectory
(
wmma_op
)
endif
()
test/image_to_column/CMakeLists.txt
0 → 100644
View file @
0f84c0c7
add_gtest_executable
(
test_image_to_column test_image_to_column.cpp
)
target_link_libraries
(
test_image_to_column PRIVATE utility device_image_to_column_instance
)
add_gtest_executable
(
test_image_to_column_interface test_image_to_column_interface.cpp
)
target_link_libraries
(
test_image_to_column_interface PRIVATE utility
)
test/image_to_column/test_image_to_column.cpp
0 → 100644
View file @
0f84c0c7
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iostream>
#include <initializer_list>
#include <tuple>
#include <vector>
#include <gtest/gtest.h>
#include "profiler/profile_image_to_column_impl.hpp"
template
<
typename
Tuple
>
class
TestImageToColumn
:
public
::
testing
::
Test
{
protected:
using
InDataType
=
std
::
tuple_element_t
<
0
,
Tuple
>
;
using
OutDataType
=
std
::
tuple_element_t
<
1
,
Tuple
>
;
using
InLayout
=
std
::
tuple_element_t
<
2
,
Tuple
>
;
std
::
vector
<
ck
::
utils
::
conv
::
ConvParam
>
conv_params
;
template
<
ck
::
index_t
NDimSpatial
>
void
Run
()
{
EXPECT_FALSE
(
conv_params
.
empty
());
bool
pass
=
true
;
for
(
auto
&
param
:
conv_params
)
{
pass
=
pass
&&
ck
::
profiler
::
profile_image_to_column_impl
<
NDimSpatial
,
InLayout
,
InDataType
,
OutDataType
>
(
true
,
// do_verification
1
,
// init_method: integer value
false
,
// do_log
false
,
// time_kernel
param
);
}
EXPECT_TRUE
(
pass
);
}
};
using
namespace
ck
::
tensor_layout
::
convolution
;
using
KernelTypes1d
=
::
testing
::
Types
<
std
::
tuple
<
float
,
float
,
GNWC
>
,
std
::
tuple
<
ck
::
bhalf_t
,
ck
::
bhalf_t
,
GNWC
>
,
std
::
tuple
<
ck
::
half_t
,
ck
::
half_t
,
GNWC
>
,
std
::
tuple
<
int8_t
,
int8_t
,
GNWC
>>
;
using
KernelTypes2d
=
::
testing
::
Types
<
std
::
tuple
<
float
,
float
,
GNHWC
>
,
std
::
tuple
<
ck
::
bhalf_t
,
ck
::
bhalf_t
,
GNHWC
>
,
std
::
tuple
<
ck
::
half_t
,
ck
::
half_t
,
GNHWC
>
,
std
::
tuple
<
int8_t
,
int8_t
,
GNHWC
>>
;
using
KernelTypes3d
=
::
testing
::
Types
<
std
::
tuple
<
float
,
float
,
GNDHWC
>
,
std
::
tuple
<
ck
::
bhalf_t
,
ck
::
bhalf_t
,
GNDHWC
>
,
std
::
tuple
<
ck
::
half_t
,
ck
::
half_t
,
GNDHWC
>
,
std
::
tuple
<
int8_t
,
int8_t
,
GNDHWC
>>
;
template
<
typename
Tuple
>
class
TestImageToColumn1d
:
public
TestImageToColumn
<
Tuple
>
{
};
template
<
typename
Tuple
>
class
TestImageToColumn2d
:
public
TestImageToColumn
<
Tuple
>
{
};
template
<
typename
Tuple
>
class
TestImageToColumn3d
:
public
TestImageToColumn
<
Tuple
>
{
};
TYPED_TEST_SUITE
(
TestImageToColumn1d
,
KernelTypes1d
);
TYPED_TEST_SUITE
(
TestImageToColumn2d
,
KernelTypes2d
);
TYPED_TEST_SUITE
(
TestImageToColumn3d
,
KernelTypes3d
);
TYPED_TEST
(
TestImageToColumn1d
,
Test1D
)
{
this
->
conv_params
.
clear
();
this
->
conv_params
.
push_back
({
1
,
1
,
4
,
1
,
192
,
{
3
},
{
28
},
{
1
},
{
1
},
{
1
},
{
1
}});
this
->
conv_params
.
push_back
({
1
,
1
,
64
,
1
,
64
,
{
3
},
{
14
},
{
1
},
{
1
},
{
1
},
{
1
}});
this
->
conv_params
.
push_back
({
1
,
1
,
64
,
1
,
64
,
{
1
},
{
7
},
{
2
},
{
1
},
{
0
},
{
0
}});
this
->
conv_params
.
push_back
({
1
,
1
,
64
,
1
,
64
,
{
1
},
{
3
},
{
1
},
{
1
},
{
0
},
{
0
}});
// ScalarPerVector should be 1
this
->
conv_params
.
push_back
({
1
,
1
,
4
,
1
,
1
,
{
3
},
{
28
},
{
1
},
{
1
},
{
1
},
{
1
}});
// stride != 1
this
->
conv_params
.
push_back
({
1
,
1
,
1
,
1
,
4
,
{
3
},
{
28
},
{
2
},
{
1
},
{
1
},
{
1
}});
// dilation != 1
this
->
conv_params
.
push_back
({
1
,
1
,
1
,
1
,
4
,
{
3
},
{
28
},
{
1
},
{
2
},
{
1
},
{
1
}});
this
->
template
Run
<
1
>();
}
TYPED_TEST
(
TestImageToColumn2d
,
Test2D
)
{
this
->
conv_params
.
clear
();
this
->
conv_params
.
push_back
(
{
2
,
1
,
4
,
1
,
192
,
{
3
,
3
},
{
28
,
28
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
}});
this
->
conv_params
.
push_back
(
{
2
,
1
,
64
,
1
,
64
,
{
3
,
3
},
{
14
,
14
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
}});
this
->
conv_params
.
push_back
({
2
,
1
,
64
,
1
,
64
,
{
1
,
1
},
{
7
,
7
},
{
2
,
2
},
{
1
,
1
},
{
0
,
0
},
{
0
,
0
}});
this
->
conv_params
.
push_back
({
2
,
1
,
64
,
1
,
64
,
{
1
,
1
},
{
3
,
3
},
{
1
,
1
},
{
1
,
1
},
{
0
,
0
},
{
0
,
0
}});
this
->
template
Run
<
2
>();
}
TYPED_TEST
(
TestImageToColumn3d
,
Test3D
)
{
this
->
conv_params
.
clear
();
this
->
conv_params
.
push_back
(
{
3
,
1
,
16
,
1
,
64
,
{
1
,
1
,
1
},
{
7
,
7
,
7
},
{
2
,
2
,
2
},
{
1
,
1
,
1
},
{
0
,
0
,
0
},
{
0
,
0
,
0
}});
this
->
conv_params
.
push_back
(
{
3
,
1
,
2
,
1
,
64
,
{
3
,
3
,
3
},
{
14
,
14
,
3
},
{
1
,
1
,
1
},
{
1
,
1
,
1
},
{
1
,
1
,
1
},
{
1
,
1
,
1
}});
this
->
conv_params
.
push_back
(
{
3
,
1
,
32
,
1
,
64
,
{
1
,
1
,
1
},
{
3
,
3
,
3
},
{
1
,
1
,
1
},
{
1
,
1
,
1
},
{
0
,
0
,
0
},
{
0
,
0
,
0
}});
this
->
template
Run
<
3
>();
}
test/image_to_column/test_image_to_column_interface.cpp
0 → 100644
View file @
0f84c0c7
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iostream>
#include <initializer_list>
#include <tuple>
#include <vector>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include <gtest/gtest.h>
using
DataType
=
float
;
using
InLayout
=
ck
::
tensor_layout
::
convolution
::
GNWC
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
template
<
ck
::
index_t
ScalarPerVector
,
bool
IsCPacked
>
class
TestImageToColumnInterface
:
public
::
testing
::
Test
{
protected:
static
constexpr
ck
::
index_t
NDimSpatial
=
1
;
// clang-format off
using
DeviceImgToColInstance
=
ck
::
tensor_operation
::
device
::
DeviceImageToColumnImpl
//#####################| Num| InLayout| InDataType| OutDataType| Block| MPer| KPer| Thread| Scalar|
//#####################| Dim| | | | Size| Block| Block| Cluster| Per|
//#####################| Spatial| | | | | | | Lengths| Vector|
//#####################| | | | | | | | | |
<
NDimSpatial
,
InLayout
,
DataType
,
DataType
,
256
,
128
,
128
,
S
<
16
,
16
>
,
ScalarPerVector
>
;
// clang-format on
ck
::
utils
::
conv
::
ConvParam
conv_param
;
bool
Run
()
{
const
auto
N
=
conv_param
.
N_
;
const
auto
C
=
conv_param
.
C_
;
const
auto
FakeC
=
conv_param
.
C_
/
2
;
// Fake C to simulate the behavior that C is not packed
const
ck
::
index_t
NDoHoWo
=
N
*
ck
::
accumulate_n
<
ck
::
index_t
>
(
conv_param
.
output_spatial_lengths_
.
begin
(),
NDimSpatial
,
1
,
std
::
multiplies
<>
());
const
ck
::
index_t
CZYX
=
C
*
ck
::
accumulate_n
<
ck
::
index_t
>
(
conv_param
.
filter_spatial_lengths_
.
begin
(),
NDimSpatial
,
1
,
std
::
multiplies
<>
());
const
auto
in_desc
=
ck
::
utils
::
conv
::
make_input_host_tensor_descriptor_g_n_c_wis_packed
<
InLayout
>
(
conv_param
);
const
auto
out_desc
=
HostTensorDescriptor
({
NDoHoWo
,
CZYX
});
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_spatial_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
filter_spatial_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
output_spatial_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
input_g_n_c_wis_strides
{};
std
::
array
<
ck
::
index_t
,
2
>
output_m_k_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
conv_filter_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
conv_filter_dilations
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_left_pads
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_right_pads
{};
auto
copy
=
[](
const
auto
&
x
,
auto
&
y
)
{
std
::
copy
(
x
.
begin
(),
x
.
end
(),
y
.
begin
());
};
copy
(
conv_param
.
input_spatial_lengths_
,
input_spatial_lengths
);
copy
(
conv_param
.
filter_spatial_lengths_
,
filter_spatial_lengths
);
copy
(
conv_param
.
output_spatial_lengths_
,
output_spatial_lengths
);
copy
(
in_desc
.
GetStrides
(),
input_g_n_c_wis_strides
);
copy
(
out_desc
.
GetStrides
(),
output_m_k_strides
);
copy
(
conv_param
.
conv_filter_strides_
,
conv_filter_strides
);
copy
(
conv_param
.
conv_filter_dilations_
,
conv_filter_dilations
);
copy
(
conv_param
.
input_left_pads_
,
input_left_pads
);
copy
(
conv_param
.
input_right_pads_
,
input_right_pads
);
auto
img2col
=
DeviceImgToColInstance
{};
auto
argument
=
img2col
.
MakeArgument
(
nullptr
,
nullptr
,
N
,
IsCPacked
?
C
:
FakeC
,
input_spatial_lengths
,
filter_spatial_lengths
,
output_spatial_lengths
,
input_g_n_c_wis_strides
,
output_m_k_strides
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
);
return
img2col
.
IsSupportedArgument
(
argument
);
}
};
class
TestImageToColumnInterface1ScalarPerVector
:
public
TestImageToColumnInterface
<
1
,
true
>
{
};
class
TestImageToColumnInterface4ScalarPerVector
:
public
TestImageToColumnInterface
<
4
,
true
>
{
};
class
TestImageToColumnInterface4ScalarPerVectorFakeC
:
public
TestImageToColumnInterface
<
4
,
false
>
{
};
TEST_F
(
TestImageToColumnInterface1ScalarPerVector
,
X1ScalarPerVector
)
{
// vector load C * X % ScalarPerVector
this
->
conv_param
=
{
1
,
1
,
1
,
1
,
1
,
{
3
},
{
3
},
{
1
},
{
1
},
{
0
},
{
0
}};
bool
is_supported
=
this
->
Run
();
EXPECT_TRUE
(
is_supported
);
// vector load C * left_pad_x % ScalarPerVector
this
->
conv_param
=
{
1
,
1
,
1
,
1
,
1
,
{
4
},
{
3
},
{
1
},
{
1
},
{
3
},
{
0
}};
is_supported
=
this
->
Run
();
EXPECT_TRUE
(
is_supported
);
// vector load C * right_pad_x % ScalarPerVector
this
->
conv_param
=
{
1
,
1
,
1
,
1
,
1
,
{
4
},
{
3
},
{
1
},
{
1
},
{
0
},
{
3
}};
is_supported
=
this
->
Run
();
EXPECT_TRUE
(
is_supported
);
// vector load C % ScalarPerVector, right_pad and stride
this
->
conv_param
=
{
1
,
1
,
1
,
1
,
1
,
{
4
},
{
3
},
{
2
},
{
1
},
{
0
},
{
3
}};
is_supported
=
this
->
Run
();
EXPECT_TRUE
(
is_supported
);
// vector load C % ScalarPerVector, left_pad and stride
this
->
conv_param
=
{
1
,
1
,
1
,
1
,
1
,
{
4
},
{
3
},
{
2
},
{
1
},
{
3
},
{
0
}};
is_supported
=
this
->
Run
();
EXPECT_TRUE
(
is_supported
);
// vector load C % ScalarPerVector, dilation
this
->
conv_param
=
{
1
,
1
,
1
,
1
,
1
,
{
4
},
{
3
},
{
1
},
{
2
},
{
0
},
{
0
}};
is_supported
=
this
->
Run
();
EXPECT_TRUE
(
is_supported
);
// C = 4
this
->
conv_param
=
{
1
,
1
,
1
,
1
,
4
,
{
3
},
{
3
},
{
1
},
{
1
},
{
3
},
{
3
}};
is_supported
=
this
->
Run
();
EXPECT_TRUE
(
is_supported
);
}
TEST_F
(
TestImageToColumnInterface4ScalarPerVector
,
X4ScalarPerVector
)
{
// vector load C * X % ScalarPerVector
this
->
conv_param
=
{
1
,
1
,
1
,
1
,
1
,
{
3
},
{
3
},
{
1
},
{
1
},
{
0
},
{
0
}};
bool
is_supported
=
this
->
Run
();
EXPECT_FALSE
(
is_supported
);
// vector load C * left_pad_x % ScalarPerVector
this
->
conv_param
=
{
1
,
1
,
1
,
1
,
1
,
{
4
},
{
3
},
{
1
},
{
1
},
{
3
},
{
0
}};
is_supported
=
this
->
Run
();
EXPECT_FALSE
(
is_supported
);
// vector load C * right_pad_x % ScalarPerVector
this
->
conv_param
=
{
1
,
1
,
1
,
1
,
1
,
{
4
},
{
3
},
{
1
},
{
1
},
{
0
},
{
3
}};
is_supported
=
this
->
Run
();
EXPECT_FALSE
(
is_supported
);
// vector load C % ScalarPerVector, right_pad and stride
this
->
conv_param
=
{
1
,
1
,
1
,
1
,
1
,
{
4
},
{
3
},
{
2
},
{
1
},
{
0
},
{
3
}};
is_supported
=
this
->
Run
();
EXPECT_FALSE
(
is_supported
);
// vector load C % ScalarPerVector, left_pad and stride
this
->
conv_param
=
{
1
,
1
,
1
,
1
,
1
,
{
4
},
{
3
},
{
2
},
{
1
},
{
3
},
{
0
}};
is_supported
=
this
->
Run
();
EXPECT_FALSE
(
is_supported
);
// vector load C % ScalarPerVector, dilation
this
->
conv_param
=
{
1
,
1
,
1
,
1
,
1
,
{
4
},
{
3
},
{
1
},
{
2
},
{
0
},
{
0
}};
is_supported
=
this
->
Run
();
EXPECT_FALSE
(
is_supported
);
// C = 4
this
->
conv_param
=
{
1
,
1
,
1
,
1
,
4
,
{
3
},
{
3
},
{
1
},
{
1
},
{
3
},
{
3
}};
is_supported
=
this
->
Run
();
EXPECT_TRUE
(
is_supported
);
}
TEST_F
(
TestImageToColumnInterface4ScalarPerVectorFakeC
,
X4ScalarPerVectorFakeC
)
{
// C = 3
this
->
conv_param
=
{
1
,
1
,
1
,
1
,
3
,
{
4
},
{
3
},
{
1
},
{
1
},
{
0
},
{
0
}};
bool
is_supported
=
this
->
Run
();
EXPECT_FALSE
(
is_supported
);
// C = 4
this
->
conv_param
=
{
1
,
1
,
1
,
1
,
8
,
{
4
},
{
3
},
{
1
},
{
1
},
{
0
},
{
0
}};
is_supported
=
this
->
Run
();
EXPECT_TRUE
(
is_supported
);
}
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment