Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
705d5a08
Commit
705d5a08
authored
Sep 08, 2023
by
Jun Liu
Browse files
Merge branch 'develop' into amd-develop
parents
d4ad52d6
8f84a012
Changes
168
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1859 additions
and
7 deletions
+1859
-7
library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_bf16_instance.cpp
.../pool3d_fwd/device_avg_pool3d_fwd_ndhwc_bf16_instance.cpp
+25
-0
library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_bf16_instance.cpp
.../pool3d_fwd/device_max_pool3d_fwd_ndhwc_bf16_instance.cpp
+34
-0
library/src/tensor_operation_instance/gpu/pool3d_fwd/pool_fwd_instance_common.hpp
...tion_instance/gpu/pool3d_fwd/pool_fwd_instance_common.hpp
+1
-0
library/src/utility/device_memory.cpp
library/src/utility/device_memory.cpp
+10
-0
profiler/README.md
profiler/README.md
+38
-0
profiler/include/profiler/profile_avg_pool3d_bwd_impl.hpp
profiler/include/profiler/profile_avg_pool3d_bwd_impl.hpp
+253
-0
profiler/include/profiler/profile_gemm_multiply_add_impl.hpp
profiler/include/profiler/profile_gemm_multiply_add_impl.hpp
+242
-0
profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
+1
-1
profiler/include/profiler/profile_image_to_column_impl.hpp
profiler/include/profiler/profile_image_to_column_impl.hpp
+200
-0
profiler/include/profiler/profile_max_pool3d_bwd_impl.hpp
profiler/include/profiler/profile_max_pool3d_bwd_impl.hpp
+288
-0
profiler/src/CMakeLists.txt
profiler/src/CMakeLists.txt
+8
-0
profiler/src/profile_avg_pool3d_bwd.cpp
profiler/src/profile_avg_pool3d_bwd.cpp
+175
-0
profiler/src/profile_gemm_bilinear.cpp
profiler/src/profile_gemm_bilinear.cpp
+19
-0
profiler/src/profile_gemm_multiply_add.cpp
profiler/src/profile_gemm_multiply_add.cpp
+153
-0
profiler/src/profile_image_to_column.cpp
profiler/src/profile_image_to_column.cpp
+169
-0
profiler/src/profile_max_pool3d_bwd.cpp
profiler/src/profile_max_pool3d_bwd.cpp
+177
-0
profiler/src/profile_max_pool3d_fwd.cpp
profiler/src/profile_max_pool3d_fwd.cpp
+60
-4
script/cmake-ck-dev.sh
script/cmake-ck-dev.sh
+0
-1
test/CMakeLists.txt
test/CMakeLists.txt
+2
-1
test/image_to_column/CMakeLists.txt
test/image_to_column/CMakeLists.txt
+4
-0
No files found.
library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_bf16_instance.cpp
0 → 100644
View file @
705d5a08
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "pool_fwd_instance_common.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
auto
ReduceOpId
=
ck
::
ReduceTensorOp
::
AVG
;
void
add_device_pool3d_fwd_ndhwc_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DevicePoolFwd
<
5
,
3
,
BF16
,
BF16
,
I32
,
NDHWC
,
NDHWC
,
ReduceOpId
,
false
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_pool3d_fwd_ndhwc_instances
<
BF16
,
BF16
,
I32
,
F32
,
ReduceOpId
,
false
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_bf16_instance.cpp
0 → 100644
View file @
705d5a08
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "pool_fwd_instance_common.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
auto
ReduceOpId
=
ck
::
ReduceTensorOp
::
MAX
;
void
add_device_pool3d_fwd_ndhwc_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DevicePoolFwd
<
5
,
3
,
BF16
,
BF16
,
I32
,
NDHWC
,
NDHWC
,
ReduceOpId
,
false
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_pool3d_fwd_ndhwc_instances
<
BF16
,
BF16
,
I32
,
BF16
,
ReduceOpId
,
false
>
{});
}
void
add_device_pool3d_fwd_ndhwc_index_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DevicePoolFwd
<
5
,
3
,
BF16
,
BF16
,
I32
,
NDHWC
,
NDHWC
,
ReduceOpId
,
true
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_pool3d_fwd_ndhwc_instances
<
BF16
,
BF16
,
I32
,
BF16
,
ReduceOpId
,
true
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/pool3d_fwd/pool_fwd_instance_common.hpp
View file @
705d5a08
...
...
@@ -17,6 +17,7 @@ namespace instance {
using
I32
=
int32_t
;
using
F16
=
ck
::
half_t
;
using
BF16
=
ck
::
bhalf_t
;
using
F32
=
float
;
using
NDHWC
=
ck
::
tensor_layout
::
convolution
::
NDHWC
;
...
...
library/src/utility/device_memory.cpp
View file @
705d5a08
...
...
@@ -37,6 +37,11 @@ void DeviceMem::ToDevice(const void* p) const
}
}
void
DeviceMem
::
ToDevice
(
const
void
*
p
,
const
std
::
size_t
cpySize
)
const
{
hip_check_error
(
hipMemcpy
(
mpDeviceBuf
,
const_cast
<
void
*>
(
p
),
cpySize
,
hipMemcpyHostToDevice
));
}
void
DeviceMem
::
FromDevice
(
void
*
p
)
const
{
if
(
mpDeviceBuf
)
...
...
@@ -49,6 +54,11 @@ void DeviceMem::FromDevice(void* p) const
}
}
void
DeviceMem
::
FromDevice
(
void
*
p
,
const
std
::
size_t
cpySize
)
const
{
hip_check_error
(
hipMemcpy
(
p
,
mpDeviceBuf
,
cpySize
,
hipMemcpyDeviceToHost
));
}
void
DeviceMem
::
SetZero
()
const
{
if
(
mpDeviceBuf
)
...
...
profiler/README.md
View file @
705d5a08
...
...
@@ -184,3 +184,41 @@ tflops: 95.337
GB/s: 69.2301
```
Note: This kernel use atomic add, this will cause output buffer to be accumulated multiple times, causing verification failure. To work around it, do not use CK's own timer and do verification at the same time.
## Profile image to column kernels
```
bash
# arg1: tensor operation (" OP_NAME ": " OP_DESC ")
# arg2: data type (0: Input fp32, Weight fp32, Output fp32
# 1: Input fp16, Weight fp16, Output fp16
# 2: Input bf16, Weight bf16, Output bf16
# 3: Input int8, Weight int8, Output int8)
# arg3: tensor layout (0: Input[N, Hi, Wi, C], Output[N * Ho * Wo, Y * X * C])
# arg4: verification (0: no, 1: yes)
# arg5: initialization (0: no init, 1: integer value, 2: decimal value)
# arg6: print tensor value (0: no; 1: yes)
# arg7: time kernel (0: no, 1: yes)
# Following arguments (depending on number of spatial dims):
# Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)
# G, N, K, C,
# <filter spatial dimensions>, (ie Y, X for 2D)
# <input image spatial dimensions>, (ie Hi, Wi for 2D)
# <strides>, (ie Sy, Sx for 2D)
# <dilations>, (ie Dy, Dx for 2D)
# <left padding>, (ie LeftPy, LeftPx for 2D)
# <right padding>, (ie RightPy, RightPx for 2D)
################ op datatype layout verify init log time Ndims G N K C Y X Hi Wi Sy Sx Dy Dx LeftPy LeftPx RightPy RightPx
./bin/ckProfiler image_to_column 0 0 1 1 0 1 2 1 256 1 512 3 3 28 28 1 1 1 1 0 0 0 0
```
Result
(
MI210, FP32, NHWC
)
```
input: dim 5, lengths {1, 256, 512, 28, 28}, strides {102760448, 401408, 1, 14336, 512}
output: dim 2, lengths {173056, 4608}, strides {4608, 1}
....
Best configuration parameters:
name: DeviceImageToColumn
<
128,
32,
64,
4
>
avg_time: 3.12326
GB/s: 2042.59
```
profiler/include/profiler/profile_avg_pool3d_bwd_impl.hpp
0 → 100644
View file @
705d5a08
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iomanip>
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp"
#include "ck/library/tensor_operation_instance/gpu/avg_pool3d_bwd.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_avgpool_bwd.hpp"
namespace
ck
{
namespace
profiler
{
template
<
typename
TensorLayout
>
std
::
vector
<
ck
::
index_t
>
f_tensor_strides_ncdhw
(
ck
::
index_t
N_
,
ck
::
index_t
C_
,
ck
::
index_t
D
,
ck
::
index_t
H
,
ck
::
index_t
W
,
TensorLayout
layout
)
{
using
namespace
ck
::
literals
;
(
void
)
N_
;
if
constexpr
(
ck
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
NDHWC
>::
value
)
return
{
D
*
C_
*
H
*
W
,
1
_uz
,
C_
*
H
*
W
,
W
*
C_
,
C_
};
else
throw
std
::
runtime_error
(
"not supported yet"
);
};
template
<
typename
DOutDataType
,
typename
DInDataType
,
typename
ComputeDataType
,
typename
DOutLayout
,
typename
DInLayout
>
bool
profile_avg_pool3d_bwd_impl
(
int
do_verification
,
int
init_method
,
bool
do_log
,
bool
time_kernel
,
std
::
vector
<
index_t
>
in_length
,
// NCDHW
std
::
vector
<
index_t
>
window_spatial_lengths
,
std
::
vector
<
index_t
>
window_strides
,
std
::
vector
<
index_t
>
window_dilations
,
std
::
vector
<
index_t
>
input_left_pads
,
std
::
vector
<
index_t
>
input_right_pads
)
{
constexpr
index_t
InOutRank
=
5
;
constexpr
index_t
WindowRank
=
3
;
if
(
in_length
.
size
()
!=
InOutRank
||
window_spatial_lengths
.
size
()
!=
WindowRank
||
window_strides
.
size
()
!=
WindowRank
||
window_dilations
.
size
()
!=
WindowRank
||
input_left_pads
.
size
()
!=
WindowRank
||
input_right_pads
.
size
()
!=
WindowRank
)
{
std
::
cout
<<
"Parameter is incorrect"
<<
std
::
endl
;
return
false
;
}
std
::
vector
<
index_t
>
out_length
(
InOutRank
);
int
N
=
in_length
[
0
];
int
C
=
in_length
[
1
];
out_length
[
0
]
=
N
;
out_length
[
1
]
=
C
;
// Calculate Do, Ho, Wo
for
(
int
i
=
2
;
i
<
InOutRank
;
++
i
)
{
auto
pad1
=
input_left_pads
[
i
-
2
];
auto
pad2
=
input_right_pads
[
i
-
2
];
auto
windows_size
=
window_spatial_lengths
[
i
-
2
];
auto
windows_stride
=
window_strides
[
i
-
2
];
auto
windows_dilation
=
window_dilations
[
i
-
2
];
auto
eff
=
(
windows_size
-
1
)
*
windows_dilation
+
1
;
out_length
[
i
]
=
(
in_length
[
i
]
+
pad1
+
pad2
-
eff
)
/
windows_stride
+
1
;
}
int
Di
=
in_length
[
2
];
int
Hi
=
in_length
[
3
];
int
Wi
=
in_length
[
4
];
int
Do
=
out_length
[
2
];
int
Ho
=
out_length
[
3
];
int
Wo
=
out_length
[
4
];
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
N_
,
std
::
size_t
C_
,
std
::
size_t
D
,
std
::
size_t
H
,
std
::
size_t
W
)
{
using
namespace
ck
::
literals
;
return
HostTensorDescriptor
({
N_
,
C_
,
D
,
H
,
W
},
{
D
*
C_
*
H
*
W
,
1
_uz
,
C_
*
H
*
W
,
W
*
C_
,
C_
});
};
Tensor
<
DOutDataType
>
dout_n_c_do_ho_wo
(
f_host_tensor_descriptor
(
N
,
C
,
Do
,
Ho
,
Wo
));
Tensor
<
DInDataType
>
din_n_c_di_hi_wi_device
(
f_host_tensor_descriptor
(
N
,
C
,
Di
,
Hi
,
Wi
));
Tensor
<
DInDataType
>
din_n_c_di_hi_wi_host
(
f_host_tensor_descriptor
(
N
,
C
,
Di
,
Hi
,
Wi
));
switch
(
init_method
)
{
case
0
:
dout_n_c_do_ho_wo
.
GenerateTensorValue
(
GeneratorTensor_1
<
DOutDataType
>
{});
break
;
case
1
:
dout_n_c_do_ho_wo
.
GenerateTensorValue
(
GeneratorTensor_2
<
DOutDataType
>
{
-
5
,
5
});
break
;
default:
dout_n_c_do_ho_wo
.
GenerateTensorValue
(
GeneratorTensor_3
<
DOutDataType
>
{
-
0.5
,
0.5
});
}
DeviceMem
dout_device_buf
(
sizeof
(
DOutDataType
)
*
dout_n_c_do_ho_wo
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
din_device_buf
(
sizeof
(
DInDataType
)
*
din_n_c_di_hi_wi_device
.
mDesc
.
GetElementSpaceSize
());
dout_device_buf
.
ToDevice
(
dout_n_c_do_ho_wo
.
mData
.
data
());
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceAvgPoolBwd
<
3
,
DOutDataType
,
DInDataType
,
DOutLayout
,
DInLayout
>
;
// get device op instances
const
auto
instance_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
instance_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
std
::
string
best_instance_name
;
float
best_avg_time
=
std
::
numeric_limits
<
float
>::
max
();
float
best_gb_per_sec
=
0
;
if
(
do_verification
)
{
using
ReferencePoolingBwdInstance
=
ck
::
tensor_operation
::
host
::
ReferenceAvgPoolBwd
<
3
,
DInDataType
,
DOutDataType
>
;
ReferencePoolingBwdInstance
ref_pooling_bwd
;
auto
ref_pooling_bwd_argument
=
ref_pooling_bwd
.
MakeArgument
(
din_n_c_di_hi_wi_host
,
dout_n_c_do_ho_wo
,
window_spatial_lengths
,
window_strides
,
window_dilations
,
input_left_pads
,
input_right_pads
);
auto
ref_invoker
=
ref_pooling_bwd
.
MakeInvoker
();
ref_invoker
.
Run
(
ref_pooling_bwd_argument
);
}
int
num_kernel
=
0
;
for
(
auto
&
inst_ptr
:
instance_ptrs
)
{
auto
argument_ptr
=
inst_ptr
->
MakeArgumentPointer
(
static_cast
<
DOutDataType
*>
(
dout_device_buf
.
GetDeviceBuffer
()),
static_cast
<
DInDataType
*>
(
din_device_buf
.
GetDeviceBuffer
()),
{
N
,
C
,
Do
,
Ho
,
Wo
},
{
N
,
C
,
Di
,
Hi
,
Wi
},
f_tensor_strides_ncdhw
(
N
,
C
,
Do
,
Ho
,
Wo
,
DOutLayout
{}),
f_tensor_strides_ncdhw
(
N
,
C
,
Di
,
Hi
,
Wi
,
DInLayout
{}),
window_spatial_lengths
,
window_strides
,
window_dilations
,
input_left_pads
,
input_right_pads
);
if
(
inst_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
++
num_kernel
;
}
else
{
if
(
time_kernel
)
{
std
::
cout
<<
inst_ptr
->
GetTypeString
()
<<
" skipped due to unsupported argument: "
;
LogRange
(
std
::
cout
<<
"doutput lengths = "
,
out_length
,
", "
)
<<
std
::
endl
;
}
continue
;
}
din_device_buf
.
SetZero
();
auto
invoker_ptr
=
inst_ptr
->
MakeInvokerPointer
();
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
num_bytes
=
dout_n_c_do_ho_wo
.
mDesc
.
GetElementSize
()
*
sizeof
(
DOutDataType
)
+
din_n_c_di_hi_wi_device
.
mDesc
.
GetElementSize
()
*
sizeof
(
DInDataType
);
float
gb_per_sec
=
num_bytes
/
1.E6
/
avg_time
;
if
(
time_kernel
)
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
avg_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
inst_ptr
->
GetTypeString
()
<<
std
::
endl
;
if
(
avg_time
<
best_avg_time
)
{
best_instance_name
=
inst_ptr
->
GetTypeString
();
best_avg_time
=
avg_time
;
best_gb_per_sec
=
gb_per_sec
;
}
if
(
do_verification
)
{
din_device_buf
.
FromDevice
(
din_n_c_di_hi_wi_device
.
mData
.
data
());
bool
pass
=
ck
::
utils
::
check_err
(
din_n_c_di_hi_wi_device
.
mData
,
din_n_c_di_hi_wi_host
.
mData
,
"Error: Incorrect results"
,
1e-3
,
1e-3
);
if
(
do_log
)
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"din_n_c_di_hi_wi_device: "
,
din_n_c_di_hi_wi_device
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"din_n_c_di_hi_wi_host: "
,
din_n_c_di_hi_wi_host
.
mData
,
","
)
<<
std
::
endl
;
}
if
(
!
pass
)
{
std
::
cout
<<
inst_ptr
->
GetTypeString
()
<<
" failed verification: "
;
LogRange
(
std
::
cout
<<
"doutput lengths = ["
,
out_length
,
", "
)
<<
"]."
<<
std
::
endl
;
return
false
;
}
else
{
if
(
time_kernel
)
std
::
cout
<<
"pass"
<<
std
::
endl
;
}
}
}
if
(
time_kernel
)
{
LogRange
(
std
::
cout
<<
"length = "
,
out_length
,
","
)
<<
std
::
endl
;
std
::
cout
<<
"best perf = "
<<
best_avg_time
<<
" ms, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_instance_name
<<
std
::
endl
;
}
if
(
num_kernel
==
0
)
{
std
::
cout
<<
"Error: No kernel is applicable"
<<
std
::
endl
;
return
false
;
}
return
true
;
}
}
// namespace profiler
}
// namespace ck
profiler/include/profiler/profile_gemm_multiply_add_impl.hpp
0 → 100644
View file @
705d5a08
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iomanip>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/gemm_multiply_add.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace
ck
{
namespace
profiler
{
template
<
typename
ADataType
,
typename
BDataType
,
typename
AccDataType
,
typename
D0DataType
,
typename
D1DataType
,
typename
EDataType
,
typename
ALayout
,
typename
BLayout
,
typename
D0Layout
,
typename
D1Layout
,
typename
ELayout
>
bool
profile_gemm_multiply_add_impl
(
int
do_verification
,
int
init_method
,
bool
/*do_log*/
,
bool
time_kernel
,
int
M
,
int
N
,
int
K
,
int
StrideA
,
int
StrideB
,
int
StrideD0
,
int
StrideD1
,
int
StrideE
)
{
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
using
namespace
ck
::
literals
;
if
(
is_same
<
decltype
(
layout
),
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
return
HostTensorDescriptor
({
row
,
col
},
{
stride
,
1
_uz
});
}
else
{
return
HostTensorDescriptor
({
row
,
col
},
{
1
_uz
,
stride
});
}
};
Tensor
<
ADataType
>
a_m_k
(
f_host_tensor_descriptor
(
M
,
K
,
StrideA
,
ALayout
{}));
Tensor
<
BDataType
>
b_k_n
(
f_host_tensor_descriptor
(
K
,
N
,
StrideB
,
BLayout
{}));
Tensor
<
D0DataType
>
d0_m_n
(
f_host_tensor_descriptor
(
M
,
N
,
StrideD0
,
D0Layout
{}));
Tensor
<
D1DataType
>
d1_m_n
(
f_host_tensor_descriptor
(
M
,
N
,
StrideD1
,
D1Layout
{}));
Tensor
<
EDataType
>
e_m_n_device_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideE
,
ELayout
{}));
Tensor
<
EDataType
>
e_m_n_host_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideE
,
ELayout
{}));
std
::
cout
<<
"a_m_k: "
<<
a_m_k
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"b_k_n: "
<<
b_k_n
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"d0_m_n: "
<<
d0_m_n
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"d1_m_n: "
<<
d1_m_n
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"e_m_n: "
<<
e_m_n_device_result
.
mDesc
<<
std
::
endl
;
switch
(
init_method
)
{
case
0
:
break
;
case
1
:
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_2
<
ADataType
>
{
-
5
,
5
});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_2
<
BDataType
>
{
-
5
,
5
});
d0_m_n
.
GenerateTensorValue
(
GeneratorTensor_2
<
D0DataType
>
{
-
5
,
5
});
d1_m_n
.
GenerateTensorValue
(
GeneratorTensor_2
<
D1DataType
>
{
-
1
,
1
});
break
;
default:
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0.0
,
1.0
});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
-
0.5
,
0.5
});
d0_m_n
.
GenerateTensorValue
(
GeneratorTensor_3
<
D0DataType
>
{
0.0
,
1.0
});
d1_m_n
.
GenerateTensorValue
(
GeneratorTensor_3
<
D1DataType
>
{
0.0
,
1.0
});
}
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
MultiplyAdd
=
ck
::
tensor_operation
::
element_wise
::
MultiplyAdd
;
using
AElementOp
=
PassThrough
;
using
BElementOp
=
PassThrough
;
using
CDEElementOp
=
MultiplyAdd
;
const
auto
a_element_op
=
AElementOp
{};
const
auto
b_element_op
=
BElementOp
{};
const
auto
cde_element_op
=
CDEElementOp
{};
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceGemmMultipleD
<
ALayout
,
BLayout
,
ck
::
Tuple
<
D0Layout
,
D1Layout
>
,
ELayout
,
ADataType
,
BDataType
,
ck
::
Tuple
<
D0DataType
,
D1DataType
>
,
EDataType
,
PassThrough
,
PassThrough
,
CDEElementOp
>
;
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
// run reference
if
(
do_verification
)
{
Tensor
<
AccDataType
>
c_m_n
({
M
,
N
});
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
BDataType
,
AccDataType
,
AccDataType
,
AElementOp
,
BElementOp
,
PassThrough
>
;
auto
ref_gemm
=
ReferenceGemmInstance
{};
auto
ref_invoker
=
ref_gemm
.
MakeInvoker
();
auto
ref_argument
=
ref_gemm
.
MakeArgument
(
a_m_k
,
b_k_n
,
c_m_n
,
a_element_op
,
b_element_op
,
PassThrough
{});
ref_invoker
.
Run
(
ref_argument
);
for
(
int
m
=
0
;
m
<
M
;
++
m
)
{
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
cde_element_op
(
e_m_n_host_result
(
m
,
n
),
c_m_n
(
m
,
n
),
d0_m_n
(
m
,
n
),
d1_m_n
(
m
,
n
));
}
}
}
DeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
a_m_k
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
b_k_n
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
d0_m_n_device_buf
(
sizeof
(
D0DataType
)
*
d0_m_n
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
d1_m_n_device_buf
(
sizeof
(
D1DataType
)
*
d1_m_n
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
e_device_buf
(
sizeof
(
EDataType
)
*
e_m_n_device_result
.
mDesc
.
GetElementSpaceSize
());
a_device_buf
.
ToDevice
(
a_m_k
.
mData
.
data
());
b_device_buf
.
ToDevice
(
b_k_n
.
mData
.
data
());
d0_m_n_device_buf
.
ToDevice
(
d0_m_n
.
mData
.
data
());
d1_m_n_device_buf
.
ToDevice
(
d1_m_n
.
mData
.
data
());
std
::
string
best_op_name
;
float
best_ave_time
=
0
;
float
best_tflops
=
0
;
float
best_gb_per_sec
=
0
;
bool
pass
=
true
;
// profile device operation instances
for
(
auto
&
op_ptr
:
op_ptrs
)
{
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
a_device_buf
.
GetDeviceBuffer
(),
b_device_buf
.
GetDeviceBuffer
(),
std
::
array
<
const
void
*
,
2
>
{
d0_m_n_device_buf
.
GetDeviceBuffer
(),
d1_m_n_device_buf
.
GetDeviceBuffer
()},
e_device_buf
.
GetDeviceBuffer
(),
M
,
N
,
K
,
StrideA
,
StrideB
,
std
::
array
<
ck
::
index_t
,
2
>
{
StrideD0
,
StrideD1
},
StrideE
,
a_element_op
,
b_element_op
,
cde_element_op
);
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
// re-init E to zero before profiling a kernel
e_device_buf
.
SetZero
();
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
M
*
N
*
K
;
std
::
size_t
num_btype
=
sizeof
(
ADataType
)
*
M
*
K
+
sizeof
(
BDataType
)
*
K
*
N
+
sizeof
(
EDataType
)
*
M
*
N
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
if
(
tflops
>
best_tflops
)
{
best_op_name
=
op_name
;
best_tflops
=
tflops
;
best_ave_time
=
ave_time
;
best_gb_per_sec
=
gb_per_sec
;
}
if
(
do_verification
)
{
e_device_buf
.
FromDevice
(
e_m_n_device_result
.
mData
.
data
());
pass
=
pass
&&
ck
::
utils
::
check_err
(
e_m_n_device_result
,
e_m_n_host_result
);
}
}
else
{
std
::
cout
<<
op_name
<<
" does not support this problem"
<<
std
::
endl
;
}
}
std
::
cout
<<
"Best Perf: "
<<
best_ave_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
return
pass
;
}
}
// namespace profiler
}
// namespace ck
profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
View file @
705d5a08
...
...
@@ -215,7 +215,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"
xdl
found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
std
::
cout
<<
"
ckProfiler
found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
for
(
auto
&
op_ptr
:
op_ptrs
)
{
...
...
profiler/include/profiler/profile_image_to_column_impl.hpp
0 → 100644
View file @
705d5a08
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iomanip>
#include <iostream>
#include <typeinfo>
#include <limits>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_image_to_column.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp"
#include "ck/library/tensor_operation_instance/gpu/image_to_column.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp"
namespace
ck
{
namespace
profiler
{
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
template
<
index_t
NDimSpatial
,
typename
InputLayout
,
typename
InputDataType
,
typename
OutputDataType
>
bool
profile_image_to_column_impl
(
int
do_verification
,
int
init_method
,
bool
do_log
,
bool
time_kernel
,
const
ck
::
utils
::
conv
::
ConvParam
&
conv_param
)
{
const
ck
::
index_t
NDoHoWo
=
conv_param
.
N_
*
ck
::
accumulate_n
<
ck
::
index_t
>
(
conv_param
.
output_spatial_lengths_
.
begin
(),
NDimSpatial
,
1
,
std
::
multiplies
<>
());
const
ck
::
index_t
CZYX
=
conv_param
.
C_
*
ck
::
accumulate_n
<
ck
::
index_t
>
(
conv_param
.
filter_spatial_lengths_
.
begin
(),
NDimSpatial
,
1
,
std
::
multiplies
<>
());
const
auto
in_desc
=
ck
::
utils
::
conv
::
make_input_host_tensor_descriptor_g_n_c_wis_packed
<
InputLayout
>
(
conv_param
);
const
auto
out_desc
=
HostTensorDescriptor
({
NDoHoWo
,
CZYX
});
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_spatial_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
filter_spatial_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
output_spatial_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
input_g_n_c_wis_strides
{};
std
::
array
<
ck
::
index_t
,
2
>
output_m_k_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
conv_filter_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
conv_filter_dilations
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_left_pads
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_right_pads
{};
auto
copy
=
[](
const
auto
&
x
,
auto
&
y
)
{
std
::
copy
(
x
.
begin
(),
x
.
end
(),
y
.
begin
());
};
copy
(
conv_param
.
input_spatial_lengths_
,
input_spatial_lengths
);
copy
(
conv_param
.
filter_spatial_lengths_
,
filter_spatial_lengths
);
copy
(
conv_param
.
output_spatial_lengths_
,
output_spatial_lengths
);
copy
(
in_desc
.
GetStrides
(),
input_g_n_c_wis_strides
);
copy
(
out_desc
.
GetStrides
(),
output_m_k_strides
);
copy
(
conv_param
.
conv_filter_strides_
,
conv_filter_strides
);
copy
(
conv_param
.
conv_filter_dilations_
,
conv_filter_dilations
);
copy
(
conv_param
.
input_left_pads_
,
input_left_pads
);
copy
(
conv_param
.
input_right_pads_
,
input_right_pads
);
Tensor
<
InputDataType
>
input
(
in_desc
);
Tensor
<
OutputDataType
>
host_output
(
out_desc
);
Tensor
<
OutputDataType
>
device_output
(
out_desc
);
std
::
cout
<<
"input: "
<<
input
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"output: "
<<
host_output
.
mDesc
<<
std
::
endl
;
switch
(
init_method
)
{
case
0
:
break
;
case
1
:
input
.
GenerateTensorValue
(
GeneratorTensor_2
<
InputDataType
>
{
-
5
,
5
});
break
;
default:
input
.
GenerateTensorValue
(
GeneratorTensor_3
<
InputDataType
>
{
0.0
,
1.0
});
}
DeviceMem
in_device_buf
(
sizeof
(
InputDataType
)
*
input
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
out_device_buf
(
sizeof
(
OutputDataType
)
*
device_output
.
mDesc
.
GetElementSpaceSize
());
in_device_buf
.
ToDevice
(
input
.
mData
.
data
());
// run reference op
if
(
do_verification
)
{
auto
ref_image_to_column
=
ck
::
tensor_operation
::
host
::
ReferenceImageToColumn
<
NDimSpatial
,
InputLayout
,
InputDataType
,
OutputDataType
>
{};
auto
ref_invoker
=
ref_image_to_column
.
MakeInvoker
();
auto
ref_argument
=
ref_image_to_column
.
MakeArgument
(
input
,
host_output
,
conv_param
.
filter_spatial_lengths_
,
conv_param
.
conv_filter_strides_
,
conv_param
.
conv_filter_dilations_
,
conv_param
.
input_left_pads_
,
conv_param
.
input_right_pads_
);
// init host output to zero
host_output
.
SetZero
();
ref_invoker
.
Run
(
ref_argument
);
}
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceImageToColumn
<
NDimSpatial
,
InputLayout
,
InputDataType
,
OutputDataType
>
;
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
std
::
string
best_op_name
;
float
best_avg_time
=
std
::
numeric_limits
<
float
>::
max
();
float
best_gb_per_sec
=
0
;
// profile device op instances
bool
pass
=
true
;
bool
is_supporting_instance
=
false
;
for
(
auto
&
op_ptr
:
op_ptrs
)
{
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
static_cast
<
InputDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutputDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
conv_param
.
N_
,
conv_param
.
C_
,
input_spatial_lengths
,
filter_spatial_lengths
,
output_spatial_lengths
,
input_g_n_c_wis_strides
,
output_m_k_strides
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
);
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
is_supporting_instance
=
true
;
// re-init output to zero before profiling next kernel
out_device_buf
.
SetZero
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
num_btype
=
NDoHoWo
*
CZYX
*
(
sizeof
(
OutputDataType
)
+
sizeof
(
InputDataType
));
float
gb_per_sec
=
num_btype
/
1.E6
/
avg_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
avg_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
if
(
avg_time
<
best_avg_time
)
{
best_op_name
=
op_name
;
best_avg_time
=
avg_time
;
best_gb_per_sec
=
gb_per_sec
;
}
if
(
do_verification
)
{
out_device_buf
.
FromDevice
(
device_output
.
mData
.
data
());
pass
=
pass
&
ck
::
utils
::
check_err
(
device_output
,
host_output
);
if
(
do_log
)
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"input : "
,
input
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"host_output : "
,
host_output
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"device_output: "
,
device_output
.
mData
,
","
)
<<
std
::
endl
;
}
}
}
else
{
std
::
cout
<<
op_ptr
->
GetTypeString
()
<<
" does not support this problem"
<<
std
::
endl
;
}
}
std
::
cout
<<
"Best configuration parameters:"
<<
"
\n
name: "
<<
best_op_name
<<
"
\n
avg_time: "
<<
best_avg_time
<<
"
\n
GB/s: "
<<
best_gb_per_sec
<<
std
::
endl
;
return
is_supporting_instance
&&
pass
;
}
}
// namespace profiler
}
// namespace ck
profiler/include/profiler/profile_max_pool3d_bwd_impl.hpp
0 → 100644
View file @
705d5a08
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iomanip>
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp"
#include "ck/library/tensor_operation_instance/gpu/max_pool_bwd.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_maxpool_bwd.hpp"
namespace
ck
{
namespace
profiler
{
template
<
typename
InDataType
,
typename
OutDataType
,
typename
IndexDataType
,
typename
DOutDataType
,
typename
DInDataType
,
bool
PropagateNan
>
bool
profile_max_pool3d_bwd_impl
(
int
do_verification
,
int
init_method
,
bool
do_log
,
bool
time_kernel
,
std
::
vector
<
index_t
>
in_length
,
// NCDHW
std
::
vector
<
index_t
>
window_spatial_lengths
,
std
::
vector
<
index_t
>
window_strides
,
std
::
vector
<
index_t
>
window_dilations
,
std
::
vector
<
index_t
>
input_left_pads
,
std
::
vector
<
index_t
>
input_right_pads
)
{
// AtomicAdd only support f32 for now. ComputeDataType must be float32
using
ComputeDataType
=
float
;
constexpr
index_t
InOutRank
=
5
;
constexpr
index_t
WindowRank
=
3
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
if
(
in_length
.
size
()
!=
InOutRank
||
window_spatial_lengths
.
size
()
!=
WindowRank
||
window_strides
.
size
()
!=
WindowRank
||
window_dilations
.
size
()
!=
WindowRank
||
input_left_pads
.
size
()
!=
WindowRank
||
input_right_pads
.
size
()
!=
WindowRank
)
{
std
::
cout
<<
"Parameter is incorrect"
<<
std
::
endl
;
return
false
;
}
std
::
vector
<
index_t
>
out_length
(
InOutRank
);
int
N
=
in_length
[
0
];
int
C
=
in_length
[
1
];
out_length
[
0
]
=
N
;
out_length
[
1
]
=
C
;
// Calculate Do, Ho, Wo
for
(
int
i
=
2
;
i
<
InOutRank
;
++
i
)
{
auto
pad1
=
input_left_pads
[
i
-
2
];
auto
pad2
=
input_right_pads
[
i
-
2
];
auto
windows_size
=
window_spatial_lengths
[
i
-
2
];
auto
windows_stride
=
window_strides
[
i
-
2
];
auto
windows_dilation
=
window_dilations
[
i
-
2
];
auto
eff
=
(
windows_size
-
1
)
*
windows_dilation
+
1
;
out_length
[
i
]
=
(
in_length
[
i
]
+
pad1
+
pad2
-
eff
)
/
windows_stride
+
1
;
}
int
Di
=
in_length
[
2
];
int
Hi
=
in_length
[
3
];
int
Wi
=
in_length
[
4
];
int
Do
=
out_length
[
2
];
int
Ho
=
out_length
[
3
];
int
Wo
=
out_length
[
4
];
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
N_
,
std
::
size_t
C_
,
std
::
size_t
D
,
std
::
size_t
H
,
std
::
size_t
W
)
{
using
namespace
ck
::
literals
;
return
HostTensorDescriptor
({
N_
,
C_
,
D
,
H
,
W
},
{
D
*
C_
*
H
*
W
,
1
_uz
,
C_
*
H
*
W
,
W
*
C_
,
C_
});
};
Tensor
<
InDataType
>
in_n_c_di_hi_wi
(
f_host_tensor_descriptor
(
N
,
C
,
Di
,
Hi
,
Wi
));
Tensor
<
OutDataType
>
out_n_c_do_ho_wo
(
f_host_tensor_descriptor
(
N
,
C
,
Do
,
Ho
,
Wo
));
Tensor
<
IndexDataType
>
out_indices_n_c_do_ho_wo
(
f_host_tensor_descriptor
(
N
,
C
,
Do
,
Ho
,
Wo
));
Tensor
<
DOutDataType
>
dout_n_c_do_ho_wo
(
f_host_tensor_descriptor
(
N
,
C
,
Do
,
Ho
,
Wo
));
Tensor
<
DInDataType
>
din_n_c_di_hi_wi_host
(
f_host_tensor_descriptor
(
N
,
C
,
Di
,
Hi
,
Wi
));
Tensor
<
DInDataType
>
din_n_c_di_hi_wi_device
(
f_host_tensor_descriptor
(
N
,
C
,
Di
,
Hi
,
Wi
));
switch
(
init_method
)
{
case
0
:
in_n_c_di_hi_wi
.
GenerateTensorValue
(
GeneratorTensor_1
<
InDataType
>
{});
dout_n_c_do_ho_wo
.
GenerateTensorValue
(
GeneratorTensor_1
<
DOutDataType
>
{});
break
;
case
1
:
in_n_c_di_hi_wi
.
GenerateTensorValue
(
GeneratorTensor_2
<
InDataType
>
{
-
5
,
5
});
dout_n_c_do_ho_wo
.
GenerateTensorValue
(
GeneratorTensor_2
<
DOutDataType
>
{
-
5
,
5
});
break
;
default:
in_n_c_di_hi_wi
.
GenerateTensorValue
(
GeneratorTensor_3
<
InDataType
>
{
-
0.5
,
0.5
});
dout_n_c_do_ho_wo
.
GenerateTensorValue
(
GeneratorTensor_3
<
DOutDataType
>
{
-
0.5
,
0.5
});
}
DeviceMem
indices_device_buf
(
sizeof
(
IndexDataType
)
*
out_indices_n_c_do_ho_wo
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
dout_device_buf
(
sizeof
(
DOutDataType
)
*
dout_n_c_do_ho_wo
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
din_device_buf
(
sizeof
(
DInDataType
)
*
din_n_c_di_hi_wi_device
.
mDesc
.
GetElementSpaceSize
());
// Generate index data from forwarding
{
using
ReferencePoolingFwdInstance
=
ck
::
tensor_operation
::
host
::
ReferencePoolingFwd
<
InOutRank
,
WindowRank
,
InDataType
,
OutDataType
,
ComputeDataType
,
IndexDataType
,
ck
::
ReduceTensorOp
::
MAX
,
false
,
true
>
;
ReferencePoolingFwdInstance
ref_pooling_fwd
;
auto
ref_pooling_fwd_argument
=
ref_pooling_fwd
.
MakeArgument
(
in_n_c_di_hi_wi
,
out_n_c_do_ho_wo
,
out_indices_n_c_do_ho_wo
,
window_spatial_lengths
,
window_strides
,
window_dilations
,
input_left_pads
,
input_right_pads
);
auto
ref_pooling_fwd_invoker
=
ref_pooling_fwd
.
MakeInvoker
();
ref_pooling_fwd_invoker
.
Run
(
ref_pooling_fwd_argument
);
}
indices_device_buf
.
ToDevice
(
out_indices_n_c_do_ho_wo
.
mData
.
data
());
dout_device_buf
.
ToDevice
(
dout_n_c_do_ho_wo
.
mData
.
data
());
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceMaxPoolBwd
<
DOutDataType
,
IndexDataType
,
DInDataType
>
;
// get device op instances
const
auto
instance_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
instance_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
std
::
string
best_instance_name
;
float
best_avg_time
=
std
::
numeric_limits
<
float
>::
max
();
float
best_gb_per_sec
=
0
;
if
(
do_verification
)
{
using
ReferencePoolingBwdInstance
=
ck
::
tensor_operation
::
host
::
ReferenceMaxPoolBwd
<
DOutDataType
,
IndexDataType
,
ComputeDataType
,
DInDataType
,
PassThrough
>
;
ReferencePoolingBwdInstance
ref_pooling_bwd
;
auto
ref_pooling_bwd_argument
=
ref_pooling_bwd
.
MakeArgument
(
dout_n_c_do_ho_wo
,
out_indices_n_c_do_ho_wo
,
din_n_c_di_hi_wi_host
,
PassThrough
{});
auto
ref_invoker
=
ref_pooling_bwd
.
MakeInvoker
();
ref_invoker
.
Run
(
ref_pooling_bwd_argument
);
}
int
num_kernel
=
0
;
for
(
auto
&
inst_ptr
:
instance_ptrs
)
{
auto
argument_ptr
=
inst_ptr
->
MakeArgumentPointer
(
static_cast
<
DOutDataType
*>
(
dout_device_buf
.
GetDeviceBuffer
()),
static_cast
<
IndexDataType
*>
(
indices_device_buf
.
GetDeviceBuffer
()),
static_cast
<
DInDataType
*>
(
din_device_buf
.
GetDeviceBuffer
()),
dout_n_c_do_ho_wo
.
mDesc
.
GetElementSpaceSize
(),
din_n_c_di_hi_wi_device
.
mDesc
.
GetElementSpaceSize
(),
window_spatial_lengths
,
window_strides
,
window_dilations
);
if
(
inst_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
++
num_kernel
;
}
else
{
if
(
time_kernel
)
{
std
::
cout
<<
inst_ptr
->
GetTypeString
()
<<
" skipped due to unsupported argument: "
;
LogRange
(
std
::
cout
<<
"doutput lengths = "
,
out_length
,
", "
)
<<
std
::
endl
;
}
continue
;
}
size_t
workspace_sz
=
inst_ptr
->
GetWorkSpaceSize
(
argument_ptr
.
get
());
DeviceMem
workspace_device_buf
(
workspace_sz
);
inst_ptr
->
SetWorkSpacePointer
(
argument_ptr
.
get
(),
workspace_device_buf
.
GetDeviceBuffer
());
auto
invoker_ptr
=
inst_ptr
->
MakeInvokerPointer
();
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
num_bytes
=
dout_n_c_do_ho_wo
.
mDesc
.
GetElementSize
()
*
sizeof
(
DOutDataType
)
+
out_indices_n_c_do_ho_wo
.
mDesc
.
GetElementSize
()
*
sizeof
(
IndexDataType
)
+
din_n_c_di_hi_wi_device
.
mDesc
.
GetElementSize
()
*
sizeof
(
DInDataType
);
float
gb_per_sec
=
num_bytes
/
1.E6
/
avg_time
;
if
(
time_kernel
)
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
avg_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
inst_ptr
->
GetTypeString
()
<<
std
::
endl
;
if
(
avg_time
<
best_avg_time
)
{
best_instance_name
=
inst_ptr
->
GetTypeString
();
best_avg_time
=
avg_time
;
best_gb_per_sec
=
gb_per_sec
;
}
if
(
do_verification
)
{
din_device_buf
.
FromDevice
(
din_n_c_di_hi_wi_device
.
mData
.
data
());
bool
pass
=
ck
::
utils
::
check_err
(
din_n_c_di_hi_wi_device
.
mData
,
din_n_c_di_hi_wi_host
.
mData
,
"Error: Incorrect results"
,
1e-3
,
1e-3
);
if
(
do_log
)
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out_indices_n_c_do_ho_wo: "
,
out_indices_n_c_do_ho_wo
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"din_n_c_di_hi_wi_device: "
,
din_n_c_di_hi_wi_device
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"din_n_c_di_hi_wi_host: "
,
din_n_c_di_hi_wi_host
.
mData
,
","
)
<<
std
::
endl
;
}
if
(
!
pass
)
{
std
::
cout
<<
inst_ptr
->
GetTypeString
()
<<
" failed verification: "
;
LogRange
(
std
::
cout
<<
"doutput lengths = ["
,
out_length
,
", "
)
<<
"]."
<<
std
::
endl
;
return
false
;
}
else
{
if
(
time_kernel
)
std
::
cout
<<
"pass"
<<
std
::
endl
;
}
}
}
if
(
time_kernel
)
{
LogRange
(
std
::
cout
<<
"length = "
,
out_length
,
","
)
<<
std
::
endl
;
std
::
cout
<<
"best perf = "
<<
best_avg_time
<<
" ms, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_instance_name
<<
std
::
endl
;
}
if
(
num_kernel
==
0
)
{
std
::
cout
<<
"Error: No kernel is applicable"
<<
std
::
endl
;
return
false
;
}
return
true
;
}
}
// namespace profiler
}
// namespace ck
profiler/src/CMakeLists.txt
View file @
705d5a08
...
...
@@ -5,6 +5,7 @@ set(PROFILER_SOURCES
profile_gemm_splitk.cpp
profile_gemm_bias_add_reduce.cpp
profile_gemm_add_multiply.cpp
profile_gemm_multiply_add.cpp
profile_gemm_reduce.cpp
profile_batched_gemm.cpp
profile_batched_gemm_reduce.cpp
...
...
@@ -18,6 +19,8 @@ set(PROFILER_SOURCES
profile_groupnorm.cpp
profile_layernorm.cpp
profile_max_pool3d_fwd.cpp
profile_avg_pool3d_bwd.cpp
profile_max_pool3d_bwd.cpp
profile_softmax.cpp
profile_batchnorm_fwd.cpp
profile_batchnorm_bwd.cpp
...
...
@@ -25,6 +28,7 @@ set(PROFILER_SOURCES
profile_contraction_bilinear.cpp
profile_contraction_scale.cpp
profile_grouped_conv_bwd_data.cpp
profile_image_to_column.cpp
)
if
(
DL_KERNELS
)
list
(
APPEND PROFILER_SOURCES profile_batched_gemm_multi_d.cpp
)
...
...
@@ -51,6 +55,7 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE utility)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_splitk_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_add_multiply_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_multiply_add_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_reduce_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_bias_add_reduce_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_batched_gemm_instance
)
...
...
@@ -74,8 +79,11 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batchnorm_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_contraction_bilinear_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_contraction_scale_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_pool3d_fwd_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_avg_pool3d_bwd_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_max_pool_bwd_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv2d_bwd_data_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv3d_bwd_data_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_image_to_column_instance
)
if
(
DL_KERNELS
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_batched_gemm_multi_d_instance
)
endif
()
...
...
profiler/src/profile_avg_pool3d_bwd.cpp
0 → 100644
View file @
705d5a08
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <vector>
#include <unordered_map>
#include "profiler/data_type_enum.hpp"
#include "profiler/profile_avg_pool3d_bwd_impl.hpp"
#include "profiler_operation_registry.hpp"
using
ck
::
index_t
;
struct
maxPoolbwdArgParser
{
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
int
>>
long_opts
=
{{
"length"
,
{}},
{
"wsize"
,
{}},
{
"wstride"
,
{}},
{
"wdilation"
,
{}},
{
"pad1"
,
{}},
{
"pad2"
,
{}}};
bool
parse_opt
(
int
argc
,
char
*
argv
[],
const
std
::
string
&
key
,
int
i
)
{
if
(
std
::
string
(
"--"
)
+
key
==
argv
[
i
])
{
int
pos
=
i
;
while
(
++
i
<
argc
&&
argv
[
i
][
0
]
!=
'-'
)
{}
int
end
=
i
;
for
(
int
j
=
pos
+
1
;
j
<
end
;
j
++
)
{
long_opts
[
key
].
push_back
(
std
::
stoi
(
argv
[
j
]));
}
return
true
;
}
return
false
;
}
void
operator
()(
int
argc
,
char
*
argv
[])
{
for
(
auto
&
kv
:
long_opts
)
{
for
(
int
i
=
1
;
i
<
argc
;
i
++
)
{
if
(
parse_opt
(
argc
,
argv
,
kv
.
first
,
i
))
break
;
}
}
}
};
void
print_help_avg_pool3d_bwd
()
{
std
::
cout
<<
"arg1: data type (0: fp16; 1: fp32; 5: bf16)
\n
"
<<
"arg2: verification (0: no; 1: yes)
\n
"
<<
"arg3: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
<<
"arg4: print tensor value (0: no; 1: yes)
\n
"
<<
"arg5: time kernel (0=no, 1=yes)
\n
"
<<
"--length: input tensor length for NCDHW(e.g, --length 2 32 30 30 30)
\n
"
<<
"--wsize: window size for ZYX (e.g, --wsize 2 2 2)
\n
"
<<
"--wstride: window stride for DHW (e.g, --wstride 2 2 2)
\n
"
<<
"--wdilation: window dilation for DHW (e.g, --wdilation 1 1 1)
\n
"
<<
"--pad1: left side of padding in DHW (e.g, --pad1 1 1 1)
\n
"
<<
"--pad2: right side of padding in DHW (e.g, --pad2 1 1 1)
\n
"
<<
"eg: ckProfiler avg_pool3d_bwd 0 1 2 0 1 --length 2 32 30 30 30 --wsize 2 2 2 "
"--wstride 2 2 2 --wdilation 1 1 1 --pad1 1 1 1 --pad2 1 1 1"
<<
std
::
endl
;
}
int
profile_avg_pool3d_bwd
(
int
argc
,
char
*
argv
[])
{
ck
::
DataTypeEnum
data_type
=
ck
::
DataTypeEnum
::
Half
;
bool
do_verification
=
true
;
int
init_method
=
0
;
bool
do_log
=
false
;
bool
time_kernel
=
true
;
std
::
vector
<
index_t
>
in_length
=
{
2
,
32
,
30
,
30
,
30
};
std
::
vector
<
index_t
>
wsize
=
{
2
,
2
,
2
};
std
::
vector
<
index_t
>
wstride
=
{
2
,
2
,
2
};
std
::
vector
<
index_t
>
wdilation
=
{
1
,
1
,
1
};
std
::
vector
<
index_t
>
pad1
=
{
1
,
1
,
1
};
std
::
vector
<
index_t
>
pad2
=
{
1
,
1
,
1
};
if
(
argc
!=
2
&&
argc
!=
33
)
{
print_help_avg_pool3d_bwd
();
return
0
;
}
else
if
(
argc
==
33
)
{
data_type
=
static_cast
<
ck
::
DataTypeEnum
>
(
std
::
stoi
(
argv
[
2
]));
do_verification
=
std
::
stoi
(
argv
[
3
]);
init_method
=
std
::
stoi
(
argv
[
4
]);
do_log
=
std
::
stoi
(
argv
[
5
]);
time_kernel
=
std
::
stoi
(
argv
[
6
]);
// parse the long options
maxPoolbwdArgParser
arg_parser
;
arg_parser
(
argc
,
argv
);
in_length
=
arg_parser
.
long_opts
[
"length"
];
wsize
=
arg_parser
.
long_opts
[
"wsize"
];
wstride
=
arg_parser
.
long_opts
[
"wstride"
];
wdilation
=
arg_parser
.
long_opts
[
"wdilation"
];
pad1
=
arg_parser
.
long_opts
[
"pad1"
];
pad2
=
arg_parser
.
long_opts
[
"pad2"
];
}
#ifdef CK_ENABLE_FP16
using
F16
=
ck
::
half_t
;
#endif
#ifdef CK_ENABLE_BF16
using
BF16
=
ck
::
bhalf_t
;
#endif
#ifdef CK_ENABLE_FP32
using
F32
=
float
;
#endif
using
NDHWC
=
ck
::
tensor_layout
::
convolution
::
NDHWC
;
if
(
false
)
;
#ifdef CK_ENABLE_FP16
else
if
(
data_type
==
ck
::
DataTypeEnum
::
Half
)
{
ck
::
profiler
::
profile_avg_pool3d_bwd_impl
<
F16
,
F16
,
F16
,
NDHWC
,
NDHWC
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
in_length
,
wsize
,
wstride
,
wdilation
,
pad1
,
pad2
);
}
#endif
#ifdef CK_ENABLE_BF16
else
if
(
data_type
==
ck
::
DataTypeEnum
::
BFloat16
)
{
ck
::
profiler
::
profile_avg_pool3d_bwd_impl
<
BF16
,
BF16
,
BF16
,
NDHWC
,
NDHWC
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
in_length
,
wsize
,
wstride
,
wdilation
,
pad1
,
pad2
);
}
#endif
#ifdef CK_ENABLE_FP32
else
if
(
data_type
==
ck
::
DataTypeEnum
::
Float
)
{
ck
::
profiler
::
profile_avg_pool3d_bwd_impl
<
F32
,
F32
,
F32
,
NDHWC
,
NDHWC
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
in_length
,
wsize
,
wstride
,
wdilation
,
pad1
,
pad2
);
}
#endif
else
{
throw
std
::
runtime_error
(
"not implemented yet"
);
}
return
0
;
}
REGISTER_PROFILER_OPERATION
(
"avg_pool3d_bwd"
,
"max_pool bwd"
,
profile_avg_pool3d_bwd
);
profiler/src/profile_gemm_bilinear.cpp
View file @
705d5a08
...
...
@@ -71,6 +71,9 @@ int profile_gemm_bilinear(int argc, char* argv[])
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
I8
=
std
::
int8_t
;
using
I32
=
std
::
int32_t
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
...
...
@@ -141,6 +144,22 @@ int profile_gemm_bilinear(int argc, char* argv[])
{
return
profile
(
F16
{},
F16
{},
F32
{},
F16
{},
F16
{},
Col
{},
Col
{},
Row
{},
Row
{});
}
else
if
(
data_type
==
MatrixDataType
::
INT8_INT8_INT8_INT8
&&
layout
==
MatrixLayout
::
MK_KN_MN_MN
)
{
return
profile
(
I8
{},
I8
{},
I32
{},
I8
{},
I8
{},
Row
{},
Row
{},
Row
{},
Row
{});
}
else
if
(
data_type
==
MatrixDataType
::
INT8_INT8_INT8_INT8
&&
layout
==
MatrixLayout
::
MK_NK_MN_MN
)
{
return
profile
(
I8
{},
I8
{},
I32
{},
I8
{},
I8
{},
Row
{},
Col
{},
Row
{},
Row
{});
}
else
if
(
data_type
==
MatrixDataType
::
INT8_INT8_INT8_INT8
&&
layout
==
MatrixLayout
::
KM_KN_MN_MN
)
{
return
profile
(
I8
{},
I8
{},
I32
{},
I8
{},
I8
{},
Col
{},
Row
{},
Row
{},
Row
{});
}
else
if
(
data_type
==
MatrixDataType
::
INT8_INT8_INT8_INT8
&&
layout
==
MatrixLayout
::
KM_NK_MN_MN
)
{
return
profile
(
I8
{},
I8
{},
I32
{},
I8
{},
I8
{},
Col
{},
Col
{},
Row
{},
Row
{});
}
else
{
std
::
cout
<<
"this data_type & layout is not implemented"
<<
std
::
endl
;
...
...
profiler/src/profile_gemm_multiply_add.cpp
0 → 100644
View file @
705d5a08
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "profiler/profile_gemm_multiply_add_impl.hpp"
#include "profiler_operation_registry.hpp"
#define OP_NAME "gemm_multiply_add"
#define OP_DESC "GEMM+MULTIPLY+ADD"
int
profile_gemm_multiply_add
(
int
argc
,
char
*
argv
[])
{
enum
struct
MatrixLayout
{
MK_KN_MN_MN_MN
,
// 0
MK_NK_MN_MN_MN
,
// 1
};
enum
struct
MatrixDataType
{
F16_F16_F16_F16_F16
,
// 0
F16_F8_F32_F32_F16
,
// 1
};
if
(
argc
!=
16
)
{
// clang-format off
printf
(
"arg1: tensor operation ("
OP_NAME
": "
OP_DESC
")
\n
"
);
printf
(
"arg2: data type (0: fp16; 1: fp16Afp8B)
\n
"
);
printf
(
"arg3: matrix layout (0: E[m, n] = Multiply_Add((A[m, k] * B[k, n]) x D1[m, n] + D0[m, n]);
\n
"
);
printf
(
" 1: E[m, n] = Multiply_Add((A[m, k] * B[n, k]) x D1[m, n] + D0[m, n]);
\n
"
);
printf
(
"arg4: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg5: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg6: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg7: time kernel (0=no, 1=yes)
\n
"
);
printf
(
"arg8 to 15: M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE
\n
"
);
// clang-format on
exit
(
1
);
}
const
auto
data_type
=
static_cast
<
MatrixDataType
>
(
std
::
stoi
(
argv
[
2
]));
const
auto
layout
=
static_cast
<
MatrixLayout
>
(
std
::
stoi
(
argv
[
3
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
4
]);
const
int
init_method
=
std
::
stoi
(
argv
[
5
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
6
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
7
]);
const
int
M
=
std
::
stoi
(
argv
[
8
]);
const
int
N
=
std
::
stoi
(
argv
[
9
]);
const
int
K
=
std
::
stoi
(
argv
[
10
]);
const
int
StrideA
=
std
::
stoi
(
argv
[
11
]);
const
int
StrideB
=
std
::
stoi
(
argv
[
12
]);
const
int
StrideD0
=
std
::
stoi
(
argv
[
13
]);
const
int
StrideD1
=
std
::
stoi
(
argv
[
14
]);
const
int
StrideE
=
std
::
stoi
(
argv
[
15
]);
using
F8
=
ck
::
f8_t
;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
auto
profile
=
[
&
](
auto
a_type
,
auto
b_type
,
auto
acc_type
,
auto
d0_type
,
auto
d1_type
,
auto
e_type
,
auto
a_layout
,
auto
b_layout
,
auto
d0_layout
,
auto
d1_layout
,
auto
e_layout
)
{
using
ADataType
=
decltype
(
a_type
);
using
BDataType
=
decltype
(
b_type
);
using
AccDataType
=
decltype
(
acc_type
);
using
D0DataType
=
decltype
(
d0_type
);
using
D1DataType
=
decltype
(
d1_type
);
using
EDataType
=
decltype
(
e_type
);
using
ALayout
=
decltype
(
a_layout
);
using
BLayout
=
decltype
(
b_layout
);
using
D0Layout
=
decltype
(
d0_layout
);
using
D1Layout
=
decltype
(
d1_layout
);
using
ELayout
=
decltype
(
e_layout
);
const
int
DefaultStrideA
=
ck
::
is_same_v
<
ALayout
,
Row
>
?
K
:
M
;
const
int
DefaultStrideB
=
ck
::
is_same_v
<
BLayout
,
Row
>
?
N
:
K
;
const
int
DefaultStrideD0
=
ck
::
is_same_v
<
D0Layout
,
Row
>
?
N
:
M
;
const
int
DefaultStrideD1
=
ck
::
is_same_v
<
D1Layout
,
Row
>
?
N
:
M
;
const
int
DefaultStrideE
=
ck
::
is_same_v
<
ELayout
,
Row
>
?
N
:
M
;
bool
pass
=
ck
::
profiler
::
profile_gemm_multiply_add_impl
<
ADataType
,
BDataType
,
AccDataType
,
D0DataType
,
D1DataType
,
EDataType
,
ALayout
,
BLayout
,
D0Layout
,
D1Layout
,
ELayout
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
M
,
N
,
K
,
(
StrideA
<
0
)
?
DefaultStrideA
:
StrideA
,
(
StrideB
<
0
)
?
DefaultStrideB
:
StrideB
,
(
StrideD0
<
0
)
?
DefaultStrideD0
:
StrideD0
,
(
StrideD1
<
0
)
?
DefaultStrideD1
:
StrideD1
,
(
StrideE
<
0
)
?
DefaultStrideE
:
StrideE
);
return
pass
?
0
:
1
;
};
if
(
data_type
==
MatrixDataType
::
F16_F16_F16_F16_F16
&&
layout
==
MatrixLayout
::
MK_KN_MN_MN_MN
)
{
return
profile
(
F16
{},
F16
{},
F32
{},
F16
{},
F16
{},
F16
{},
Row
{},
Row
{},
Row
{},
Row
{},
Row
{});
}
else
if
(
data_type
==
MatrixDataType
::
F16_F16_F16_F16_F16
&&
layout
==
MatrixLayout
::
MK_NK_MN_MN_MN
)
{
return
profile
(
F16
{},
F16
{},
F32
{},
F16
{},
F16
{},
F16
{},
Row
{},
Col
{},
Row
{},
Row
{},
Row
{});
}
else
if
(
data_type
==
MatrixDataType
::
F16_F8_F32_F32_F16
&&
layout
==
MatrixLayout
::
MK_KN_MN_MN_MN
)
{
return
profile
(
F16
{},
F8
{},
F32
{},
F32
{},
F32
{},
F16
{},
Row
{},
Row
{},
Row
{},
Row
{},
Row
{});
}
else
if
(
data_type
==
MatrixDataType
::
F16_F8_F32_F32_F16
&&
layout
==
MatrixLayout
::
MK_NK_MN_MN_MN
)
{
return
profile
(
F16
{},
F8
{},
F32
{},
F32
{},
F32
{},
F16
{},
Row
{},
Col
{},
Row
{},
Row
{},
Row
{});
}
else
{
std
::
cout
<<
"this data_type & layout is not implemented"
<<
std
::
endl
;
return
1
;
}
}
REGISTER_PROFILER_OPERATION
(
OP_NAME
,
OP_DESC
,
profile_gemm_multiply_add
);
profiler/src/profile_image_to_column.cpp
0 → 100644
View file @
705d5a08
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "profiler/profile_image_to_column_impl.hpp"
#include "profiler_operation_registry.hpp"
namespace
{
enum
struct
ConvLayout
{
NHWC
,
// 0
};
enum
struct
DataType
{
F32_F32
,
// 0
F16_F16
,
// 1
BF16_BF16
,
// 2
INT8_INT8
,
// 3
};
#define OP_NAME "image_to_column"
#define OP_DESC "Image To Column"
static
void
print_helper_msg
()
{
std
::
cout
// clang-format off
<<
"arg1: tensor operation ("
OP_NAME
": "
OP_DESC
")
\n
"
<<
"arg2: data type (0: Input fp32, Weight fp32, Output fp32
\n
"
<<
" 1: Input fp16, Weight fp16, Output fp16
\n
"
<<
" 2: Input bf16, Weight bf16, Output bf16
\n
"
<<
" 3: Input int8, Weight int8, Output int8)
\n
"
<<
"arg3: tensor layout (0: Input[N, Hi, Wi, C], Output[N * Ho * Wo, Y * X * C])
\n
"
<<
"arg4: verification (0: no, 1: yes)
\n
"
<<
"arg5: initialization (0: no init, 1: integer value, 2: decimal value)
\n
"
<<
"arg6: print tensor value (0: no; 1: yes)
\n
"
<<
"arg7: time kernel (0: no, 1: yes)
\n
"
<<
ck
::
utils
::
conv
::
get_conv_param_parser_helper_msg
()
<<
std
::
endl
;
// clang-format on
}
}
// namespace
int
profile_image_to_column
(
int
argc
,
char
*
argv
[])
{
// 8 for control, 1 for num_dim_spatial
if
(
argc
<
9
)
{
print_helper_msg
();
return
1
;
}
const
auto
data_type
=
static_cast
<
DataType
>
(
std
::
stoi
(
argv
[
2
]));
const
auto
layout
=
static_cast
<
ConvLayout
>
(
std
::
stoi
(
argv
[
3
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
4
]);
const
int
init_method
=
std
::
stoi
(
argv
[
5
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
6
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
7
]);
const
int
num_dim_spatial
=
std
::
stoi
(
argv
[
8
]);
// 8 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial
if
(
argc
!=
8
+
1
+
4
+
6
*
num_dim_spatial
)
{
print_helper_msg
();
return
1
;
}
const
auto
params
=
ck
::
utils
::
conv
::
parse_conv_param
(
num_dim_spatial
,
9
,
argv
);
using
F32
=
float
;
using
F16
=
ck
::
half_t
;
using
BF16
=
ck
::
bhalf_t
;
using
INT8
=
int8_t
;
using
namespace
ck
::
tensor_layout
::
convolution
;
constexpr
auto
I1
=
ck
::
Number
<
1
>
{};
constexpr
auto
I2
=
ck
::
Number
<
2
>
{};
constexpr
auto
I3
=
ck
::
Number
<
3
>
{};
auto
profile
=
[
&
](
auto
num_dim_spatial_tmp
,
auto
in_layout
,
auto
in_type
,
auto
out_type
)
{
constexpr
ck
::
index_t
NDimSpatial
=
num_dim_spatial_tmp
.
value
;
using
InLayout
=
decltype
(
in_layout
);
using
InDataType
=
decltype
(
in_type
);
using
OutDataType
=
decltype
(
out_type
);
bool
pass
=
ck
::
profiler
::
profile_image_to_column_impl
<
NDimSpatial
,
InLayout
,
InDataType
,
OutDataType
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
params
);
return
pass
?
0
:
1
;
};
// NHWC
if
(
layout
==
ConvLayout
::
NHWC
)
{
if
(
num_dim_spatial
==
1
)
{
if
(
data_type
==
DataType
::
F32_F32
)
{
return
profile
(
I1
,
GNWC
{},
F32
{},
F32
{});
}
else
if
(
data_type
==
DataType
::
F16_F16
)
{
return
profile
(
I1
,
GNWC
{},
F16
{},
F16
{});
}
else
if
(
data_type
==
DataType
::
BF16_BF16
)
{
return
profile
(
I1
,
GNWC
{},
BF16
{},
BF16
{});
}
else
if
(
data_type
==
DataType
::
INT8_INT8
)
{
return
profile
(
I1
,
GNWC
{},
INT8
{},
INT8
{});
}
}
else
if
(
num_dim_spatial
==
2
)
{
if
(
data_type
==
DataType
::
F32_F32
)
{
return
profile
(
I2
,
GNHWC
{},
F32
{},
F32
{});
}
else
if
(
data_type
==
DataType
::
F16_F16
)
{
return
profile
(
I2
,
GNHWC
{},
F16
{},
F16
{});
}
else
if
(
data_type
==
DataType
::
BF16_BF16
)
{
return
profile
(
I2
,
GNHWC
{},
BF16
{},
BF16
{});
}
else
if
(
data_type
==
DataType
::
INT8_INT8
)
{
return
profile
(
I2
,
GNHWC
{},
INT8
{},
INT8
{});
}
}
else
if
(
num_dim_spatial
==
3
)
{
if
(
data_type
==
DataType
::
F32_F32
)
{
return
profile
(
I3
,
GNDHWC
{},
F32
{},
F32
{});
}
else
if
(
data_type
==
DataType
::
F16_F16
)
{
return
profile
(
I3
,
GNDHWC
{},
F16
{},
F16
{});
}
else
if
(
data_type
==
DataType
::
BF16_BF16
)
{
return
profile
(
I3
,
GNDHWC
{},
BF16
{},
BF16
{});
}
else
if
(
data_type
==
DataType
::
INT8_INT8
)
{
return
profile
(
I3
,
GNDHWC
{},
INT8
{},
INT8
{});
}
}
}
std
::
cout
<<
"this data_type & layout is not implemented"
<<
std
::
endl
;
return
1
;
}
REGISTER_PROFILER_OPERATION
(
OP_NAME
,
OP_DESC
,
profile_image_to_column
);
profiler/src/profile_max_pool3d_bwd.cpp
0 → 100644
View file @
705d5a08
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <vector>
#include <unordered_map>
#include "profiler/data_type_enum.hpp"
#include "profiler/profile_max_pool3d_bwd_impl.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "profiler_operation_registry.hpp"
using
ck
::
index_t
;
struct
maxPoolbwdArgParser
{
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
int
>>
long_opts
=
{{
"length"
,
{}},
{
"wsize"
,
{}},
{
"wstride"
,
{}},
{
"wdilation"
,
{}},
{
"pad1"
,
{}},
{
"pad2"
,
{}}};
bool
parse_opt
(
int
argc
,
char
*
argv
[],
const
std
::
string
&
key
,
int
i
)
{
if
(
std
::
string
(
"--"
)
+
key
==
argv
[
i
])
{
int
pos
=
i
;
while
(
++
i
<
argc
&&
argv
[
i
][
0
]
!=
'-'
)
{}
int
end
=
i
;
for
(
int
j
=
pos
+
1
;
j
<
end
;
j
++
)
{
long_opts
[
key
].
push_back
(
std
::
stoi
(
argv
[
j
]));
}
return
true
;
}
return
false
;
}
void
operator
()(
int
argc
,
char
*
argv
[])
{
for
(
auto
&
kv
:
long_opts
)
{
for
(
int
i
=
1
;
i
<
argc
;
i
++
)
{
if
(
parse_opt
(
argc
,
argv
,
kv
.
first
,
i
))
break
;
}
}
}
};
void
print_help_max_pool3d_bwd
()
{
std
::
cout
<<
"arg1: data type (0: fp16; 1: fp32; 5: bf16)
\n
"
<<
"arg2: verification (0: no; 1: yes)
\n
"
<<
"arg3: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
<<
"arg4: print tensor value (0: no; 1: yes)
\n
"
<<
"arg5: time kernel (0=no, 1=yes)
\n
"
<<
"--length: input tensor length for NCDHW(e.g, --length 2 32 30 30 30)
\n
"
<<
"--wsize: window size for ZYX (e.g, --wsize 2 2 2)
\n
"
<<
"--wstride: window stride for DHW (e.g, --wstride 2 2 2)
\n
"
<<
"--wdilation: window dilation for DHW (e.g, --wdilation 1 1 1)
\n
"
<<
"--pad1: left side of padding in DHW (e.g, --pad1 1 1 1)
\n
"
<<
"--pad2: right side of padding in DHW (e.g, --pad2 1 1 1)
\n
"
<<
"eg: ckProfiler max_pool3d_bwd 0 1 2 0 1 --length 2 32 30 30 30 --wsize 2 2 2 "
"--wstride 2 2 2 --wdilation 1 1 1 --pad1 1 1 1 --pad2 1 1 1"
<<
std
::
endl
;
}
int
profile_max_pool3d_bwd
(
int
argc
,
char
*
argv
[])
{
ck
::
DataTypeEnum
data_type
=
ck
::
DataTypeEnum
::
Half
;
bool
do_verification
=
true
;
int
init_method
=
0
;
bool
do_log
=
false
;
bool
time_kernel
=
true
;
std
::
vector
<
index_t
>
in_length
=
{
2
,
32
,
30
,
30
,
30
};
std
::
vector
<
index_t
>
wsize
=
{
2
,
2
,
2
};
std
::
vector
<
index_t
>
wstride
=
{
2
,
2
,
2
};
std
::
vector
<
index_t
>
wdilation
=
{
1
,
1
,
1
};
std
::
vector
<
index_t
>
pad1
=
{
1
,
1
,
1
};
std
::
vector
<
index_t
>
pad2
=
{
1
,
1
,
1
};
if
(
argc
!=
2
&&
argc
!=
33
)
{
print_help_max_pool3d_bwd
();
return
0
;
}
else
if
(
argc
==
33
)
{
data_type
=
static_cast
<
ck
::
DataTypeEnum
>
(
std
::
stoi
(
argv
[
2
]));
do_verification
=
std
::
stoi
(
argv
[
3
]);
init_method
=
std
::
stoi
(
argv
[
4
]);
do_log
=
std
::
stoi
(
argv
[
5
]);
time_kernel
=
std
::
stoi
(
argv
[
6
]);
// parse the long options
maxPoolbwdArgParser
arg_parser
;
arg_parser
(
argc
,
argv
);
in_length
=
arg_parser
.
long_opts
[
"length"
];
wsize
=
arg_parser
.
long_opts
[
"wsize"
];
wstride
=
arg_parser
.
long_opts
[
"wstride"
];
wdilation
=
arg_parser
.
long_opts
[
"wdilation"
];
pad1
=
arg_parser
.
long_opts
[
"pad1"
];
pad2
=
arg_parser
.
long_opts
[
"pad2"
];
}
#ifdef CK_ENABLE_FP16
using
F16
=
ck
::
half_t
;
#endif
#ifdef CK_ENABLE_BF16
using
BF16
=
ck
::
bhalf_t
;
#endif
#ifdef CK_ENABLE_FP32
using
F32
=
float
;
#endif
using
I32
=
int32_t
;
if
(
false
)
;
#ifdef CK_ENABLE_FP16
else
if
(
data_type
==
ck
::
DataTypeEnum
::
Half
)
{
ck
::
profiler
::
profile_max_pool3d_bwd_impl
<
F16
,
F16
,
I32
,
F16
,
F16
,
false
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
in_length
,
wsize
,
wstride
,
wdilation
,
pad1
,
pad2
);
}
#endif
#ifdef CK_ENABLE_BF16
else
if
(
data_type
==
ck
::
DataTypeEnum
::
BFloat16
)
{
ck
::
profiler
::
profile_max_pool3d_bwd_impl
<
BF16
,
BF16
,
I32
,
BF16
,
BF16
,
false
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
in_length
,
wsize
,
wstride
,
wdilation
,
pad1
,
pad2
);
}
#endif
#ifdef CK_ENABLE_FP32
else
if
(
data_type
==
ck
::
DataTypeEnum
::
Float
)
{
ck
::
profiler
::
profile_max_pool3d_bwd_impl
<
F32
,
F32
,
I32
,
F32
,
F32
,
false
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
in_length
,
wsize
,
wstride
,
wdilation
,
pad1
,
pad2
);
}
#endif
else
{
throw
std
::
runtime_error
(
"not implemented yet"
);
}
return
0
;
}
REGISTER_PROFILER_OPERATION
(
"max_pool3d_bwd"
,
"max_pool3d bwd"
,
profile_max_pool3d_bwd
);
profiler/src/profile_max_pool3d_fwd.cpp
View file @
705d5a08
...
...
@@ -51,7 +51,7 @@ struct maxPoolFwdArgParser
void
print_help_max_pool3d_fwd
()
{
std
::
cout
<<
"arg1: data type (0: fp16; 1: fp32)
\n
"
std
::
cout
<<
"arg1: data type (0: fp16; 1: fp32
; 5: bf16
)
\n
"
<<
"arg2: verification (0: no; 1: yes)
\n
"
<<
"arg3: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
<<
"arg4: print tensor value (0: no; 1: yes)
\n
"
...
...
@@ -109,8 +109,15 @@ int profile_max_pool3d_fwd(int argc, char* argv[])
pad2
=
arg_parser
.
long_opts
[
"pad2"
];
}
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
#ifdef CK_ENABLE_FP16
using
F16
=
ck
::
half_t
;
#endif
#ifdef CK_ENABLE_BF16
using
BF16
=
ck
::
bhalf_t
;
#endif
#ifdef CK_ENABLE_FP32
using
F32
=
float
;
#endif
using
I32
=
int32_t
;
using
NDHWC
=
ck
::
tensor_layout
::
convolution
::
NDHWC
;
...
...
@@ -120,7 +127,10 @@ int profile_max_pool3d_fwd(int argc, char* argv[])
constexpr
auto
ReduceOpId
=
ck
::
ReduceTensorOp
::
AVG
;
#endif
if
(
data_type
==
ck
::
DataTypeEnum
::
Half
)
if
(
false
)
;
#ifdef CK_ENABLE_FP16
else
if
(
data_type
==
ck
::
DataTypeEnum
::
Half
)
{
if
(
return_index
)
ck
::
profiler
::
...
...
@@ -149,6 +159,51 @@ int profile_max_pool3d_fwd(int argc, char* argv[])
pad1
,
pad2
);
}
#endif
#ifdef CK_ENABLE_BF16
else
if
(
data_type
==
ck
::
DataTypeEnum
::
BFloat16
)
{
if
(
return_index
)
ck
::
profiler
::
profile_pool3d_fwd_impl
<
BF16
,
BF16
,
BF16
,
I32
,
NDHWC
,
NDHWC
,
ReduceOpId
,
false
,
true
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
in_length
,
wsize
,
wstride
,
wdilation
,
pad1
,
pad2
);
else
ck
::
profiler
::
profile_pool3d_fwd_impl
<
BF16
,
BF16
,
BF16
,
I32
,
NDHWC
,
NDHWC
,
ReduceOpId
,
false
,
false
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
in_length
,
wsize
,
wstride
,
wdilation
,
pad1
,
pad2
);
}
#endif
#ifdef CK_ENABLE_FP32
else
if
(
data_type
==
ck
::
DataTypeEnum
::
Float
)
{
if
(
return_index
)
...
...
@@ -178,6 +233,7 @@ int profile_max_pool3d_fwd(int argc, char* argv[])
pad1
,
pad2
);
}
#endif
else
{
throw
std
::
runtime_error
(
"not implemented yet"
);
...
...
script/cmake-ck-dev.sh
View file @
705d5a08
...
...
@@ -16,4 +16,3 @@ cmake
-D
CMAKE_VERBOSE_MAKEFILE:BOOL
=
ON
\
-D
USE_BITINT_EXTENSION_INT4
=
OFF
\
${
MY_PROJECT_SOURCE
}
test/CMakeLists.txt
View file @
705d5a08
...
...
@@ -57,9 +57,10 @@ add_subdirectory(data_type)
add_subdirectory
(
elementwise_normalization
)
add_subdirectory
(
batchnorm
)
add_subdirectory
(
contraction
)
add_subdirectory
(
pool
_fwd
)
add_subdirectory
(
pool
)
add_subdirectory
(
batched_gemm_multi_d
)
add_subdirectory
(
grouped_convnd_bwd_data
)
add_subdirectory
(
image_to_column
)
if
(
GPU_TARGETS MATCHES
"gfx11"
)
add_subdirectory
(
wmma_op
)
endif
()
test/image_to_column/CMakeLists.txt
0 → 100644
View file @
705d5a08
add_gtest_executable
(
test_image_to_column test_image_to_column.cpp
)
target_link_libraries
(
test_image_to_column PRIVATE utility device_image_to_column_instance
)
add_gtest_executable
(
test_image_to_column_interface test_image_to_column_interface.cpp
)
target_link_libraries
(
test_image_to_column_interface PRIVATE utility
)
Prev
1
…
4
5
6
7
8
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment