Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
c7c47fd7
Commit
c7c47fd7
authored
Aug 10, 2023
by
Bartlomiej Wroblewski
Browse files
Merge branch 'develop' into bwroblew/dpp8
parents
f8eb91d7
578142db
Changes
183
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
459 additions
and
54 deletions
+459
-54
example/37_batched_gemm_add_add_relu_gemm_add/CMakeLists.txt
example/37_batched_gemm_add_add_relu_gemm_add/CMakeLists.txt
+3
-1
example/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt
example/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt
+3
-1
example/39_permute/CMakeLists.txt
example/39_permute/CMakeLists.txt
+9
-7
example/40_conv2d_fwd_quantization/CMakeLists.txt
example/40_conv2d_fwd_quantization/CMakeLists.txt
+14
-16
example/41_grouped_conv_conv_fwd/CMakeLists.txt
example/41_grouped_conv_conv_fwd/CMakeLists.txt
+12
-4
example/42_groupnorm/CMakeLists.txt
example/42_groupnorm/CMakeLists.txt
+5
-3
example/43_splitk_gemm_bias_e_permute/CMakeLists.txt
example/43_splitk_gemm_bias_e_permute/CMakeLists.txt
+6
-2
example/44_elementwise_permute/CMakeLists.txt
example/44_elementwise_permute/CMakeLists.txt
+4
-2
example/46_gemm_add_multiply/CMakeLists.txt
example/46_gemm_add_multiply/CMakeLists.txt
+6
-2
example/48_pool3d_fwd/CMakeLists.txt
example/48_pool3d_fwd/CMakeLists.txt
+3
-2
example/49_maxpool2d_bwd/CMakeLists.txt
example/49_maxpool2d_bwd/CMakeLists.txt
+9
-3
example/50_put_element/CMakeLists.txt
example/50_put_element/CMakeLists.txt
+3
-1
example/51_avgpool3d_bwd/CMakeLists.txt
example/51_avgpool3d_bwd/CMakeLists.txt
+3
-0
example/51_avgpool3d_bwd/avgpool3d_bwd_bf16.cpp
example/51_avgpool3d_bwd/avgpool3d_bwd_bf16.cpp
+62
-0
example/51_avgpool3d_bwd/avgpool3d_bwd_common.hpp
example/51_avgpool3d_bwd/avgpool3d_bwd_common.hpp
+147
-0
example/51_avgpool3d_bwd/avgpool3d_bwd_fp16.cpp
example/51_avgpool3d_bwd/avgpool3d_bwd_fp16.cpp
+62
-0
example/51_avgpool3d_bwd/avgpool3d_bwd_fp32.cpp
example/51_avgpool3d_bwd/avgpool3d_bwd_fp32.cpp
+62
-0
include/ck/ck.hpp
include/ck/ck.hpp
+1
-1
include/ck/tensor_operation/gpu/device/device_avgpool_bwd.hpp
...ude/ck/tensor_operation/gpu/device/device_avgpool_bwd.hpp
+39
-0
include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp
...r_operation/gpu/device/device_grouped_conv_bwd_weight.hpp
+6
-9
No files found.
example/37_batched_gemm_add_add_relu_gemm_add/CMakeLists.txt
View file @
c7c47fd7
add_example_executable
(
example_batched_gemm_add_add_relu_gemm_add_xdl_fp16 batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp
)
if
(
DTYPES MATCHES
"fp16"
OR NOT DEFINED DTYPES
)
add_example_executable
(
example_batched_gemm_add_add_relu_gemm_add_xdl_fp16 batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp
)
endif
()
example/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt
View file @
c7c47fd7
if
(
DTYPES MATCHES
"fp16"
OR NOT DEFINED DTYPES
)
list
(
APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942
)
list
(
APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942
)
set
(
target 0
)
set
(
target 0
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
...
@@ -10,4 +11,5 @@ foreach(gpu IN LISTS GPU_TARGETS)
...
@@ -10,4 +11,5 @@ foreach(gpu IN LISTS GPU_TARGETS)
add_dependencies
(
example_grouped_conv_bwd_data example_grouped_conv_bwd_data_bias_relu_fp16
)
add_dependencies
(
example_grouped_conv_bwd_data example_grouped_conv_bwd_data_bias_relu_fp16
)
set
(
target 1
)
set
(
target 1
)
endif
()
endif
()
endforeach
()
endforeach
()
\ No newline at end of file
endif
()
example/39_permute/CMakeLists.txt
View file @
c7c47fd7
add_custom_target
(
example_permute
)
if
(
DTYPES MATCHES
"fp16"
OR NOT DEFINED DTYPES
)
add_custom_target
(
example_permute
)
add_example_executable
(
example_permute_1xHxW_fp16 permute_1xHxW_fp16.cpp
)
add_example_executable
(
example_permute_1xHxW_fp16 permute_1xHxW_fp16.cpp
)
add_example_executable
(
example_permute_NxHxW_fp16 permute_NxHxW_fp16.cpp
)
add_example_executable
(
example_permute_NxHxW_fp16 permute_NxHxW_fp16.cpp
)
add_example_executable
(
example_permute_HxWx4_fp16 permute_HxWx4_fp16.cpp
)
add_example_executable
(
example_permute_HxWx4_fp16 permute_HxWx4_fp16.cpp
)
add_dependencies
(
example_permute example_permute_1xHxW_fp16
)
add_dependencies
(
example_permute example_permute_1xHxW_fp16
)
add_dependencies
(
example_permute example_permute_NxHxW_fp16
)
add_dependencies
(
example_permute example_permute_NxHxW_fp16
)
add_dependencies
(
example_permute example_permute_HxWx4_fp16
)
add_dependencies
(
example_permute example_permute_HxWx4_fp16
)
endif
()
example/40_conv2d_fwd_quantization/CMakeLists.txt
View file @
c7c47fd7
...
@@ -10,21 +10,19 @@ foreach(gpu IN LISTS GPU_TARGETS)
...
@@ -10,21 +10,19 @@ foreach(gpu IN LISTS GPU_TARGETS)
set
(
target 1
)
set
(
target 1
)
endif
()
endif
()
endforeach
()
endforeach
()
# Conv perlayer quantization
add_example_executable
(
example_conv2d_fwd_dl_perlayer_quantization_int8 conv2d_fwd_dl_perlayer_quantization_int8.cpp
)
# Conv perchannel quantization
if
(
DL_KERNELS
)
add_example_executable
(
example_conv2d_fwd_dl_perchannel_quantization_int8 conv2d_fwd_dl_perchannel_
quantization
_int8.cpp
)
# Conv perlayer
quantization
add_example_executable
(
example_conv2d_fwd_dl_perlayer_quantization_int8 conv2d_fwd_dl_perlayer_quantization_int8.cpp
)
# Conv
+ bias + relu perlayer
quantization
# Conv
perchannel
quantization
add_example_executable
(
example_conv2d_fwd_dl_
bias_relu_perlayer
_quantization_int8 conv2d_fwd_dl_
bias_relu_perlayer
_quantization_int8.cpp
)
add_example_executable
(
example_conv2d_fwd_dl_
perchannel
_quantization_int8 conv2d_fwd_dl_
perchannel
_quantization_int8.cpp
)
# Conv + bias + relu perlayer quantization
# Conv +
bias
+
relu
per
channel
quantization
add_example_executable
(
example_conv2d_fwd_dl_
bias
_
relu
_
per
layer_quantization_int8 conv2d_fwd_dl_bias_relu_perlayer_
quantization
_int8.cpp
)
add_example_executable
(
example_conv2d_fwd_dl_
bias
_
relu
_
perchannel
_quantization_int8 conv2d_fwd_dl_bias_relu_perchannel_
quantization
_int8.cpp
)
# Conv +
bias
+
relu
perchannel
quantization
add_example_executable
(
example_conv2d_fwd_dl_bias_relu_perchannel_quantization_int8 conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp
)
# Conv + bias + tanh perlayer quantization
# Conv + bias + tanh perlayer quantization
add_example_executable
(
example_conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8 conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp
)
add_example_executable
(
example_conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8 conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp
)
# Conv + bias + tanh perchannel quantization
# Conv +
bias
+
tanh
perchannel
quantization
add_example_executable
(
example_conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8 conv2d_fwd_dl_
bias
_
tanh
_
perchannel
_
quantization
_int8.cpp
)
add_example_executable
(
example_conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8 conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp
)
endif
(
)
endif
()
endif
()
\ No newline at end of file
example/41_grouped_conv_conv_fwd/CMakeLists.txt
View file @
c7c47fd7
...
@@ -3,9 +3,15 @@ list(APPEND gpu_list2 gfx908 gfx90a)
...
@@ -3,9 +3,15 @@ list(APPEND gpu_list2 gfx908 gfx90a)
set
(
target 0
)
set
(
target 0
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
if
(
gpu IN_LIST gpu_list1 AND target EQUAL 0
)
if
(
gpu IN_LIST gpu_list1 AND target EQUAL 0
)
add_example_executable
(
example_grouped_conv_conv_fwd_xdl_fp32 grouped_conv_conv_fwd_xdl_fp32.cpp
)
if
(
DTYPES MATCHES
"fp32"
OR NOT DEFINED DTYPES
)
add_example_executable
(
example_grouped_conv_conv_fwd_xdl_fp16 grouped_conv_conv_fwd_xdl_fp16.cpp
)
add_example_executable
(
example_grouped_conv_conv_fwd_xdl_fp32 grouped_conv_conv_fwd_xdl_fp32.cpp
)
add_example_executable
(
example_grouped_conv_conv_fwd_xdl_bf16 grouped_conv_conv_fwd_xdl_bf16.cpp
)
endif
()
if
(
DTYPES MATCHES
"fp16"
OR NOT DEFINED DTYPES
)
add_example_executable
(
example_grouped_conv_conv_fwd_xdl_fp16 grouped_conv_conv_fwd_xdl_fp16.cpp
)
endif
()
if
(
DTYPES MATCHES
"bf16"
OR NOT DEFINED DTYPES
)
add_example_executable
(
example_grouped_conv_conv_fwd_xdl_bf16 grouped_conv_conv_fwd_xdl_bf16.cpp
)
endif
()
if
(
USE_BITINT_EXTENSION_INT4
)
if
(
USE_BITINT_EXTENSION_INT4
)
add_example_executable
(
example_grouped_conv_conv_fwd_xdl_int4 grouped_conv_conv_fwd_xdl_int4.cpp
)
add_example_executable
(
example_grouped_conv_conv_fwd_xdl_int4 grouped_conv_conv_fwd_xdl_int4.cpp
)
endif
(
USE_BITINT_EXTENSION_INT4
)
endif
(
USE_BITINT_EXTENSION_INT4
)
...
@@ -14,5 +20,7 @@ foreach(gpu IN LISTS GPU_TARGETS)
...
@@ -14,5 +20,7 @@ foreach(gpu IN LISTS GPU_TARGETS)
endforeach
()
endforeach
()
if
(
NOT GPU_TARGETS MATCHES
"gfx94"
AND NOT GPU_TARGETS MATCHES
"gfx1"
)
if
(
NOT GPU_TARGETS MATCHES
"gfx94"
AND NOT GPU_TARGETS MATCHES
"gfx1"
)
add_example_executable
(
example_grouped_conv_conv_fwd_xdl_int8 grouped_conv_conv_fwd_xdl_int8.cpp
)
if
(
DTYPES MATCHES
"int8"
OR NOT DEFINED DTYPES
)
add_example_executable
(
example_grouped_conv_conv_fwd_xdl_int8 grouped_conv_conv_fwd_xdl_int8.cpp
)
endif
()
endif
()
endif
()
example/42_groupnorm/CMakeLists.txt
View file @
c7c47fd7
add_example_executable
(
example_groupnorm_sigmoid_mul_fp16 groupnorm_sigmoid_mul_fp16.cpp
)
if
(
DTYPES MATCHES
"fp16"
OR NOT DEFINED DTYPES
)
add_example_executable
(
example_groupnorm_splitk_fp16 groupnorm_splitk_fp16.cpp
)
add_example_executable
(
example_groupnorm_sigmoid_mul_fp16 groupnorm_sigmoid_mul_fp16.cpp
)
add_example_executable
(
example_groupnorm_swish_fp16 groupnorm_swish_fp16.cpp
)
add_example_executable
(
example_groupnorm_splitk_fp16 groupnorm_splitk_fp16.cpp
)
add_example_executable
(
example_groupnorm_swish_fp16 groupnorm_swish_fp16.cpp
)
endif
()
example/43_splitk_gemm_bias_e_permute/CMakeLists.txt
View file @
c7c47fd7
add_example_executable
(
example_splitk_gemm_bias_e_permute_xdl_fp16 splitk_gemm_bias_e_permute_xdl_fp16.cpp
)
if
(
DTYPES MATCHES
"fp16"
OR NOT DEFINED DTYPES
)
add_example_executable
(
example_splitk_gemm_bias_e_permute_xdl_fp32 splitk_gemm_bias_e_permute_xdl_fp32.cpp
)
add_example_executable
(
example_splitk_gemm_bias_e_permute_xdl_fp16 splitk_gemm_bias_e_permute_xdl_fp16.cpp
)
endif
()
if
(
DTYPES MATCHES
"fp32"
OR NOT DEFINED DTYPES
)
add_example_executable
(
example_splitk_gemm_bias_e_permute_xdl_fp32 splitk_gemm_bias_e_permute_xdl_fp32.cpp
)
endif
()
example/44_elementwise_permute/CMakeLists.txt
View file @
c7c47fd7
add_example_executable
(
example_elementwise_permute_4D_fp16 elementwise_permute_4D_fp16.cpp
)
if
(
DTYPES MATCHES
"fp16"
OR NOT DEFINED DTYPES
)
add_example_executable
(
example_elementwise_permute_4D_fp16_2d elementwise_permute_4D_fp16_2d.cpp
)
add_example_executable
(
example_elementwise_permute_4D_fp16 elementwise_permute_4D_fp16.cpp
)
add_example_executable
(
example_elementwise_permute_4D_fp16_2d elementwise_permute_4D_fp16_2d.cpp
)
endif
()
example/46_gemm_add_multiply/CMakeLists.txt
View file @
c7c47fd7
add_example_executable
(
example_gemm_add_multiply_dl_fp16 gemm_add_multiply_dl_fp16.cpp
)
if
(
DTYPES MATCHES
"fp16"
OR NOT DEFINED DTYPES
)
add_example_executable
(
example_gemm_add_multiply_xdl_fp16 gemm_add_multiply_xdl_fp16.cpp
)
if
(
DL_KERNELS
)
add_example_executable
(
example_gemm_add_multiply_dl_fp16 gemm_add_multiply_dl_fp16.cpp
)
endif
()
add_example_executable
(
example_gemm_add_multiply_xdl_fp16 gemm_add_multiply_xdl_fp16.cpp
)
endif
()
example/48_pool3d_fwd/CMakeLists.txt
View file @
c7c47fd7
add_example_executable
(
example_pool3d_fwd_fp16 pool3d_fwd_fp16.cpp
)
if
(
DTYPES MATCHES
"fp16"
OR NOT DEFINED DTYPES
)
add_example_executable
(
example_pool3d_fwd_fp16 pool3d_fwd_fp16.cpp
)
endif
()
example/49_maxpool2d_bwd/CMakeLists.txt
View file @
c7c47fd7
add_example_executable
(
example_maxpool2d_bwd_bf16 maxpool2d_bwd_bf16.cpp
)
if
(
DTYPES MATCHES
"bf16"
OR NOT DEFINED DTYPES
)
add_example_executable
(
example_maxpool2d_bwd_fp16 maxpool2d_bwd_fp16.cpp
)
add_example_executable
(
example_maxpool2d_bwd_bf16 maxpool2d_bwd_bf16.cpp
)
add_example_executable
(
example_maxpool2d_bwd_fp32 maxpool2d_bwd_fp32.cpp
)
endif
()
if
(
DTYPES MATCHES
"fp16"
OR NOT DEFINED DTYPES
)
add_example_executable
(
example_maxpool2d_bwd_fp16 maxpool2d_bwd_fp16.cpp
)
endif
()
if
(
DTYPES MATCHES
"fp32"
OR NOT DEFINED DTYPES
)
add_example_executable
(
example_maxpool2d_bwd_fp32 maxpool2d_bwd_fp32.cpp
)
endif
()
example/50_put_element/CMakeLists.txt
View file @
c7c47fd7
add_example_executable
(
example_put_element_fp16 put_element_fp16.cpp
)
if
(
DTYPES MATCHES
"fp16"
OR NOT DEFINED DTYPES
)
add_example_executable
(
example_put_element_fp16 put_element_fp16.cpp
)
endif
()
example/51_avgpool3d_bwd/CMakeLists.txt
0 → 100644
View file @
c7c47fd7
add_example_executable
(
example_avgpool3d_bwd_bf16 avgpool3d_bwd_bf16.cpp
)
add_example_executable
(
example_avgpool3d_bwd_fp16 avgpool3d_bwd_fp16.cpp
)
add_example_executable
(
example_avgpool3d_bwd_fp32 avgpool3d_bwd_fp32.cpp
)
example/51_avgpool3d_bwd/avgpool3d_bwd_bf16.cpp
0 → 100644
View file @
c7c47fd7
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_avgpool3d_bwd_ndhwc_ndhwc.hpp"
#include "avgpool3d_bwd_common.hpp"
using
DOutDataType
=
ck
::
bhalf_t
;
using
DInDataType
=
ck
::
bhalf_t
;
using
ComputeDataType
=
float
;
#if 1
using
DOutLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWC
;
using
DInLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWC
;
#else
using
DOutLayout
=
ck
::
tensor_layout
::
convolution
::
NCDHW
;
using
DInLayout
=
ck
::
tensor_layout
::
convolution
::
NCDHW
;
#endif
using
DevicePoolBwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceAvgPool3dBwd_NDHWC_NDHWC
<
DOutDataType
,
DInDataType
,
ComputeDataType
,
64
,
// BlockSize
64
,
// ReduceMThreadClusterSize
1
,
// ReduceKThreadClusterSize
1
,
// ReduceMThreadSliceSize
1
,
// ReduceKThreadSliceSize
1
>
;
// InSrcOutDstVectorSize
int
main
()
{
std
::
vector
<
ck
::
index_t
>
window_lengths
=
{
5
,
5
,
5
};
std
::
vector
<
ck
::
index_t
>
window_strides
=
{
2
,
2
,
2
};
std
::
vector
<
ck
::
index_t
>
window_dilations
=
{
2
,
2
,
2
};
std
::
vector
<
ck
::
index_t
>
dinput_left_pads
=
{
0
,
0
,
0
};
std
::
vector
<
ck
::
index_t
>
dinput_right_pads
=
{
0
,
0
,
0
};
ck
::
index_t
N
=
1
;
ck
::
index_t
C
=
16
;
ck
::
index_t
Di
=
40
;
ck
::
index_t
Hi
=
40
;
ck
::
index_t
Wi
=
40
;
pool3d_bwd_test
<
DevicePoolBwdInstance
,
DOutDataType
,
DInDataType
,
DOutLayout
,
DInLayout
>
(
true
,
false
,
N
,
C
,
Di
,
Hi
,
Wi
,
window_lengths
,
window_strides
,
window_dilations
,
dinput_left_pads
,
dinput_right_pads
);
}
example/51_avgpool3d_bwd/avgpool3d_bwd_common.hpp
0 → 100644
View file @
c7c47fd7
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_avgpool_bwd.hpp"
template
<
typename
TensorLayout
>
std
::
vector
<
ck
::
index_t
>
f_tensor_strides_ncdhw
(
ck
::
index_t
N_
,
ck
::
index_t
C_
,
ck
::
index_t
D
,
ck
::
index_t
H
,
ck
::
index_t
W
,
TensorLayout
layout
)
{
using
namespace
ck
::
literals
;
(
void
)
N_
;
if
constexpr
(
ck
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
NCDHW
>::
value
)
return
{
C_
*
D
*
H
*
W
,
D
*
H
*
W
,
H
*
W
,
W
,
1
_uz
};
else
if
constexpr
(
ck
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
NDHWC
>::
value
)
return
{
D
*
C_
*
H
*
W
,
1
_uz
,
C_
*
H
*
W
,
W
*
C_
,
C_
};
};
template
<
typename
TensorLayout
>
HostTensorDescriptor
f_host_tensor_descriptor
(
std
::
size_t
N_
,
std
::
size_t
C_
,
std
::
size_t
D
,
std
::
size_t
H
,
std
::
size_t
W
,
TensorLayout
layout
)
{
using
namespace
ck
::
literals
;
if
constexpr
(
ck
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
NCDHW
>::
value
)
{
return
HostTensorDescriptor
({
N_
,
C_
,
D
,
H
,
W
},
{
C_
*
D
*
H
*
W
,
D
*
H
*
W
,
H
*
W
,
W
,
1
_uz
});
}
else
if
constexpr
(
ck
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
NDHWC
>::
value
)
{
return
HostTensorDescriptor
({
N_
,
C_
,
D
,
H
,
W
},
{
D
*
C_
*
H
*
W
,
1
_uz
,
C_
*
H
*
W
,
W
*
C_
,
C_
});
}
};
template
<
typename
DevicePoolBwdInstance
,
typename
DOutDataType
,
typename
DInDataType
,
typename
DOutLayout
,
typename
DInLayout
>
bool
pool3d_bwd_test
(
bool
do_verification
,
bool
time_kernel
,
ck
::
index_t
N
,
ck
::
index_t
C
,
ck
::
index_t
Di
,
ck
::
index_t
Hi
,
ck
::
index_t
Wi
,
std
::
vector
<
ck
::
index_t
>
window_lengths
,
std
::
vector
<
ck
::
index_t
>
window_strides
,
std
::
vector
<
ck
::
index_t
>
window_dilations
,
std
::
vector
<
ck
::
index_t
>
dinput_left_pads
,
std
::
vector
<
ck
::
index_t
>
dinput_right_pads
)
{
auto
OutSpatialLength
=
[
&
](
auto
InSpatialLength
,
int
index
)
{
ck
::
index_t
left_pad
=
dinput_left_pads
[
index
];
ck
::
index_t
right_pad
=
dinput_right_pads
[
index
];
ck
::
index_t
window_len
=
window_lengths
[
index
];
ck
::
index_t
stride
=
window_strides
[
index
];
ck
::
index_t
dilation
=
window_dilations
[
index
];
ck
::
index_t
eff
=
(
window_len
-
1
)
*
dilation
+
1
;
return
(
InSpatialLength
+
left_pad
+
right_pad
-
eff
)
/
stride
+
1
;
};
ck
::
index_t
Do
=
OutSpatialLength
(
Di
,
0
);
ck
::
index_t
Ho
=
OutSpatialLength
(
Hi
,
1
);
ck
::
index_t
Wo
=
OutSpatialLength
(
Wi
,
2
);
Tensor
<
DOutDataType
>
dout
(
f_host_tensor_descriptor
(
N
,
C
,
Do
,
Ho
,
Wo
,
DOutLayout
{}));
Tensor
<
DInDataType
>
din_dev
(
f_host_tensor_descriptor
(
N
,
C
,
Di
,
Hi
,
Wi
,
DInLayout
{}));
Tensor
<
DInDataType
>
din_host
(
f_host_tensor_descriptor
(
N
,
C
,
Di
,
Hi
,
Wi
,
DInLayout
{}));
std
::
cout
<<
"dout: "
<<
dout
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"din_host: "
<<
din_host
.
mDesc
<<
std
::
endl
;
dout
.
GenerateTensorValue
(
GeneratorTensor_3
<
DOutDataType
>
{
0.0
,
1.0
});
DeviceMem
dout_device_buf
(
sizeof
(
DOutDataType
)
*
dout
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
din_device_buf
(
sizeof
(
DInDataType
)
*
din_dev
.
mDesc
.
GetElementSpaceSize
());
dout_device_buf
.
ToDevice
(
dout
.
mData
.
data
());
din_device_buf
.
SetZero
();
auto
pool
=
DevicePoolBwdInstance
{};
auto
invoker_ptr
=
pool
.
MakeInvokerPointer
();
auto
argument_ptr
=
pool
.
MakeArgumentPointer
(
static_cast
<
DOutDataType
*>
(
dout_device_buf
.
GetDeviceBuffer
()),
static_cast
<
DInDataType
*>
(
din_device_buf
.
GetDeviceBuffer
()),
{
N
,
C
,
Do
,
Ho
,
Wo
},
{
N
,
C
,
Di
,
Hi
,
Wi
},
f_tensor_strides_ncdhw
(
N
,
C
,
Do
,
Ho
,
Wo
,
DOutLayout
{}),
f_tensor_strides_ncdhw
(
N
,
C
,
Di
,
Hi
,
Wi
,
DInLayout
{}),
window_lengths
,
window_strides
,
window_dilations
,
dinput_left_pads
,
dinput_right_pads
);
if
(
!
pool
.
IsSupportedArgument
(
argument_ptr
.
get
()))
{
throw
std
::
runtime_error
(
"wrong! device_op with the specified compilation parameters does "
"not support this problem"
);
}
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
cout
<<
"Perf: "
<<
ave_time
<<
std
::
endl
;
bool
pass
=
true
;
if
(
do_verification
)
{
auto
ref_pool
=
ck
::
tensor_operation
::
host
::
ReferenceAvgPoolBwd
<
3
,
DInDataType
,
DOutDataType
>
();
auto
ref_invoker
=
ref_pool
.
MakeInvoker
();
auto
ref_argument
=
ref_pool
.
MakeArgument
(
din_host
,
dout
,
window_lengths
,
window_strides
,
window_dilations
,
dinput_left_pads
,
dinput_right_pads
);
ref_invoker
.
Run
(
ref_argument
);
din_device_buf
.
FromDevice
(
din_dev
.
mData
.
data
());
pass
=
ck
::
utils
::
check_err
(
din_dev
,
din_host
);
}
return
pass
;
}
example/51_avgpool3d_bwd/avgpool3d_bwd_fp16.cpp
0 → 100644
View file @
c7c47fd7
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_avgpool3d_bwd_ndhwc_ndhwc.hpp"
#include "avgpool3d_bwd_common.hpp"
using
DOutDataType
=
ck
::
half_t
;
using
DInDataType
=
ck
::
half_t
;
using
ComputeDataType
=
float
;
#if 1
using
DOutLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWC
;
using
DInLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWC
;
#else
using
DOutLayout
=
ck
::
tensor_layout
::
convolution
::
NCDHW
;
using
DInLayout
=
ck
::
tensor_layout
::
convolution
::
NCDHW
;
#endif
using
DevicePoolBwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceAvgPool3dBwd_NDHWC_NDHWC
<
DOutDataType
,
DInDataType
,
ComputeDataType
,
64
,
// BlockSize
64
,
// ReduceMThreadClusterSize
1
,
// ReduceKThreadClusterSize
1
,
// ReduceMThreadSliceSize
1
,
// ReduceKThreadSliceSize
1
>
;
// InSrcOutDstVectorSize
int
main
()
{
std
::
vector
<
ck
::
index_t
>
window_lengths
=
{
5
,
5
,
5
};
std
::
vector
<
ck
::
index_t
>
window_strides
=
{
2
,
2
,
2
};
std
::
vector
<
ck
::
index_t
>
window_dilations
=
{
2
,
2
,
2
};
std
::
vector
<
ck
::
index_t
>
dinput_left_pads
=
{
0
,
0
,
0
};
std
::
vector
<
ck
::
index_t
>
dinput_right_pads
=
{
0
,
0
,
0
};
ck
::
index_t
N
=
1
;
ck
::
index_t
C
=
16
;
ck
::
index_t
Di
=
40
;
ck
::
index_t
Hi
=
40
;
ck
::
index_t
Wi
=
40
;
pool3d_bwd_test
<
DevicePoolBwdInstance
,
DOutDataType
,
DInDataType
,
DOutLayout
,
DInLayout
>
(
true
,
false
,
N
,
C
,
Di
,
Hi
,
Wi
,
window_lengths
,
window_strides
,
window_dilations
,
dinput_left_pads
,
dinput_right_pads
);
}
example/51_avgpool3d_bwd/avgpool3d_bwd_fp32.cpp
0 → 100644
View file @
c7c47fd7
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_avgpool3d_bwd_ndhwc_ndhwc.hpp"
#include "avgpool3d_bwd_common.hpp"
using
DOutDataType
=
float
;
using
DInDataType
=
float
;
using
ComputeDataType
=
float
;
#if 1
using
DOutLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWC
;
using
DInLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWC
;
#else
using
DOutLayout
=
ck
::
tensor_layout
::
convolution
::
NCDHW
;
using
DInLayout
=
ck
::
tensor_layout
::
convolution
::
NCDHW
;
#endif
using
DevicePoolBwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceAvgPool3dBwd_NDHWC_NDHWC
<
DOutDataType
,
DInDataType
,
ComputeDataType
,
64
,
// BlockSize
64
,
// ReduceMThreadClusterSize
1
,
// ReduceKThreadClusterSize
1
,
// ReduceMThreadSliceSize
1
,
// ReduceKThreadSliceSize
1
>
;
// InSrcOutDstVectorSize
int
main
()
{
std
::
vector
<
ck
::
index_t
>
window_lengths
=
{
5
,
5
,
5
};
std
::
vector
<
ck
::
index_t
>
window_strides
=
{
2
,
2
,
2
};
std
::
vector
<
ck
::
index_t
>
window_dilations
=
{
2
,
2
,
2
};
std
::
vector
<
ck
::
index_t
>
dinput_left_pads
=
{
0
,
0
,
0
};
std
::
vector
<
ck
::
index_t
>
dinput_right_pads
=
{
0
,
0
,
0
};
ck
::
index_t
N
=
1
;
ck
::
index_t
C
=
16
;
ck
::
index_t
Di
=
40
;
ck
::
index_t
Hi
=
40
;
ck
::
index_t
Wi
=
40
;
pool3d_bwd_test
<
DevicePoolBwdInstance
,
DOutDataType
,
DInDataType
,
DOutLayout
,
DInLayout
>
(
true
,
false
,
N
,
C
,
Di
,
Hi
,
Wi
,
window_lengths
,
window_strides
,
window_dilations
,
dinput_left_pads
,
dinput_right_pads
);
}
include/ck/ck.hpp
View file @
c7c47fd7
...
@@ -201,7 +201,7 @@
...
@@ -201,7 +201,7 @@
#define CK_WORKAROUND_SWDEV_388832 1
#define CK_WORKAROUND_SWDEV_388832 1
// workaround: Grouped Conv2d_bwd_data fails for already implemented instance
// workaround: Grouped Conv2d_bwd_data fails for already implemented instance
#define CK_WORKAROUND_
SWDEV_3318619 0
#define CK_WORKAROUND_
GITHUB_ISSUE_824 1
// flag to enable (1) or disable (0) the debugging output in some kernels
// flag to enable (1) or disable (0) the debugging output in some kernels
#define DEBUG_LOG 0
#define DEBUG_LOG 0
...
...
include/ck/tensor_operation/gpu/device/device_avgpool_bwd.hpp
0 → 100644
View file @
c7c47fd7
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include "ck/tensor_operation/gpu/device/device_base.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
template
<
index_t
NDimSpatial
,
typename
DOutDataType
,
typename
DInDataType
,
typename
DOutLayout
,
typename
DInLayout
>
struct
DeviceAvgPoolBwd
:
public
BaseOperator
{
virtual
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
void
*
p_dout
,
void
*
p_din
,
std
::
vector
<
ck
::
index_t
>
dout_n_k_wos_lengths
,
std
::
vector
<
ck
::
index_t
>
dout_n_k_wos_strides
,
std
::
vector
<
ck
::
index_t
>
din_n_k_wos_length
,
std
::
vector
<
ck
::
index_t
>
din_n_k_wos_strides
,
std
::
vector
<
ck
::
index_t
>
window_k_c_xs_lengths
,
std
::
vector
<
ck
::
index_t
>
window_strides
,
std
::
vector
<
ck
::
index_t
>
window_dilations
,
std
::
vector
<
ck
::
index_t
>
input_left_pads
,
std
::
vector
<
ck
::
index_t
>
input_right_pads
)
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
};
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp
View file @
c7c47fd7
...
@@ -27,15 +27,12 @@ struct DeviceGroupedConvBwdWeight : public BaseOperator
...
@@ -27,15 +27,12 @@ struct DeviceGroupedConvBwdWeight : public BaseOperator
MakeArgumentPointer
(
const
void
*
p_in
,
MakeArgumentPointer
(
const
void
*
p_in
,
void
*
p_wei
,
void
*
p_wei
,
const
void
*
p_out
,
const
void
*
p_out
,
const
ck
::
index_t
G
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
a_g_n_c_wis_lengths
,
// input
const
ck
::
index_t
N
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
a_g_n_c_wis_strides
,
const
ck
::
index_t
K
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_lengths
,
// weight
const
ck
::
index_t
C
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_strides
,
const
std
::
array
<
ck
::
index_t
,
NDimSpatial
>&
input_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
e_g_n_k_wos_lengths
,
// output
const
std
::
array
<
ck
::
index_t
,
NDimSpatial
>&
filter_spatial_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
e_g_n_k_wos_strides
,
const
std
::
array
<
ck
::
index_t
,
NDimSpatial
>&
output_spatial_lengths
,
const
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>&
input_strides
,
const
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>&
output_strides
,
const
std
::
array
<
ck
::
index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
ck
::
index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
ck
::
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
ck
::
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
ck
::
index_t
,
NDimSpatial
>&
input_left_pads
,
const
std
::
array
<
ck
::
index_t
,
NDimSpatial
>&
input_left_pads
,
...
...
Prev
1
2
3
4
5
6
7
…
10
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment