Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
8f9c0243
Commit
8f9c0243
authored
Sep 22, 2023
by
Alan Turner
Browse files
Merge branch 'develop' into migx-jit-lib
parents
181ea79a
c8a8385f
Changes
609
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
135 additions
and
99 deletions
+135
-99
example/37_batched_gemm_add_add_relu_gemm_add/CMakeLists.txt
example/37_batched_gemm_add_add_relu_gemm_add/CMakeLists.txt
+3
-1
example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp
..._gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp
+3
-1
example/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt
example/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt
+3
-1
example/39_permute/CMakeLists.txt
example/39_permute/CMakeLists.txt
+9
-7
example/40_conv2d_fwd_quantization/CMakeLists.txt
example/40_conv2d_fwd_quantization/CMakeLists.txt
+16
-16
example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp
.../conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp
+1
-1
example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp
...on/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp
+1
-1
example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp
.../conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp
+1
-1
example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp
...on/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp
+1
-1
example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp
...antization/conv2d_fwd_dl_perchannel_quantization_int8.cpp
+1
-1
example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp
...quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp
+1
-1
example/41_grouped_conv_conv_fwd/CMakeLists.txt
example/41_grouped_conv_conv_fwd/CMakeLists.txt
+14
-10
example/42_groupnorm/CMakeLists.txt
example/42_groupnorm/CMakeLists.txt
+5
-3
example/43_splitk_gemm_bias_e_permute/CMakeLists.txt
example/43_splitk_gemm_bias_e_permute/CMakeLists.txt
+6
-2
example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp16.cpp
...mm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp16.cpp
+1
-1
example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp32.cpp
...mm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp32.cpp
+1
-1
example/44_elementwise_permute/CMakeLists.txt
example/44_elementwise_permute/CMakeLists.txt
+4
-2
example/46_gemm_add_multiply/CMakeLists.txt
example/46_gemm_add_multiply/CMakeLists.txt
+6
-2
example/48_pool3d_fwd/CMakeLists.txt
example/48_pool3d_fwd/CMakeLists.txt
+3
-2
example/48_pool3d_fwd/pool3d_fwd_common.hpp
example/48_pool3d_fwd/pool3d_fwd_common.hpp
+55
-44
No files found.
example/37_batched_gemm_add_add_relu_gemm_add/CMakeLists.txt
View file @
8f9c0243
add_example_executable
(
example_batched_gemm_add_add_relu_gemm_add_xdl_fp16 batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp
)
if
(
DTYPES MATCHES
"fp16"
OR NOT DEFINED DTYPES
)
add_example_executable
(
example_batched_gemm_add_add_relu_gemm_add_xdl_fp16 batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp
)
endif
()
example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp
View file @
8f9c0243
...
...
@@ -173,6 +173,8 @@ using DeviceGemmInstance =
8
,
8
,
true
,
9
,
// D0sTransferSrcVectorDim
4
,
// D0sTransferSrcScalaerPerVector
S
<
8
,
32
,
1
>
,
// B1BlockTransfer
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
...
...
@@ -189,7 +191,7 @@ int main(int argc, char* argv[])
{
bool
do_verification
=
true
;
int
init_method
=
1
;
bool
time_kernel
=
fals
e
;
bool
time_kernel
=
tru
e
;
// GEMM shape
ck
::
index_t
M
=
1024
;
...
...
example/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt
View file @
8f9c0243
if
(
DTYPES MATCHES
"fp16"
OR NOT DEFINED DTYPES
)
list
(
APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942
)
set
(
target 0
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
...
...
@@ -10,4 +11,5 @@ foreach(gpu IN LISTS GPU_TARGETS)
add_dependencies
(
example_grouped_conv_bwd_data example_grouped_conv_bwd_data_bias_relu_fp16
)
set
(
target 1
)
endif
()
endforeach
()
\ No newline at end of file
endforeach
()
endif
()
example/39_permute/CMakeLists.txt
View file @
8f9c0243
add_custom_target
(
example_permute
)
if
(
DTYPES MATCHES
"fp16"
OR NOT DEFINED DTYPES
)
add_custom_target
(
example_permute
)
add_example_executable
(
example_permute_1xHxW_fp16 permute_1xHxW_fp16.cpp
)
add_example_executable
(
example_permute_NxHxW_fp16 permute_NxHxW_fp16.cpp
)
add_example_executable
(
example_permute_HxWx4_fp16 permute_HxWx4_fp16.cpp
)
add_example_executable
(
example_permute_1xHxW_fp16 permute_1xHxW_fp16.cpp
)
add_example_executable
(
example_permute_NxHxW_fp16 permute_NxHxW_fp16.cpp
)
add_example_executable
(
example_permute_HxWx4_fp16 permute_HxWx4_fp16.cpp
)
add_dependencies
(
example_permute example_permute_1xHxW_fp16
)
add_dependencies
(
example_permute example_permute_NxHxW_fp16
)
add_dependencies
(
example_permute example_permute_HxWx4_fp16
)
add_dependencies
(
example_permute example_permute_1xHxW_fp16
)
add_dependencies
(
example_permute example_permute_NxHxW_fp16
)
add_dependencies
(
example_permute example_permute_HxWx4_fp16
)
endif
()
example/40_conv2d_fwd_quantization/CMakeLists.txt
View file @
8f9c0243
if
(
DTYPES MATCHES
"int8"
OR NOT DEFINED DTYPES
)
list
(
APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942
)
set
(
target 0
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
...
...
@@ -9,20 +10,19 @@ foreach(gpu IN LISTS GPU_TARGETS)
set
(
target 1
)
endif
()
endforeach
()
# Conv perlayer quantization
add_example_executable
(
example_conv2d_fwd_dl_perlayer_quantization_int8 conv2d_fwd_dl_perlayer_quantization_int8.cpp
)
# Conv perchannel quantization
add_example_executable
(
example_conv2d_fwd_dl_perchannel_quantization_int8 conv2d_fwd_dl_perchannel_quantization_int8.cpp
)
# Conv + bias + relu perlayer quantization
add_example_executable
(
example_conv2d_fwd_dl_bias_relu_perlayer_quantization_int8 conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp
)
# Conv + bias + relu perchannel quantization
add_example_executable
(
example_conv2d_fwd_dl_bias_relu_perchannel_quantization_int8 conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp
)
# Conv + bias + tanh perlayer quantization
add_example_executable
(
example_conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8 conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp
)
# Conv + bias + tanh perchannel quantization
add_example_executable
(
example_conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8 conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp
)
\ No newline at end of file
if
(
DL_KERNELS
)
# Conv perlayer quantization
add_example_executable
(
example_conv2d_fwd_dl_perlayer_quantization_int8 conv2d_fwd_dl_perlayer_quantization_int8.cpp
)
# Conv perchannel quantization
add_example_executable
(
example_conv2d_fwd_dl_perchannel_quantization_int8 conv2d_fwd_dl_perchannel_quantization_int8.cpp
)
# Conv + bias + relu perlayer quantization
add_example_executable
(
example_conv2d_fwd_dl_bias_relu_perlayer_quantization_int8 conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp
)
# Conv + bias + relu perchannel quantization
add_example_executable
(
example_conv2d_fwd_dl_bias_relu_perchannel_quantization_int8 conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp
)
# Conv + bias + tanh perlayer quantization
add_example_executable
(
example_conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8 conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp
)
# Conv + bias + tanh perchannel quantization
add_example_executable
(
example_conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8 conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp
)
endif
()
endif
()
\ No newline at end of file
example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp
View file @
8f9c0243
...
...
@@ -2,7 +2,7 @@
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
using
InDataType
=
int8_t
;
using
WeiDataType
=
int8_t
;
...
...
example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp
View file @
8f9c0243
...
...
@@ -2,7 +2,7 @@
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
using
InDataType
=
int8_t
;
using
WeiDataType
=
int8_t
;
...
...
example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp
View file @
8f9c0243
...
...
@@ -2,7 +2,7 @@
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
using
InDataType
=
int8_t
;
using
WeiDataType
=
int8_t
;
...
...
example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp
View file @
8f9c0243
...
...
@@ -2,7 +2,7 @@
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
using
InDataType
=
int8_t
;
using
WeiDataType
=
int8_t
;
...
...
example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp
View file @
8f9c0243
...
...
@@ -2,7 +2,7 @@
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
using
InDataType
=
int8_t
;
using
WeiDataType
=
int8_t
;
...
...
example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp
View file @
8f9c0243
...
...
@@ -2,7 +2,7 @@
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
using
InDataType
=
int8_t
;
using
WeiDataType
=
int8_t
;
...
...
example/41_grouped_conv_conv_fwd/CMakeLists.txt
View file @
8f9c0243
...
...
@@ -3,9 +3,15 @@ list(APPEND gpu_list2 gfx908 gfx90a)
set
(
target 0
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
if
(
gpu IN_LIST gpu_list1 AND target EQUAL 0
)
add_example_executable
(
example_grouped_conv_conv_fwd_xdl_fp32 grouped_conv_conv_fwd_xdl_fp32.cpp
)
add_example_executable
(
example_grouped_conv_conv_fwd_xdl_fp16 grouped_conv_conv_fwd_xdl_fp16.cpp
)
add_example_executable
(
example_grouped_conv_conv_fwd_xdl_bf16 grouped_conv_conv_fwd_xdl_bf16.cpp
)
if
(
DTYPES MATCHES
"fp32"
OR NOT DEFINED DTYPES
)
add_example_executable
(
example_grouped_conv_conv_fwd_xdl_fp32 grouped_conv_conv_fwd_xdl_fp32.cpp
)
endif
()
if
(
DTYPES MATCHES
"fp16"
OR NOT DEFINED DTYPES
)
add_example_executable
(
example_grouped_conv_conv_fwd_xdl_fp16 grouped_conv_conv_fwd_xdl_fp16.cpp
)
endif
()
if
(
DTYPES MATCHES
"bf16"
OR NOT DEFINED DTYPES
)
add_example_executable
(
example_grouped_conv_conv_fwd_xdl_bf16 grouped_conv_conv_fwd_xdl_bf16.cpp
)
endif
()
if
(
USE_BITINT_EXTENSION_INT4
)
add_example_executable
(
example_grouped_conv_conv_fwd_xdl_int4 grouped_conv_conv_fwd_xdl_int4.cpp
)
endif
(
USE_BITINT_EXTENSION_INT4
)
...
...
@@ -13,10 +19,8 @@ foreach(gpu IN LISTS GPU_TARGETS)
endif
()
endforeach
()
set
(
target 0
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
if
(
gpu IN_LIST gpu_list2 AND target EQUAL 0
)
add_example_executable
(
example_grouped_conv_conv_fwd_xdl_int8 grouped_conv_conv_fwd_xdl_int8.cpp
)
set
(
target 1
)
endif
()
endforeach
()
if
(
NOT GPU_TARGETS MATCHES
"gfx94"
AND NOT GPU_TARGETS MATCHES
"gfx1"
)
if
(
DTYPES MATCHES
"int8"
OR NOT DEFINED DTYPES
)
add_example_executable
(
example_grouped_conv_conv_fwd_xdl_int8 grouped_conv_conv_fwd_xdl_int8.cpp
)
endif
()
endif
()
example/42_groupnorm/CMakeLists.txt
View file @
8f9c0243
add_example_executable
(
example_groupnorm_sigmoid_mul_fp16 groupnorm_sigmoid_mul_fp16.cpp
)
add_example_executable
(
example_groupnorm_splitk_fp16 groupnorm_splitk_fp16.cpp
)
add_example_executable
(
example_groupnorm_swish_fp16 groupnorm_swish_fp16.cpp
)
if
(
DTYPES MATCHES
"fp16"
OR NOT DEFINED DTYPES
)
add_example_executable
(
example_groupnorm_sigmoid_mul_fp16 groupnorm_sigmoid_mul_fp16.cpp
)
add_example_executable
(
example_groupnorm_splitk_fp16 groupnorm_splitk_fp16.cpp
)
add_example_executable
(
example_groupnorm_swish_fp16 groupnorm_swish_fp16.cpp
)
endif
()
example/43_splitk_gemm_bias_e_permute/CMakeLists.txt
View file @
8f9c0243
add_example_executable
(
example_splitk_gemm_bias_e_permute_xdl_fp16 splitk_gemm_bias_e_permute_xdl_fp16.cpp
)
add_example_executable
(
example_splitk_gemm_bias_e_permute_xdl_fp32 splitk_gemm_bias_e_permute_xdl_fp32.cpp
)
if
(
DTYPES MATCHES
"fp16"
OR NOT DEFINED DTYPES
)
add_example_executable
(
example_splitk_gemm_bias_e_permute_xdl_fp16 splitk_gemm_bias_e_permute_xdl_fp16.cpp
)
endif
()
if
(
DTYPES MATCHES
"fp32"
OR NOT DEFINED DTYPES
)
add_example_executable
(
example_splitk_gemm_bias_e_permute_xdl_fp32 splitk_gemm_bias_e_permute_xdl_fp32.cpp
)
endif
()
example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp16.cpp
View file @
8f9c0243
...
...
@@ -8,7 +8,7 @@
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_splitk_contraction_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
...
...
example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp32.cpp
View file @
8f9c0243
...
...
@@ -8,7 +8,7 @@
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_splitk_contraction_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
...
...
example/44_elementwise_permute/CMakeLists.txt
View file @
8f9c0243
add_example_executable
(
example_elementwise_permute_4D_fp16 elementwise_permute_4D_fp16.cpp
)
add_example_executable
(
example_elementwise_permute_4D_fp16_2d elementwise_permute_4D_fp16_2d.cpp
)
if
(
DTYPES MATCHES
"fp16"
OR NOT DEFINED DTYPES
)
add_example_executable
(
example_elementwise_permute_4D_fp16 elementwise_permute_4D_fp16.cpp
)
add_example_executable
(
example_elementwise_permute_4D_fp16_2d elementwise_permute_4D_fp16_2d.cpp
)
endif
()
example/46_gemm_add_multiply/CMakeLists.txt
View file @
8f9c0243
add_example_executable
(
example_gemm_add_multiply_dl_fp16 gemm_add_multiply_dl_fp16.cpp
)
add_example_executable
(
example_gemm_add_multiply_xdl_fp16 gemm_add_multiply_xdl_fp16.cpp
)
if
(
DTYPES MATCHES
"fp16"
OR NOT DEFINED DTYPES
)
if
(
DL_KERNELS
)
add_example_executable
(
example_gemm_add_multiply_dl_fp16 gemm_add_multiply_dl_fp16.cpp
)
endif
()
add_example_executable
(
example_gemm_add_multiply_xdl_fp16 gemm_add_multiply_xdl_fp16.cpp
)
endif
()
example/48_pool3d_fwd/CMakeLists.txt
View file @
8f9c0243
add_example_executable
(
example_pool3d_fwd_fp16 pool3d_fwd_fp16.cpp
)
if
(
DTYPES MATCHES
"fp16"
OR NOT DEFINED DTYPES
)
add_example_executable
(
example_pool3d_fwd_fp16 pool3d_fwd_fp16.cpp
)
endif
()
example/48_pool3d_fwd/pool3d_fwd_common.hpp
View file @
8f9c0243
...
...
@@ -18,7 +18,45 @@
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp"
template
<
typename
InDataType
,
template
<
typename
TensorLayout
>
std
::
vector
<
ck
::
index_t
>
f_tensor_strides_ncdhw
(
ck
::
index_t
N_
,
ck
::
index_t
C_
,
ck
::
index_t
D
,
ck
::
index_t
H
,
ck
::
index_t
W
,
TensorLayout
layout
)
{
using
namespace
ck
::
literals
;
(
void
)
N_
;
if
constexpr
(
ck
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
NCDHW
>::
value
)
return
{
C_
*
D
*
H
*
W
,
D
*
H
*
W
,
H
*
W
,
W
,
1
_uz
};
else
if
constexpr
(
ck
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
NDHWC
>::
value
)
return
{
D
*
C_
*
H
*
W
,
1
_uz
,
C_
*
H
*
W
,
W
*
C_
,
C_
};
};
template
<
typename
TensorLayout
>
HostTensorDescriptor
f_host_tensor_descriptor
(
std
::
size_t
N_
,
std
::
size_t
C_
,
std
::
size_t
D
,
std
::
size_t
H
,
std
::
size_t
W
,
TensorLayout
layout
)
{
using
namespace
ck
::
literals
;
if
constexpr
(
ck
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
NCDHW
>::
value
)
{
return
HostTensorDescriptor
({
N_
,
C_
,
D
,
H
,
W
},
{
C_
*
D
*
H
*
W
,
D
*
H
*
W
,
H
*
W
,
W
,
1
_uz
});
}
else
if
constexpr
(
ck
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
NDHWC
>::
value
)
{
return
HostTensorDescriptor
({
N_
,
C_
,
D
,
H
,
W
},
{
D
*
C_
*
H
*
W
,
1
_uz
,
C_
*
H
*
W
,
W
*
C_
,
C_
});
}
};
template
<
typename
DevicePoolFwdInstance
,
typename
InDataType
,
typename
OutDataType
,
typename
ComputeDataType
,
typename
IndexDataType
,
...
...
@@ -40,6 +78,9 @@ bool pool3d_test(bool do_verification,
ck
::
index_t
window_stride_d
,
ck
::
index_t
window_stride_h
,
ck
::
index_t
window_stride_w
,
ck
::
index_t
window_dilation_d
,
ck
::
index_t
window_dilation_h
,
ck
::
index_t
window_dilation_w
,
ck
::
index_t
in_left_pad_d
,
ck
::
index_t
in_left_pad_h
,
ck
::
index_t
in_left_pad_w
,
...
...
@@ -47,53 +88,21 @@ bool pool3d_test(bool do_verification,
ck
::
index_t
in_right_pad_h
,
ck
::
index_t
in_right_pad_w
)
{
using
DevicePoolFwdInstance
=
ck
::
tensor_operation
::
device
::
DevicePool3dFwd_Input_N_Di_Hi_Wi_C_Output_N_Do_Ho_Wo_C
<
InDataType
,
// InDataType
OutDataType
,
// OutDataType
IndexDataType
,
// IndexDataType
ComputeDataType
,
// ComputeDataType
ReduceOpId
,
OutputIndex
,
64
,
// BlockSize
64
,
// ReduceMThreadClusterSize
1
,
// ReduceKThreadClusterSize
4
,
// ReduceMThreadSliceSize
1
,
// ReduceKThreadSliceSize
4
>
;
// InSrcOutDstVectorSize
const
ck
::
index_t
Do
=
(
Di
+
in_left_pad_d
+
in_right_pad_d
-
Z
)
/
window_stride_d
+
1
;
const
ck
::
index_t
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
Y
)
/
window_stride_h
+
1
;
const
ck
::
index_t
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
X
)
/
window_stride_w
+
1
;
const
ck
::
index_t
Zs
=
(
Z
-
1
)
*
window_dilation_d
+
1
;
const
ck
::
index_t
Ys
=
(
Y
-
1
)
*
window_dilation_h
+
1
;
const
ck
::
index_t
Xs
=
(
X
-
1
)
*
window_dilation_w
+
1
;
const
ck
::
index_t
Do
=
(
Di
+
in_left_pad_d
+
in_right_pad_d
-
Zs
)
/
window_stride_d
+
1
;
const
ck
::
index_t
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
Ys
)
/
window_stride_h
+
1
;
const
ck
::
index_t
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
Xs
)
/
window_stride_w
+
1
;
const
std
::
vector
<
ck
::
index_t
>
window_spatial_lengths
{
Z
,
Y
,
X
};
const
std
::
vector
<
ck
::
index_t
>
window_strides
{
window_stride_d
,
window_stride_h
,
window_stride_w
};
const
std
::
vector
<
ck
::
index_t
>
window_dilations
{
window_dilation_d
,
window_dilation_h
,
window_dilation_w
};
const
std
::
vector
<
ck
::
index_t
>
input_left_pads
{
in_left_pad_d
,
in_left_pad_h
,
in_left_pad_w
};
const
std
::
vector
<
ck
::
index_t
>
input_right_pads
{
in_right_pad_d
,
in_right_pad_h
,
in_right_pad_w
};
// tensor layout
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
N_
,
std
::
size_t
C_
,
std
::
size_t
D
,
std
::
size_t
H
,
std
::
size_t
W
,
auto
layout
)
{
using
namespace
ck
::
literals
;
if
constexpr
(
ck
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
NCDHW
>::
value
)
{
return
HostTensorDescriptor
({
N_
,
C_
,
D
,
H
,
W
},
{
C_
*
D
*
H
*
W
,
D
*
H
*
W
,
H
*
W
,
W
,
1
_uz
});
}
else
if
constexpr
(
ck
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
NDHWC
>::
value
)
{
return
HostTensorDescriptor
({
N_
,
C_
,
D
,
H
,
W
},
{
D
*
C_
*
H
*
W
,
1
_uz
,
C_
*
H
*
W
,
W
*
C_
,
C_
});
}
};
Tensor
<
InDataType
>
in_n_c_di_hi_wi
(
f_host_tensor_descriptor
(
N
,
C
,
Di
,
Hi
,
Wi
,
InLayout
{}));
Tensor
<
OutDataType
>
out_n_c_do_ho_wo_host
(
f_host_tensor_descriptor
(
N
,
C
,
Do
,
Ho
,
Wo
,
OutLayout
{}));
...
...
@@ -126,10 +135,11 @@ bool pool3d_test(bool do_verification,
{
N
,
C
,
Di
,
Hi
,
Wi
},
{
Z
,
Y
,
X
},
{
N
,
C
,
Do
,
Ho
,
Wo
},
{
Di
*
C
*
Hi
*
Wi
,
1
,
C
*
Hi
*
Wi
,
Wi
*
C
,
C
}
,
{
Do
*
C
*
Ho
*
Wo
,
1
,
C
*
Ho
*
Wo
,
Wo
*
C
,
C
}
,
{
Do
*
C
*
Ho
*
Wo
,
1
,
C
*
Ho
*
Wo
,
Wo
*
C
,
C
}
,
f_tensor_strides_ncdhw
(
N
,
C
,
Di
,
Hi
,
Wi
,
InLayout
{})
,
f_tensor_strides_ncdhw
(
N
,
C
,
Do
,
Ho
,
Wo
,
OutLayout
{})
,
f_tensor_strides_ncdhw
(
N
,
C
,
Do
,
Ho
,
Wo
,
OutLayout
{})
,
window_strides
,
window_dilations
,
input_left_pads
,
input_right_pads
,
{
2
,
3
,
4
});
...
...
@@ -165,6 +175,7 @@ bool pool3d_test(bool do_verification,
out_indices_n_c_do_ho_wo_host
,
window_spatial_lengths
,
window_strides
,
window_dilations
,
input_left_pads
,
input_right_pads
);
...
...
Prev
1
2
3
4
5
6
7
8
9
…
31
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment