Commit 56863b9a authored by Jing Zhang's avatar Jing Zhang
Browse files

add fp8 support

parents 54df59bf d4c84256
add_example_executable(example_gemm_add_multiply_dl_fp16 gemm_add_multiply_dl_fp16.cpp) if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
add_example_executable(example_gemm_add_multiply_xdl_fp16 gemm_add_multiply_xdl_fp16.cpp) if(DL_KERNELS)
add_example_executable(example_gemm_add_multiply_dl_fp16 gemm_add_multiply_dl_fp16.cpp)
endif()
add_example_executable(example_gemm_add_multiply_xdl_fp16 gemm_add_multiply_xdl_fp16.cpp)
endif()
add_example_executable(example_pool3d_fwd_fp16 pool3d_fwd_fp16.cpp) if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
add_example_executable(example_pool3d_fwd_fp16 pool3d_fwd_fp16.cpp)
endif()
...@@ -18,7 +18,45 @@ ...@@ -18,7 +18,45 @@
#include "ck/library/utility/literals.hpp" #include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp"
template <typename InDataType, template <typename TensorLayout>
std::vector<ck::index_t> f_tensor_strides_ncdhw(ck::index_t N_,
ck::index_t C_,
ck::index_t D,
ck::index_t H,
ck::index_t W,
TensorLayout layout)
{
using namespace ck::literals;
(void)N_;
if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCDHW>::value)
return {C_ * D * H * W, D * H * W, H * W, W, 1_uz};
else if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NDHWC>::value)
return {D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_};
};
template <typename TensorLayout>
HostTensorDescriptor f_host_tensor_descriptor(std::size_t N_,
std::size_t C_,
std::size_t D,
std::size_t H,
std::size_t W,
TensorLayout layout)
{
using namespace ck::literals;
if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCDHW>::value)
{
return HostTensorDescriptor({N_, C_, D, H, W}, {C_ * D * H * W, D * H * W, H * W, W, 1_uz});
}
else if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NDHWC>::value)
{
return HostTensorDescriptor({N_, C_, D, H, W},
{D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_});
}
};
template <typename DevicePoolFwdInstance,
typename InDataType,
typename OutDataType, typename OutDataType,
typename ComputeDataType, typename ComputeDataType,
typename IndexDataType, typename IndexDataType,
...@@ -40,6 +78,9 @@ bool pool3d_test(bool do_verification, ...@@ -40,6 +78,9 @@ bool pool3d_test(bool do_verification,
ck::index_t window_stride_d, ck::index_t window_stride_d,
ck::index_t window_stride_h, ck::index_t window_stride_h,
ck::index_t window_stride_w, ck::index_t window_stride_w,
ck::index_t window_dilation_d,
ck::index_t window_dilation_h,
ck::index_t window_dilation_w,
ck::index_t in_left_pad_d, ck::index_t in_left_pad_d,
ck::index_t in_left_pad_h, ck::index_t in_left_pad_h,
ck::index_t in_left_pad_w, ck::index_t in_left_pad_w,
...@@ -47,53 +88,21 @@ bool pool3d_test(bool do_verification, ...@@ -47,53 +88,21 @@ bool pool3d_test(bool do_verification,
ck::index_t in_right_pad_h, ck::index_t in_right_pad_h,
ck::index_t in_right_pad_w) ck::index_t in_right_pad_w)
{ {
using DevicePoolFwdInstance = const ck::index_t Zs = (Z - 1) * window_dilation_d + 1;
ck::tensor_operation::device::DevicePool3dFwd_Input_N_Di_Hi_Wi_C_Output_N_Do_Ho_Wo_C< const ck::index_t Ys = (Y - 1) * window_dilation_h + 1;
InDataType, // InDataType const ck::index_t Xs = (X - 1) * window_dilation_w + 1;
OutDataType, // OutDataType const ck::index_t Do = (Di + in_left_pad_d + in_right_pad_d - Zs) / window_stride_d + 1;
IndexDataType, // IndexDataType const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Ys) / window_stride_h + 1;
ComputeDataType, // ComputeDataType const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - Xs) / window_stride_w + 1;
ReduceOpId,
OutputIndex,
64, // BlockSize
64, // ReduceMThreadClusterSize
1, // ReduceKThreadClusterSize
4, // ReduceMThreadSliceSize
1, // ReduceKThreadSliceSize
4>; // InSrcOutDstVectorSize
const ck::index_t Do = (Di + in_left_pad_d + in_right_pad_d - Z) / window_stride_d + 1;
const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Y) / window_stride_h + 1;
const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - X) / window_stride_w + 1;
const std::vector<ck::index_t> window_spatial_lengths{Z, Y, X}; const std::vector<ck::index_t> window_spatial_lengths{Z, Y, X};
const std::vector<ck::index_t> window_strides{ const std::vector<ck::index_t> window_strides{
window_stride_d, window_stride_h, window_stride_w}; window_stride_d, window_stride_h, window_stride_w};
const std::vector<ck::index_t> window_dilations{
window_dilation_d, window_dilation_h, window_dilation_w};
const std::vector<ck::index_t> input_left_pads{in_left_pad_d, in_left_pad_h, in_left_pad_w}; const std::vector<ck::index_t> input_left_pads{in_left_pad_d, in_left_pad_h, in_left_pad_w};
const std::vector<ck::index_t> input_right_pads{in_right_pad_d, in_right_pad_h, in_right_pad_w}; const std::vector<ck::index_t> input_right_pads{in_right_pad_d, in_right_pad_h, in_right_pad_w};
// tensor layout
auto f_host_tensor_descriptor = [](std::size_t N_,
std::size_t C_,
std::size_t D,
std::size_t H,
std::size_t W,
auto layout) {
using namespace ck::literals;
if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCDHW>::value)
{
return HostTensorDescriptor({N_, C_, D, H, W},
{C_ * D * H * W, D * H * W, H * W, W, 1_uz});
}
else if constexpr(ck::is_same<decltype(layout),
ck::tensor_layout::convolution::NDHWC>::value)
{
return HostTensorDescriptor({N_, C_, D, H, W},
{D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_});
}
};
Tensor<InDataType> in_n_c_di_hi_wi(f_host_tensor_descriptor(N, C, Di, Hi, Wi, InLayout{})); Tensor<InDataType> in_n_c_di_hi_wi(f_host_tensor_descriptor(N, C, Di, Hi, Wi, InLayout{}));
Tensor<OutDataType> out_n_c_do_ho_wo_host( Tensor<OutDataType> out_n_c_do_ho_wo_host(
f_host_tensor_descriptor(N, C, Do, Ho, Wo, OutLayout{})); f_host_tensor_descriptor(N, C, Do, Ho, Wo, OutLayout{}));
...@@ -126,10 +135,11 @@ bool pool3d_test(bool do_verification, ...@@ -126,10 +135,11 @@ bool pool3d_test(bool do_verification,
{N, C, Di, Hi, Wi}, {N, C, Di, Hi, Wi},
{Z, Y, X}, {Z, Y, X},
{N, C, Do, Ho, Wo}, {N, C, Do, Ho, Wo},
{Di * C * Hi * Wi, 1, C * Hi * Wi, Wi * C, C}, f_tensor_strides_ncdhw(N, C, Di, Hi, Wi, InLayout{}),
{Do * C * Ho * Wo, 1, C * Ho * Wo, Wo * C, C}, f_tensor_strides_ncdhw(N, C, Do, Ho, Wo, OutLayout{}),
{Do * C * Ho * Wo, 1, C * Ho * Wo, Wo * C, C}, f_tensor_strides_ncdhw(N, C, Do, Ho, Wo, OutLayout{}),
window_strides, window_strides,
window_dilations,
input_left_pads, input_left_pads,
input_right_pads, input_right_pads,
{2, 3, 4}); {2, 3, 4});
...@@ -165,6 +175,7 @@ bool pool3d_test(bool do_verification, ...@@ -165,6 +175,7 @@ bool pool3d_test(bool do_verification,
out_indices_n_c_do_ho_wo_host, out_indices_n_c_do_ho_wo_host,
window_spatial_lengths, window_spatial_lengths,
window_strides, window_strides,
window_dilations,
input_left_pads, input_left_pads,
input_right_pads); input_right_pads);
......
...@@ -27,31 +27,49 @@ static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG; ...@@ -27,31 +27,49 @@ static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
static constexpr bool OutputIndex = false; static constexpr bool OutputIndex = false;
static constexpr bool PropagateNan = false; static constexpr bool PropagateNan = false;
using DevicePoolFwdInstance =
ck::tensor_operation::device::DevicePool3dFwd_NDHWC_NDHWC<InDataType,
OutDataType,
IndexDataType,
ComputeDataType,
ReduceOpId,
OutputIndex,
64, // BlockSize
64, // ReduceMThreadClusterSize
1, // ReduceKThreadClusterSize
1, // ReduceMThreadSliceSize
1, // ReduceKThreadSliceSize
1>; // InSrcOutDstVectorSize
int main() int main()
{ {
bool do_verification = true; bool do_verification = true;
bool time_kernel = false; bool time_kernel = false;
// Pool shape // Pool shape
ck::index_t N = 2; ck::index_t N = 2;
ck::index_t C = 32; ck::index_t C = 32;
ck::index_t Z = 2; ck::index_t Z = 2;
ck::index_t Y = 2; ck::index_t Y = 2;
ck::index_t X = 2; ck::index_t X = 2;
ck::index_t Di = 30; ck::index_t Di = 30;
ck::index_t Hi = 30; ck::index_t Hi = 30;
ck::index_t Wi = 30; ck::index_t Wi = 30;
ck::index_t window_stride_d = 2; ck::index_t window_stride_d = 2;
ck::index_t window_stride_h = 2; ck::index_t window_stride_h = 2;
ck::index_t window_stride_w = 2; ck::index_t window_stride_w = 2;
ck::index_t in_left_pad_d = 1; ck::index_t window_dilation_d = 1;
ck::index_t in_left_pad_h = 1; ck::index_t window_dilation_h = 1;
ck::index_t in_left_pad_w = 1; ck::index_t window_dilation_w = 1;
ck::index_t in_right_pad_d = 1; ck::index_t in_left_pad_d = 1;
ck::index_t in_right_pad_h = 1; ck::index_t in_left_pad_h = 1;
ck::index_t in_right_pad_w = 1; ck::index_t in_left_pad_w = 1;
ck::index_t in_right_pad_d = 1;
ck::index_t in_right_pad_h = 1;
ck::index_t in_right_pad_w = 1;
bool pass = pool3d_test<InDataType, bool pass = pool3d_test<DevicePoolFwdInstance,
InDataType,
OutDataType, OutDataType,
ComputeDataType, ComputeDataType,
IndexDataType, IndexDataType,
...@@ -72,6 +90,9 @@ int main() ...@@ -72,6 +90,9 @@ int main()
window_stride_d, window_stride_d,
window_stride_h, window_stride_h,
window_stride_w, window_stride_w,
window_dilation_d,
window_dilation_h,
window_dilation_w,
in_left_pad_d, in_left_pad_d,
in_left_pad_h, in_left_pad_h,
in_left_pad_w, in_left_pad_w,
......
add_example_executable(example_maxpool2d_bwd_bf16 maxpool2d_bwd_bf16.cpp) if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
add_example_executable(example_maxpool2d_bwd_fp16 maxpool2d_bwd_fp16.cpp) add_example_executable(example_maxpool2d_bwd_bf16 maxpool2d_bwd_bf16.cpp)
add_example_executable(example_maxpool2d_bwd_fp32 maxpool2d_bwd_fp32.cpp) endif()
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
add_example_executable(example_maxpool2d_bwd_fp16 maxpool2d_bwd_fp16.cpp)
endif()
if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
add_example_executable(example_maxpool2d_bwd_fp32 maxpool2d_bwd_fp32.cpp)
endif()
...@@ -24,18 +24,20 @@ int main() ...@@ -24,18 +24,20 @@ int main()
bool time_kernel = false; bool time_kernel = false;
// Pool shape // Pool shape
ck::index_t N = 1; ck::index_t N = 1;
ck::index_t C = 1; ck::index_t C = 1;
ck::index_t Y = 3; ck::index_t Y = 3;
ck::index_t X = 3; ck::index_t X = 3;
ck::index_t Hi = 32; ck::index_t Hi = 32;
ck::index_t Wi = 32; ck::index_t Wi = 32;
ck::index_t window_stride_h = 1; ck::index_t window_stride_h = 1;
ck::index_t window_stride_w = 1; ck::index_t window_stride_w = 1;
ck::index_t in_left_pad_h = 0; ck::index_t window_dilation_h = 1;
ck::index_t in_left_pad_w = 0; ck::index_t window_dilation_w = 1;
ck::index_t in_right_pad_h = 0; ck::index_t in_left_pad_h = 0;
ck::index_t in_right_pad_w = 0; ck::index_t in_left_pad_w = 0;
ck::index_t in_right_pad_h = 0;
ck::index_t in_right_pad_w = 0;
bool pass = maxpool_bwd_test<InDataType, bool pass = maxpool_bwd_test<InDataType,
OutDataType, OutDataType,
...@@ -53,6 +55,8 @@ int main() ...@@ -53,6 +55,8 @@ int main()
Wi, Wi,
window_stride_h, window_stride_h,
window_stride_w, window_stride_w,
window_dilation_h,
window_dilation_w,
in_left_pad_h, in_left_pad_h,
in_left_pad_w, in_left_pad_w,
in_right_pad_h, in_right_pad_h,
......
...@@ -36,6 +36,8 @@ bool maxpool_bwd_test(bool do_verification, ...@@ -36,6 +36,8 @@ bool maxpool_bwd_test(bool do_verification,
ck::index_t Wi, ck::index_t Wi,
ck::index_t window_stride_h, ck::index_t window_stride_h,
ck::index_t window_stride_w, ck::index_t window_stride_w,
ck::index_t window_dilation_h,
ck::index_t window_dilation_w,
ck::index_t in_left_pad_h, ck::index_t in_left_pad_h,
ck::index_t in_left_pad_w, ck::index_t in_left_pad_w,
ck::index_t in_right_pad_h, ck::index_t in_right_pad_h,
...@@ -44,28 +46,30 @@ bool maxpool_bwd_test(bool do_verification, ...@@ -44,28 +46,30 @@ bool maxpool_bwd_test(bool do_verification,
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using DevicePoolFwdInstance = using DevicePoolFwdInstance =
ck::tensor_operation::device::DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C< ck::tensor_operation::device::DevicePool2dFwd_NHWC_NHWC<InDataType, // InDataType
InDataType, // InDataType OutDataType, // OutDataType
OutDataType, // OutDataType IndexDataType, // IndexDataType
IndexDataType, // IndexDataType ComputeDataType, // ComputeDataType
ComputeDataType, // ComputeDataType ck::ReduceTensorOp::MAX,
ck::ReduceTensorOp::MAX, true,
true, // OutputIndex 64, // BlockSize
64, // BlockSize 64, // ReduceMThreadClusterSize
64, // ReduceMThreadClusterSize 1, // ReduceKThreadClusterSize
1, // ReduceKThreadClusterSize 4, // ReduceMThreadSliceSize
4, // ReduceMThreadSliceSize 1, // ReduceKThreadSliceSize
1, // ReduceKThreadSliceSize 1>; // InSrcOutDstVectorSize
1>; // InSrcOutDstVectorSize
using DeviceMaxPoolBwdInstance = ck::tensor_operation::device:: using DeviceMaxPoolBwdInstance = ck::tensor_operation::device::
DeviceIndexPoolBwdImpl<DOutDataType, IndexDataType, DInDataType, 4>; DeviceIndexPoolBwdImpl<DOutDataType, IndexDataType, DInDataType, 4>;
const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Y) / window_stride_h + 1; const ck::index_t Ys = (Y - 1) * window_dilation_h + 1;
const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - X) / window_stride_w + 1; const ck::index_t Xs = (X - 1) * window_dilation_w + 1;
const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Ys) / window_stride_h + 1;
const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - Xs) / window_stride_w + 1;
const std::vector<ck::index_t> window_spatial_lengths{Y, X}; const std::vector<ck::index_t> window_spatial_lengths{Y, X};
const std::vector<ck::index_t> window_strides{window_stride_h, window_stride_w}; const std::vector<ck::index_t> window_strides{window_stride_h, window_stride_w};
const std::vector<ck::index_t> window_dilations{window_dilation_h, window_dilation_w};
const std::vector<ck::index_t> input_left_pads{in_left_pad_h, in_left_pad_w}; const std::vector<ck::index_t> input_left_pads{in_left_pad_h, in_left_pad_w};
const std::vector<ck::index_t> input_right_pads{in_right_pad_h, in_right_pad_w}; const std::vector<ck::index_t> input_right_pads{in_right_pad_h, in_right_pad_w};
...@@ -128,6 +132,7 @@ bool maxpool_bwd_test(bool do_verification, ...@@ -128,6 +132,7 @@ bool maxpool_bwd_test(bool do_verification,
{C * Ho * Wo, 1, Wo * C, C}, {C * Ho * Wo, 1, Wo * C, C},
{C * Ho * Wo, 1, Wo * C, C}, {C * Ho * Wo, 1, Wo * C, C},
window_strides, window_strides,
window_dilations,
input_left_pads, input_left_pads,
input_right_pads, input_right_pads,
{2, 3}); {2, 3});
...@@ -191,6 +196,7 @@ bool maxpool_bwd_test(bool do_verification, ...@@ -191,6 +196,7 @@ bool maxpool_bwd_test(bool do_verification,
indices_n_c_ho_wo_host, indices_n_c_ho_wo_host,
window_spatial_lengths, window_spatial_lengths,
window_strides, window_strides,
window_dilations,
input_left_pads, input_left_pads,
input_right_pads); input_right_pads);
ref_pooling_fwd_invoker.Run(ref_pooling_fwd_argument); ref_pooling_fwd_invoker.Run(ref_pooling_fwd_argument);
......
...@@ -24,18 +24,20 @@ int main() ...@@ -24,18 +24,20 @@ int main()
bool time_kernel = false; bool time_kernel = false;
// Pool shape // Pool shape
ck::index_t N = 1; ck::index_t N = 1;
ck::index_t C = 1; ck::index_t C = 1;
ck::index_t Y = 3; ck::index_t Y = 3;
ck::index_t X = 3; ck::index_t X = 3;
ck::index_t Hi = 32; ck::index_t Hi = 32;
ck::index_t Wi = 32; ck::index_t Wi = 32;
ck::index_t window_stride_h = 1; ck::index_t window_stride_h = 1;
ck::index_t window_stride_w = 1; ck::index_t window_stride_w = 1;
ck::index_t in_left_pad_h = 0; ck::index_t window_dilation_h = 1;
ck::index_t in_left_pad_w = 0; ck::index_t window_dilation_w = 1;
ck::index_t in_right_pad_h = 0; ck::index_t in_left_pad_h = 0;
ck::index_t in_right_pad_w = 0; ck::index_t in_left_pad_w = 0;
ck::index_t in_right_pad_h = 0;
ck::index_t in_right_pad_w = 0;
bool pass = maxpool_bwd_test<InDataType, bool pass = maxpool_bwd_test<InDataType,
OutDataType, OutDataType,
...@@ -53,6 +55,8 @@ int main() ...@@ -53,6 +55,8 @@ int main()
Wi, Wi,
window_stride_h, window_stride_h,
window_stride_w, window_stride_w,
window_dilation_h,
window_dilation_w,
in_left_pad_h, in_left_pad_h,
in_left_pad_w, in_left_pad_w,
in_right_pad_h, in_right_pad_h,
......
...@@ -24,18 +24,20 @@ int main() ...@@ -24,18 +24,20 @@ int main()
bool time_kernel = false; bool time_kernel = false;
// Pool shape // Pool shape
ck::index_t N = 1; ck::index_t N = 1;
ck::index_t C = 1; ck::index_t C = 1;
ck::index_t Y = 2; ck::index_t Y = 2;
ck::index_t X = 2; ck::index_t X = 2;
ck::index_t Hi = 32; ck::index_t Hi = 32;
ck::index_t Wi = 32; ck::index_t Wi = 32;
ck::index_t window_stride_h = 2; ck::index_t window_stride_h = 2;
ck::index_t window_stride_w = 2; ck::index_t window_stride_w = 2;
ck::index_t in_left_pad_h = 0; ck::index_t window_dilation_h = 1;
ck::index_t in_left_pad_w = 0; ck::index_t window_dilation_w = 1;
ck::index_t in_right_pad_h = 0; ck::index_t in_left_pad_h = 0;
ck::index_t in_right_pad_w = 0; ck::index_t in_left_pad_w = 0;
ck::index_t in_right_pad_h = 0;
ck::index_t in_right_pad_w = 0;
bool pass = maxpool_bwd_test<InDataType, bool pass = maxpool_bwd_test<InDataType,
OutDataType, OutDataType,
...@@ -53,6 +55,8 @@ int main() ...@@ -53,6 +55,8 @@ int main()
Wi, Wi,
window_stride_h, window_stride_h,
window_stride_w, window_stride_w,
window_dilation_h,
window_dilation_w,
in_left_pad_h, in_left_pad_h,
in_left_pad_w, in_left_pad_w,
in_right_pad_h, in_right_pad_h,
......
add_example_executable(example_put_element_fp16 put_element_fp16.cpp) if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
add_example_executable(example_put_element_fp16 put_element_fp16.cpp)
endif()
add_example_executable(example_avgpool3d_bwd_bf16 avgpool3d_bwd_bf16.cpp)
add_example_executable(example_avgpool3d_bwd_fp16 avgpool3d_bwd_fp16.cpp)
add_example_executable(example_avgpool3d_bwd_fp32 avgpool3d_bwd_fp32.cpp)
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_avgpool3d_bwd_ndhwc_ndhwc.hpp"
#include "avgpool3d_bwd_common.hpp"
using DOutDataType = ck::bhalf_t;
using DInDataType = ck::bhalf_t;
using ComputeDataType = float;
#if 1
using DOutLayout = ck::tensor_layout::convolution::NDHWC;
using DInLayout = ck::tensor_layout::convolution::NDHWC;
#else
using DOutLayout = ck::tensor_layout::convolution::NCDHW;
using DInLayout = ck::tensor_layout::convolution::NCDHW;
#endif
using DevicePoolBwdInstance =
ck::tensor_operation::device::DeviceAvgPool3dBwd_NDHWC_NDHWC<DOutDataType,
DInDataType,
ComputeDataType,
64, // BlockSize
64, // ReduceMThreadClusterSize
1, // ReduceKThreadClusterSize
1, // ReduceMThreadSliceSize
1, // ReduceKThreadSliceSize
1>; // InSrcOutDstVectorSize
int main()
{
std::vector<ck::index_t> window_lengths = {5, 5, 5};
std::vector<ck::index_t> window_strides = {2, 2, 2};
std::vector<ck::index_t> window_dilations = {2, 2, 2};
std::vector<ck::index_t> dinput_left_pads = {0, 0, 0};
std::vector<ck::index_t> dinput_right_pads = {0, 0, 0};
ck::index_t N = 1;
ck::index_t C = 16;
ck::index_t Di = 40;
ck::index_t Hi = 40;
ck::index_t Wi = 40;
pool3d_bwd_test<DevicePoolBwdInstance, DOutDataType, DInDataType, DOutLayout, DInLayout>(
true,
false,
N,
C,
Di,
Hi,
Wi,
window_lengths,
window_strides,
window_dilations,
dinput_left_pads,
dinput_right_pads);
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_avgpool_bwd.hpp"
template <typename TensorLayout>
std::vector<ck::index_t> f_tensor_strides_ncdhw(ck::index_t N_,
ck::index_t C_,
ck::index_t D,
ck::index_t H,
ck::index_t W,
TensorLayout layout)
{
using namespace ck::literals;
(void)N_;
if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCDHW>::value)
return {C_ * D * H * W, D * H * W, H * W, W, 1_uz};
else if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NDHWC>::value)
return {D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_};
};
template <typename TensorLayout>
HostTensorDescriptor f_host_tensor_descriptor(std::size_t N_,
std::size_t C_,
std::size_t D,
std::size_t H,
std::size_t W,
TensorLayout layout)
{
using namespace ck::literals;
if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCDHW>::value)
{
return HostTensorDescriptor({N_, C_, D, H, W}, {C_ * D * H * W, D * H * W, H * W, W, 1_uz});
}
else if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NDHWC>::value)
{
return HostTensorDescriptor({N_, C_, D, H, W},
{D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_});
}
};
template <typename DevicePoolBwdInstance,
typename DOutDataType,
typename DInDataType,
typename DOutLayout,
typename DInLayout>
bool pool3d_bwd_test(bool do_verification,
bool time_kernel,
ck::index_t N,
ck::index_t C,
ck::index_t Di,
ck::index_t Hi,
ck::index_t Wi,
std::vector<ck::index_t> window_lengths,
std::vector<ck::index_t> window_strides,
std::vector<ck::index_t> window_dilations,
std::vector<ck::index_t> dinput_left_pads,
std::vector<ck::index_t> dinput_right_pads)
{
auto OutSpatialLength = [&](auto InSpatialLength, int index) {
ck::index_t left_pad = dinput_left_pads[index];
ck::index_t right_pad = dinput_right_pads[index];
ck::index_t window_len = window_lengths[index];
ck::index_t stride = window_strides[index];
ck::index_t dilation = window_dilations[index];
ck::index_t eff = (window_len - 1) * dilation + 1;
return (InSpatialLength + left_pad + right_pad - eff) / stride + 1;
};
ck::index_t Do = OutSpatialLength(Di, 0);
ck::index_t Ho = OutSpatialLength(Hi, 1);
ck::index_t Wo = OutSpatialLength(Wi, 2);
Tensor<DOutDataType> dout(f_host_tensor_descriptor(N, C, Do, Ho, Wo, DOutLayout{}));
Tensor<DInDataType> din_dev(f_host_tensor_descriptor(N, C, Di, Hi, Wi, DInLayout{}));
Tensor<DInDataType> din_host(f_host_tensor_descriptor(N, C, Di, Hi, Wi, DInLayout{}));
std::cout << "dout: " << dout.mDesc << std::endl;
std::cout << "din_host: " << din_host.mDesc << std::endl;
dout.GenerateTensorValue(GeneratorTensor_3<DOutDataType>{0.0, 1.0});
DeviceMem dout_device_buf(sizeof(DOutDataType) * dout.mDesc.GetElementSpaceSize());
DeviceMem din_device_buf(sizeof(DInDataType) * din_dev.mDesc.GetElementSpaceSize());
dout_device_buf.ToDevice(dout.mData.data());
din_device_buf.SetZero();
auto pool = DevicePoolBwdInstance{};
auto invoker_ptr = pool.MakeInvokerPointer();
auto argument_ptr =
pool.MakeArgumentPointer(static_cast<DOutDataType*>(dout_device_buf.GetDeviceBuffer()),
static_cast<DInDataType*>(din_device_buf.GetDeviceBuffer()),
{N, C, Do, Ho, Wo},
{N, C, Di, Hi, Wi},
f_tensor_strides_ncdhw(N, C, Do, Ho, Wo, DOutLayout{}),
f_tensor_strides_ncdhw(N, C, Di, Hi, Wi, DInLayout{}),
window_lengths,
window_strides,
window_dilations,
dinput_left_pads,
dinput_right_pads);
if(!pool.IsSupportedArgument(argument_ptr.get()))
{
throw std::runtime_error("wrong! device_op with the specified compilation parameters does "
"not support this problem");
}
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::cout << "Perf: " << ave_time << std::endl;
bool pass = true;
if(do_verification)
{
auto ref_pool =
ck::tensor_operation::host::ReferenceAvgPoolBwd<3, DInDataType, DOutDataType>();
auto ref_invoker = ref_pool.MakeInvoker();
auto ref_argument = ref_pool.MakeArgument(din_host,
dout,
window_lengths,
window_strides,
window_dilations,
dinput_left_pads,
dinput_right_pads);
ref_invoker.Run(ref_argument);
din_device_buf.FromDevice(din_dev.mData.data());
pass = ck::utils::check_err(din_dev, din_host);
}
return pass;
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_avgpool3d_bwd_ndhwc_ndhwc.hpp"
#include "avgpool3d_bwd_common.hpp"
using DOutDataType = ck::half_t;
using DInDataType = ck::half_t;
using ComputeDataType = float;
#if 1
using DOutLayout = ck::tensor_layout::convolution::NDHWC;
using DInLayout = ck::tensor_layout::convolution::NDHWC;
#else
using DOutLayout = ck::tensor_layout::convolution::NCDHW;
using DInLayout = ck::tensor_layout::convolution::NCDHW;
#endif
using DevicePoolBwdInstance =
ck::tensor_operation::device::DeviceAvgPool3dBwd_NDHWC_NDHWC<DOutDataType,
DInDataType,
ComputeDataType,
64, // BlockSize
64, // ReduceMThreadClusterSize
1, // ReduceKThreadClusterSize
1, // ReduceMThreadSliceSize
1, // ReduceKThreadSliceSize
1>; // InSrcOutDstVectorSize
int main()
{
std::vector<ck::index_t> window_lengths = {5, 5, 5};
std::vector<ck::index_t> window_strides = {2, 2, 2};
std::vector<ck::index_t> window_dilations = {2, 2, 2};
std::vector<ck::index_t> dinput_left_pads = {0, 0, 0};
std::vector<ck::index_t> dinput_right_pads = {0, 0, 0};
ck::index_t N = 1;
ck::index_t C = 16;
ck::index_t Di = 40;
ck::index_t Hi = 40;
ck::index_t Wi = 40;
pool3d_bwd_test<DevicePoolBwdInstance, DOutDataType, DInDataType, DOutLayout, DInLayout>(
true,
false,
N,
C,
Di,
Hi,
Wi,
window_lengths,
window_strides,
window_dilations,
dinput_left_pads,
dinput_right_pads);
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_avgpool3d_bwd_ndhwc_ndhwc.hpp"
#include "avgpool3d_bwd_common.hpp"
using DOutDataType = float;
using DInDataType = float;
using ComputeDataType = float;
#if 1
using DOutLayout = ck::tensor_layout::convolution::NDHWC;
using DInLayout = ck::tensor_layout::convolution::NDHWC;
#else
using DOutLayout = ck::tensor_layout::convolution::NCDHW;
using DInLayout = ck::tensor_layout::convolution::NCDHW;
#endif
using DevicePoolBwdInstance =
ck::tensor_operation::device::DeviceAvgPool3dBwd_NDHWC_NDHWC<DOutDataType,
DInDataType,
ComputeDataType,
64, // BlockSize
64, // ReduceMThreadClusterSize
1, // ReduceKThreadClusterSize
1, // ReduceMThreadSliceSize
1, // ReduceKThreadSliceSize
1>; // InSrcOutDstVectorSize
int main()
{
std::vector<ck::index_t> window_lengths = {5, 5, 5};
std::vector<ck::index_t> window_strides = {2, 2, 2};
std::vector<ck::index_t> window_dilations = {2, 2, 2};
std::vector<ck::index_t> dinput_left_pads = {0, 0, 0};
std::vector<ck::index_t> dinput_right_pads = {0, 0, 0};
ck::index_t N = 1;
ck::index_t C = 16;
ck::index_t Di = 40;
ck::index_t Hi = 40;
ck::index_t Wi = 40;
pool3d_bwd_test<DevicePoolBwdInstance, DOutDataType, DInDataType, DOutLayout, DInLayout>(
true,
false,
N,
C,
Di,
Hi,
Wi,
window_lengths,
window_strides,
window_dilations,
dinput_left_pads,
dinput_right_pads);
}
...@@ -125,6 +125,9 @@ ...@@ -125,6 +125,9 @@
// `s_nop`s to avoid hazard // `s_nop`s to avoid hazard
#define CK_USE_AMD_V_DOT_INLINE_ASM 0 #define CK_USE_AMD_V_DOT_INLINE_ASM 0
// inner product using V_DOT with DPP8 modifiers
#define CK_USE_AMD_V_DOT_DPP8_INLINE_ASM 1
// block synchronization only s_wait lgkmcnt(0), not vmcnt(0) // block synchronization only s_wait lgkmcnt(0), not vmcnt(0)
#define CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1 #define CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1
...@@ -198,7 +201,7 @@ ...@@ -198,7 +201,7 @@
#define CK_WORKAROUND_SWDEV_388832 1 #define CK_WORKAROUND_SWDEV_388832 1
// workaround: Grouped Conv2d_bwd_data fails for already implemented instance // workaround: Grouped Conv2d_bwd_data fails for already implemented instance
#define CK_WORKAROUND_SWDEV_3318619 0 #define CK_WORKAROUND_GITHUB_ISSUE_824 1
// flag to enable (1) or disable (0) the debugging output in some kernels // flag to enable (1) or disable (0) the debugging output in some kernels
#define DEBUG_LOG 0 #define DEBUG_LOG 0
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/amd_gemm_dpp.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_adaptor.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v4r1.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_contraction_dl_dpp8.hpp"
namespace ck {
/**
* DPP8 version of blockwise GEMM algorithm. It uses DPP8 instruction modifier to limit
* the data loaded from LDS to registers.
*
* The algorithm groups threads into groups of size `dpp8::lane_group_size` and splits the matrix C
* between them in such a way that threads from the same group need the same chunk of either
* matrix A (or B, respectively). Without the usage of DPP8, each thread would need to load the
* whole chunk from LDS to its own register space.
* Usage of DPP8 modifiers allow each thread to load less data, exactly `1 / dpp8::lane_group_size`
* of the chunk, and then share that data with other threads from the same lane group.
*
* Assumptions coming from the usage of DPP8:
* 1. `BM10BN10ThreadClusterBM10Xs[1] == dpp8::lane_group_size` or
* `BM10BN10ThreadClusterBN10Xs[1] == dpp8::lane_group_size` -
* - it makes consecutive `dpp8::lane_group_size` threads use the same chunk of either
* matrix A or B;
* - based on these values we determine which matrix to share.
* 2. `BM1PerThreadBM11 % dpp8::lane_group_size == 0` (if sharing A) or
* `BN1PerThreadBN11 % dpp8::lane_group_size == 0` (if sharing B) -
* - we have to make sure that the data to split is divisible by the number of
* threads in the group.
*
* General algorithm:
* C[BM0, BM1, BN0, BN1] += transpose(A[K, BM0, BM1]) * B[K, BN0, BN1]
* A and B are visible to the whole block, C is distributed among each thread
* Assume:
* 1. A:
* 1. ABlockDesc_BK0_BM_BK1 is known at compile-time
* 2. ABlockBuffer is DynamicBuffer
* 2. B:
* 1. BBlockDesc_BK0_BN_BK1 is known at compile-time
* 2. BBlockBuffer is DynamicBuffer
* 3. C:
* 1. CThreadDesc_BM0_BM11_BN0_BN11 is known at compile-time
* 2. CThreadBuffer is StaticBuffer
* 4. BM10BN10ThreadClusterBM10Xs::Size() = BM10BN10ThreadClusterBN10Xs::Size() == 2
*/
template <index_t BlockSize,
typename FloatA,
typename FloatB,
typename FloatC,
typename ABlockDesc_BK0_BM_BK1,
typename BBlockDesc_BK0_BN_BK1,
index_t BM1PerThreadBM11,
index_t BN1PerThreadBN11,
index_t BK0PerThread,
typename BM10BN10ThreadClusterBM10Xs, // Sequence<BM10BN10ThreadClusterBM100,
// BM10BN10ThreadClusterBM101, ...>
typename BM10BN10ThreadClusterBN10Xs, // Sequence<BM10BN10ThreadClusterBN100,
// BM10BN10ThreadClusterBN101, ...>
index_t AThreadCopyScalarPerVector_BM11,
index_t BThreadCopyScalarPerVector_BN11,
typename enable_if<ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&
BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(),
bool>::type = false>
struct BlockwiseGemmDlDpp8_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_loop_BM0_BN0
{
using AIndex = MultiIndex<4>;
using BIndex = MultiIndex<4>;
using CIndex = MultiIndex<4>;
static constexpr auto I0 = Number<0>{};
static constexpr auto I1 = Number<1>{};
static constexpr auto I2 = Number<2>{};
static constexpr auto I3 = Number<3>{};
static constexpr index_t BK0 = ABlockDesc_BK0_BM_BK1{}.GetLength(I0);
static constexpr index_t BK1 = ABlockDesc_BK0_BM_BK1{}.GetLength(I2);
static constexpr index_t BM = ABlockDesc_BK0_BM_BK1{}.GetLength(I1);
static constexpr index_t BN = BBlockDesc_BK0_BN_BK1{}.GetLength(I1);
static constexpr index_t BM100 = BM10BN10ThreadClusterBM10Xs{}[I0];
static constexpr index_t BN100 = BM10BN10ThreadClusterBN10Xs{}[I0];
static constexpr index_t BM101 = BM10BN10ThreadClusterBM10Xs{}[I1];
static constexpr index_t BN101 = BM10BN10ThreadClusterBN10Xs{}[I1];
static constexpr index_t BM11 = BM1PerThreadBM11;
static constexpr index_t BN11 = BN1PerThreadBN11;
static constexpr index_t BM1 = BM100 * BM101 * BM11;
static constexpr index_t BN1 = BN100 * BN101 * BN11;
static constexpr index_t BM0 = BM / BM1;
static constexpr index_t BN0 = BN / BN1;
// We assume that either `BM101` or `BN101` is equal to `dpp8::lane_group_size`. It makes all
// threads in a lane group need the same chunk of B or A matrices and we can share them using
// DPP.
static_assert(BM101 == dpp8::lane_group_size || BN101 == dpp8::lane_group_size);
static constexpr bool ShareB = BM101 == dpp8::lane_group_size ? true : false;
static constexpr bool ShareA = !ShareB;
// If DPP shares A (B, respectively), lane group gets `BM1PerThreadBM11` (`BN1PerThreadBN11`,
// respectively) elements, so we split them between threads in lane group so each thread loads
// less data from LDS.
static constexpr index_t BM1PerThread =
ShareA ? BM1PerThreadBM11 / dpp8::lane_group_size : BM1PerThreadBM11;
static constexpr index_t BN1PerThread =
ShareB ? BN1PerThreadBN11 / dpp8::lane_group_size : BN1PerThreadBN11;
__host__ __device__ static constexpr auto
MakeABlockDescriptor_BK0_BM0_BM1_BK1(const ABlockDesc_BK0_BM_BK1& a_block_desc_bk0_bm_bk1)
{
const auto a_block_bk0_bm0_bm1_bk1 = transform_tensor_descriptor(
a_block_desc_bk0_bm_bk1,
make_tuple(make_pass_through_transform(Number<BK0>{}),
make_unmerge_transform(make_tuple(Number<BM0>{}, Number<BM1>{})),
make_pass_through_transform(Number<BK1>{})),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
return a_block_bk0_bm0_bm1_bk1;
}
__host__ __device__ static constexpr auto
MakeBBlockDescriptor_BK0_BN0_BN1_BK1(const BBlockDesc_BK0_BN_BK1& b_block_desc_bk0_bn_bk1)
{
const auto b_block_desc_bk0_bn0_bn1_bk1 = transform_tensor_descriptor(
b_block_desc_bk0_bn_bk1,
make_tuple(make_pass_through_transform(Number<BK0>{}),
make_unmerge_transform(make_tuple(Number<BN0>{}, Number<BN1>{})),
make_pass_through_transform(Number<BK1>{})),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
return b_block_desc_bk0_bn0_bn1_bk1;
}
__host__ __device__ static constexpr auto
MakeCBlockAdaptor_BM0_BM100_BM101_BM11_BN0_BN100_BN101_BN11_To_BM_BN()
{
// upper: [BM0, BM100, BM101, BM11, BN0, BN100, BN101, BN11]
// lower: [BM, BN]
constexpr auto c_block_adaptor_m0_m100_m101_m11_n0_n100_n101_n11_to_m_n =
make_single_stage_tensor_adaptor(
make_tuple(make_unmerge_transform(make_tuple(
Number<BM0>{}, Number<BM100>{}, Number<BM101>{}, Number<BM11>{})),
make_unmerge_transform(make_tuple(
Number<BN0>{}, Number<BN100>{}, Number<BN101>{}, Number<BN11>{}))),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0, 1, 2, 3>{}, Sequence<4, 5, 6, 7>{}));
return c_block_adaptor_m0_m100_m101_m11_n0_n100_n101_n11_to_m_n;
}
__host__ __device__ static constexpr auto
MakeCBlockAdaptor_BM0_BM100_BM101_BM11_BN0_BN100_BN101_BN11_To_BM0_BM1_BN0_BN1()
{
// upper: [BM0, BM100, BM101, BM11, BN0, BN100, BN101, BN11]
// lower: [BM0, BM1, BN0, BN1]
constexpr auto c_block_adaptor_m0_m100_m101_m11_n0_n100_n101_n11_to_m0_m1_n0_n1 =
make_single_stage_tensor_adaptor(
make_tuple(make_pass_through_transform(Number<BM0>{}),
make_unmerge_transform(
make_tuple(Number<BM100>{}, Number<BM101>{}, Number<BM11>{})),
make_pass_through_transform(Number<BN0>{}),
make_unmerge_transform(
make_tuple(Number<BN100>{}, Number<BN101>{}, Number<BN11>{}))),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{}, Sequence<5, 6, 7>{}));
return c_block_adaptor_m0_m100_m101_m11_n0_n100_n101_n11_to_m0_m1_n0_n1;
}
__host__ __device__ static constexpr auto GetCThreadTensorLengths_BM0_BM1_BN0_BN1()
{
return Sequence<BM0, BM11, BN0, BN11>{};
}
static constexpr auto a_block_desc_bk0_bm0_bm1_bk1_ =
MakeABlockDescriptor_BK0_BM0_BM1_BK1(ABlockDesc_BK0_BM_BK1{});
static constexpr auto b_block_desc_bk0_bn0_bn1_bk1_ =
MakeBBlockDescriptor_BK0_BN0_BN1_BK1(BBlockDesc_BK0_BN_BK1{});
public:
__device__ BlockwiseGemmDlDpp8_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_loop_BM0_BN0()
: c_thread_origin_data_idx_{CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1(
get_thread_local_1d_id())},
a_thread_copy_{CalculateAThreadOriginOnBlock_BK0_BM0_BM1_BK1()},
b_thread_copy_{CalculateBThreadOriginOnBlock_BK0_BN0_BN1_BK1()}
{
static_assert(ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&
BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(),
"wrong! Desc should be known at compile-time");
static_assert(BM % BM1 == 0 && BN % BN1 == 0, "wrong!");
static_assert(ABlockDesc_BK0_BM_BK1{}.GetLength(I0) ==
BBlockDesc_BK0_BN_BK1{}.GetLength(I0),
"wrong! K dimension not consistent");
static_assert(BM10BN10ThreadClusterBM10Xs::Size() == 2 &&
BM10BN10ThreadClusterBN10Xs::Size() == 2,
"wrong!");
}
__device__ static CIndex CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1(index_t thread_id)
{
// lower: [BM0, BM1, BN0, BN1]
// upper: [BM0, BM100, BM101, BM11, BN0, BN100, BN101, BN11]
constexpr auto adaptor0 =
MakeCBlockAdaptor_BM0_BM100_BM101_BM11_BN0_BN100_BN101_BN11_To_BM0_BM1_BN0_BN1();
// lower: [BM0, BM100, BM101, BM11, BN0, BN100, BN101, BN11]
// upper: [Tid, BM0, BM11, BN0, BN11]
constexpr auto adaptor1 = make_single_stage_tensor_adaptor(
make_tuple(make_merge_transform(make_tuple(BM100, BN100, BM101, BN101)),
make_pass_through_transform(BM0),
make_pass_through_transform(BM11),
make_pass_through_transform(BN0),
make_pass_through_transform(BN11)),
make_tuple(
Sequence<1, 5, 2, 6>{}, Sequence<0>{}, Sequence<3>{}, Sequence<4>{}, Sequence<7>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
constexpr auto adaptor = chain_tensor_adaptors(adaptor0, adaptor1);
return adaptor.CalculateBottomIndex(make_multi_index(thread_id, 0, 0, 0, 0));
}
__device__ AIndex CalculateAThreadOriginOnBlock_BK0_BM0_BM1_BK1()
{
const auto offsetBM0 = c_thread_origin_data_idx_[I0];
// If sharing matrix A, we need a separate BM1 offset for each thread in lane group.
const auto offsetBM1 = ShareA ? c_thread_origin_data_idx_[I1] +
dpp8::get_thread_idx_in_lane_group() * BM1PerThread
: c_thread_origin_data_idx_[I1];
return make_tuple(0, offsetBM0, offsetBM1, 0);
}
__device__ BIndex CalculateBThreadOriginOnBlock_BK0_BN0_BN1_BK1()
{
const auto offsetBN0 = c_thread_origin_data_idx_[I2];
// If sharing matrix B, we need a separate BN1 offset for each thread in lane group.
const auto offsetBN1 = ShareB ? c_thread_origin_data_idx_[I3] +
dpp8::get_thread_idx_in_lane_group() * BN1PerThread
: c_thread_origin_data_idx_[I3];
return make_tuple(0, offsetBN0, offsetBN1, 0);
}
template <typename CThreadDesc_BM0_BM11_BN0_BN11,
typename ABlockBuffer,
typename BBlockBuffer,
typename CThreadBuffer>
__device__ void Run(const CThreadDesc_BM0_BM11_BN0_BN11&,
const ABlockBuffer& a_block_buf,
const BBlockBuffer& b_block_buf,
CThreadBuffer& c_thread_buf) const
{
static_assert(CThreadDesc_BM0_BM11_BN0_BN11::IsKnownAtCompileTime(),
"wrong! Desc should be known at compile-time");
auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatA>(
a_thread_desc_bk0_bm0_bm1_bk1_.GetElementSpaceSize());
auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatB>(
b_thread_desc_bk0_bn0_bn1_bk1_.GetElementSpaceSize());
constexpr auto threadwise_contraction =
ThreadwiseContractionDlDpp8_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1<
FloatA,
FloatB,
FloatC,
decltype(a_thread_desc_bk0_bm0_bm1_bk1_),
decltype(b_thread_desc_bk0_bn0_bn1_bk1_),
CThreadDesc_BM0_BM11_BN0_BN11,
Sequence<BK0PerThread, BK1>,
Sequence<1, BM1PerThreadBM11>,
Sequence<1, BN1PerThreadBN11>,
ShareA>{};
static_for<0, BN0, 1>{}([&](auto bn0) {
static_for<0, BM0, 1>{}([&](auto bm0) {
a_thread_copy_.Run(a_block_desc_bk0_bm0_bm1_bk1_,
make_tuple(I0, bm0, I0, I0),
a_block_buf,
a_thread_desc_bk0_bm0_bm1_bk1_,
make_tuple(I0, I0, I0, I0),
a_thread_buf);
b_thread_copy_.Run(b_block_desc_bk0_bn0_bn1_bk1_,
make_tuple(I0, bn0, I0, I0),
b_block_buf,
b_thread_desc_bk0_bn0_bn1_bk1_,
make_tuple(I0, I0, I0, I0),
b_thread_buf);
threadwise_contraction.Run(a_thread_buf,
make_tuple(I0, I0, I0, I0),
b_thread_buf,
make_tuple(I0, I0, I0, I0),
c_thread_buf,
make_tuple(bm0, I0, bn0, I0));
static_for<BK0PerThread, BK0, BK0PerThread>{}([&](auto bk0) {
a_thread_copy_.Run(a_block_desc_bk0_bm0_bm1_bk1_,
make_tuple(bk0, bm0, I0, I0),
a_block_buf,
a_thread_desc_bk0_bm0_bm1_bk1_,
make_tuple(I0, I0, I0, I0),
a_thread_buf);
b_thread_copy_.Run(b_block_desc_bk0_bn0_bn1_bk1_,
make_tuple(bk0, bn0, I0, I0),
b_block_buf,
b_thread_desc_bk0_bn0_bn1_bk1_,
make_tuple(I0, I0, I0, I0),
b_thread_buf);
threadwise_contraction.Run(a_thread_buf,
make_tuple(I0, I0, I0, I0),
b_thread_buf,
make_tuple(I0, I0, I0, I0),
c_thread_buf,
make_tuple(bm0, I0, bn0, I0));
});
});
});
}
private:
// A[BK0, BM0, BM1, BK1]
static constexpr auto a_thread_desc_bk0_bm0_bm1_bk1_ = make_naive_tensor_descriptor_packed(
make_tuple(Number<BK0PerThread>{}, Number<BM0>{}, Number<BM1PerThread>{}, Number<BK1>{}));
// B[BK0, BN0, BN1, BK1]
static constexpr auto b_thread_desc_bk0_bn0_bn1_bk1_ = make_naive_tensor_descriptor_packed(
make_tuple(Number<BK0PerThread>{}, Number<BN0>{}, Number<BN1PerThread>{}, Number<BK1>{}));
using AThreadCopy = ThreadwiseTensorSliceTransfer_v4r1<
FloatA,
FloatA,
decltype(a_block_desc_bk0_bm0_bm1_bk1_),
decltype(a_thread_desc_bk0_bm0_bm1_bk1_),
Sequence<BK0PerThread, 1, BM1PerThread, BK1>, // SliceLengths
Sequence<0, 1, 2, 3>, // DimAccessOrder
Sequence<1, 1, BM1PerThread, BK1>, // SrcVectorTensorLengths
Sequence<0, 1, 2, 3>>; // SrcVectorTensorContiguousDimOrder
using BThreadCopy = ThreadwiseTensorSliceTransfer_v4r1<
FloatB,
FloatB,
decltype(b_block_desc_bk0_bn0_bn1_bk1_),
decltype(b_thread_desc_bk0_bn0_bn1_bk1_),
Sequence<BK0PerThread, 1, BN1PerThread, BK1>, // SliceLengths
Sequence<0, 1, 2, 3>, // DimAccessOrder
Sequence<1, 1, BN1PerThread, BK1>, // SrcVectorTensorLengths
Sequence<0, 1, 2, 3>>; // SrcVectorTensorContiguousDimOrder
CIndex c_thread_origin_data_idx_;
AThreadCopy a_thread_copy_;
BThreadCopy b_thread_copy_;
};
} // namespace ck
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
namespace ck { namespace ck {
// C[BM0, BM1, BN0, BN1] += transpose(A[K, BM0, BM1]) * B[K, BN0, BN1] // C[BM0, BM1, BN0, BN1] += transpose(A[K, BM0, BM1]) * B[K, BN0, BN1]
// A and B are visable to the whole block, C is distributed among each thread // A and B are visible to the whole block, C is distributed among each thread
// Assume: // Assume:
// 1. A: // 1. A:
// 1. ABlockDesc_BK0_BM_BK1 is known at compile-time // 1. ABlockDesc_BK0_BM_BK1 is known at compile-time
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include "ck/tensor_operation/gpu/device/device_base.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
template <index_t NDimSpatial,
typename DOutDataType,
typename DInDataType,
typename DOutLayout,
typename DInLayout>
struct DeviceAvgPoolBwd : public BaseOperator
{
virtual std::unique_ptr<BaseArgument>
MakeArgumentPointer(const void* p_dout,
void* p_din,
std::vector<ck::index_t> dout_n_k_wos_lengths,
std::vector<ck::index_t> dout_n_k_wos_strides,
std::vector<ck::index_t> din_n_k_wos_length,
std::vector<ck::index_t> din_n_k_wos_strides,
std::vector<ck::index_t> window_k_c_xs_lengths,
std::vector<ck::index_t> window_strides,
std::vector<ck::index_t> window_dilations,
std::vector<ck::index_t> input_left_pads,
std::vector<ck::index_t> input_right_pads) = 0;
virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
};
} // namespace device
} // namespace tensor_operation
} // namespace ck
...@@ -27,15 +27,12 @@ struct DeviceGroupedConvBwdWeight : public BaseOperator ...@@ -27,15 +27,12 @@ struct DeviceGroupedConvBwdWeight : public BaseOperator
MakeArgumentPointer(const void* p_in, MakeArgumentPointer(const void* p_in,
void* p_wei, void* p_wei,
const void* p_out, const void* p_out,
const ck::index_t G, const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths, // input
const ck::index_t N, const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
const ck::index_t K, const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths, // weight
const ck::index_t C, const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
const std::array<ck::index_t, NDimSpatial>& input_spatial_lengths, const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths, // output
const std::array<ck::index_t, NDimSpatial>& filter_spatial_lengths, const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_strides,
const std::array<ck::index_t, NDimSpatial>& output_spatial_lengths,
const std::array<ck::index_t, NDimSpatial + 3>& input_strides,
const std::array<ck::index_t, NDimSpatial + 3>& output_strides,
const std::array<ck::index_t, NDimSpatial>& conv_filter_strides, const std::array<ck::index_t, NDimSpatial>& conv_filter_strides,
const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations, const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations,
const std::array<ck::index_t, NDimSpatial>& input_left_pads, const std::array<ck::index_t, NDimSpatial>& input_left_pads,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment