add fp8 support

56863b9a · Jing Zhang · 54df59bf · d4c84256 · 56863b9a · 56863b9a
Commit 56863b9a authored Aug 16, 2023 by Jing Zhang
20 changed files
--- a/example/46_gemm_add_multiply/CMakeLists.txt
+++ b/example/46_gemm_add_multiply/CMakeLists.txt
-add_example_executable(example_gemm_add_multiply_dl_fp16 gemm_add_multiply_dl_fp16.cpp)
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-add_example_executable(example_gemm_add_multiply_xdl_fp16 gemm_add_multiply_xdl_fp16.cpp)
+    if(DL_KERNELS)
+        add_example_executable(example_gemm_add_multiply_dl_fp16 gemm_add_multiply_dl_fp16.cpp)
+    endif()
+    add_example_executable(example_gemm_add_multiply_xdl_fp16 gemm_add_multiply_xdl_fp16.cpp)
+endif()
--- a/example/48_pool3d_fwd/CMakeLists.txt
+++ b/example/48_pool3d_fwd/CMakeLists.txt
-add_example_executable(example_pool3d_fwd_fp16 pool3d_fwd_fp16.cpp)
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+    add_example_executable(example_pool3d_fwd_fp16 pool3d_fwd_fp16.cpp)
+endif()
--- a/example/48_pool3d_fwd/pool3d_fwd_common.hpp
+++ b/example/48_pool3d_fwd/pool3d_fwd_common.hpp
@@ -18,7 +18,45 @@
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp"
-template <typename InDataType,
+template <typename TensorLayout>
+std::vector<ck::index_t> f_tensor_strides_ncdhw(ck::index_t N_,
+                                                ck::index_t C_,
+                                                ck::index_t D,
+                                                ck::index_t H,
+                                                ck::index_t W,
+                                                TensorLayout layout)
+{
+    using namespace ck::literals;
+    (void)N_;
+    if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCDHW>::value)
+        return {C_ * D * H * W, D * H * W, H * W, W, 1_uz};
+    else if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NDHWC>::value)
+        return {D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_};
+};
+template <typename TensorLayout>
+HostTensorDescriptor f_host_tensor_descriptor(std::size_t N_,
+                                              std::size_t C_,
+                                              std::size_t D,
+                                              std::size_t H,
+                                              std::size_t W,
+                                              TensorLayout layout)
+{
+    using namespace ck::literals;
+    if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCDHW>::value)
+    {
+        return HostTensorDescriptor({N_, C_, D, H, W}, {C_ * D * H * W, D * H * W, H * W, W, 1_uz});
+    }
+    else if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NDHWC>::value)
+    {
+        return HostTensorDescriptor({N_, C_, D, H, W},
+                                    {D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_});
+    }
+};
+template <typename DevicePoolFwdInstance,
+          typename InDataType,
          typename OutDataType,
          typename ComputeDataType,
          typename IndexDataType,
@@ -40,6 +78,9 @@ bool pool3d_test(bool do_verification,
                 ck::index_t window_stride_d,
                 ck::index_t window_stride_h,
                 ck::index_t window_stride_w,
+                 ck::index_t window_dilation_d,
+                 ck::index_t window_dilation_h,
+                 ck::index_t window_dilation_w,
                 ck::index_t in_left_pad_d,
                 ck::index_t in_left_pad_h,
                 ck::index_t in_left_pad_w,
@@ -47,53 +88,21 @@ bool pool3d_test(bool do_verification,
                 ck::index_t in_right_pad_h,
                 ck::index_t in_right_pad_w)
 {
-    using DevicePoolFwdInstance =
+    const ck::index_t Zs = (Z - 1) * window_dilation_d + 1;
-        ck::tensor_operation::device::DevicePool3dFwd_Input_N_Di_Hi_Wi_C_Output_N_Do_Ho_Wo_C<
+    const ck::index_t Ys = (Y - 1) * window_dilation_h + 1;
-            InDataType,      // InDataType
+    const ck::index_t Xs = (X - 1) * window_dilation_w + 1;
-            OutDataType,     // OutDataType
+    const ck::index_t Do = (Di + in_left_pad_d + in_right_pad_d - Zs) / window_stride_d + 1;
-            IndexDataType,   // IndexDataType
+    const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Ys) / window_stride_h + 1;
-            ComputeDataType, // ComputeDataType
+    const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - Xs) / window_stride_w + 1;
-            ReduceOpId,
-            OutputIndex,
-            64, // BlockSize
-            64, // ReduceMThreadClusterSize
-            1,  // ReduceKThreadClusterSize
-            4,  // ReduceMThreadSliceSize
-            1,  // ReduceKThreadSliceSize
-            4>; // InSrcOutDstVectorSize
-    const ck::index_t Do = (Di + in_left_pad_d + in_right_pad_d - Z) / window_stride_d + 1;
-    const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Y) / window_stride_h + 1;
-    const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - X) / window_stride_w + 1;
    const std::vector<ck::index_t> window_spatial_lengths{Z, Y, X};
    const std::vector<ck::index_t> window_strides{
        window_stride_d, window_stride_h, window_stride_w};
+    const std::vector<ck::index_t> window_dilations{
+        window_dilation_d, window_dilation_h, window_dilation_w};
    const std::vector<ck::index_t> input_left_pads{in_left_pad_d, in_left_pad_h, in_left_pad_w};
    const std::vector<ck::index_t> input_right_pads{in_right_pad_d, in_right_pad_h, in_right_pad_w};
-    // tensor layout
-    auto f_host_tensor_descriptor = [](std::size_t N_,
-                                       std::size_t C_,
-                                       std::size_t D,
-                                       std::size_t H,
-                                       std::size_t W,
-                                       auto layout) {
-        using namespace ck::literals;
-        if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCDHW>::value)
-        {
-            return HostTensorDescriptor({N_, C_, D, H, W},
-                                        {C_ * D * H * W, D * H * W, H * W, W, 1_uz});
-        }
-        else if constexpr(ck::is_same<decltype(layout),
-                                      ck::tensor_layout::convolution::NDHWC>::value)
-        {
-            return HostTensorDescriptor({N_, C_, D, H, W},
-                                        {D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_});
-        }
-    };
    Tensor<InDataType> in_n_c_di_hi_wi(f_host_tensor_descriptor(N, C, Di, Hi, Wi, InLayout{}));
    Tensor<OutDataType> out_n_c_do_ho_wo_host(
        f_host_tensor_descriptor(N, C, Do, Ho, Wo, OutLayout{}));
@@ -126,10 +135,11 @@ bool pool3d_test(bool do_verification,
        {N, C, Di, Hi, Wi},
        {Z, Y, X},
        {N, C, Do, Ho, Wo},
-        {Di * C * Hi * Wi, 1, C * Hi * Wi, Wi * C, C},
+        f_tensor_strides_ncdhw(N, C, Di, Hi, Wi, InLayout{}),
-        {Do * C * Ho * Wo, 1, C * Ho * Wo, Wo * C, C},
+        f_tensor_strides_ncdhw(N, C, Do, Ho, Wo, OutLayout{}),
-        {Do * C * Ho * Wo, 1, C * Ho * Wo, Wo * C, C},
+        f_tensor_strides_ncdhw(N, C, Do, Ho, Wo, OutLayout{}),
        window_strides,
+        window_dilations,
        input_left_pads,
        input_right_pads,
        {2, 3, 4});
@@ -165,6 +175,7 @@ bool pool3d_test(bool do_verification,
                                                             out_indices_n_c_do_ho_wo_host,
                                                             window_spatial_lengths,
                                                             window_strides,
+                                                             window_dilations,
                                                             input_left_pads,
                                                             input_right_pads);

--- a/example/48_pool3d_fwd/pool3d_fwd_fp16.cpp
+++ b/example/48_pool3d_fwd/pool3d_fwd_fp16.cpp
@@ -27,31 +27,49 @@ static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
 static constexpr bool OutputIndex  = false;
 static constexpr bool PropagateNan = false;
+using DevicePoolFwdInstance =
+    ck::tensor_operation::device::DevicePool3dFwd_NDHWC_NDHWC<InDataType,
+                                                              OutDataType,
+                                                              IndexDataType,
+                                                              ComputeDataType,
+                                                              ReduceOpId,
+                                                              OutputIndex,
+                                                              64, // BlockSize
+                                                              64, // ReduceMThreadClusterSize
+                                                              1,  // ReduceKThreadClusterSize
+                                                              1,  // ReduceMThreadSliceSize
+                                                              1,  // ReduceKThreadSliceSize
+                                                              1>; // InSrcOutDstVectorSize
 int main()
 {
    bool do_verification = true;
    bool time_kernel     = false;
    // Pool shape
-    ck::index_t N               = 2;
+    ck::index_t N                 = 2;
-    ck::index_t C               = 32;
+    ck::index_t C                 = 32;
-    ck::index_t Z               = 2;
+    ck::index_t Z                 = 2;
-    ck::index_t Y               = 2;
+    ck::index_t Y                 = 2;
-    ck::index_t X               = 2;
+    ck::index_t X                 = 2;
-    ck::index_t Di              = 30;
+    ck::index_t Di                = 30;
-    ck::index_t Hi              = 30;
+    ck::index_t Hi                = 30;
-    ck::index_t Wi              = 30;
+    ck::index_t Wi                = 30;
-    ck::index_t window_stride_d = 2;
+    ck::index_t window_stride_d   = 2;
-    ck::index_t window_stride_h = 2;
+    ck::index_t window_stride_h   = 2;
-    ck::index_t window_stride_w = 2;
+    ck::index_t window_stride_w   = 2;
-    ck::index_t in_left_pad_d   = 1;
+    ck::index_t window_dilation_d = 1;
-    ck::index_t in_left_pad_h   = 1;
+    ck::index_t window_dilation_h = 1;
-    ck::index_t in_left_pad_w   = 1;
+    ck::index_t window_dilation_w = 1;
-    ck::index_t in_right_pad_d  = 1;
+    ck::index_t in_left_pad_d     = 1;
-    ck::index_t in_right_pad_h  = 1;
+    ck::index_t in_left_pad_h     = 1;
-    ck::index_t in_right_pad_w  = 1;
+    ck::index_t in_left_pad_w     = 1;
+    ck::index_t in_right_pad_d    = 1;
+    ck::index_t in_right_pad_h    = 1;
+    ck::index_t in_right_pad_w    = 1;
-    bool pass = pool3d_test<InDataType,
+    bool pass = pool3d_test<DevicePoolFwdInstance,
+                            InDataType,
                            OutDataType,
                            ComputeDataType,
                            IndexDataType,
@@ -72,6 +90,9 @@ int main()
                                         window_stride_d,
                                         window_stride_h,
                                         window_stride_w,
+                                         window_dilation_d,
+                                         window_dilation_h,
+                                         window_dilation_w,
                                         in_left_pad_d,
                                         in_left_pad_h,
                                         in_left_pad_w,

--- a/example/49_maxpool2d_bwd/CMakeLists.txt
+++ b/example/49_maxpool2d_bwd/CMakeLists.txt
-add_example_executable(example_maxpool2d_bwd_bf16 maxpool2d_bwd_bf16.cpp)
+if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
-add_example_executable(example_maxpool2d_bwd_fp16 maxpool2d_bwd_fp16.cpp)
+    add_example_executable(example_maxpool2d_bwd_bf16 maxpool2d_bwd_bf16.cpp)
-add_example_executable(example_maxpool2d_bwd_fp32 maxpool2d_bwd_fp32.cpp)
+endif()
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+    add_example_executable(example_maxpool2d_bwd_fp16 maxpool2d_bwd_fp16.cpp)
+endif()
+if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
+    add_example_executable(example_maxpool2d_bwd_fp32 maxpool2d_bwd_fp32.cpp)
+endif()
--- a/example/49_maxpool2d_bwd/maxpool2d_bwd_bf16.cpp
+++ b/example/49_maxpool2d_bwd/maxpool2d_bwd_bf16.cpp
@@ -24,18 +24,20 @@ int main()
    bool time_kernel     = false;
    // Pool shape
-    ck::index_t N               = 1;
+    ck::index_t N                 = 1;
-    ck::index_t C               = 1;
+    ck::index_t C                 = 1;
-    ck::index_t Y               = 3;
+    ck::index_t Y                 = 3;
-    ck::index_t X               = 3;
+    ck::index_t X                 = 3;
-    ck::index_t Hi              = 32;
+    ck::index_t Hi                = 32;
-    ck::index_t Wi              = 32;
+    ck::index_t Wi                = 32;
-    ck::index_t window_stride_h = 1;
+    ck::index_t window_stride_h   = 1;
-    ck::index_t window_stride_w = 1;
+    ck::index_t window_stride_w   = 1;
-    ck::index_t in_left_pad_h   = 0;
+    ck::index_t window_dilation_h = 1;
-    ck::index_t in_left_pad_w   = 0;
+    ck::index_t window_dilation_w = 1;
-    ck::index_t in_right_pad_h  = 0;
+    ck::index_t in_left_pad_h     = 0;
-    ck::index_t in_right_pad_w  = 0;
+    ck::index_t in_left_pad_w     = 0;
+    ck::index_t in_right_pad_h    = 0;
+    ck::index_t in_right_pad_w    = 0;
    bool pass = maxpool_bwd_test<InDataType,
                                 OutDataType,
@@ -53,6 +55,8 @@ int main()
                                               Wi,
                                               window_stride_h,
                                               window_stride_w,
+                                               window_dilation_h,
+                                               window_dilation_w,
                                               in_left_pad_h,
                                               in_left_pad_w,
                                               in_right_pad_h,

--- a/example/49_maxpool2d_bwd/maxpool2d_bwd_common.hpp
+++ b/example/49_maxpool2d_bwd/maxpool2d_bwd_common.hpp
@@ -36,6 +36,8 @@ bool maxpool_bwd_test(bool do_verification,
                      ck::index_t Wi,
                      ck::index_t window_stride_h,
                      ck::index_t window_stride_w,
+                      ck::index_t window_dilation_h,
+                      ck::index_t window_dilation_w,
                      ck::index_t in_left_pad_h,
                      ck::index_t in_left_pad_w,
                      ck::index_t in_right_pad_h,
@@ -44,28 +46,30 @@ bool maxpool_bwd_test(bool do_verification,
    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
    using DevicePoolFwdInstance =
-        ck::tensor_operation::device::DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C<
+        ck::tensor_operation::device::DevicePool2dFwd_NHWC_NHWC<InDataType,      // InDataType
-            InDataType,      // InDataType
+                                                                OutDataType,     // OutDataType
-            OutDataType,     // OutDataType
+                                                                IndexDataType,   // IndexDataType
-            IndexDataType,   // IndexDataType
+                                                                ComputeDataType, // ComputeDataType
-            ComputeDataType, // ComputeDataType
+                                                                ck::ReduceTensorOp::MAX,
-            ck::ReduceTensorOp::MAX,
+                                                                true,
-            true, // OutputIndex
+                                                                64, // BlockSize
-            64,   // BlockSize
+                                                                64, // ReduceMThreadClusterSize
-            64,   // ReduceMThreadClusterSize
+                                                                1,  // ReduceKThreadClusterSize
-            1,    // ReduceKThreadClusterSize
+                                                                4,  // ReduceMThreadSliceSize
-            4,    // ReduceMThreadSliceSize
+                                                                1,  // ReduceKThreadSliceSize
-            1,    // ReduceKThreadSliceSize
+                                                                1>; // InSrcOutDstVectorSize
-            1>;   // InSrcOutDstVectorSize
    using DeviceMaxPoolBwdInstance = ck::tensor_operation::device::
        DeviceIndexPoolBwdImpl<DOutDataType, IndexDataType, DInDataType, 4>;
-    const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Y) / window_stride_h + 1;
+    const ck::index_t Ys = (Y - 1) * window_dilation_h + 1;
-    const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - X) / window_stride_w + 1;
+    const ck::index_t Xs = (X - 1) * window_dilation_w + 1;
+    const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Ys) / window_stride_h + 1;
+    const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - Xs) / window_stride_w + 1;
    const std::vector<ck::index_t> window_spatial_lengths{Y, X};
    const std::vector<ck::index_t> window_strides{window_stride_h, window_stride_w};
+    const std::vector<ck::index_t> window_dilations{window_dilation_h, window_dilation_w};
    const std::vector<ck::index_t> input_left_pads{in_left_pad_h, in_left_pad_w};
    const std::vector<ck::index_t> input_right_pads{in_right_pad_h, in_right_pad_w};
@@ -128,6 +132,7 @@ bool maxpool_bwd_test(bool do_verification,
        {C * Ho * Wo, 1, Wo * C, C},
        {C * Ho * Wo, 1, Wo * C, C},
        window_strides,
+        window_dilations,
        input_left_pads,
        input_right_pads,
        {2, 3});
@@ -191,6 +196,7 @@ bool maxpool_bwd_test(bool do_verification,
                                                                     indices_n_c_ho_wo_host,
                                                                     window_spatial_lengths,
                                                                     window_strides,
+                                                                     window_dilations,
                                                                     input_left_pads,
                                                                     input_right_pads);
        ref_pooling_fwd_invoker.Run(ref_pooling_fwd_argument);

--- a/example/49_maxpool2d_bwd/maxpool2d_bwd_fp16.cpp
+++ b/example/49_maxpool2d_bwd/maxpool2d_bwd_fp16.cpp
@@ -24,18 +24,20 @@ int main()
    bool time_kernel     = false;
    // Pool shape
-    ck::index_t N               = 1;
+    ck::index_t N                 = 1;
-    ck::index_t C               = 1;
+    ck::index_t C                 = 1;
-    ck::index_t Y               = 3;
+    ck::index_t Y                 = 3;
-    ck::index_t X               = 3;
+    ck::index_t X                 = 3;
-    ck::index_t Hi              = 32;
+    ck::index_t Hi                = 32;
-    ck::index_t Wi              = 32;
+    ck::index_t Wi                = 32;
-    ck::index_t window_stride_h = 1;
+    ck::index_t window_stride_h   = 1;
-    ck::index_t window_stride_w = 1;
+    ck::index_t window_stride_w   = 1;
-    ck::index_t in_left_pad_h   = 0;
+    ck::index_t window_dilation_h = 1;
-    ck::index_t in_left_pad_w   = 0;
+    ck::index_t window_dilation_w = 1;
-    ck::index_t in_right_pad_h  = 0;
+    ck::index_t in_left_pad_h     = 0;
-    ck::index_t in_right_pad_w  = 0;
+    ck::index_t in_left_pad_w     = 0;
+    ck::index_t in_right_pad_h    = 0;
+    ck::index_t in_right_pad_w    = 0;
    bool pass = maxpool_bwd_test<InDataType,
                                 OutDataType,
@@ -53,6 +55,8 @@ int main()
                                               Wi,
                                               window_stride_h,
                                               window_stride_w,
+                                               window_dilation_h,
+                                               window_dilation_w,
                                               in_left_pad_h,
                                               in_left_pad_w,
                                               in_right_pad_h,

--- a/example/49_maxpool2d_bwd/maxpool2d_bwd_fp32.cpp
+++ b/example/49_maxpool2d_bwd/maxpool2d_bwd_fp32.cpp
@@ -24,18 +24,20 @@ int main()
    bool time_kernel     = false;
    // Pool shape
-    ck::index_t N               = 1;
+    ck::index_t N                 = 1;
-    ck::index_t C               = 1;
+    ck::index_t C                 = 1;
-    ck::index_t Y               = 2;
+    ck::index_t Y                 = 2;
-    ck::index_t X               = 2;
+    ck::index_t X                 = 2;
-    ck::index_t Hi              = 32;
+    ck::index_t Hi                = 32;
-    ck::index_t Wi              = 32;
+    ck::index_t Wi                = 32;
-    ck::index_t window_stride_h = 2;
+    ck::index_t window_stride_h   = 2;
-    ck::index_t window_stride_w = 2;
+    ck::index_t window_stride_w   = 2;
-    ck::index_t in_left_pad_h   = 0;
+    ck::index_t window_dilation_h = 1;
-    ck::index_t in_left_pad_w   = 0;
+    ck::index_t window_dilation_w = 1;
-    ck::index_t in_right_pad_h  = 0;
+    ck::index_t in_left_pad_h     = 0;
-    ck::index_t in_right_pad_w  = 0;
+    ck::index_t in_left_pad_w     = 0;
+    ck::index_t in_right_pad_h    = 0;
+    ck::index_t in_right_pad_w    = 0;
    bool pass = maxpool_bwd_test<InDataType,
                                 OutDataType,
@@ -53,6 +55,8 @@ int main()
                                               Wi,
                                               window_stride_h,
                                               window_stride_w,
+                                               window_dilation_h,
+                                               window_dilation_w,
                                               in_left_pad_h,
                                               in_left_pad_w,
                                               in_right_pad_h,

--- a/example/50_put_element/CMakeLists.txt
+++ b/example/50_put_element/CMakeLists.txt
-add_example_executable(example_put_element_fp16 put_element_fp16.cpp)
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+    add_example_executable(example_put_element_fp16 put_element_fp16.cpp)
+endif()
--- a/example/51_avgpool3d_bwd/CMakeLists.txt
+++ b/example/51_avgpool3d_bwd/CMakeLists.txt
+add_example_executable(example_avgpool3d_bwd_bf16 avgpool3d_bwd_bf16.cpp)
+add_example_executable(example_avgpool3d_bwd_fp16 avgpool3d_bwd_fp16.cpp)
+add_example_executable(example_avgpool3d_bwd_fp32 avgpool3d_bwd_fp32.cpp)
--- a/example/51_avgpool3d_bwd/avgpool3d_bwd_bf16.cpp
+++ b/example/51_avgpool3d_bwd/avgpool3d_bwd_bf16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_avgpool3d_bwd_ndhwc_ndhwc.hpp"
+#include "avgpool3d_bwd_common.hpp"
+using DOutDataType    = ck::bhalf_t;
+using DInDataType     = ck::bhalf_t;
+using ComputeDataType = float;
+#if 1
+using DOutLayout = ck::tensor_layout::convolution::NDHWC;
+using DInLayout  = ck::tensor_layout::convolution::NDHWC;
+#else
+using DOutLayout = ck::tensor_layout::convolution::NCDHW;
+using DInLayout  = ck::tensor_layout::convolution::NCDHW;
+#endif
+using DevicePoolBwdInstance =
+    ck::tensor_operation::device::DeviceAvgPool3dBwd_NDHWC_NDHWC<DOutDataType,
+                                                                 DInDataType,
+                                                                 ComputeDataType,
+                                                                 64, // BlockSize
+                                                                 64, // ReduceMThreadClusterSize
+                                                                 1,  // ReduceKThreadClusterSize
+                                                                 1,  // ReduceMThreadSliceSize
+                                                                 1,  // ReduceKThreadSliceSize
+                                                                 1>; // InSrcOutDstVectorSize
+int main()
+{
+    std::vector<ck::index_t> window_lengths    = {5, 5, 5};
+    std::vector<ck::index_t> window_strides    = {2, 2, 2};
+    std::vector<ck::index_t> window_dilations  = {2, 2, 2};
+    std::vector<ck::index_t> dinput_left_pads  = {0, 0, 0};
+    std::vector<ck::index_t> dinput_right_pads = {0, 0, 0};
+    ck::index_t N  = 1;
+    ck::index_t C  = 16;
+    ck::index_t Di = 40;
+    ck::index_t Hi = 40;
+    ck::index_t Wi = 40;
+    pool3d_bwd_test<DevicePoolBwdInstance, DOutDataType, DInDataType, DOutLayout, DInLayout>(
+        true,
+        false,
+        N,
+        C,
+        Di,
+        Hi,
+        Wi,
+        window_lengths,
+        window_strides,
+        window_dilations,
+        dinput_left_pads,
+        dinput_right_pads);
+}
--- a/example/51_avgpool3d_bwd/avgpool3d_bwd_common.hpp
+++ b/example/51_avgpool3d_bwd/avgpool3d_bwd_common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include "ck/ck.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_avgpool_bwd.hpp"
+template <typename TensorLayout>
+std::vector<ck::index_t> f_tensor_strides_ncdhw(ck::index_t N_,
+                                                ck::index_t C_,
+                                                ck::index_t D,
+                                                ck::index_t H,
+                                                ck::index_t W,
+                                                TensorLayout layout)
+{
+    using namespace ck::literals;
+    (void)N_;
+    if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCDHW>::value)
+        return {C_ * D * H * W, D * H * W, H * W, W, 1_uz};
+    else if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NDHWC>::value)
+        return {D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_};
+};
+template <typename TensorLayout>
+HostTensorDescriptor f_host_tensor_descriptor(std::size_t N_,
+                                              std::size_t C_,
+                                              std::size_t D,
+                                              std::size_t H,
+                                              std::size_t W,
+                                              TensorLayout layout)
+{
+    using namespace ck::literals;
+    if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCDHW>::value)
+    {
+        return HostTensorDescriptor({N_, C_, D, H, W}, {C_ * D * H * W, D * H * W, H * W, W, 1_uz});
+    }
+    else if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NDHWC>::value)
+    {
+        return HostTensorDescriptor({N_, C_, D, H, W},
+                                    {D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_});
+    }
+};
+template <typename DevicePoolBwdInstance,
+          typename DOutDataType,
+          typename DInDataType,
+          typename DOutLayout,
+          typename DInLayout>
+bool pool3d_bwd_test(bool do_verification,
+                     bool time_kernel,
+                     ck::index_t N,
+                     ck::index_t C,
+                     ck::index_t Di,
+                     ck::index_t Hi,
+                     ck::index_t Wi,
+                     std::vector<ck::index_t> window_lengths,
+                     std::vector<ck::index_t> window_strides,
+                     std::vector<ck::index_t> window_dilations,
+                     std::vector<ck::index_t> dinput_left_pads,
+                     std::vector<ck::index_t> dinput_right_pads)
+{
+    auto OutSpatialLength = [&](auto InSpatialLength, int index) {
+        ck::index_t left_pad   = dinput_left_pads[index];
+        ck::index_t right_pad  = dinput_right_pads[index];
+        ck::index_t window_len = window_lengths[index];
+        ck::index_t stride     = window_strides[index];
+        ck::index_t dilation   = window_dilations[index];
+        ck::index_t eff        = (window_len - 1) * dilation + 1;
+        return (InSpatialLength + left_pad + right_pad - eff) / stride + 1;
+    };
+    ck::index_t Do = OutSpatialLength(Di, 0);
+    ck::index_t Ho = OutSpatialLength(Hi, 1);
+    ck::index_t Wo = OutSpatialLength(Wi, 2);
+    Tensor<DOutDataType> dout(f_host_tensor_descriptor(N, C, Do, Ho, Wo, DOutLayout{}));
+    Tensor<DInDataType> din_dev(f_host_tensor_descriptor(N, C, Di, Hi, Wi, DInLayout{}));
+    Tensor<DInDataType> din_host(f_host_tensor_descriptor(N, C, Di, Hi, Wi, DInLayout{}));
+    std::cout << "dout: " << dout.mDesc << std::endl;
+    std::cout << "din_host: " << din_host.mDesc << std::endl;
+    dout.GenerateTensorValue(GeneratorTensor_3<DOutDataType>{0.0, 1.0});
+    DeviceMem dout_device_buf(sizeof(DOutDataType) * dout.mDesc.GetElementSpaceSize());
+    DeviceMem din_device_buf(sizeof(DInDataType) * din_dev.mDesc.GetElementSpaceSize());
+    dout_device_buf.ToDevice(dout.mData.data());
+    din_device_buf.SetZero();
+    auto pool        = DevicePoolBwdInstance{};
+    auto invoker_ptr = pool.MakeInvokerPointer();
+    auto argument_ptr =
+        pool.MakeArgumentPointer(static_cast<DOutDataType*>(dout_device_buf.GetDeviceBuffer()),
+                                 static_cast<DInDataType*>(din_device_buf.GetDeviceBuffer()),
+                                 {N, C, Do, Ho, Wo},
+                                 {N, C, Di, Hi, Wi},
+                                 f_tensor_strides_ncdhw(N, C, Do, Ho, Wo, DOutLayout{}),
+                                 f_tensor_strides_ncdhw(N, C, Di, Hi, Wi, DInLayout{}),
+                                 window_lengths,
+                                 window_strides,
+                                 window_dilations,
+                                 dinput_left_pads,
+                                 dinput_right_pads);
+    if(!pool.IsSupportedArgument(argument_ptr.get()))
+    {
+        throw std::runtime_error("wrong! device_op with the specified compilation parameters does "
+                                 "not support this problem");
+    }
+    float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+    std::cout << "Perf: " << ave_time << std::endl;
+    bool pass = true;
+    if(do_verification)
+    {
+        auto ref_pool =
+            ck::tensor_operation::host::ReferenceAvgPoolBwd<3, DInDataType, DOutDataType>();
+        auto ref_invoker = ref_pool.MakeInvoker();
+        auto ref_argument = ref_pool.MakeArgument(din_host,
+                                                  dout,
+                                                  window_lengths,
+                                                  window_strides,
+                                                  window_dilations,
+                                                  dinput_left_pads,
+                                                  dinput_right_pads);
+        ref_invoker.Run(ref_argument);
+        din_device_buf.FromDevice(din_dev.mData.data());
+        pass = ck::utils::check_err(din_dev, din_host);
+    }
+    return pass;
+}
--- a/example/51_avgpool3d_bwd/avgpool3d_bwd_fp16.cpp
+++ b/example/51_avgpool3d_bwd/avgpool3d_bwd_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_avgpool3d_bwd_ndhwc_ndhwc.hpp"
+#include "avgpool3d_bwd_common.hpp"
+using DOutDataType    = ck::half_t;
+using DInDataType     = ck::half_t;
+using ComputeDataType = float;
+#if 1
+using DOutLayout = ck::tensor_layout::convolution::NDHWC;
+using DInLayout  = ck::tensor_layout::convolution::NDHWC;
+#else
+using DOutLayout = ck::tensor_layout::convolution::NCDHW;
+using DInLayout  = ck::tensor_layout::convolution::NCDHW;
+#endif
+using DevicePoolBwdInstance =
+    ck::tensor_operation::device::DeviceAvgPool3dBwd_NDHWC_NDHWC<DOutDataType,
+                                                                 DInDataType,
+                                                                 ComputeDataType,
+                                                                 64, // BlockSize
+                                                                 64, // ReduceMThreadClusterSize
+                                                                 1,  // ReduceKThreadClusterSize
+                                                                 1,  // ReduceMThreadSliceSize
+                                                                 1,  // ReduceKThreadSliceSize
+                                                                 1>; // InSrcOutDstVectorSize
+int main()
+{
+    std::vector<ck::index_t> window_lengths    = {5, 5, 5};
+    std::vector<ck::index_t> window_strides    = {2, 2, 2};
+    std::vector<ck::index_t> window_dilations  = {2, 2, 2};
+    std::vector<ck::index_t> dinput_left_pads  = {0, 0, 0};
+    std::vector<ck::index_t> dinput_right_pads = {0, 0, 0};
+    ck::index_t N  = 1;
+    ck::index_t C  = 16;
+    ck::index_t Di = 40;
+    ck::index_t Hi = 40;
+    ck::index_t Wi = 40;
+    pool3d_bwd_test<DevicePoolBwdInstance, DOutDataType, DInDataType, DOutLayout, DInLayout>(
+        true,
+        false,
+        N,
+        C,
+        Di,
+        Hi,
+        Wi,
+        window_lengths,
+        window_strides,
+        window_dilations,
+        dinput_left_pads,
+        dinput_right_pads);
+}
--- a/example/51_avgpool3d_bwd/avgpool3d_bwd_fp32.cpp
+++ b/example/51_avgpool3d_bwd/avgpool3d_bwd_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_avgpool3d_bwd_ndhwc_ndhwc.hpp"
+#include "avgpool3d_bwd_common.hpp"
+using DOutDataType    = float;
+using DInDataType     = float;
+using ComputeDataType = float;
+#if 1
+using DOutLayout = ck::tensor_layout::convolution::NDHWC;
+using DInLayout  = ck::tensor_layout::convolution::NDHWC;
+#else
+using DOutLayout = ck::tensor_layout::convolution::NCDHW;
+using DInLayout  = ck::tensor_layout::convolution::NCDHW;
+#endif
+using DevicePoolBwdInstance =
+    ck::tensor_operation::device::DeviceAvgPool3dBwd_NDHWC_NDHWC<DOutDataType,
+                                                                 DInDataType,
+                                                                 ComputeDataType,
+                                                                 64, // BlockSize
+                                                                 64, // ReduceMThreadClusterSize
+                                                                 1,  // ReduceKThreadClusterSize
+                                                                 1,  // ReduceMThreadSliceSize
+                                                                 1,  // ReduceKThreadSliceSize
+                                                                 1>; // InSrcOutDstVectorSize
+int main()
+{
+    std::vector<ck::index_t> window_lengths    = {5, 5, 5};
+    std::vector<ck::index_t> window_strides    = {2, 2, 2};
+    std::vector<ck::index_t> window_dilations  = {2, 2, 2};
+    std::vector<ck::index_t> dinput_left_pads  = {0, 0, 0};
+    std::vector<ck::index_t> dinput_right_pads = {0, 0, 0};
+    ck::index_t N  = 1;
+    ck::index_t C  = 16;
+    ck::index_t Di = 40;
+    ck::index_t Hi = 40;
+    ck::index_t Wi = 40;
+    pool3d_bwd_test<DevicePoolBwdInstance, DOutDataType, DInDataType, DOutLayout, DInLayout>(
+        true,
+        false,
+        N,
+        C,
+        Di,
+        Hi,
+        Wi,
+        window_lengths,
+        window_strides,
+        window_dilations,
+        dinput_left_pads,
+        dinput_right_pads);
+}
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -125,6 +125,9 @@
 // `s_nop`s to avoid hazard
 #define CK_USE_AMD_V_DOT_INLINE_ASM 0
+// inner product using V_DOT with DPP8 modifiers
+#define CK_USE_AMD_V_DOT_DPP8_INLINE_ASM 1
 // block synchronization only s_wait lgkmcnt(0), not vmcnt(0)
 #define CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1
@@ -198,7 +201,7 @@
 #define CK_WORKAROUND_SWDEV_388832 1
 // workaround: Grouped Conv2d_bwd_data fails for already implemented instance
-#define CK_WORKAROUND_SWDEV_3318619 0
+#define CK_WORKAROUND_GITHUB_ISSUE_824 1
 // flag to enable (1) or disable (0) the debugging output in some kernels
 #define DEBUG_LOG 0

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_dpp8.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_dpp8.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck/utility/amd_gemm_dpp.hpp"
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_adaptor.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v4r1.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_contraction_dl_dpp8.hpp"
+namespace ck {
+/**
+ * DPP8 version of blockwise GEMM algorithm. It uses DPP8 instruction modifier to limit
+ * the data loaded from LDS to registers.
+ *
+ * The algorithm groups threads into groups of size `dpp8::lane_group_size` and splits the matrix C
+ * between them in such a way that threads from the same group need the same chunk of either
+ * matrix A (or B, respectively). Without the usage of DPP8, each thread would need to load the
+ * whole chunk from LDS to its own register space.
+ * Usage of DPP8 modifiers allow each thread to load less data, exactly `1 / dpp8::lane_group_size`
+ * of the chunk, and then share that data with other threads from the same lane group.
+ *
+ * Assumptions coming from the usage of DPP8:
+ *   1. `BM10BN10ThreadClusterBM10Xs[1] == dpp8::lane_group_size` or
+ *      `BM10BN10ThreadClusterBN10Xs[1] == dpp8::lane_group_size` -
+ *        - it makes consecutive `dpp8::lane_group_size` threads use the same chunk of either
+ *          matrix A or B;
+ *        - based on these values we determine which matrix to share.
+ *   2. `BM1PerThreadBM11 % dpp8::lane_group_size == 0` (if sharing A) or
+ *      `BN1PerThreadBN11 % dpp8::lane_group_size == 0` (if sharing B) -
+ *        - we have to make sure that the data to split is divisible by the number of
+ *          threads in the group.
+ *
+ * General algorithm:
+ * C[BM0, BM1, BN0, BN1] += transpose(A[K, BM0, BM1]) * B[K, BN0, BN1]
+ * A and B are visible to the whole block, C is distributed among each thread
+ * Assume:
+ *   1. A:
+ *     1. ABlockDesc_BK0_BM_BK1 is known at compile-time
+ *     2. ABlockBuffer is DynamicBuffer
+ *   2. B:
+ *     1. BBlockDesc_BK0_BN_BK1 is known at compile-time
+ *     2. BBlockBuffer is DynamicBuffer
+ *   3. C:
+ *     1. CThreadDesc_BM0_BM11_BN0_BN11 is known at compile-time
+ *     2. CThreadBuffer is StaticBuffer
+ *   4. BM10BN10ThreadClusterBM10Xs::Size() = BM10BN10ThreadClusterBN10Xs::Size() == 2
+ */
+template <index_t BlockSize,
+          typename FloatA,
+          typename FloatB,
+          typename FloatC,
+          typename ABlockDesc_BK0_BM_BK1,
+          typename BBlockDesc_BK0_BN_BK1,
+          index_t BM1PerThreadBM11,
+          index_t BN1PerThreadBN11,
+          index_t BK0PerThread,
+          typename BM10BN10ThreadClusterBM10Xs, // Sequence<BM10BN10ThreadClusterBM100,
+                                                //          BM10BN10ThreadClusterBM101, ...>
+          typename BM10BN10ThreadClusterBN10Xs, // Sequence<BM10BN10ThreadClusterBN100,
+                                                //          BM10BN10ThreadClusterBN101, ...>
+          index_t AThreadCopyScalarPerVector_BM11,
+          index_t BThreadCopyScalarPerVector_BN11,
+          typename enable_if<ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&
+                                 BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(),
+                             bool>::type = false>
+struct BlockwiseGemmDlDpp8_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_loop_BM0_BN0
+{
+    using AIndex = MultiIndex<4>;
+    using BIndex = MultiIndex<4>;
+    using CIndex = MultiIndex<4>;
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr index_t BK0 = ABlockDesc_BK0_BM_BK1{}.GetLength(I0);
+    static constexpr index_t BK1 = ABlockDesc_BK0_BM_BK1{}.GetLength(I2);
+    static constexpr index_t BM  = ABlockDesc_BK0_BM_BK1{}.GetLength(I1);
+    static constexpr index_t BN  = BBlockDesc_BK0_BN_BK1{}.GetLength(I1);
+    static constexpr index_t BM100 = BM10BN10ThreadClusterBM10Xs{}[I0];
+    static constexpr index_t BN100 = BM10BN10ThreadClusterBN10Xs{}[I0];
+    static constexpr index_t BM101 = BM10BN10ThreadClusterBM10Xs{}[I1];
+    static constexpr index_t BN101 = BM10BN10ThreadClusterBN10Xs{}[I1];
+    static constexpr index_t BM11 = BM1PerThreadBM11;
+    static constexpr index_t BN11 = BN1PerThreadBN11;
+    static constexpr index_t BM1 = BM100 * BM101 * BM11;
+    static constexpr index_t BN1 = BN100 * BN101 * BN11;
+    static constexpr index_t BM0 = BM / BM1;
+    static constexpr index_t BN0 = BN / BN1;
+    // We assume that either `BM101` or `BN101` is equal to `dpp8::lane_group_size`. It makes all
+    // threads in a lane group need the same chunk of B or A matrices and we can share them using
+    // DPP.
+    static_assert(BM101 == dpp8::lane_group_size || BN101 == dpp8::lane_group_size);
+    static constexpr bool ShareB = BM101 == dpp8::lane_group_size ? true : false;
+    static constexpr bool ShareA = !ShareB;
+    // If DPP shares A (B, respectively), lane group gets `BM1PerThreadBM11` (`BN1PerThreadBN11`,
+    // respectively) elements, so we split them between threads in lane group so each thread loads
+    // less data from LDS.
+    static constexpr index_t BM1PerThread =
+        ShareA ? BM1PerThreadBM11 / dpp8::lane_group_size : BM1PerThreadBM11;
+    static constexpr index_t BN1PerThread =
+        ShareB ? BN1PerThreadBN11 / dpp8::lane_group_size : BN1PerThreadBN11;
+    __host__ __device__ static constexpr auto
+    MakeABlockDescriptor_BK0_BM0_BM1_BK1(const ABlockDesc_BK0_BM_BK1& a_block_desc_bk0_bm_bk1)
+    {
+        const auto a_block_bk0_bm0_bm1_bk1 = transform_tensor_descriptor(
+            a_block_desc_bk0_bm_bk1,
+            make_tuple(make_pass_through_transform(Number<BK0>{}),
+                       make_unmerge_transform(make_tuple(Number<BM0>{}, Number<BM1>{})),
+                       make_pass_through_transform(Number<BK1>{})),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+        return a_block_bk0_bm0_bm1_bk1;
+    }
+    __host__ __device__ static constexpr auto
+    MakeBBlockDescriptor_BK0_BN0_BN1_BK1(const BBlockDesc_BK0_BN_BK1& b_block_desc_bk0_bn_bk1)
+    {
+        const auto b_block_desc_bk0_bn0_bn1_bk1 = transform_tensor_descriptor(
+            b_block_desc_bk0_bn_bk1,
+            make_tuple(make_pass_through_transform(Number<BK0>{}),
+                       make_unmerge_transform(make_tuple(Number<BN0>{}, Number<BN1>{})),
+                       make_pass_through_transform(Number<BK1>{})),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+        return b_block_desc_bk0_bn0_bn1_bk1;
+    }
+    __host__ __device__ static constexpr auto
+    MakeCBlockAdaptor_BM0_BM100_BM101_BM11_BN0_BN100_BN101_BN11_To_BM_BN()
+    {
+        // upper: [BM0, BM100, BM101, BM11, BN0, BN100, BN101, BN11]
+        // lower: [BM, BN]
+        constexpr auto c_block_adaptor_m0_m100_m101_m11_n0_n100_n101_n11_to_m_n =
+            make_single_stage_tensor_adaptor(
+                make_tuple(make_unmerge_transform(make_tuple(
+                               Number<BM0>{}, Number<BM100>{}, Number<BM101>{}, Number<BM11>{})),
+                           make_unmerge_transform(make_tuple(
+                               Number<BN0>{}, Number<BN100>{}, Number<BN101>{}, Number<BN11>{}))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 2, 3>{}, Sequence<4, 5, 6, 7>{}));
+        return c_block_adaptor_m0_m100_m101_m11_n0_n100_n101_n11_to_m_n;
+    }
+    __host__ __device__ static constexpr auto
+    MakeCBlockAdaptor_BM0_BM100_BM101_BM11_BN0_BN100_BN101_BN11_To_BM0_BM1_BN0_BN1()
+    {
+        // upper: [BM0, BM100, BM101, BM11, BN0, BN100, BN101, BN11]
+        // lower: [BM0, BM1, BN0, BN1]
+        constexpr auto c_block_adaptor_m0_m100_m101_m11_n0_n100_n101_n11_to_m0_m1_n0_n1 =
+            make_single_stage_tensor_adaptor(
+                make_tuple(make_pass_through_transform(Number<BM0>{}),
+                           make_unmerge_transform(
+                               make_tuple(Number<BM100>{}, Number<BM101>{}, Number<BM11>{})),
+                           make_pass_through_transform(Number<BN0>{}),
+                           make_unmerge_transform(
+                               make_tuple(Number<BN100>{}, Number<BN101>{}, Number<BN11>{}))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{}, Sequence<5, 6, 7>{}));
+        return c_block_adaptor_m0_m100_m101_m11_n0_n100_n101_n11_to_m0_m1_n0_n1;
+    }
+    __host__ __device__ static constexpr auto GetCThreadTensorLengths_BM0_BM1_BN0_BN1()
+    {
+        return Sequence<BM0, BM11, BN0, BN11>{};
+    }
+    static constexpr auto a_block_desc_bk0_bm0_bm1_bk1_ =
+        MakeABlockDescriptor_BK0_BM0_BM1_BK1(ABlockDesc_BK0_BM_BK1{});
+    static constexpr auto b_block_desc_bk0_bn0_bn1_bk1_ =
+        MakeBBlockDescriptor_BK0_BN0_BN1_BK1(BBlockDesc_BK0_BN_BK1{});
+    public:
+    __device__ BlockwiseGemmDlDpp8_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_loop_BM0_BN0()
+        : c_thread_origin_data_idx_{CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1(
+              get_thread_local_1d_id())},
+          a_thread_copy_{CalculateAThreadOriginOnBlock_BK0_BM0_BM1_BK1()},
+          b_thread_copy_{CalculateBThreadOriginOnBlock_BK0_BN0_BN1_BK1()}
+    {
+        static_assert(ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&
+                          BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+        static_assert(BM % BM1 == 0 && BN % BN1 == 0, "wrong!");
+        static_assert(ABlockDesc_BK0_BM_BK1{}.GetLength(I0) ==
+                          BBlockDesc_BK0_BN_BK1{}.GetLength(I0),
+                      "wrong! K dimension not consistent");
+        static_assert(BM10BN10ThreadClusterBM10Xs::Size() == 2 &&
+                          BM10BN10ThreadClusterBN10Xs::Size() == 2,
+                      "wrong!");
+    }
+    __device__ static CIndex CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1(index_t thread_id)
+    {
+        // lower: [BM0, BM1, BN0, BN1]
+        // upper: [BM0, BM100, BM101, BM11, BN0, BN100, BN101, BN11]
+        constexpr auto adaptor0 =
+            MakeCBlockAdaptor_BM0_BM100_BM101_BM11_BN0_BN100_BN101_BN11_To_BM0_BM1_BN0_BN1();
+        // lower: [BM0, BM100, BM101, BM11, BN0, BN100, BN101, BN11]
+        // upper: [Tid, BM0, BM11, BN0, BN11]
+        constexpr auto adaptor1 = make_single_stage_tensor_adaptor(
+            make_tuple(make_merge_transform(make_tuple(BM100, BN100, BM101, BN101)),
+                       make_pass_through_transform(BM0),
+                       make_pass_through_transform(BM11),
+                       make_pass_through_transform(BN0),
+                       make_pass_through_transform(BN11)),
+            make_tuple(
+                Sequence<1, 5, 2, 6>{}, Sequence<0>{}, Sequence<3>{}, Sequence<4>{}, Sequence<7>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
+        constexpr auto adaptor = chain_tensor_adaptors(adaptor0, adaptor1);
+        return adaptor.CalculateBottomIndex(make_multi_index(thread_id, 0, 0, 0, 0));
+    }
+    __device__ AIndex CalculateAThreadOriginOnBlock_BK0_BM0_BM1_BK1()
+    {
+        const auto offsetBM0 = c_thread_origin_data_idx_[I0];
+        // If sharing matrix A, we need a separate BM1 offset for each thread in lane group.
+        const auto offsetBM1 = ShareA ? c_thread_origin_data_idx_[I1] +
+                                            dpp8::get_thread_idx_in_lane_group() * BM1PerThread
+                                      : c_thread_origin_data_idx_[I1];
+        return make_tuple(0, offsetBM0, offsetBM1, 0);
+    }
+    __device__ BIndex CalculateBThreadOriginOnBlock_BK0_BN0_BN1_BK1()
+    {
+        const auto offsetBN0 = c_thread_origin_data_idx_[I2];
+        // If sharing matrix B, we need a separate BN1 offset for each thread in lane group.
+        const auto offsetBN1 = ShareB ? c_thread_origin_data_idx_[I3] +
+                                            dpp8::get_thread_idx_in_lane_group() * BN1PerThread
+                                      : c_thread_origin_data_idx_[I3];
+        return make_tuple(0, offsetBN0, offsetBN1, 0);
+    }
+    template <typename CThreadDesc_BM0_BM11_BN0_BN11,
+              typename ABlockBuffer,
+              typename BBlockBuffer,
+              typename CThreadBuffer>
+    __device__ void Run(const CThreadDesc_BM0_BM11_BN0_BN11&,
+                        const ABlockBuffer& a_block_buf,
+                        const BBlockBuffer& b_block_buf,
+                        CThreadBuffer& c_thread_buf) const
+    {
+        static_assert(CThreadDesc_BM0_BM11_BN0_BN11::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatA>(
+            a_thread_desc_bk0_bm0_bm1_bk1_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatB>(
+            b_thread_desc_bk0_bn0_bn1_bk1_.GetElementSpaceSize());
+        constexpr auto threadwise_contraction =
+            ThreadwiseContractionDlDpp8_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1<
+                FloatA,
+                FloatB,
+                FloatC,
+                decltype(a_thread_desc_bk0_bm0_bm1_bk1_),
+                decltype(b_thread_desc_bk0_bn0_bn1_bk1_),
+                CThreadDesc_BM0_BM11_BN0_BN11,
+                Sequence<BK0PerThread, BK1>,
+                Sequence<1, BM1PerThreadBM11>,
+                Sequence<1, BN1PerThreadBN11>,
+                ShareA>{};
+        static_for<0, BN0, 1>{}([&](auto bn0) {
+            static_for<0, BM0, 1>{}([&](auto bm0) {
+                a_thread_copy_.Run(a_block_desc_bk0_bm0_bm1_bk1_,
+                                   make_tuple(I0, bm0, I0, I0),
+                                   a_block_buf,
+                                   a_thread_desc_bk0_bm0_bm1_bk1_,
+                                   make_tuple(I0, I0, I0, I0),
+                                   a_thread_buf);
+                b_thread_copy_.Run(b_block_desc_bk0_bn0_bn1_bk1_,
+                                   make_tuple(I0, bn0, I0, I0),
+                                   b_block_buf,
+                                   b_thread_desc_bk0_bn0_bn1_bk1_,
+                                   make_tuple(I0, I0, I0, I0),
+                                   b_thread_buf);
+                threadwise_contraction.Run(a_thread_buf,
+                                           make_tuple(I0, I0, I0, I0),
+                                           b_thread_buf,
+                                           make_tuple(I0, I0, I0, I0),
+                                           c_thread_buf,
+                                           make_tuple(bm0, I0, bn0, I0));
+                static_for<BK0PerThread, BK0, BK0PerThread>{}([&](auto bk0) {
+                    a_thread_copy_.Run(a_block_desc_bk0_bm0_bm1_bk1_,
+                                       make_tuple(bk0, bm0, I0, I0),
+                                       a_block_buf,
+                                       a_thread_desc_bk0_bm0_bm1_bk1_,
+                                       make_tuple(I0, I0, I0, I0),
+                                       a_thread_buf);
+                    b_thread_copy_.Run(b_block_desc_bk0_bn0_bn1_bk1_,
+                                       make_tuple(bk0, bn0, I0, I0),
+                                       b_block_buf,
+                                       b_thread_desc_bk0_bn0_bn1_bk1_,
+                                       make_tuple(I0, I0, I0, I0),
+                                       b_thread_buf);
+                    threadwise_contraction.Run(a_thread_buf,
+                                               make_tuple(I0, I0, I0, I0),
+                                               b_thread_buf,
+                                               make_tuple(I0, I0, I0, I0),
+                                               c_thread_buf,
+                                               make_tuple(bm0, I0, bn0, I0));
+                });
+            });
+        });
+    }
+    private:
+    // A[BK0, BM0, BM1, BK1]
+    static constexpr auto a_thread_desc_bk0_bm0_bm1_bk1_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<BK0PerThread>{}, Number<BM0>{}, Number<BM1PerThread>{}, Number<BK1>{}));
+    // B[BK0, BN0, BN1, BK1]
+    static constexpr auto b_thread_desc_bk0_bn0_bn1_bk1_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<BK0PerThread>{}, Number<BN0>{}, Number<BN1PerThread>{}, Number<BK1>{}));
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4r1<
+        FloatA,
+        FloatA,
+        decltype(a_block_desc_bk0_bm0_bm1_bk1_),
+        decltype(a_thread_desc_bk0_bm0_bm1_bk1_),
+        Sequence<BK0PerThread, 1, BM1PerThread, BK1>, // SliceLengths
+        Sequence<0, 1, 2, 3>,                         // DimAccessOrder
+        Sequence<1, 1, BM1PerThread, BK1>,            // SrcVectorTensorLengths
+        Sequence<0, 1, 2, 3>>;                        // SrcVectorTensorContiguousDimOrder
+    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4r1<
+        FloatB,
+        FloatB,
+        decltype(b_block_desc_bk0_bn0_bn1_bk1_),
+        decltype(b_thread_desc_bk0_bn0_bn1_bk1_),
+        Sequence<BK0PerThread, 1, BN1PerThread, BK1>, // SliceLengths
+        Sequence<0, 1, 2, 3>,                         // DimAccessOrder
+        Sequence<1, 1, BN1PerThread, BK1>,            // SrcVectorTensorLengths
+        Sequence<0, 1, 2, 3>>;                        // SrcVectorTensorContiguousDimOrder
+    CIndex c_thread_origin_data_idx_;
+    AThreadCopy a_thread_copy_;
+    BThreadCopy b_thread_copy_;
+};
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp
@@ -11,7 +11,7 @@
 namespace ck {
 // C[BM0, BM1, BN0, BN1] += transpose(A[K, BM0, BM1]) * B[K, BN0, BN1]
-// A and B are visable to the whole block, C is distributed among each thread
+// A and B are visible to the whole block, C is distributed among each thread
 // Assume:
 //   1. A:
 //     1. ABlockDesc_BK0_BM_BK1 is known at compile-time

--- a/include/ck/tensor_operation/gpu/device/device_avgpool_bwd.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_avgpool_bwd.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <vector>
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+template <index_t NDimSpatial,
+          typename DOutDataType,
+          typename DInDataType,
+          typename DOutLayout,
+          typename DInLayout>
+struct DeviceAvgPoolBwd : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_dout,
+                        void* p_din,
+                        std::vector<ck::index_t> dout_n_k_wos_lengths,
+                        std::vector<ck::index_t> dout_n_k_wos_strides,
+                        std::vector<ck::index_t> din_n_k_wos_length,
+                        std::vector<ck::index_t> din_n_k_wos_strides,
+                        std::vector<ck::index_t> window_k_c_xs_lengths,
+                        std::vector<ck::index_t> window_strides,
+                        std::vector<ck::index_t> window_dilations,
+                        std::vector<ck::index_t> input_left_pads,
+                        std::vector<ck::index_t> input_right_pads) = 0;
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp
@@ -27,15 +27,12 @@ struct DeviceGroupedConvBwdWeight : public BaseOperator
    MakeArgumentPointer(const void* p_in,
                        void* p_wei,
                        const void* p_out,
-                        const ck::index_t G,
+                        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths, // input
-                        const ck::index_t N,
+                        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
-                        const ck::index_t K,
+                        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths, // weight
-                        const ck::index_t C,
+                        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
-                        const std::array<ck::index_t, NDimSpatial>& input_spatial_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths, // output
-                        const std::array<ck::index_t, NDimSpatial>& filter_spatial_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_strides,
-                        const std::array<ck::index_t, NDimSpatial>& output_spatial_lengths,
-                        const std::array<ck::index_t, NDimSpatial + 3>& input_strides,
-                        const std::array<ck::index_t, NDimSpatial + 3>& output_strides,
                        const std::array<ck::index_t, NDimSpatial>& conv_filter_strides,
                        const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations,
                        const std::array<ck::index_t, NDimSpatial>& input_left_pads,