Merge branch 'amd-develop' into amd-master

48fe8532 · Jun Liu · d88411d1 · 39002e9e · 48fe8532 · 48fe8532
Commit 48fe8532 authored Aug 18, 2023 by Jun Liu
20 changed files
--- a/example/49_maxpool2d_bwd/maxpool2d_bwd_fp32.cpp
+++ b/example/49_maxpool2d_bwd/maxpool2d_bwd_fp32.cpp
@@ -24,18 +24,20 @@ int main()
    bool time_kernel     = false;
    // Pool shape
-    ck::index_t N               = 1;
+    ck::index_t N                 = 1;
-    ck::index_t C               = 1;
+    ck::index_t C                 = 1;
-    ck::index_t Y               = 2;
+    ck::index_t Y                 = 2;
-    ck::index_t X               = 2;
+    ck::index_t X                 = 2;
-    ck::index_t Hi              = 32;
+    ck::index_t Hi                = 32;
-    ck::index_t Wi              = 32;
+    ck::index_t Wi                = 32;
-    ck::index_t window_stride_h = 2;
+    ck::index_t window_stride_h   = 2;
-    ck::index_t window_stride_w = 2;
+    ck::index_t window_stride_w   = 2;
-    ck::index_t in_left_pad_h   = 0;
+    ck::index_t window_dilation_h = 1;
-    ck::index_t in_left_pad_w   = 0;
+    ck::index_t window_dilation_w = 1;
-    ck::index_t in_right_pad_h  = 0;
+    ck::index_t in_left_pad_h     = 0;
-    ck::index_t in_right_pad_w  = 0;
+    ck::index_t in_left_pad_w     = 0;
+    ck::index_t in_right_pad_h    = 0;
+    ck::index_t in_right_pad_w    = 0;
    bool pass = maxpool_bwd_test<InDataType,
                                 OutDataType,
@@ -53,6 +55,8 @@ int main()
                                               Wi,
                                               window_stride_h,
                                               window_stride_w,
+                                               window_dilation_h,
+                                               window_dilation_w,
                                               in_left_pad_h,
                                               in_left_pad_w,
                                               in_right_pad_h,

--- a/example/51_avgpool3d_bwd/CMakeLists.txt
+++ b/example/51_avgpool3d_bwd/CMakeLists.txt
+add_example_executable(example_avgpool3d_bwd_bf16 avgpool3d_bwd_bf16.cpp)
+add_example_executable(example_avgpool3d_bwd_fp16 avgpool3d_bwd_fp16.cpp)
+add_example_executable(example_avgpool3d_bwd_fp32 avgpool3d_bwd_fp32.cpp)
--- a/example/51_avgpool3d_bwd/avgpool3d_bwd_bf16.cpp
+++ b/example/51_avgpool3d_bwd/avgpool3d_bwd_bf16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_avgpool3d_bwd_ndhwc_ndhwc.hpp"
+#include "avgpool3d_bwd_common.hpp"
+using DOutDataType    = ck::bhalf_t;
+using DInDataType     = ck::bhalf_t;
+using ComputeDataType = float;
+#if 1
+using DOutLayout = ck::tensor_layout::convolution::NDHWC;
+using DInLayout  = ck::tensor_layout::convolution::NDHWC;
+#else
+using DOutLayout = ck::tensor_layout::convolution::NCDHW;
+using DInLayout  = ck::tensor_layout::convolution::NCDHW;
+#endif
+using DevicePoolBwdInstance =
+    ck::tensor_operation::device::DeviceAvgPool3dBwd_NDHWC_NDHWC<DOutDataType,
+                                                                 DInDataType,
+                                                                 ComputeDataType,
+                                                                 64, // BlockSize
+                                                                 64, // ReduceMThreadClusterSize
+                                                                 1,  // ReduceKThreadClusterSize
+                                                                 1,  // ReduceMThreadSliceSize
+                                                                 1,  // ReduceKThreadSliceSize
+                                                                 1>; // InSrcOutDstVectorSize
+int main()
+{
+    std::vector<ck::index_t> window_lengths    = {5, 5, 5};
+    std::vector<ck::index_t> window_strides    = {2, 2, 2};
+    std::vector<ck::index_t> window_dilations  = {2, 2, 2};
+    std::vector<ck::index_t> dinput_left_pads  = {0, 0, 0};
+    std::vector<ck::index_t> dinput_right_pads = {0, 0, 0};
+    ck::index_t N  = 1;
+    ck::index_t C  = 16;
+    ck::index_t Di = 40;
+    ck::index_t Hi = 40;
+    ck::index_t Wi = 40;
+    pool3d_bwd_test<DevicePoolBwdInstance, DOutDataType, DInDataType, DOutLayout, DInLayout>(
+        true,
+        false,
+        N,
+        C,
+        Di,
+        Hi,
+        Wi,
+        window_lengths,
+        window_strides,
+        window_dilations,
+        dinput_left_pads,
+        dinput_right_pads);
+}
--- a/example/51_avgpool3d_bwd/avgpool3d_bwd_common.hpp
+++ b/example/51_avgpool3d_bwd/avgpool3d_bwd_common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include "ck/ck.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_avgpool_bwd.hpp"
+template <typename TensorLayout>
+std::vector<ck::index_t> f_tensor_strides_ncdhw(ck::index_t N_,
+                                                ck::index_t C_,
+                                                ck::index_t D,
+                                                ck::index_t H,
+                                                ck::index_t W,
+                                                TensorLayout layout)
+{
+    using namespace ck::literals;
+    (void)N_;
+    if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCDHW>::value)
+        return {C_ * D * H * W, D * H * W, H * W, W, 1_uz};
+    else if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NDHWC>::value)
+        return {D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_};
+};
+template <typename TensorLayout>
+HostTensorDescriptor f_host_tensor_descriptor(std::size_t N_,
+                                              std::size_t C_,
+                                              std::size_t D,
+                                              std::size_t H,
+                                              std::size_t W,
+                                              TensorLayout layout)
+{
+    using namespace ck::literals;
+    if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCDHW>::value)
+    {
+        return HostTensorDescriptor({N_, C_, D, H, W}, {C_ * D * H * W, D * H * W, H * W, W, 1_uz});
+    }
+    else if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NDHWC>::value)
+    {
+        return HostTensorDescriptor({N_, C_, D, H, W},
+                                    {D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_});
+    }
+};
+template <typename DevicePoolBwdInstance,
+          typename DOutDataType,
+          typename DInDataType,
+          typename DOutLayout,
+          typename DInLayout>
+bool pool3d_bwd_test(bool do_verification,
+                     bool time_kernel,
+                     ck::index_t N,
+                     ck::index_t C,
+                     ck::index_t Di,
+                     ck::index_t Hi,
+                     ck::index_t Wi,
+                     std::vector<ck::index_t> window_lengths,
+                     std::vector<ck::index_t> window_strides,
+                     std::vector<ck::index_t> window_dilations,
+                     std::vector<ck::index_t> dinput_left_pads,
+                     std::vector<ck::index_t> dinput_right_pads)
+{
+    auto OutSpatialLength = [&](auto InSpatialLength, int index) {
+        ck::index_t left_pad   = dinput_left_pads[index];
+        ck::index_t right_pad  = dinput_right_pads[index];
+        ck::index_t window_len = window_lengths[index];
+        ck::index_t stride     = window_strides[index];
+        ck::index_t dilation   = window_dilations[index];
+        ck::index_t eff        = (window_len - 1) * dilation + 1;
+        return (InSpatialLength + left_pad + right_pad - eff) / stride + 1;
+    };
+    ck::index_t Do = OutSpatialLength(Di, 0);
+    ck::index_t Ho = OutSpatialLength(Hi, 1);
+    ck::index_t Wo = OutSpatialLength(Wi, 2);
+    Tensor<DOutDataType> dout(f_host_tensor_descriptor(N, C, Do, Ho, Wo, DOutLayout{}));
+    Tensor<DInDataType> din_dev(f_host_tensor_descriptor(N, C, Di, Hi, Wi, DInLayout{}));
+    Tensor<DInDataType> din_host(f_host_tensor_descriptor(N, C, Di, Hi, Wi, DInLayout{}));
+    std::cout << "dout: " << dout.mDesc << std::endl;
+    std::cout << "din_host: " << din_host.mDesc << std::endl;
+    dout.GenerateTensorValue(GeneratorTensor_3<DOutDataType>{0.0, 1.0});
+    DeviceMem dout_device_buf(sizeof(DOutDataType) * dout.mDesc.GetElementSpaceSize());
+    DeviceMem din_device_buf(sizeof(DInDataType) * din_dev.mDesc.GetElementSpaceSize());
+    dout_device_buf.ToDevice(dout.mData.data());
+    din_device_buf.SetZero();
+    auto pool        = DevicePoolBwdInstance{};
+    auto invoker_ptr = pool.MakeInvokerPointer();
+    auto argument_ptr =
+        pool.MakeArgumentPointer(static_cast<DOutDataType*>(dout_device_buf.GetDeviceBuffer()),
+                                 static_cast<DInDataType*>(din_device_buf.GetDeviceBuffer()),
+                                 {N, C, Do, Ho, Wo},
+                                 {N, C, Di, Hi, Wi},
+                                 f_tensor_strides_ncdhw(N, C, Do, Ho, Wo, DOutLayout{}),
+                                 f_tensor_strides_ncdhw(N, C, Di, Hi, Wi, DInLayout{}),
+                                 window_lengths,
+                                 window_strides,
+                                 window_dilations,
+                                 dinput_left_pads,
+                                 dinput_right_pads);
+    if(!pool.IsSupportedArgument(argument_ptr.get()))
+    {
+        throw std::runtime_error("wrong! device_op with the specified compilation parameters does "
+                                 "not support this problem");
+    }
+    float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+    std::cout << "Perf: " << ave_time << std::endl;
+    bool pass = true;
+    if(do_verification)
+    {
+        auto ref_pool =
+            ck::tensor_operation::host::ReferenceAvgPoolBwd<3, DInDataType, DOutDataType>();
+        auto ref_invoker = ref_pool.MakeInvoker();
+        auto ref_argument = ref_pool.MakeArgument(din_host,
+                                                  dout,
+                                                  window_lengths,
+                                                  window_strides,
+                                                  window_dilations,
+                                                  dinput_left_pads,
+                                                  dinput_right_pads);
+        ref_invoker.Run(ref_argument);
+        din_device_buf.FromDevice(din_dev.mData.data());
+        pass = ck::utils::check_err(din_dev, din_host);
+    }
+    return pass;
+}
--- a/example/51_avgpool3d_bwd/avgpool3d_bwd_fp16.cpp
+++ b/example/51_avgpool3d_bwd/avgpool3d_bwd_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_avgpool3d_bwd_ndhwc_ndhwc.hpp"
+#include "avgpool3d_bwd_common.hpp"
+using DOutDataType    = ck::half_t;
+using DInDataType     = ck::half_t;
+using ComputeDataType = float;
+#if 1
+using DOutLayout = ck::tensor_layout::convolution::NDHWC;
+using DInLayout  = ck::tensor_layout::convolution::NDHWC;
+#else
+using DOutLayout = ck::tensor_layout::convolution::NCDHW;
+using DInLayout  = ck::tensor_layout::convolution::NCDHW;
+#endif
+using DevicePoolBwdInstance =
+    ck::tensor_operation::device::DeviceAvgPool3dBwd_NDHWC_NDHWC<DOutDataType,
+                                                                 DInDataType,
+                                                                 ComputeDataType,
+                                                                 64, // BlockSize
+                                                                 64, // ReduceMThreadClusterSize
+                                                                 1,  // ReduceKThreadClusterSize
+                                                                 1,  // ReduceMThreadSliceSize
+                                                                 1,  // ReduceKThreadSliceSize
+                                                                 1>; // InSrcOutDstVectorSize
+int main()
+{
+    std::vector<ck::index_t> window_lengths    = {5, 5, 5};
+    std::vector<ck::index_t> window_strides    = {2, 2, 2};
+    std::vector<ck::index_t> window_dilations  = {2, 2, 2};
+    std::vector<ck::index_t> dinput_left_pads  = {0, 0, 0};
+    std::vector<ck::index_t> dinput_right_pads = {0, 0, 0};
+    ck::index_t N  = 1;
+    ck::index_t C  = 16;
+    ck::index_t Di = 40;
+    ck::index_t Hi = 40;
+    ck::index_t Wi = 40;
+    pool3d_bwd_test<DevicePoolBwdInstance, DOutDataType, DInDataType, DOutLayout, DInLayout>(
+        true,
+        false,
+        N,
+        C,
+        Di,
+        Hi,
+        Wi,
+        window_lengths,
+        window_strides,
+        window_dilations,
+        dinput_left_pads,
+        dinput_right_pads);
+}
--- a/example/51_avgpool3d_bwd/avgpool3d_bwd_fp32.cpp
+++ b/example/51_avgpool3d_bwd/avgpool3d_bwd_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_avgpool3d_bwd_ndhwc_ndhwc.hpp"
+#include "avgpool3d_bwd_common.hpp"
+using DOutDataType    = float;
+using DInDataType     = float;
+using ComputeDataType = float;
+#if 1
+using DOutLayout = ck::tensor_layout::convolution::NDHWC;
+using DInLayout  = ck::tensor_layout::convolution::NDHWC;
+#else
+using DOutLayout = ck::tensor_layout::convolution::NCDHW;
+using DInLayout  = ck::tensor_layout::convolution::NCDHW;
+#endif
+using DevicePoolBwdInstance =
+    ck::tensor_operation::device::DeviceAvgPool3dBwd_NDHWC_NDHWC<DOutDataType,
+                                                                 DInDataType,
+                                                                 ComputeDataType,
+                                                                 64, // BlockSize
+                                                                 64, // ReduceMThreadClusterSize
+                                                                 1,  // ReduceKThreadClusterSize
+                                                                 1,  // ReduceMThreadSliceSize
+                                                                 1,  // ReduceKThreadSliceSize
+                                                                 1>; // InSrcOutDstVectorSize
+int main()
+{
+    std::vector<ck::index_t> window_lengths    = {5, 5, 5};
+    std::vector<ck::index_t> window_strides    = {2, 2, 2};
+    std::vector<ck::index_t> window_dilations  = {2, 2, 2};
+    std::vector<ck::index_t> dinput_left_pads  = {0, 0, 0};
+    std::vector<ck::index_t> dinput_right_pads = {0, 0, 0};
+    ck::index_t N  = 1;
+    ck::index_t C  = 16;
+    ck::index_t Di = 40;
+    ck::index_t Hi = 40;
+    ck::index_t Wi = 40;
+    pool3d_bwd_test<DevicePoolBwdInstance, DOutDataType, DInDataType, DOutLayout, DInLayout>(
+        true,
+        false,
+        N,
+        C,
+        Di,
+        Hi,
+        Wi,
+        window_lengths,
+        window_strides,
+        window_dilations,
+        dinput_left_pads,
+        dinput_right_pads);
+}
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -125,6 +125,9 @@
 // `s_nop`s to avoid hazard
 #define CK_USE_AMD_V_DOT_INLINE_ASM 0
+// inner product using V_DOT with DPP8 modifiers
+#define CK_USE_AMD_V_DOT_DPP8_INLINE_ASM 1
 // block synchronization only s_wait lgkmcnt(0), not vmcnt(0)
 #define CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_dpp8.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_dpp8.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck/utility/amd_gemm_dpp.hpp"
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_adaptor.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v4r1.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_contraction_dl_dpp8.hpp"
+namespace ck {
+/**
+ * DPP8 version of blockwise GEMM algorithm. It uses DPP8 instruction modifier to limit
+ * the data loaded from LDS to registers.
+ *
+ * The algorithm groups threads into groups of size `dpp8::lane_group_size` and splits the matrix C
+ * between them in such a way that threads from the same group need the same chunk of either
+ * matrix A (or B, respectively). Without the usage of DPP8, each thread would need to load the
+ * whole chunk from LDS to its own register space.
+ * Usage of DPP8 modifiers allow each thread to load less data, exactly `1 / dpp8::lane_group_size`
+ * of the chunk, and then share that data with other threads from the same lane group.
+ *
+ * Assumptions coming from the usage of DPP8:
+ *   1. `BM10BN10ThreadClusterBM10Xs[1] == dpp8::lane_group_size` or
+ *      `BM10BN10ThreadClusterBN10Xs[1] == dpp8::lane_group_size` -
+ *        - it makes consecutive `dpp8::lane_group_size` threads use the same chunk of either
+ *          matrix A or B;
+ *        - based on these values we determine which matrix to share.
+ *   2. `BM1PerThreadBM11 % dpp8::lane_group_size == 0` (if sharing A) or
+ *      `BN1PerThreadBN11 % dpp8::lane_group_size == 0` (if sharing B) -
+ *        - we have to make sure that the data to split is divisible by the number of
+ *          threads in the group.
+ *
+ * General algorithm:
+ * C[BM0, BM1, BN0, BN1] += transpose(A[K, BM0, BM1]) * B[K, BN0, BN1]
+ * A and B are visible to the whole block, C is distributed among each thread
+ * Assume:
+ *   1. A:
+ *     1. ABlockDesc_BK0_BM_BK1 is known at compile-time
+ *     2. ABlockBuffer is DynamicBuffer
+ *   2. B:
+ *     1. BBlockDesc_BK0_BN_BK1 is known at compile-time
+ *     2. BBlockBuffer is DynamicBuffer
+ *   3. C:
+ *     1. CThreadDesc_BM0_BM11_BN0_BN11 is known at compile-time
+ *     2. CThreadBuffer is StaticBuffer
+ *   4. BM10BN10ThreadClusterBM10Xs::Size() = BM10BN10ThreadClusterBN10Xs::Size() == 2
+ */
+template <index_t BlockSize,
+          typename FloatA,
+          typename FloatB,
+          typename FloatC,
+          typename ABlockDesc_BK0_BM_BK1,
+          typename BBlockDesc_BK0_BN_BK1,
+          index_t BM1PerThreadBM11,
+          index_t BN1PerThreadBN11,
+          index_t BK0PerThread,
+          typename BM10BN10ThreadClusterBM10Xs, // Sequence<BM10BN10ThreadClusterBM100,
+                                                //          BM10BN10ThreadClusterBM101, ...>
+          typename BM10BN10ThreadClusterBN10Xs, // Sequence<BM10BN10ThreadClusterBN100,
+                                                //          BM10BN10ThreadClusterBN101, ...>
+          index_t AThreadCopyScalarPerVector_BM11,
+          index_t BThreadCopyScalarPerVector_BN11,
+          typename enable_if<ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&
+                                 BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(),
+                             bool>::type = false>
+struct BlockwiseGemmDlDpp8_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_loop_BM0_BN0
+{
+    using AIndex = MultiIndex<4>;
+    using BIndex = MultiIndex<4>;
+    using CIndex = MultiIndex<4>;
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr index_t BK0 = ABlockDesc_BK0_BM_BK1{}.GetLength(I0);
+    static constexpr index_t BK1 = ABlockDesc_BK0_BM_BK1{}.GetLength(I2);
+    static constexpr index_t BM  = ABlockDesc_BK0_BM_BK1{}.GetLength(I1);
+    static constexpr index_t BN  = BBlockDesc_BK0_BN_BK1{}.GetLength(I1);
+    static constexpr index_t BM100 = BM10BN10ThreadClusterBM10Xs{}[I0];
+    static constexpr index_t BN100 = BM10BN10ThreadClusterBN10Xs{}[I0];
+    static constexpr index_t BM101 = BM10BN10ThreadClusterBM10Xs{}[I1];
+    static constexpr index_t BN101 = BM10BN10ThreadClusterBN10Xs{}[I1];
+    static constexpr index_t BM11 = BM1PerThreadBM11;
+    static constexpr index_t BN11 = BN1PerThreadBN11;
+    static constexpr index_t BM1 = BM100 * BM101 * BM11;
+    static constexpr index_t BN1 = BN100 * BN101 * BN11;
+    static constexpr index_t BM0 = BM / BM1;
+    static constexpr index_t BN0 = BN / BN1;
+    // We assume that either `BM101` or `BN101` is equal to `dpp8::lane_group_size`. It makes all
+    // threads in a lane group need the same chunk of B or A matrices and we can share them using
+    // DPP.
+    static_assert(BM101 == dpp8::lane_group_size || BN101 == dpp8::lane_group_size);
+    static constexpr bool ShareB = BM101 == dpp8::lane_group_size ? true : false;
+    static constexpr bool ShareA = !ShareB;
+    // If DPP shares A (B, respectively), lane group gets `BM1PerThreadBM11` (`BN1PerThreadBN11`,
+    // respectively) elements, so we split them between threads in lane group so each thread loads
+    // less data from LDS.
+    static constexpr index_t BM1PerThread =
+        ShareA ? BM1PerThreadBM11 / dpp8::lane_group_size : BM1PerThreadBM11;
+    static constexpr index_t BN1PerThread =
+        ShareB ? BN1PerThreadBN11 / dpp8::lane_group_size : BN1PerThreadBN11;
+    __host__ __device__ static constexpr auto
+    MakeABlockDescriptor_BK0_BM0_BM1_BK1(const ABlockDesc_BK0_BM_BK1& a_block_desc_bk0_bm_bk1)
+    {
+        const auto a_block_bk0_bm0_bm1_bk1 = transform_tensor_descriptor(
+            a_block_desc_bk0_bm_bk1,
+            make_tuple(make_pass_through_transform(Number<BK0>{}),
+                       make_unmerge_transform(make_tuple(Number<BM0>{}, Number<BM1>{})),
+                       make_pass_through_transform(Number<BK1>{})),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+        return a_block_bk0_bm0_bm1_bk1;
+    }
+    __host__ __device__ static constexpr auto
+    MakeBBlockDescriptor_BK0_BN0_BN1_BK1(const BBlockDesc_BK0_BN_BK1& b_block_desc_bk0_bn_bk1)
+    {
+        const auto b_block_desc_bk0_bn0_bn1_bk1 = transform_tensor_descriptor(
+            b_block_desc_bk0_bn_bk1,
+            make_tuple(make_pass_through_transform(Number<BK0>{}),
+                       make_unmerge_transform(make_tuple(Number<BN0>{}, Number<BN1>{})),
+                       make_pass_through_transform(Number<BK1>{})),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+        return b_block_desc_bk0_bn0_bn1_bk1;
+    }
+    __host__ __device__ static constexpr auto
+    MakeCBlockAdaptor_BM0_BM100_BM101_BM11_BN0_BN100_BN101_BN11_To_BM_BN()
+    {
+        // upper: [BM0, BM100, BM101, BM11, BN0, BN100, BN101, BN11]
+        // lower: [BM, BN]
+        constexpr auto c_block_adaptor_m0_m100_m101_m11_n0_n100_n101_n11_to_m_n =
+            make_single_stage_tensor_adaptor(
+                make_tuple(make_unmerge_transform(make_tuple(
+                               Number<BM0>{}, Number<BM100>{}, Number<BM101>{}, Number<BM11>{})),
+                           make_unmerge_transform(make_tuple(
+                               Number<BN0>{}, Number<BN100>{}, Number<BN101>{}, Number<BN11>{}))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 2, 3>{}, Sequence<4, 5, 6, 7>{}));
+        return c_block_adaptor_m0_m100_m101_m11_n0_n100_n101_n11_to_m_n;
+    }
+    __host__ __device__ static constexpr auto
+    MakeCBlockAdaptor_BM0_BM100_BM101_BM11_BN0_BN100_BN101_BN11_To_BM0_BM1_BN0_BN1()
+    {
+        // upper: [BM0, BM100, BM101, BM11, BN0, BN100, BN101, BN11]
+        // lower: [BM0, BM1, BN0, BN1]
+        constexpr auto c_block_adaptor_m0_m100_m101_m11_n0_n100_n101_n11_to_m0_m1_n0_n1 =
+            make_single_stage_tensor_adaptor(
+                make_tuple(make_pass_through_transform(Number<BM0>{}),
+                           make_unmerge_transform(
+                               make_tuple(Number<BM100>{}, Number<BM101>{}, Number<BM11>{})),
+                           make_pass_through_transform(Number<BN0>{}),
+                           make_unmerge_transform(
+                               make_tuple(Number<BN100>{}, Number<BN101>{}, Number<BN11>{}))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{}, Sequence<5, 6, 7>{}));
+        return c_block_adaptor_m0_m100_m101_m11_n0_n100_n101_n11_to_m0_m1_n0_n1;
+    }
+    __host__ __device__ static constexpr auto GetCThreadTensorLengths_BM0_BM1_BN0_BN1()
+    {
+        return Sequence<BM0, BM11, BN0, BN11>{};
+    }
+    static constexpr auto a_block_desc_bk0_bm0_bm1_bk1_ =
+        MakeABlockDescriptor_BK0_BM0_BM1_BK1(ABlockDesc_BK0_BM_BK1{});
+    static constexpr auto b_block_desc_bk0_bn0_bn1_bk1_ =
+        MakeBBlockDescriptor_BK0_BN0_BN1_BK1(BBlockDesc_BK0_BN_BK1{});
+    public:
+    __device__ BlockwiseGemmDlDpp8_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_loop_BM0_BN0()
+        : c_thread_origin_data_idx_{CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1(
+              get_thread_local_1d_id())},
+          a_thread_copy_{CalculateAThreadOriginOnBlock_BK0_BM0_BM1_BK1()},
+          b_thread_copy_{CalculateBThreadOriginOnBlock_BK0_BN0_BN1_BK1()}
+    {
+        static_assert(ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&
+                          BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+        static_assert(BM % BM1 == 0 && BN % BN1 == 0, "wrong!");
+        static_assert(ABlockDesc_BK0_BM_BK1{}.GetLength(I0) ==
+                          BBlockDesc_BK0_BN_BK1{}.GetLength(I0),
+                      "wrong! K dimension not consistent");
+        static_assert(BM10BN10ThreadClusterBM10Xs::Size() == 2 &&
+                          BM10BN10ThreadClusterBN10Xs::Size() == 2,
+                      "wrong!");
+    }
+    __device__ static CIndex CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1(index_t thread_id)
+    {
+        // lower: [BM0, BM1, BN0, BN1]
+        // upper: [BM0, BM100, BM101, BM11, BN0, BN100, BN101, BN11]
+        constexpr auto adaptor0 =
+            MakeCBlockAdaptor_BM0_BM100_BM101_BM11_BN0_BN100_BN101_BN11_To_BM0_BM1_BN0_BN1();
+        // lower: [BM0, BM100, BM101, BM11, BN0, BN100, BN101, BN11]
+        // upper: [Tid, BM0, BM11, BN0, BN11]
+        constexpr auto adaptor1 = make_single_stage_tensor_adaptor(
+            make_tuple(make_merge_transform(make_tuple(BM100, BN100, BM101, BN101)),
+                       make_pass_through_transform(BM0),
+                       make_pass_through_transform(BM11),
+                       make_pass_through_transform(BN0),
+                       make_pass_through_transform(BN11)),
+            make_tuple(
+                Sequence<1, 5, 2, 6>{}, Sequence<0>{}, Sequence<3>{}, Sequence<4>{}, Sequence<7>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
+        constexpr auto adaptor = chain_tensor_adaptors(adaptor0, adaptor1);
+        return adaptor.CalculateBottomIndex(make_multi_index(thread_id, 0, 0, 0, 0));
+    }
+    __device__ AIndex CalculateAThreadOriginOnBlock_BK0_BM0_BM1_BK1()
+    {
+        const auto offsetBM0 = c_thread_origin_data_idx_[I0];
+        // If sharing matrix A, we need a separate BM1 offset for each thread in lane group.
+        const auto offsetBM1 = ShareA ? c_thread_origin_data_idx_[I1] +
+                                            dpp8::get_thread_idx_in_lane_group() * BM1PerThread
+                                      : c_thread_origin_data_idx_[I1];
+        return make_tuple(0, offsetBM0, offsetBM1, 0);
+    }
+    __device__ BIndex CalculateBThreadOriginOnBlock_BK0_BN0_BN1_BK1()
+    {
+        const auto offsetBN0 = c_thread_origin_data_idx_[I2];
+        // If sharing matrix B, we need a separate BN1 offset for each thread in lane group.
+        const auto offsetBN1 = ShareB ? c_thread_origin_data_idx_[I3] +
+                                            dpp8::get_thread_idx_in_lane_group() * BN1PerThread
+                                      : c_thread_origin_data_idx_[I3];
+        return make_tuple(0, offsetBN0, offsetBN1, 0);
+    }
+    template <typename CThreadDesc_BM0_BM11_BN0_BN11,
+              typename ABlockBuffer,
+              typename BBlockBuffer,
+              typename CThreadBuffer>
+    __device__ void Run(const CThreadDesc_BM0_BM11_BN0_BN11&,
+                        const ABlockBuffer& a_block_buf,
+                        const BBlockBuffer& b_block_buf,
+                        CThreadBuffer& c_thread_buf) const
+    {
+        static_assert(CThreadDesc_BM0_BM11_BN0_BN11::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatA>(
+            a_thread_desc_bk0_bm0_bm1_bk1_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatB>(
+            b_thread_desc_bk0_bn0_bn1_bk1_.GetElementSpaceSize());
+        constexpr auto threadwise_contraction =
+            ThreadwiseContractionDlDpp8_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1<
+                FloatA,
+                FloatB,
+                FloatC,
+                decltype(a_thread_desc_bk0_bm0_bm1_bk1_),
+                decltype(b_thread_desc_bk0_bn0_bn1_bk1_),
+                CThreadDesc_BM0_BM11_BN0_BN11,
+                Sequence<BK0PerThread, BK1>,
+                Sequence<1, BM1PerThreadBM11>,
+                Sequence<1, BN1PerThreadBN11>,
+                ShareA>{};
+        static_for<0, BN0, 1>{}([&](auto bn0) {
+            static_for<0, BM0, 1>{}([&](auto bm0) {
+                a_thread_copy_.Run(a_block_desc_bk0_bm0_bm1_bk1_,
+                                   make_tuple(I0, bm0, I0, I0),
+                                   a_block_buf,
+                                   a_thread_desc_bk0_bm0_bm1_bk1_,
+                                   make_tuple(I0, I0, I0, I0),
+                                   a_thread_buf);
+                b_thread_copy_.Run(b_block_desc_bk0_bn0_bn1_bk1_,
+                                   make_tuple(I0, bn0, I0, I0),
+                                   b_block_buf,
+                                   b_thread_desc_bk0_bn0_bn1_bk1_,
+                                   make_tuple(I0, I0, I0, I0),
+                                   b_thread_buf);
+                threadwise_contraction.Run(a_thread_buf,
+                                           make_tuple(I0, I0, I0, I0),
+                                           b_thread_buf,
+                                           make_tuple(I0, I0, I0, I0),
+                                           c_thread_buf,
+                                           make_tuple(bm0, I0, bn0, I0));
+                static_for<BK0PerThread, BK0, BK0PerThread>{}([&](auto bk0) {
+                    a_thread_copy_.Run(a_block_desc_bk0_bm0_bm1_bk1_,
+                                       make_tuple(bk0, bm0, I0, I0),
+                                       a_block_buf,
+                                       a_thread_desc_bk0_bm0_bm1_bk1_,
+                                       make_tuple(I0, I0, I0, I0),
+                                       a_thread_buf);
+                    b_thread_copy_.Run(b_block_desc_bk0_bn0_bn1_bk1_,
+                                       make_tuple(bk0, bn0, I0, I0),
+                                       b_block_buf,
+                                       b_thread_desc_bk0_bn0_bn1_bk1_,
+                                       make_tuple(I0, I0, I0, I0),
+                                       b_thread_buf);
+                    threadwise_contraction.Run(a_thread_buf,
+                                               make_tuple(I0, I0, I0, I0),
+                                               b_thread_buf,
+                                               make_tuple(I0, I0, I0, I0),
+                                               c_thread_buf,
+                                               make_tuple(bm0, I0, bn0, I0));
+                });
+            });
+        });
+    }
+    private:
+    // A[BK0, BM0, BM1, BK1]
+    static constexpr auto a_thread_desc_bk0_bm0_bm1_bk1_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<BK0PerThread>{}, Number<BM0>{}, Number<BM1PerThread>{}, Number<BK1>{}));
+    // B[BK0, BN0, BN1, BK1]
+    static constexpr auto b_thread_desc_bk0_bn0_bn1_bk1_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<BK0PerThread>{}, Number<BN0>{}, Number<BN1PerThread>{}, Number<BK1>{}));
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4r1<
+        FloatA,
+        FloatA,
+        decltype(a_block_desc_bk0_bm0_bm1_bk1_),
+        decltype(a_thread_desc_bk0_bm0_bm1_bk1_),
+        Sequence<BK0PerThread, 1, BM1PerThread, BK1>, // SliceLengths
+        Sequence<0, 1, 2, 3>,                         // DimAccessOrder
+        Sequence<1, 1, BM1PerThread, BK1>,            // SrcVectorTensorLengths
+        Sequence<0, 1, 2, 3>>;                        // SrcVectorTensorContiguousDimOrder
+    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4r1<
+        FloatB,
+        FloatB,
+        decltype(b_block_desc_bk0_bn0_bn1_bk1_),
+        decltype(b_thread_desc_bk0_bn0_bn1_bk1_),
+        Sequence<BK0PerThread, 1, BN1PerThread, BK1>, // SliceLengths
+        Sequence<0, 1, 2, 3>,                         // DimAccessOrder
+        Sequence<1, 1, BN1PerThread, BK1>,            // SrcVectorTensorLengths
+        Sequence<0, 1, 2, 3>>;                        // SrcVectorTensorContiguousDimOrder
+    CIndex c_thread_origin_data_idx_;
+    AThreadCopy a_thread_copy_;
+    BThreadCopy b_thread_copy_;
+};
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp
@@ -11,7 +11,7 @@
 namespace ck {
 // C[BM0, BM1, BN0, BN1] += transpose(A[K, BM0, BM1]) * B[K, BN0, BN1]
-// A and B are visable to the whole block, C is distributed among each thread
+// A and B are visible to the whole block, C is distributed among each thread
 // Assume:
 //   1. A:
 //     1. ABlockDesc_BK0_BM_BK1 is known at compile-time

--- a/include/ck/tensor_operation/gpu/device/device_avgpool_bwd.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_avgpool_bwd.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <vector>
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+template <index_t NDimSpatial,
+          typename DOutDataType,
+          typename DInDataType,
+          typename DOutLayout,
+          typename DInLayout>
+struct DeviceAvgPoolBwd : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_dout,
+                        void* p_din,
+                        std::vector<ck::index_t> dout_n_k_wos_lengths,
+                        std::vector<ck::index_t> dout_n_k_wos_strides,
+                        std::vector<ck::index_t> din_n_k_wos_length,
+                        std::vector<ck::index_t> din_n_k_wos_strides,
+                        std::vector<ck::index_t> window_k_c_xs_lengths,
+                        std::vector<ck::index_t> window_strides,
+                        std::vector<ck::index_t> window_dilations,
+                        std::vector<ck::index_t> input_left_pads,
+                        std::vector<ck::index_t> input_right_pads) = 0;
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_pool_fwd.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_pool_fwd.hpp
@@ -17,6 +17,8 @@ template <index_t InOutRank,
          typename InDataType,
          typename OutDataType,
          typename IndexDataType,
+          typename InLayout,
+          typename OutLayout,
          ReduceTensorOp ReduceOpId,
          bool OutputIndex>
 struct DevicePoolFwd : public BaseOperator
@@ -25,13 +27,14 @@ struct DevicePoolFwd : public BaseOperator
    MakeArgumentPointer(const void* p_in_dev,
                        void* p_out_dev,
                        void* p_out_indices_dev,
-                        std::vector<ck::index_t> input_lengths,
+                        std::vector<ck::index_t> input_n_c_wis_lengths,
-                        std::vector<ck::index_t> window_lengths,
+                        std::vector<ck::index_t> window_xs_lengths,
-                        std::vector<ck::index_t> output_lengths,
+                        std::vector<ck::index_t> output_n_c_wos_lengths,
-                        std::vector<ck::index_t> input_stride,
+                        std::vector<ck::index_t> input_n_c_wis_stride,
-                        std::vector<ck::index_t> output_stride,
+                        std::vector<ck::index_t> output_n_c_wis_stride,
-                        std::vector<ck::index_t> indices_stride,
+                        std::vector<ck::index_t> indices_n_c_wis_stride,
-                        std::vector<ck::index_t> window_strides,
+                        std::vector<ck::index_t> window_xs_strides,
+                        std::vector<ck::index_t> window_xs_dilations,
                        std::vector<ck::index_t> input_left_pads,
                        std::vector<ck::index_t> input_right_pads,
                        std::vector<ck::index_t> pooling_dims) = 0;

--- a/include/ck/tensor_operation/gpu/device/device_put_element.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_put_element.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once

--- a/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f16_instance.cpp
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#include "pool_fwd_instance_common.hpp"
+#pragma once
 namespace ck {
 namespace tensor_operation {
 namespace device {
-namespace instance {
-static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
+enum struct GemmDlAlgorithm
-void add_device_pool2d_fwd_nhwc_f16_instances(
-    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, F16, F16, I32, ReduceOpId, false>>>& instances)
 {
-    add_device_operation_instances(
+    Default, // Uses DOT vector instructions
-        instances, device_pool2d_fwd_nhwc_instances<F16, F16, I32, F32, ReduceOpId, false>{});
+    Dpp8,    // Uses DOT vector instructions with DPP8 SEL modifier to reduce data loads from LDS
-}
+};
-} // namespace instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
--- a/include/ck/tensor_operation/gpu/device/impl/device_avgpool3d_bwd_ndhwc_ndhwc.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_avgpool3d_bwd_ndhwc_ndhwc.hpp
--- a/include/ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp
@@ -123,7 +123,8 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
        ALayout,
        BLayout,
        CLayout,
-        ADataType, // TODO: distinguish A/B datatype
+        ADataType,
+        BDataType,
        GemmAccDataType,
        CShuffleDataType,
        CDataType,
@@ -284,8 +285,11 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
            {
-                const auto kernel =
+                const auto kernel = kernel_gemm_xdl_cshuffle_v1<GridwiseGemm,
-                    kernel_gemm_xdl_cshuffle_v1<GridwiseGemm, ADataType, CDataType, true>;
+                                                                ADataType,
+                                                                BDataType,
+                                                                CDataType,
+                                                                true>;
                ave_time += launch_and_time_kernel(stream_config,
                                                   kernel,
@@ -357,8 +361,11 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
            }
            else
            {
-                const auto kernel =
+                const auto kernel = kernel_gemm_xdl_cshuffle_v1<GridwiseGemm,
-                    kernel_gemm_xdl_cshuffle_v1<GridwiseGemm, ADataType, CDataType, false>;
+                                                                ADataType,
+                                                                BDataType,
+                                                                CDataType,
+                                                                false>;
                ave_time += launch_and_time_kernel(stream_config,
                                                   kernel,

--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
@@ -532,11 +532,12 @@ struct DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
            float ave_time = 0;
            const auto Run = [&](const auto& kernel) {
-                hipGetErrorString(hipMemset(
+                hipGetErrorString(hipMemsetAsync(
                    arg.p_c_grid_,
                    0,
                    arg.c_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() *
-                        sizeof(CDataType)));
+                        sizeof(CDataType),
+                    stream_config.stream_id_));
                ave_time =
                    launch_and_time_kernel(stream_config,

--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp
@@ -11,6 +11,7 @@
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_dl_algorithm.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_dl_v1r3.hpp"
 #include "ck/host_utility/device_prop.hpp"
@@ -59,6 +60,7 @@ template <
    typename CThreadTransferSrcDstAccessOrder,
    index_t CThreadTransferSrcDstVectorDim,
    index_t CThreadTransferDstScalarPerVector,
+    GemmDlAlgorithm GemmDlAlg = GemmDlAlgorithm::Default,
    enable_if_t<
        is_same_v<AElementwiseOperation, ck::tensor_operation::element_wise::PassThrough> &&
            is_same_v<BElementwiseOperation, ck::tensor_operation::element_wise::PassThrough> &&
@@ -236,7 +238,8 @@ struct DeviceGemmDl : public DeviceGemm<ALayout,
                                     BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1,
                                     CThreadTransferSrcDstAccessOrder,
                                     CThreadTransferSrcDstVectorDim,
-                                     CThreadTransferDstScalarPerVector>;
+                                     CThreadTransferDstScalarPerVector,
+                                     GemmDlAlg>;
    using AGridDesc_K0_M0_M1_K1 =
        decltype(GridwiseGemm::MakeAGridDescriptor_K0_M0_M1_K1(AGridDesc_K0_M_K1{}));
@@ -372,7 +375,8 @@ struct DeviceGemmDl : public DeviceGemm<ALayout,
                                        remove_reference_t<CGridDesc_M0_M10_M11_N0_N10_N11>,
                                        remove_reference_t<DefaultBlock2CTileMap>,
                                        true,
-                                        true>;
+                                        true,
+                                        GemmDlAlg>;
                ave_time = launch_and_time_kernel(stream_config,
                                                  kernel,
@@ -398,7 +402,8 @@ struct DeviceGemmDl : public DeviceGemm<ALayout,
                                        remove_reference_t<CGridDesc_M0_M10_M11_N0_N10_N11>,
                                        remove_reference_t<DefaultBlock2CTileMap>,
                                        true,
-                                        false>;
+                                        false,
+                                        GemmDlAlg>;
                ave_time = launch_and_time_kernel(stream_config,
                                                  kernel,
@@ -424,7 +429,8 @@ struct DeviceGemmDl : public DeviceGemm<ALayout,
                                        remove_reference_t<CGridDesc_M0_M10_M11_N0_N10_N11>,
                                        remove_reference_t<DefaultBlock2CTileMap>,
                                        false,
-                                        true>;
+                                        true,
+                                        GemmDlAlg>;
                ave_time = launch_and_time_kernel(stream_config,
                                                  kernel,
@@ -450,7 +456,8 @@ struct DeviceGemmDl : public DeviceGemm<ALayout,
                                        remove_reference_t<CGridDesc_M0_M10_M11_N0_N10_N11>,
                                        remove_reference_t<DefaultBlock2CTileMap>,
                                        false,
-                                        false>;
+                                        false,
+                                        GemmDlAlg>;
                ave_time = launch_and_time_kernel(stream_config,
                                                  kernel,
@@ -485,6 +492,16 @@ struct DeviceGemmDl : public DeviceGemm<ALayout,
    static bool IsSupportedArgument(const Argument& arg)
    {
+        if constexpr(GemmDlAlg == GemmDlAlgorithm::Dpp8)
+        {
+            if(ck::get_device_name() == "gfx1030")
+            {
+                return GridwiseGemm::CheckValidity(
+                    arg.a_grid_desc_k0_m_k1_, arg.b_grid_desc_k0_n_k1_, arg.c_grid_desc_m_n_);
+            }
+            return false;
+        }
        if(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx1030" ||
           ck::get_device_name() == "gfx1100" || ck::get_device_name() == "gfx1101" ||
           ck::get_device_name() == "gfx1102")
@@ -492,10 +509,7 @@ struct DeviceGemmDl : public DeviceGemm<ALayout,
            return GridwiseGemm::CheckValidity(
                arg.a_grid_desc_k0_m_k1_, arg.b_grid_desc_k0_n_k1_, arg.c_grid_desc_m_n_);
        }
-        else
+        return false;
-        {
-            return false;
-        }
    }
    // polymorphic
@@ -572,7 +586,7 @@ struct DeviceGemmDl : public DeviceGemm<ALayout,
    }
    // polymorphic
-    std::string GetTypeString() const override
+    virtual std::string GetTypeString() const override
    {
        auto str = std::stringstream();

--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl_dpp8.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl_dpp8.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iostream>
+#include <sstream>
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_dl_algorithm.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_dl_v1r3.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+template <
+    typename ADataType,
+    typename BDataType,
+    typename CDataType,
+    typename AccDataType,
+    typename ALayout,
+    typename BLayout,
+    typename CLayout,
+    typename AElementwiseOperation,
+    typename BElementwiseOperation,
+    typename CElementwiseOperation,
+    GemmSpecialization GemmSpec,
+    index_t BlockSize,
+    index_t MPerBlock,
+    index_t NPerBlock,
+    index_t K0PerBlock,
+    index_t K1,
+    index_t M1PerThread,
+    index_t N1PerThread,
+    index_t KPerThread,
+    typename M1N1ThreadClusterM1Xs,
+    typename M1N1ThreadClusterN1Xs,
+    typename ABlockTransferThreadSliceLengths_K0_M0_M1_K1,
+    typename ABlockTransferThreadClusterLengths_K0_M0_M1_K1,
+    typename ABlockTransferThreadClusterArrangeOrder,
+    typename ABlockTransferSrcAccessOrder,
+    typename ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1,
+    typename ABlockTransferSrcVectorTensorContiguousDimOrder,
+    typename ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1,
+    typename BBlockTransferThreadSliceLengths_K0_N0_N1_K1,
+    typename BBlockTransferThreadClusterLengths_K0_N0_N1_K1,
+    typename BBlockTransferThreadClusterArrangeOrder,
+    typename BBlockTransferSrcAccessOrder,
+    typename BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1,
+    typename BBlockTransferSrcVectorTensorContiguousDimOrder,
+    typename BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1,
+    typename CThreadTransferSrcDstAccessOrder,
+    index_t CThreadTransferSrcDstVectorDim,
+    index_t CThreadTransferDstScalarPerVector,
+    enable_if_t<
+        is_same_v<AElementwiseOperation, ck::tensor_operation::element_wise::PassThrough> &&
+            is_same_v<BElementwiseOperation, ck::tensor_operation::element_wise::PassThrough> &&
+            is_same_v<CElementwiseOperation, ck::tensor_operation::element_wise::PassThrough>,
+        bool> = false>
+struct DeviceGemmDlDpp8 : public DeviceGemmDl<ADataType,
+                                              BDataType,
+                                              CDataType,
+                                              AccDataType,
+                                              ALayout,
+                                              BLayout,
+                                              CLayout,
+                                              AElementwiseOperation,
+                                              BElementwiseOperation,
+                                              CElementwiseOperation,
+                                              GemmSpec,
+                                              BlockSize,
+                                              MPerBlock,
+                                              NPerBlock,
+                                              K0PerBlock,
+                                              K1,
+                                              M1PerThread,
+                                              N1PerThread,
+                                              KPerThread,
+                                              M1N1ThreadClusterM1Xs,
+                                              M1N1ThreadClusterN1Xs,
+                                              ABlockTransferThreadSliceLengths_K0_M0_M1_K1,
+                                              ABlockTransferThreadClusterLengths_K0_M0_M1_K1,
+                                              ABlockTransferThreadClusterArrangeOrder,
+                                              ABlockTransferSrcAccessOrder,
+                                              ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1,
+                                              ABlockTransferSrcVectorTensorContiguousDimOrder,
+                                              ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1,
+                                              BBlockTransferThreadSliceLengths_K0_N0_N1_K1,
+                                              BBlockTransferThreadClusterLengths_K0_N0_N1_K1,
+                                              BBlockTransferThreadClusterArrangeOrder,
+                                              BBlockTransferSrcAccessOrder,
+                                              BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1,
+                                              BBlockTransferSrcVectorTensorContiguousDimOrder,
+                                              BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1,
+                                              CThreadTransferSrcDstAccessOrder,
+                                              CThreadTransferSrcDstVectorDim,
+                                              CThreadTransferDstScalarPerVector,
+                                              GemmDlAlgorithm::Dpp8>
+{
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+        // clang-format off
+        str << "DeviceGemmDlDpp8"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << K0PerBlock << ", "
+            << K1 << ", "
+            << M1PerThread << ", "
+            << N1PerThread << ", "
+            << KPerThread
+            << ">";
+        // clang-format on
+        return str.str();
+    }
+};
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp
@@ -65,7 +65,8 @@ template <typename ALayout,
          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
          LoopScheduler LoopSched     = make_default_loop_scheduler(),
-          PipelineVersion PipelineVer = PipelineVersion::v1>
+          PipelineVersion PipelineVer = PipelineVersion::v1,
+          typename ComputeType        = CDataType>
 struct DeviceGemm_Xdl_CShuffle : public DeviceGemm<ALayout,
                                                   BLayout,
                                                   CLayout,
@@ -87,7 +88,8 @@ struct DeviceGemm_Xdl_CShuffle : public DeviceGemm<ALayout,
        ALayout,
        BLayout,
        CLayout,
-        ADataType, // TODO: distinguish A/B datatype
+        ADataType,
+        BDataType,
        GemmAccDataType,
        CShuffleDataType,
        CDataType,
@@ -128,7 +130,8 @@ struct DeviceGemm_Xdl_CShuffle : public DeviceGemm<ALayout,
        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
        CShuffleBlockTransferScalarPerVector_NPerBlock,
        LoopSched,
-        PipelineVer>;
+        PipelineVer,
+        ComputeType>;
    using Argument = typename GridwiseGemm::Argument;

--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
@@ -158,8 +158,10 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
            const auto Run = [&](const auto& kernel) {
                if(kbatch > 1)
-                    hipGetErrorString(
+                    hipGetErrorString(hipMemsetAsync(karg.p_c_grid,
-                        hipMemset(karg.p_c_grid, 0, karg.M * karg.N * sizeof(CDataType)));
+                                                     0,
+                                                     karg.M * karg.N * sizeof(CDataType),
+                                                     stream_config.stream_id_));
                ave_time = launch_and_time_kernel(
                    stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, karg, b2c_map);