merge develop

b93575ca · Jing Zhang · 54df59bf · c8a8385f · b93575ca · b93575ca
Commit b93575ca authored Aug 28, 2023 by Jing Zhang
20 changed files
--- a/example/51_avgpool3d_bwd/avgpool3d_bwd_fp32.cpp
+++ b/example/51_avgpool3d_bwd/avgpool3d_bwd_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_avgpool3d_bwd_ndhwc_ndhwc.hpp"
+
+#include "avgpool3d_bwd_common.hpp"
+
+using DOutDataType    = float;
+using DInDataType     = float;
+using ComputeDataType = float;
+
+#if 1
+using DOutLayout = ck::tensor_layout::convolution::NDHWC;
+using DInLayout  = ck::tensor_layout::convolution::NDHWC;
+#else
+using DOutLayout = ck::tensor_layout::convolution::NCDHW;
+using DInLayout  = ck::tensor_layout::convolution::NCDHW;
+#endif
+
+using DevicePoolBwdInstance =
+    ck::tensor_operation::device::DeviceAvgPool3dBwd_NDHWC_NDHWC<DOutDataType,
+                                                                 DInDataType,
+                                                                 ComputeDataType,
+                                                                 64, // BlockSize
+                                                                 64, // ReduceMThreadClusterSize
+                                                                 1,  // ReduceKThreadClusterSize
+                                                                 1,  // ReduceMThreadSliceSize
+                                                                 1,  // ReduceKThreadSliceSize
+                                                                 1>; // InSrcOutDstVectorSize
+
+int main()
+{
+    std::vector<ck::index_t> window_lengths    = {5, 5, 5};
+    std::vector<ck::index_t> window_strides    = {2, 2, 2};
+    std::vector<ck::index_t> window_dilations  = {2, 2, 2};
+    std::vector<ck::index_t> dinput_left_pads  = {0, 0, 0};
+    std::vector<ck::index_t> dinput_right_pads = {0, 0, 0};
+
+    ck::index_t N  = 1;
+    ck::index_t C  = 16;
+    ck::index_t Di = 40;
+    ck::index_t Hi = 40;
+    ck::index_t Wi = 40;
+
+    pool3d_bwd_test<DevicePoolBwdInstance, DOutDataType, DInDataType, DOutLayout, DInLayout>(
+        true,
+        false,
+        N,
+        C,
+        Di,
+        Hi,
+        Wi,
+        window_lengths,
+        window_strides,
+        window_dilations,
+        dinput_left_pads,
+        dinput_right_pads);
+}
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -3,6 +3,8 @@

 #pragma once

+#include "ck/config.h"
+
 #ifndef CK_DONT_USE_HIP_RUNTIME_HEADERS
 #include "hip/hip_runtime.h"
 #include "hip/hip_fp16.h"
@@ -125,6 +127,9 @@
 // `s_nop`s to avoid hazard
 #define CK_USE_AMD_V_DOT_INLINE_ASM 0

+// inner product using V_DOT with DPP8 modifiers
+#define CK_USE_AMD_V_DOT_DPP8_INLINE_ASM 1
+
 // block synchronization only s_wait lgkmcnt(0), not vmcnt(0)
 #define CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1

@@ -197,9 +202,6 @@
 // workaround: compiler issue on gfx908
 #define CK_WORKAROUND_SWDEV_388832 1

-// workaround: Grouped Conv2d_bwd_data fails for already implemented instance
-#define CK_WORKAROUND_SWDEV_3318619 0
-
 // flag to enable (1) or disable (0) the debugging output in some kernels
 #define DEBUG_LOG 0


--- a/include/ck/config.h.in
+++ b/include/ck/config.h.in
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef CK_CONFIG_H_IN
+#define CK_CONFIG_H_IN
+
+// clang-format off
+//
+// DataType supports in the current CK build
+//
+#ifndef DTYPES
+#cmakedefine DTYPES "@DTYPES@"
+#endif
+// if DTYPES is not defined, enable all datatypes in headerfiles
+#ifndef CK_ENABLE_ALL_DTYPES
+#cmakedefine CK_ENABLE_ALL_DTYPES @CK_ENABLE_ALL_DTYPES@
+#if defined(CK_ENABLE_ALL_DTYPES)
+#ifndef CK_ENABLE_INT8
+#define CK_ENABLE_INT8 "ON"
+#endif
+#ifndef CK_ENABLE_FP8
+#define CK_ENABLE_FP8 "ON"
+#endif
+#ifndef CK_ENABLE_FP16
+#define CK_ENABLE_FP16 "ON"
+#endif
+#ifndef CK_ENABLE_BF16
+#define CK_ENABLE_BF16 "ON"
+#endif
+#ifndef CK_ENABLE_FP32
+#define CK_ENABLE_FP32 "ON"
+#endif
+#ifndef CK_ENABLE_FP64
+#define CK_ENABLE_FP64 "ON"
+#endif
+#endif
+#endif
+// if DTYPES are selectively enabled
+#ifndef CK_ENABLE_INT8
+#cmakedefine CK_ENABLE_INT8 @CK_ENABLE_INT8@
+#endif
+
+#ifndef CK_ENABLE_FP8
+#cmakedefine CK_ENABLE_FP8 @CK_ENABLE_FP8@
+#endif
+
+#ifndef CK_ENABLE_FP16
+#cmakedefine CK_ENABLE_FP16 @CK_ENABLE_FP16@
+#endif
+
+#ifndef CK_ENABLE_BF16
+#cmakedefine CK_ENABLE_BF16 @CK_ENABLE_BF16@
+#endif
+
+#ifndef CK_ENABLE_FP32
+#cmakedefine CK_ENABLE_FP32 @CK_ENABLE_FP32@
+#endif
+
+#ifndef CK_ENABLE_FP64
+#cmakedefine CK_ENABLE_FP64 @CK_ENABLE_FP64@
+#endif
+
+//
+// Legacy DL kernel supports in the current CK build
+// by default DL kernels are turned OFF
+//
+#ifndef CK_ENABLE_DL_KERNELS
+#cmakedefine CK_ENABLE_DL_KERNELS @CK_ENABLE_DL_KERNELS@
+#endif
+
+//
+// Instances supports in the current CK build
+//
+#ifndef CK_ENABLE_INSTANCES_ONLY
+#cmakedefine CK_ENABLE_INSTANCES_ONLY @CK_ENABLE_INSTANCES_ONLY@
+#endif
+
+// clang-format on
+
+#endif // CK_CONFIG_H_IN
--- a/include/ck/tensor_description/multi_index_transform.hpp
+++ b/include/ck/tensor_description/multi_index_transform.hpp
@@ -1042,13 +1042,13 @@ struct Merge_v2_magic_division
    using UpLengths =
        decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies{}, Number<1>{})));

-    using LowLengthsMagicDivisorMultipiler = decltype(
-        generate_tuple(lambda_merge_generate_MagicDivision_calculate_magic_multiplier<LowLengths>{},
-                       Number<NDimLow>{}));
+    using LowLengthsMagicDivisorMultipiler = decltype(generate_tuple(
+        lambda_merge_generate_MagicDivision_calculate_magic_multiplier<LowLengths>{},
+        Number<NDimLow>{}));

-    using LowLengthsMagicDivisorShift = decltype(
-        generate_tuple(lambda_merge_generate_MagicDivision_calculate_magic_shift<LowLengths>{},
-                       Number<NDimLow>{}));
+    using LowLengthsMagicDivisorShift = decltype(generate_tuple(
+        lambda_merge_generate_MagicDivision_calculate_magic_shift<LowLengths>{},
+        Number<NDimLow>{}));

    LowLengths low_lengths_;
    LowLengthsMagicDivisorMultipiler low_lengths_magic_divisor_multiplier_;
@@ -1201,9 +1201,9 @@ struct Merge_v2r2_magic_division
        lambda_merge_generate_MagicDivision_calculate_magic_multiplier<LowLengthsScan>{},
        Number<NDimLow>{}));

-    using LowLengthsScanMagicDivisorShift = decltype(
-        generate_tuple(lambda_merge_generate_MagicDivision_calculate_magic_shift<LowLengthsScan>{},
-                       Number<NDimLow>{}));
+    using LowLengthsScanMagicDivisorShift = decltype(generate_tuple(
+        lambda_merge_generate_MagicDivision_calculate_magic_shift<LowLengthsScan>{},
+        Number<NDimLow>{}));

    LowLengths low_lengths_;
    LowLengthsScan low_lengths_scan_;

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_dpp8.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_dpp8.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/amd_gemm_dpp.hpp"
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_adaptor.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v4r1.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_contraction_dl_dpp8.hpp"
+
+namespace ck {
+
+/**
+ * DPP8 version of blockwise GEMM algorithm. It uses DPP8 instruction modifier to limit
+ * the data loaded from LDS to registers.
+ *
+ * The algorithm groups threads into groups of size `dpp8::lane_group_size` and splits the matrix C
+ * between them in such a way that threads from the same group need the same chunk of either
+ * matrix A (or B, respectively). Without the usage of DPP8, each thread would need to load the
+ * whole chunk from LDS to its own register space.
+ * Usage of DPP8 modifiers allow each thread to load less data, exactly `1 / dpp8::lane_group_size`
+ * of the chunk, and then share that data with other threads from the same lane group.
+ *
+ * Assumptions coming from the usage of DPP8:
+ *   1. `BM10BN10ThreadClusterBM10Xs[1] == dpp8::lane_group_size` or
+ *      `BM10BN10ThreadClusterBN10Xs[1] == dpp8::lane_group_size` -
+ *        - it makes consecutive `dpp8::lane_group_size` threads use the same chunk of either
+ *          matrix A or B;
+ *        - based on these values we determine which matrix to share.
+ *   2. `BM1PerThreadBM11 % dpp8::lane_group_size == 0` (if sharing A) or
+ *      `BN1PerThreadBN11 % dpp8::lane_group_size == 0` (if sharing B) -
+ *        - we have to make sure that the data to split is divisible by the number of
+ *          threads in the group.
+ *
+ * General algorithm:
+ * C[BM0, BM1, BN0, BN1] += transpose(A[K, BM0, BM1]) * B[K, BN0, BN1]
+ * A and B are visible to the whole block, C is distributed among each thread
+ * Assume:
+ *   1. A:
+ *     1. ABlockDesc_BK0_BM_BK1 is known at compile-time
+ *     2. ABlockBuffer is DynamicBuffer
+ *   2. B:
+ *     1. BBlockDesc_BK0_BN_BK1 is known at compile-time
+ *     2. BBlockBuffer is DynamicBuffer
+ *   3. C:
+ *     1. CThreadDesc_BM0_BM11_BN0_BN11 is known at compile-time
+ *     2. CThreadBuffer is StaticBuffer
+ *   4. BM10BN10ThreadClusterBM10Xs::Size() = BM10BN10ThreadClusterBN10Xs::Size() == 2
+ */
+template <index_t BlockSize,
+          typename FloatA,
+          typename FloatB,
+          typename FloatC,
+          typename ABlockDesc_BK0_BM_BK1,
+          typename BBlockDesc_BK0_BN_BK1,
+          index_t BM1PerThreadBM11,
+          index_t BN1PerThreadBN11,
+          index_t BK0PerThread,
+          typename BM10BN10ThreadClusterBM10Xs, // Sequence<BM10BN10ThreadClusterBM100,
+                                                //          BM10BN10ThreadClusterBM101, ...>
+          typename BM10BN10ThreadClusterBN10Xs, // Sequence<BM10BN10ThreadClusterBN100,
+                                                //          BM10BN10ThreadClusterBN101, ...>
+          index_t AThreadCopyScalarPerVector_BM11,
+          index_t BThreadCopyScalarPerVector_BN11,
+          typename enable_if<ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&
+                                 BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(),
+                             bool>::type = false>
+struct BlockwiseGemmDlDpp8_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_loop_BM0_BN0
+{
+    using AIndex = MultiIndex<4>;
+    using BIndex = MultiIndex<4>;
+    using CIndex = MultiIndex<4>;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    static constexpr index_t BK0 = ABlockDesc_BK0_BM_BK1{}.GetLength(I0);
+    static constexpr index_t BK1 = ABlockDesc_BK0_BM_BK1{}.GetLength(I2);
+    static constexpr index_t BM  = ABlockDesc_BK0_BM_BK1{}.GetLength(I1);
+    static constexpr index_t BN  = BBlockDesc_BK0_BN_BK1{}.GetLength(I1);
+
+    static constexpr index_t BM100 = BM10BN10ThreadClusterBM10Xs{}[I0];
+    static constexpr index_t BN100 = BM10BN10ThreadClusterBN10Xs{}[I0];
+
+    static constexpr index_t BM101 = BM10BN10ThreadClusterBM10Xs{}[I1];
+    static constexpr index_t BN101 = BM10BN10ThreadClusterBN10Xs{}[I1];
+
+    static constexpr index_t BM11 = BM1PerThreadBM11;
+    static constexpr index_t BN11 = BN1PerThreadBN11;
+
+    static constexpr index_t BM1 = BM100 * BM101 * BM11;
+    static constexpr index_t BN1 = BN100 * BN101 * BN11;
+
+    static constexpr index_t BM0 = BM / BM1;
+    static constexpr index_t BN0 = BN / BN1;
+
+    // We assume that either `BM101` or `BN101` is equal to `dpp8::lane_group_size`. It makes all
+    // threads in a lane group need the same chunk of B or A matrices and we can share them using
+    // DPP.
+    static_assert(BM101 == dpp8::lane_group_size || BN101 == dpp8::lane_group_size);
+    static constexpr bool ShareB = BM101 == dpp8::lane_group_size ? true : false;
+    static constexpr bool ShareA = !ShareB;
+
+    // If DPP shares A (B, respectively), lane group gets `BM1PerThreadBM11` (`BN1PerThreadBN11`,
+    // respectively) elements, so we split them between threads in lane group so each thread loads
+    // less data from LDS.
+    static constexpr index_t BM1PerThread =
+        ShareA ? BM1PerThreadBM11 / dpp8::lane_group_size : BM1PerThreadBM11;
+    static constexpr index_t BN1PerThread =
+        ShareB ? BN1PerThreadBN11 / dpp8::lane_group_size : BN1PerThreadBN11;
+
+    __host__ __device__ static constexpr auto
+    MakeABlockDescriptor_BK0_BM0_BM1_BK1(const ABlockDesc_BK0_BM_BK1& a_block_desc_bk0_bm_bk1)
+    {
+        const auto a_block_bk0_bm0_bm1_bk1 = transform_tensor_descriptor(
+            a_block_desc_bk0_bm_bk1,
+            make_tuple(make_pass_through_transform(Number<BK0>{}),
+                       make_unmerge_transform(make_tuple(Number<BM0>{}, Number<BM1>{})),
+                       make_pass_through_transform(Number<BK1>{})),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+
+        return a_block_bk0_bm0_bm1_bk1;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeBBlockDescriptor_BK0_BN0_BN1_BK1(const BBlockDesc_BK0_BN_BK1& b_block_desc_bk0_bn_bk1)
+    {
+        const auto b_block_desc_bk0_bn0_bn1_bk1 = transform_tensor_descriptor(
+            b_block_desc_bk0_bn_bk1,
+            make_tuple(make_pass_through_transform(Number<BK0>{}),
+                       make_unmerge_transform(make_tuple(Number<BN0>{}, Number<BN1>{})),
+                       make_pass_through_transform(Number<BK1>{})),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+
+        return b_block_desc_bk0_bn0_bn1_bk1;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCBlockAdaptor_BM0_BM100_BM101_BM11_BN0_BN100_BN101_BN11_To_BM_BN()
+    {
+        // upper: [BM0, BM100, BM101, BM11, BN0, BN100, BN101, BN11]
+        // lower: [BM, BN]
+        constexpr auto c_block_adaptor_m0_m100_m101_m11_n0_n100_n101_n11_to_m_n =
+            make_single_stage_tensor_adaptor(
+                make_tuple(make_unmerge_transform(make_tuple(
+                               Number<BM0>{}, Number<BM100>{}, Number<BM101>{}, Number<BM11>{})),
+                           make_unmerge_transform(make_tuple(
+                               Number<BN0>{}, Number<BN100>{}, Number<BN101>{}, Number<BN11>{}))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 2, 3>{}, Sequence<4, 5, 6, 7>{}));
+
+        return c_block_adaptor_m0_m100_m101_m11_n0_n100_n101_n11_to_m_n;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCBlockAdaptor_BM0_BM100_BM101_BM11_BN0_BN100_BN101_BN11_To_BM0_BM1_BN0_BN1()
+    {
+        // upper: [BM0, BM100, BM101, BM11, BN0, BN100, BN101, BN11]
+        // lower: [BM0, BM1, BN0, BN1]
+        constexpr auto c_block_adaptor_m0_m100_m101_m11_n0_n100_n101_n11_to_m0_m1_n0_n1 =
+            make_single_stage_tensor_adaptor(
+                make_tuple(make_pass_through_transform(Number<BM0>{}),
+                           make_unmerge_transform(
+                               make_tuple(Number<BM100>{}, Number<BM101>{}, Number<BM11>{})),
+                           make_pass_through_transform(Number<BN0>{}),
+                           make_unmerge_transform(
+                               make_tuple(Number<BN100>{}, Number<BN101>{}, Number<BN11>{}))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{}, Sequence<5, 6, 7>{}));
+
+        return c_block_adaptor_m0_m100_m101_m11_n0_n100_n101_n11_to_m0_m1_n0_n1;
+    }
+
+    __host__ __device__ static constexpr auto GetCThreadTensorLengths_BM0_BM1_BN0_BN1()
+    {
+        return Sequence<BM0, BM11, BN0, BN11>{};
+    }
+
+    static constexpr auto a_block_desc_bk0_bm0_bm1_bk1_ =
+        MakeABlockDescriptor_BK0_BM0_BM1_BK1(ABlockDesc_BK0_BM_BK1{});
+
+    static constexpr auto b_block_desc_bk0_bn0_bn1_bk1_ =
+        MakeBBlockDescriptor_BK0_BN0_BN1_BK1(BBlockDesc_BK0_BN_BK1{});
+
+    public:
+    __device__ BlockwiseGemmDlDpp8_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_loop_BM0_BN0()
+        : c_thread_origin_data_idx_{CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1(
+              get_thread_local_1d_id())},
+          a_thread_copy_{CalculateAThreadOriginOnBlock_BK0_BM0_BM1_BK1()},
+          b_thread_copy_{CalculateBThreadOriginOnBlock_BK0_BN0_BN1_BK1()}
+    {
+        static_assert(ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&
+                          BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+
+        static_assert(BM % BM1 == 0 && BN % BN1 == 0, "wrong!");
+
+        static_assert(ABlockDesc_BK0_BM_BK1{}.GetLength(I0) ==
+                          BBlockDesc_BK0_BN_BK1{}.GetLength(I0),
+                      "wrong! K dimension not consistent");
+
+        static_assert(BM10BN10ThreadClusterBM10Xs::Size() == 2 &&
+                          BM10BN10ThreadClusterBN10Xs::Size() == 2,
+                      "wrong!");
+    }
+
+    __device__ static CIndex CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1(index_t thread_id)
+    {
+        // lower: [BM0, BM1, BN0, BN1]
+        // upper: [BM0, BM100, BM101, BM11, BN0, BN100, BN101, BN11]
+        constexpr auto adaptor0 =
+            MakeCBlockAdaptor_BM0_BM100_BM101_BM11_BN0_BN100_BN101_BN11_To_BM0_BM1_BN0_BN1();
+
+        // lower: [BM0, BM100, BM101, BM11, BN0, BN100, BN101, BN11]
+        // upper: [Tid, BM0, BM11, BN0, BN11]
+        constexpr auto adaptor1 = make_single_stage_tensor_adaptor(
+            make_tuple(make_merge_transform(make_tuple(BM100, BN100, BM101, BN101)),
+                       make_pass_through_transform(BM0),
+                       make_pass_through_transform(BM11),
+                       make_pass_through_transform(BN0),
+                       make_pass_through_transform(BN11)),
+            make_tuple(
+                Sequence<1, 5, 2, 6>{}, Sequence<0>{}, Sequence<3>{}, Sequence<4>{}, Sequence<7>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
+
+        constexpr auto adaptor = chain_tensor_adaptors(adaptor0, adaptor1);
+
+        return adaptor.CalculateBottomIndex(make_multi_index(thread_id, 0, 0, 0, 0));
+    }
+
+    __device__ AIndex CalculateAThreadOriginOnBlock_BK0_BM0_BM1_BK1()
+    {
+        const auto offsetBM0 = c_thread_origin_data_idx_[I0];
+        // If sharing matrix A, we need a separate BM1 offset for each thread in lane group.
+        const auto offsetBM1 = ShareA ? c_thread_origin_data_idx_[I1] +
+                                            dpp8::get_thread_idx_in_lane_group() * BM1PerThread
+                                      : c_thread_origin_data_idx_[I1];
+        return make_tuple(0, offsetBM0, offsetBM1, 0);
+    }
+
+    __device__ BIndex CalculateBThreadOriginOnBlock_BK0_BN0_BN1_BK1()
+    {
+        const auto offsetBN0 = c_thread_origin_data_idx_[I2];
+        // If sharing matrix B, we need a separate BN1 offset for each thread in lane group.
+        const auto offsetBN1 = ShareB ? c_thread_origin_data_idx_[I3] +
+                                            dpp8::get_thread_idx_in_lane_group() * BN1PerThread
+                                      : c_thread_origin_data_idx_[I3];
+        return make_tuple(0, offsetBN0, offsetBN1, 0);
+    }
+
+    template <typename CThreadDesc_BM0_BM11_BN0_BN11,
+              typename ABlockBuffer,
+              typename BBlockBuffer,
+              typename CThreadBuffer>
+    __device__ void Run(const CThreadDesc_BM0_BM11_BN0_BN11&,
+                        const ABlockBuffer& a_block_buf,
+                        const BBlockBuffer& b_block_buf,
+                        CThreadBuffer& c_thread_buf) const
+    {
+        static_assert(CThreadDesc_BM0_BM11_BN0_BN11::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatA>(
+            a_thread_desc_bk0_bm0_bm1_bk1_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatB>(
+            b_thread_desc_bk0_bn0_bn1_bk1_.GetElementSpaceSize());
+
+        constexpr auto threadwise_contraction =
+            ThreadwiseContractionDlDpp8_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1<
+                FloatA,
+                FloatB,
+                FloatC,
+                decltype(a_thread_desc_bk0_bm0_bm1_bk1_),
+                decltype(b_thread_desc_bk0_bn0_bn1_bk1_),
+                CThreadDesc_BM0_BM11_BN0_BN11,
+                Sequence<BK0PerThread, BK1>,
+                Sequence<1, BM1PerThreadBM11>,
+                Sequence<1, BN1PerThreadBN11>,
+                ShareA>{};
+
+        static_for<0, BN0, 1>{}([&](auto bn0) {
+            static_for<0, BM0, 1>{}([&](auto bm0) {
+                a_thread_copy_.Run(a_block_desc_bk0_bm0_bm1_bk1_,
+                                   make_tuple(I0, bm0, I0, I0),
+                                   a_block_buf,
+                                   a_thread_desc_bk0_bm0_bm1_bk1_,
+                                   make_tuple(I0, I0, I0, I0),
+                                   a_thread_buf);
+
+                b_thread_copy_.Run(b_block_desc_bk0_bn0_bn1_bk1_,
+                                   make_tuple(I0, bn0, I0, I0),
+                                   b_block_buf,
+                                   b_thread_desc_bk0_bn0_bn1_bk1_,
+                                   make_tuple(I0, I0, I0, I0),
+                                   b_thread_buf);
+
+                threadwise_contraction.Run(a_thread_buf,
+                                           make_tuple(I0, I0, I0, I0),
+                                           b_thread_buf,
+                                           make_tuple(I0, I0, I0, I0),
+                                           c_thread_buf,
+                                           make_tuple(bm0, I0, bn0, I0));
+
+                static_for<BK0PerThread, BK0, BK0PerThread>{}([&](auto bk0) {
+                    a_thread_copy_.Run(a_block_desc_bk0_bm0_bm1_bk1_,
+                                       make_tuple(bk0, bm0, I0, I0),
+                                       a_block_buf,
+                                       a_thread_desc_bk0_bm0_bm1_bk1_,
+                                       make_tuple(I0, I0, I0, I0),
+                                       a_thread_buf);
+
+                    b_thread_copy_.Run(b_block_desc_bk0_bn0_bn1_bk1_,
+                                       make_tuple(bk0, bn0, I0, I0),
+                                       b_block_buf,
+                                       b_thread_desc_bk0_bn0_bn1_bk1_,
+                                       make_tuple(I0, I0, I0, I0),
+                                       b_thread_buf);
+
+                    threadwise_contraction.Run(a_thread_buf,
+                                               make_tuple(I0, I0, I0, I0),
+                                               b_thread_buf,
+                                               make_tuple(I0, I0, I0, I0),
+                                               c_thread_buf,
+                                               make_tuple(bm0, I0, bn0, I0));
+                });
+            });
+        });
+    }
+
+    private:
+    // A[BK0, BM0, BM1, BK1]
+    static constexpr auto a_thread_desc_bk0_bm0_bm1_bk1_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<BK0PerThread>{}, Number<BM0>{}, Number<BM1PerThread>{}, Number<BK1>{}));
+
+    // B[BK0, BN0, BN1, BK1]
+    static constexpr auto b_thread_desc_bk0_bn0_bn1_bk1_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<BK0PerThread>{}, Number<BN0>{}, Number<BN1PerThread>{}, Number<BK1>{}));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4r1<
+        FloatA,
+        FloatA,
+        decltype(a_block_desc_bk0_bm0_bm1_bk1_),
+        decltype(a_thread_desc_bk0_bm0_bm1_bk1_),
+        Sequence<BK0PerThread, 1, BM1PerThread, BK1>, // SliceLengths
+        Sequence<0, 1, 2, 3>,                         // DimAccessOrder
+        Sequence<1, 1, BM1PerThread, BK1>,            // SrcVectorTensorLengths
+        Sequence<0, 1, 2, 3>>;                        // SrcVectorTensorContiguousDimOrder
+
+    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4r1<
+        FloatB,
+        FloatB,
+        decltype(b_block_desc_bk0_bn0_bn1_bk1_),
+        decltype(b_thread_desc_bk0_bn0_bn1_bk1_),
+        Sequence<BK0PerThread, 1, BN1PerThread, BK1>, // SliceLengths
+        Sequence<0, 1, 2, 3>,                         // DimAccessOrder
+        Sequence<1, 1, BN1PerThread, BK1>,            // SrcVectorTensorLengths
+        Sequence<0, 1, 2, 3>>;                        // SrcVectorTensorContiguousDimOrder
+
+    CIndex c_thread_origin_data_idx_;
+
+    AThreadCopy a_thread_copy_;
+    BThreadCopy b_thread_copy_;
+};
+
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp
@@ -11,7 +11,7 @@
 namespace ck {

 // C[BM0, BM1, BN0, BN1] += transpose(A[K, BM0, BM1]) * B[K, BN0, BN1]
-// A and B are visable to the whole block, C is distributed among each thread
+// A and B are visible to the whole block, C is distributed among each thread
 // Assume:
 //   1. A:
 //     1. ABlockDesc_BK0_BM_BK1 is known at compile-time

--- a/include/ck/tensor_operation/gpu/block/blockwise_softmax.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_softmax.hpp
@@ -35,8 +35,8 @@ struct BlockwiseSoftmax
    static constexpr index_t MRepeat = ThreadSliceDesc_M_K{}.GetLength(I0);
    static constexpr index_t KRepeat = ThreadSliceDesc_M_K{}.GetLength(I1);

-    using ThreadSliceDesc_M = decltype(
-        make_naive_tensor_descriptor_packed(make_tuple(ThreadSliceDesc_M_K{}.GetLength(I0))));
+    using ThreadSliceDesc_M = decltype(make_naive_tensor_descriptor_packed(
+        make_tuple(ThreadSliceDesc_M_K{}.GetLength(I0))));

    using ThreadwiseMaxReduce = typename conditional<
        IgnoreNaN,

--- a/include/ck/tensor_operation/gpu/device/device_avgpool_bwd.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_avgpool_bwd.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <index_t NDimSpatial,
+          typename DOutDataType,
+          typename DInDataType,
+          typename DOutLayout,
+          typename DInLayout>
+struct DeviceAvgPoolBwd : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_dout,
+                        void* p_din,
+                        std::vector<ck::index_t> dout_n_k_wos_lengths,
+                        std::vector<ck::index_t> dout_n_k_wos_strides,
+                        std::vector<ck::index_t> din_n_k_wos_length,
+                        std::vector<ck::index_t> din_n_k_wos_strides,
+                        std::vector<ck::index_t> window_k_c_xs_lengths,
+                        std::vector<ck::index_t> window_strides,
+                        std::vector<ck::index_t> window_dilations,
+                        std::vector<ck::index_t> input_left_pads,
+                        std::vector<ck::index_t> input_right_pads) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp
@@ -27,15 +27,12 @@ struct DeviceGroupedConvBwdWeight : public BaseOperator
    MakeArgumentPointer(const void* p_in,
                        void* p_wei,
                        const void* p_out,
-                        const ck::index_t G,
-                        const ck::index_t N,
-                        const ck::index_t K,
-                        const ck::index_t C,
-                        const std::array<ck::index_t, NDimSpatial>& input_spatial_lengths,
-                        const std::array<ck::index_t, NDimSpatial>& filter_spatial_lengths,
-                        const std::array<ck::index_t, NDimSpatial>& output_spatial_lengths,
-                        const std::array<ck::index_t, NDimSpatial + 3>& input_strides,
-                        const std::array<ck::index_t, NDimSpatial + 3>& output_strides,
+                        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths, // input
+                        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+                        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths, // weight
+                        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
+                        const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths, // output
+                        const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_strides,
                        const std::array<ck::index_t, NDimSpatial>& conv_filter_strides,
                        const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations,
                        const std::array<ck::index_t, NDimSpatial>& input_left_pads,

--- a/include/ck/tensor_operation/gpu/device/device_pool_fwd.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_pool_fwd.hpp
@@ -17,6 +17,8 @@ template <index_t InOutRank,
          typename InDataType,
          typename OutDataType,
          typename IndexDataType,
+          typename InLayout,
+          typename OutLayout,
          ReduceTensorOp ReduceOpId,
          bool OutputIndex>
 struct DevicePoolFwd : public BaseOperator
@@ -25,13 +27,14 @@ struct DevicePoolFwd : public BaseOperator
    MakeArgumentPointer(const void* p_in_dev,
                        void* p_out_dev,
                        void* p_out_indices_dev,
-                        std::vector<ck::index_t> input_lengths,
-                        std::vector<ck::index_t> window_lengths,
-                        std::vector<ck::index_t> output_lengths,
-                        std::vector<ck::index_t> input_stride,
-                        std::vector<ck::index_t> output_stride,
-                        std::vector<ck::index_t> indices_stride,
-                        std::vector<ck::index_t> window_strides,
+                        std::vector<ck::index_t> input_n_c_wis_lengths,
+                        std::vector<ck::index_t> window_xs_lengths,
+                        std::vector<ck::index_t> output_n_c_wos_lengths,
+                        std::vector<ck::index_t> input_n_c_wis_stride,
+                        std::vector<ck::index_t> output_n_c_wis_stride,
+                        std::vector<ck::index_t> indices_n_c_wis_stride,
+                        std::vector<ck::index_t> window_xs_strides,
+                        std::vector<ck::index_t> window_xs_dilations,
                        std::vector<ck::index_t> input_left_pads,
                        std::vector<ck::index_t> input_right_pads,
                        std::vector<ck::index_t> pooling_dims) = 0;

--- a/include/ck/tensor_operation/gpu/device/device_put_element.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_put_element.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f16_instance.cpp
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

-#include "pool_fwd_instance_common.hpp"
+#pragma once

 namespace ck {
 namespace tensor_operation {
 namespace device {
-namespace instance {

-static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
-
-void add_device_pool2d_fwd_nhwc_f16_instances(
-    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, F16, F16, I32, ReduceOpId, false>>>& instances)
+enum struct GemmDlAlgorithm
 {
-    add_device_operation_instances(
-        instances, device_pool2d_fwd_nhwc_instances<F16, F16, I32, F32, ReduceOpId, false>{});
-}
+    Default, // Uses DOT vector instructions
+    Dpp8,    // Uses DOT vector instructions with DPP8 SEL modifier to reduce data loads from LDS
+};

-} // namespace instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
--- a/include/ck/tensor_operation/gpu/device/impl/device_avgpool3d_bwd_ndhwc_ndhwc.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_avgpool3d_bwd_ndhwc_ndhwc.hpp
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp
@@ -588,14 +588,18 @@ struct DeviceBatchedContractionMultipleD_Xdl_CShuffle
        LoopSched>;

    // desc for blockwise copy
-    using AGridDesc_AK0_M_AK1                          = remove_cvref_t<decltype(
-        GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(AGridDesc_M_K{}))>;
-    using BGridDesc_BK0_N_BK1                          = remove_cvref_t<decltype(
-        GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(BGridDesc_N_K{}))>;
-    using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<decltype(
-        GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(DsGridDesc_M_N{}))>;
-    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock  = remove_cvref_t<decltype(
-        GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(EGridDesc_M_N{}))>;
+    using AGridDesc_AK0_M_AK1 =
+        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(
+            AGridDesc_M_K{}))>;
+    using BGridDesc_BK0_N_BK1 =
+        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(
+            BGridDesc_N_K{}))>;
+    using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<
+        decltype(GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+            DsGridDesc_M_N{}))>;
+    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
+        remove_cvref_t<decltype(GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+            EGridDesc_M_N{}))>;

    // block-to-e-tile map
    using Block2ETileMap =

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp
@@ -378,13 +378,16 @@ struct DeviceBatchedGemmEPermuteXdl : public DeviceBatchedGemmEPermute<ALayout,
        CDEBlockTransferScalarPerVector_NPerBlock,
        LoopSched>;

-    using AGridDesc_AK0_M_AK1 = remove_cvref_t<decltype(
-        GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(AGridDesc_M_K{}))>;
-    using BGridDesc_BK0_N_BK1 = remove_cvref_t<decltype(
-        GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(BGridDesc_N_K{}))>;
-
-    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = decltype(
-        GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(EGridDesc_M_N{}));
+    using AGridDesc_AK0_M_AK1 =
+        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(
+            AGridDesc_M_K{}))>;
+    using BGridDesc_BK0_N_BK1 =
+        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(
+            BGridDesc_N_K{}))>;
+
+    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
+        decltype(GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+            EGridDesc_M_N{}));
    using Block2ETileMap = typename GridwiseGemm::DefaultBlock2ETileMap;

    // Argument

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp
@@ -368,14 +368,18 @@ struct DeviceBatchedGemmMultiD_Xdl : public DeviceBatchedGemmMultiD<ALayout,
        LoopSched>;

    // desc for blockwise copy
-    using AGridDesc_AK0_M_AK1                          = remove_cvref_t<decltype(
-        GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(AGridDesc_M_K{}))>;
-    using BGridDesc_BK0_N_BK1                          = remove_cvref_t<decltype(
-        GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(BGridDesc_N_K{}))>;
-    using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<decltype(
-        GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(DsGridDesc_M_N{}))>;
-    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock  = remove_cvref_t<decltype(
-        GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(EGridDesc_M_N{}))>;
+    using AGridDesc_AK0_M_AK1 =
+        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(
+            AGridDesc_M_K{}))>;
+    using BGridDesc_BK0_N_BK1 =
+        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(
+            BGridDesc_N_K{}))>;
+    using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<
+        decltype(GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+            DsGridDesc_M_N{}))>;
+    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
+        remove_cvref_t<decltype(GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+            EGridDesc_M_N{}))>;

    // block-to-e-tile map
    using Block2ETileMap =

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp
@@ -510,12 +510,15 @@ struct DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
        CDE1ShuffleBlockTransferScalarPerVector_NPerBlock,
        LoopSched>;

-    using A0GridDesc_AK0_M_AK1 = remove_cvref_t<decltype(
-        GridwiseGemm::MakeDefaultA0GridDescriptor_AK0_M_AK1(A0GridDesc_M_K{}))>;
-    using B0GridDesc_BK0_N_BK1 = remove_cvref_t<decltype(
-        GridwiseGemm::MakeDefaultB0GridDescriptor_BK0_N_BK1(B0GridDesc_N_K{}))>;
-    using B1GridDesc_BK0_N_BK1 = remove_cvref_t<decltype(
-        GridwiseGemm::MakeDefaultB1GridDescriptor_BK0_N_BK1(B1GridDesc_N_K{}))>;
+    using A0GridDesc_AK0_M_AK1 =
+        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultA0GridDescriptor_AK0_M_AK1(
+            A0GridDesc_M_K{}))>;
+    using B0GridDesc_BK0_N_BK1 =
+        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultB0GridDescriptor_BK0_N_BK1(
+            B0GridDesc_N_K{}))>;
+    using B1GridDesc_BK0_N_BK1 =
+        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultB1GridDescriptor_BK0_N_BK1(
+            B1GridDesc_N_K{}))>;

    // Argument
    struct Argument : public BaseArgument

--- a/include/ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp
@@ -123,7 +123,8 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
        ALayout,
        BLayout,
        CLayout,
-        ADataType, // TODO: distinguish A/B datatype
+        ADataType,
+        BDataType,
        GemmAccDataType,
        CShuffleDataType,
        CDataType,
@@ -284,8 +285,11 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle

            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
            {
-                const auto kernel =
-                    kernel_gemm_xdl_cshuffle_v1<GridwiseGemm, ADataType, CDataType, true>;
+                const auto kernel = kernel_gemm_xdl_cshuffle_v1<GridwiseGemm,
+                                                                ADataType,
+                                                                BDataType,
+                                                                CDataType,
+                                                                true>;

                ave_time += launch_and_time_kernel(stream_config,
                                                   kernel,
@@ -357,8 +361,11 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
            }
            else
            {
-                const auto kernel =
-                    kernel_gemm_xdl_cshuffle_v1<GridwiseGemm, ADataType, CDataType, false>;
+                const auto kernel = kernel_gemm_xdl_cshuffle_v1<GridwiseGemm,
+                                                                ADataType,
+                                                                BDataType,
+                                                                CDataType,
+                                                                false>;

                ave_time += launch_and_time_kernel(stream_config,
                                                   kernel,

--- a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
@@ -532,11 +532,12 @@ struct DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
            float ave_time = 0;

            const auto Run = [&](const auto& kernel) {
-                hipGetErrorString(hipMemset(
+                hipGetErrorString(hipMemsetAsync(
                    arg.p_c_grid_,
                    0,
                    arg.c_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() *
-                        sizeof(CDataType)));
+                        sizeof(CDataType),
+                    stream_config.stream_id_));

                ave_time =
                    launch_and_time_kernel(stream_config,