add fp8 support

56863b9a · Jing Zhang · 54df59bf · d4c84256 · 56863b9a · 56863b9a
Commit 56863b9a authored Aug 16, 2023 by Jing Zhang
20 changed files
--- a/include/ck/tensor_operation/gpu/device/device_pool_fwd.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_pool_fwd.hpp
@@ -17,6 +17,8 @@ template <index_t InOutRank,
          typename InDataType,
          typename OutDataType,
          typename IndexDataType,
+          typename InLayout,
+          typename OutLayout,
          ReduceTensorOp ReduceOpId,
          bool OutputIndex>
 struct DevicePoolFwd : public BaseOperator
@@ -25,13 +27,14 @@ struct DevicePoolFwd : public BaseOperator
    MakeArgumentPointer(const void* p_in_dev,
                        void* p_out_dev,
                        void* p_out_indices_dev,
-                        std::vector<ck::index_t> input_lengths,
-                        std::vector<ck::index_t> window_lengths,
-                        std::vector<ck::index_t> output_lengths,
-                        std::vector<ck::index_t> input_stride,
-                        std::vector<ck::index_t> output_stride,
-                        std::vector<ck::index_t> indices_stride,
-                        std::vector<ck::index_t> window_strides,
+                        std::vector<ck::index_t> input_n_c_wis_lengths,
+                        std::vector<ck::index_t> window_xs_lengths,
+                        std::vector<ck::index_t> output_n_c_wos_lengths,
+                        std::vector<ck::index_t> input_n_c_wis_stride,
+                        std::vector<ck::index_t> output_n_c_wis_stride,
+                        std::vector<ck::index_t> indices_n_c_wis_stride,
+                        std::vector<ck::index_t> window_xs_strides,
+                        std::vector<ck::index_t> window_xs_dilations,
                        std::vector<ck::index_t> input_left_pads,
                        std::vector<ck::index_t> input_right_pads,
                        std::vector<ck::index_t> pooling_dims) = 0;

--- a/include/ck/tensor_operation/gpu/device/device_put_element.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_put_element.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f16_instance.cpp
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

-#include "pool_fwd_instance_common.hpp"
+#pragma once

 namespace ck {
 namespace tensor_operation {
 namespace device {
-namespace instance {

-static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
-
-void add_device_pool2d_fwd_nhwc_f16_instances(
-    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, F16, F16, I32, ReduceOpId, false>>>& instances)
+enum struct GemmDlAlgorithm
 {
-    add_device_operation_instances(
-        instances, device_pool2d_fwd_nhwc_instances<F16, F16, I32, F32, ReduceOpId, false>{});
-}
+    Default, // Uses DOT vector instructions
+    Dpp8,    // Uses DOT vector instructions with DPP8 SEL modifier to reduce data loads from LDS
+};

-} // namespace instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
--- a/include/ck/tensor_operation/gpu/device/impl/device_avgpool3d_bwd_ndhwc_ndhwc.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_avgpool3d_bwd_ndhwc_ndhwc.hpp
--- a/include/ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp
@@ -123,7 +123,8 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
        ALayout,
        BLayout,
        CLayout,
-        ADataType, // TODO: distinguish A/B datatype
+        ADataType,
+        BDataType,
        GemmAccDataType,
        CShuffleDataType,
        CDataType,
@@ -284,8 +285,11 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle

            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
            {
-                const auto kernel =
-                    kernel_gemm_xdl_cshuffle_v1<GridwiseGemm, ADataType, CDataType, true>;
+                const auto kernel = kernel_gemm_xdl_cshuffle_v1<GridwiseGemm,
+                                                                ADataType,
+                                                                BDataType,
+                                                                CDataType,
+                                                                true>;

                ave_time += launch_and_time_kernel(stream_config,
                                                   kernel,
@@ -357,8 +361,11 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
            }
            else
            {
-                const auto kernel =
-                    kernel_gemm_xdl_cshuffle_v1<GridwiseGemm, ADataType, CDataType, false>;
+                const auto kernel = kernel_gemm_xdl_cshuffle_v1<GridwiseGemm,
+                                                                ADataType,
+                                                                BDataType,
+                                                                CDataType,
+                                                                false>;

                ave_time += launch_and_time_kernel(stream_config,
                                                   kernel,

--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp
@@ -11,6 +11,7 @@
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_dl_algorithm.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_dl_v1r3.hpp"
 #include "ck/host_utility/device_prop.hpp"
@@ -59,6 +60,7 @@ template <
    typename CThreadTransferSrcDstAccessOrder,
    index_t CThreadTransferSrcDstVectorDim,
    index_t CThreadTransferDstScalarPerVector,
+    GemmDlAlgorithm GemmDlAlg = GemmDlAlgorithm::Default,
    enable_if_t<
        is_same_v<AElementwiseOperation, ck::tensor_operation::element_wise::PassThrough> &&
            is_same_v<BElementwiseOperation, ck::tensor_operation::element_wise::PassThrough> &&
@@ -236,7 +238,8 @@ struct DeviceGemmDl : public DeviceGemm<ALayout,
                                     BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1,
                                     CThreadTransferSrcDstAccessOrder,
                                     CThreadTransferSrcDstVectorDim,
-                                     CThreadTransferDstScalarPerVector>;
+                                     CThreadTransferDstScalarPerVector,
+                                     GemmDlAlg>;

    using AGridDesc_K0_M0_M1_K1 =
        decltype(GridwiseGemm::MakeAGridDescriptor_K0_M0_M1_K1(AGridDesc_K0_M_K1{}));
@@ -372,7 +375,8 @@ struct DeviceGemmDl : public DeviceGemm<ALayout,
                                        remove_reference_t<CGridDesc_M0_M10_M11_N0_N10_N11>,
                                        remove_reference_t<DefaultBlock2CTileMap>,
                                        true,
-                                        true>;
+                                        true,
+                                        GemmDlAlg>;

                ave_time = launch_and_time_kernel(stream_config,
                                                  kernel,
@@ -398,7 +402,8 @@ struct DeviceGemmDl : public DeviceGemm<ALayout,
                                        remove_reference_t<CGridDesc_M0_M10_M11_N0_N10_N11>,
                                        remove_reference_t<DefaultBlock2CTileMap>,
                                        true,
-                                        false>;
+                                        false,
+                                        GemmDlAlg>;

                ave_time = launch_and_time_kernel(stream_config,
                                                  kernel,
@@ -424,7 +429,8 @@ struct DeviceGemmDl : public DeviceGemm<ALayout,
                                        remove_reference_t<CGridDesc_M0_M10_M11_N0_N10_N11>,
                                        remove_reference_t<DefaultBlock2CTileMap>,
                                        false,
-                                        true>;
+                                        true,
+                                        GemmDlAlg>;

                ave_time = launch_and_time_kernel(stream_config,
                                                  kernel,
@@ -450,7 +456,8 @@ struct DeviceGemmDl : public DeviceGemm<ALayout,
                                        remove_reference_t<CGridDesc_M0_M10_M11_N0_N10_N11>,
                                        remove_reference_t<DefaultBlock2CTileMap>,
                                        false,
-                                        false>;
+                                        false,
+                                        GemmDlAlg>;

                ave_time = launch_and_time_kernel(stream_config,
                                                  kernel,
@@ -485,6 +492,16 @@ struct DeviceGemmDl : public DeviceGemm<ALayout,

    static bool IsSupportedArgument(const Argument& arg)
    {
+        if constexpr(GemmDlAlg == GemmDlAlgorithm::Dpp8)
+        {
+            if(ck::get_device_name() == "gfx1030")
+            {
+                return GridwiseGemm::CheckValidity(
+                    arg.a_grid_desc_k0_m_k1_, arg.b_grid_desc_k0_n_k1_, arg.c_grid_desc_m_n_);
+            }
+            return false;
+        }
+
        if(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx1030" ||
           ck::get_device_name() == "gfx1100" || ck::get_device_name() == "gfx1101" ||
           ck::get_device_name() == "gfx1102")
@@ -492,11 +509,8 @@ struct DeviceGemmDl : public DeviceGemm<ALayout,
            return GridwiseGemm::CheckValidity(
                arg.a_grid_desc_k0_m_k1_, arg.b_grid_desc_k0_n_k1_, arg.c_grid_desc_m_n_);
        }
-        else
-        {
        return false;
    }
-    }

    // polymorphic
    bool IsSupportedArgument(const BaseArgument* p_arg) override
@@ -572,7 +586,7 @@ struct DeviceGemmDl : public DeviceGemm<ALayout,
    }

    // polymorphic
-    std::string GetTypeString() const override
+    virtual std::string GetTypeString() const override
    {
        auto str = std::stringstream();


--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl_dpp8.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl_dpp8.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_dl_algorithm.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_dl_v1r3.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <
+    typename ADataType,
+    typename BDataType,
+    typename CDataType,
+    typename AccDataType,
+    typename ALayout,
+    typename BLayout,
+    typename CLayout,
+    typename AElementwiseOperation,
+    typename BElementwiseOperation,
+    typename CElementwiseOperation,
+    GemmSpecialization GemmSpec,
+    index_t BlockSize,
+    index_t MPerBlock,
+    index_t NPerBlock,
+    index_t K0PerBlock,
+    index_t K1,
+    index_t M1PerThread,
+    index_t N1PerThread,
+    index_t KPerThread,
+    typename M1N1ThreadClusterM1Xs,
+    typename M1N1ThreadClusterN1Xs,
+    typename ABlockTransferThreadSliceLengths_K0_M0_M1_K1,
+    typename ABlockTransferThreadClusterLengths_K0_M0_M1_K1,
+    typename ABlockTransferThreadClusterArrangeOrder,
+    typename ABlockTransferSrcAccessOrder,
+    typename ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1,
+    typename ABlockTransferSrcVectorTensorContiguousDimOrder,
+    typename ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1,
+    typename BBlockTransferThreadSliceLengths_K0_N0_N1_K1,
+    typename BBlockTransferThreadClusterLengths_K0_N0_N1_K1,
+    typename BBlockTransferThreadClusterArrangeOrder,
+    typename BBlockTransferSrcAccessOrder,
+    typename BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1,
+    typename BBlockTransferSrcVectorTensorContiguousDimOrder,
+    typename BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1,
+    typename CThreadTransferSrcDstAccessOrder,
+    index_t CThreadTransferSrcDstVectorDim,
+    index_t CThreadTransferDstScalarPerVector,
+    enable_if_t<
+        is_same_v<AElementwiseOperation, ck::tensor_operation::element_wise::PassThrough> &&
+            is_same_v<BElementwiseOperation, ck::tensor_operation::element_wise::PassThrough> &&
+            is_same_v<CElementwiseOperation, ck::tensor_operation::element_wise::PassThrough>,
+        bool> = false>
+struct DeviceGemmDlDpp8 : public DeviceGemmDl<ADataType,
+                                              BDataType,
+                                              CDataType,
+                                              AccDataType,
+                                              ALayout,
+                                              BLayout,
+                                              CLayout,
+                                              AElementwiseOperation,
+                                              BElementwiseOperation,
+                                              CElementwiseOperation,
+                                              GemmSpec,
+                                              BlockSize,
+                                              MPerBlock,
+                                              NPerBlock,
+                                              K0PerBlock,
+                                              K1,
+                                              M1PerThread,
+                                              N1PerThread,
+                                              KPerThread,
+                                              M1N1ThreadClusterM1Xs,
+                                              M1N1ThreadClusterN1Xs,
+                                              ABlockTransferThreadSliceLengths_K0_M0_M1_K1,
+                                              ABlockTransferThreadClusterLengths_K0_M0_M1_K1,
+                                              ABlockTransferThreadClusterArrangeOrder,
+                                              ABlockTransferSrcAccessOrder,
+                                              ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1,
+                                              ABlockTransferSrcVectorTensorContiguousDimOrder,
+                                              ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1,
+                                              BBlockTransferThreadSliceLengths_K0_N0_N1_K1,
+                                              BBlockTransferThreadClusterLengths_K0_N0_N1_K1,
+                                              BBlockTransferThreadClusterArrangeOrder,
+                                              BBlockTransferSrcAccessOrder,
+                                              BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1,
+                                              BBlockTransferSrcVectorTensorContiguousDimOrder,
+                                              BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1,
+                                              CThreadTransferSrcDstAccessOrder,
+                                              CThreadTransferSrcDstVectorDim,
+                                              CThreadTransferDstScalarPerVector,
+                                              GemmDlAlgorithm::Dpp8>
+
+{
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceGemmDlDpp8"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << K0PerBlock << ", "
+            << K1 << ", "
+            << M1PerThread << ", "
+            << N1PerThread << ", "
+            << KPerThread
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp
@@ -65,7 +65,8 @@ template <typename ALayout,
          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
          LoopScheduler LoopSched     = make_default_loop_scheduler(),
-          PipelineVersion PipelineVer = PipelineVersion::v1>
+          PipelineVersion PipelineVer = PipelineVersion::v1,
+          typename ComputeType        = CDataType>
 struct DeviceGemm_Xdl_CShuffle : public DeviceGemm<ALayout,
                                                   BLayout,
                                                   CLayout,
@@ -87,7 +88,8 @@ struct DeviceGemm_Xdl_CShuffle : public DeviceGemm<ALayout,
        ALayout,
        BLayout,
        CLayout,
-        ADataType, // TODO: distinguish A/B datatype
+        ADataType,
+        BDataType,
        GemmAccDataType,
        CShuffleDataType,
        CDataType,
@@ -128,7 +130,8 @@ struct DeviceGemm_Xdl_CShuffle : public DeviceGemm<ALayout,
        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
        CShuffleBlockTransferScalarPerVector_NPerBlock,
        LoopSched,
-        PipelineVer>;
+        PipelineVer,
+        ComputeType>;

    using Argument = typename GridwiseGemm::Argument;


--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_dl.hpp
@@ -784,15 +784,12 @@ struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Dl
        Argument(const InDataType* p_in_grid,
                 WeiDataType* p_wei_grid,
                 const OutDataType* p_out_grid,
-                 const ck::index_t G,
-                 const ck::index_t N,
-                 const ck::index_t K,
-                 const ck::index_t C,
-                 const std::array<ck::index_t, NDimSpatial>& input_spatial_lengths,
-                 const std::array<ck::index_t, NDimSpatial>& filter_spatial_lengths,
-                 const std::array<ck::index_t, NDimSpatial>& output_spatial_lengths,
-                 const std::array<ck::index_t, NDimSpatial + 3>& /*input_strides*/,
-                 const std::array<ck::index_t, NDimSpatial + 3>& /*output_strides*/,
+                 const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths, // input
+                 const std::array<index_t, NDimSpatial + 3>& /*a_g_n_c_wis_strides*/,
+                 const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths, // weight
+                 const std::array<index_t, NDimSpatial + 3>& /*b_g_k_c_xs_strides*/,
+                 const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths, // output
+                 const std::array<index_t, NDimSpatial + 3>& /*e_g_n_k_wos_strides*/,
                 const std::array<ck::index_t, NDimSpatial>& conv_filter_strides,
                 const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations,
                 const std::array<ck::index_t, NDimSpatial>& input_left_pads,
@@ -812,27 +809,38 @@ struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Dl
              a_element_op_{out_element_op},
              b_element_op_{wei_element_op},
              c_element_op_{in_element_op},
-              Conv_G_{G},
-              Conv_N_{N},
-              Conv_K_{K},
-              Conv_C_{C},
-              input_spatial_lengths_{input_spatial_lengths},
-              filter_spatial_lengths_{filter_spatial_lengths},
-              output_spatial_lengths_{output_spatial_lengths},
+              Conv_G_{a_g_n_c_wis_lengths[0]},
+              Conv_N_{a_g_n_c_wis_lengths[1]},
+              Conv_K_{b_g_k_c_xs_lengths[1]},
+              Conv_C_{a_g_n_c_wis_lengths[2]},
+              input_spatial_lengths_{},
+              filter_spatial_lengths_{},
+              output_spatial_lengths_{},
              conv_filter_strides_{conv_filter_strides},
              conv_filter_dilations_{conv_filter_dilations},
              input_left_pads_{input_left_pads},
              input_right_pads_{input_right_pads},
              k_batch_{split_k}
        {
+            constexpr index_t spatial_offset = 3;
+            std::copy(begin(a_g_n_c_wis_lengths) + spatial_offset,
+                      end(a_g_n_c_wis_lengths),
+                      begin(input_spatial_lengths_));
+            std::copy(begin(b_g_k_c_xs_lengths) + spatial_offset,
+                      end(b_g_k_c_xs_lengths),
+                      begin(filter_spatial_lengths_));
+            std::copy(begin(e_g_n_k_wos_lengths) + spatial_offset,
+                      end(e_g_n_k_wos_lengths),
+                      begin(output_spatial_lengths_));
+
            const auto descs =
                DeviceOp::MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<NDimSpatial>(
-                    N,
-                    K,
-                    C,
-                    input_spatial_lengths,
-                    filter_spatial_lengths,
-                    output_spatial_lengths,
+                    Conv_N_,
+                    Conv_K_,
+                    Conv_C_,
+                    input_spatial_lengths_,
+                    filter_spatial_lengths_,
+                    output_spatial_lengths_,
                    conv_filter_strides,
                    conv_filter_dilations,
                    input_left_pads,
@@ -856,21 +864,21 @@ struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Dl

            // A/B/C Batch Stride
            compute_ptr_offset_of_batch_.BatchStrideA_ =
-                N * K *
-                std::accumulate(begin(output_spatial_lengths),
-                                end(output_spatial_lengths),
+                Conv_N_ * Conv_K_ *
+                std::accumulate(begin(output_spatial_lengths_),
+                                end(output_spatial_lengths_),
                                index_t{1},
                                std::multiplies<>{});
            compute_ptr_offset_of_batch_.BatchStrideB_ =
-                N * C *
-                std::accumulate(begin(input_spatial_lengths),
-                                end(input_spatial_lengths),
+                Conv_N_ * Conv_C_ *
+                std::accumulate(begin(input_spatial_lengths_),
+                                end(input_spatial_lengths_),
                                index_t{1},
                                std::multiplies<>{});
            compute_ptr_offset_of_batch_.BatchStrideC_ =
-                K * C *
-                std::accumulate(begin(filter_spatial_lengths),
-                                end(filter_spatial_lengths),
+                Conv_K_ * Conv_C_ *
+                std::accumulate(begin(filter_spatial_lengths_),
+                                end(filter_spatial_lengths_),
                                index_t{1},
                                std::multiplies<>{});
        }
@@ -904,9 +912,9 @@ struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Dl
        const index_t Conv_K_;
        const index_t Conv_C_;

-        const std::array<ck::index_t, NDimSpatial>& input_spatial_lengths_;
-        const std::array<ck::index_t, NDimSpatial>& filter_spatial_lengths_;
-        const std::array<ck::index_t, NDimSpatial>& output_spatial_lengths_;
+        std::array<ck::index_t, NDimSpatial> input_spatial_lengths_;
+        std::array<ck::index_t, NDimSpatial> filter_spatial_lengths_;
+        std::array<ck::index_t, NDimSpatial> output_spatial_lengths_;
        const std::array<ck::index_t, NDimSpatial>& conv_filter_strides_;
        const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations_;
        const std::array<ck::index_t, NDimSpatial>& input_left_pads_;
@@ -1110,18 +1118,16 @@ struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Dl
        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
    }

-    static auto MakeArgument(const InDataType* p_in_grid,
+    static auto
+    MakeArgument(const InDataType* p_in_grid,
                 WeiDataType* p_wei_grid,
                 const OutDataType* p_out_grid,
-                             const ck::index_t G,
-                             const ck::index_t N,
-                             const ck::index_t K,
-                             const ck::index_t C,
-                             const std::array<ck::index_t, NDimSpatial>& input_spatial_lengths,
-                             const std::array<ck::index_t, NDimSpatial>& filter_spatial_lengths,
-                             const std::array<ck::index_t, NDimSpatial>& output_spatial_lengths,
-                             const std::array<ck::index_t, NDimSpatial + 3>& input_strides,
-                             const std::array<ck::index_t, NDimSpatial + 3>& output_strides,
+                 const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths, // input
+                 const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+                 const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths, // weight
+                 const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
+                 const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths, // output
+                 const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_strides,
                 const std::array<ck::index_t, NDimSpatial>& conv_filter_strides,
                 const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations,
                 const std::array<ck::index_t, NDimSpatial>& input_left_pads,
@@ -1134,15 +1140,12 @@ struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Dl
        return Argument{p_in_grid,
                        p_wei_grid,
                        p_out_grid,
-                        G,
-                        N,
-                        K,
-                        C,
-                        input_spatial_lengths,
-                        filter_spatial_lengths,
-                        output_spatial_lengths,
-                        input_strides,
-                        output_strides,
+                        a_g_n_c_wis_lengths, // input
+                        a_g_n_c_wis_strides,
+                        b_g_k_c_xs_lengths, // weight
+                        b_g_k_c_xs_strides,
+                        e_g_n_k_wos_lengths, // output
+                        e_g_n_k_wos_strides,
                        conv_filter_strides,
                        conv_filter_dilations,
                        input_left_pads,
@@ -1159,15 +1162,12 @@ struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Dl
    MakeArgumentPointer(const void* p_in_grid,
                        void* p_wei_grid,
                        const void* p_out_grid,
-                        const ck::index_t G,
-                        const ck::index_t N,
-                        const ck::index_t K,
-                        const ck::index_t C,
-                        const std::array<ck::index_t, NDimSpatial>& input_spatial_lengths,
-                        const std::array<ck::index_t, NDimSpatial>& filter_spatial_lengths,
-                        const std::array<ck::index_t, NDimSpatial>& output_spatial_lengths,
-                        const std::array<ck::index_t, NDimSpatial + 3>& input_strides,
-                        const std::array<ck::index_t, NDimSpatial + 3>& output_strides,
+                        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths, // input
+                        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+                        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths, // weight
+                        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
+                        const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths, // output
+                        const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_strides,
                        const std::array<ck::index_t, NDimSpatial>& conv_filter_strides,
                        const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations,
                        const std::array<ck::index_t, NDimSpatial>& input_left_pads,
@@ -1180,15 +1180,12 @@ struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Dl
        return std::make_unique<Argument>(static_cast<const InDataType*>(p_in_grid),
                                          static_cast<WeiDataType*>(p_wei_grid),
                                          static_cast<const OutDataType*>(p_out_grid),
-                                          G,
-                                          N,
-                                          K,
-                                          C,
-                                          input_spatial_lengths,
-                                          filter_spatial_lengths,
-                                          output_spatial_lengths,
-                                          input_strides,
-                                          output_strides,
+                                          a_g_n_c_wis_lengths, // input
+                                          a_g_n_c_wis_strides,
+                                          b_g_k_c_xs_lengths, // weight
+                                          b_g_k_c_xs_strides,
+                                          e_g_n_k_wos_lengths, // output
+                                          e_g_n_k_wos_strides,
                                          conv_filter_strides,
                                          conv_filter_dilations,
                                          input_left_pads,

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp
@@ -214,13 +214,17 @@ struct DeviceGroupedGemm_Xdl_Fixed_NK : public DeviceGroupedGemmFixedNK<ALayout,
    static constexpr auto I1 = Number<1>{};
    static constexpr auto I2 = Number<2>{};

+    using ComputeType = EDataType;
+
    // GridwiseGemm
    using GridwiseGemm = GridwiseGemmMultipleD_xdl_splitk_cshuffle<
        ADataType, // TODO: distinguish A/B datatype
+        BDataType,
        AccDataType,
        CShuffleDataType,
        DsDataType,
        EDataType,
+        ComputeType,
        AElementwiseOperation,
        BElementwiseOperation,
        CDEElementwiseOperation,

--- a/include/ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp
--- a/include/ck/tensor_operation/gpu/device/impl/device_pool3d_fwd_ndhwc_ndhwc.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_pool3d_fwd_ndhwc_ndhwc.hpp
--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -75,6 +75,12 @@ struct PassThrough
        y = x;
    }

+    template <>
+    __host__ __device__ void operator()<half_t, int8_t>(half_t& y, const int8_t& x) const
+    {
+        y = type_convert<half_t>(x);
+    }
+
    template <>
    __host__ __device__ void operator()<int8_t, int32_t>(int8_t& y, const int32_t& x) const
    {

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_v1r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_v1r3.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_splitk_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_splitk_cshuffle.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_put_element_1d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_put_element_1d.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_1st.hpp
+++ b/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_1st.hpp
@@ -78,17 +78,18 @@ struct GridwiseNormalizationSplitK1st
    static constexpr auto ThreadBufferNumber = Number<KThreadSliceSize / XSrcVectorSize>{};

    __device__ static int
-    GetKPerThread(int kRaw, int kGridSize, int block_k_cluster_id, int thread_k_cluster_id)
+    GetKPerThread(int k, int kRaw, int kGridSize, int block_k_cluster_id, int thread_k_cluster_id)
    {
        bool is_rightmost_block = block_k_cluster_id == kGridSize - 1;

        if(is_rightmost_block)
        {
-            int left_kPerBlock = math::integer_divide_ceil(kRaw, kGridSize);
-            int kPerBlock      = kRaw % kGridSize == 0 ? left_kPerBlock : kRaw % left_kPerBlock;
-            int kPerThread =
-                kPerBlock < K_BlockTileSize ? 0 : KThreadSliceSize * (kPerBlock / K_BlockTileSize);
-            int kPerBlockTail = kPerBlock - kPerThread * KThreadClusterSize;
+            int left_kPerBlock  = math::integer_divide_ceil(k, kGridSize);
+            int kRightmostBlock = kRaw - left_kPerBlock * (kGridSize - 1);
+            int kPerThread      = kRightmostBlock < K_BlockTileSize
+                                 ? 0
+                                 : KThreadSliceSize * (kRightmostBlock / K_BlockTileSize);
+            int kPerBlockTail = kRightmostBlock - kPerThread * KThreadClusterSize;

            if(kPerBlockTail > 0)
            {
@@ -105,7 +106,7 @@ struct GridwiseNormalizationSplitK1st
        }
        else
        {
-            int kPerBlock = math::integer_divide_ceil(kRaw, kGridSize);
+            int kPerBlock = math::integer_divide_ceil(k, kGridSize);
            return KThreadSliceSize * (kPerBlock / K_BlockTileSize);
        }
    }
@@ -195,8 +196,11 @@ struct GridwiseNormalizationSplitK1st

        auto threadwise_welford       = ThreadwiseWelford();
        int kRaw                      = x_grid_desc_m_k.GetTransforms()[I2].GetUpperLengths()[I0];
-        threadwise_welford.max_count_ =
-            GetKPerThread(kRaw, k_grid_size, block_k_cluster_id, thread_k_cluster_id);
+        threadwise_welford.max_count_ = GetKPerThread(x_grid_desc_m_k.GetLength(I1),
+                                                      kRaw,
+                                                      k_grid_size,
+                                                      block_k_cluster_id,
+                                                      thread_k_cluster_id);

        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
            mean_thread_buf(I) = type_convert<ComputeDataType>(0.0f);

--- a/include/ck/tensor_operation/gpu/thread/threadwise_contraction_dl_dpp8.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_contraction_dl_dpp8.hpp