Merge branch 'develop' into aosewski/ggemm_dl_instances

3f299c33 · Adam Osewski · GitHub · 507d793a · 091570f5 · 3f299c33
Unverified Commit 3f299c33 authored Mar 30, 2023 by Adam Osewski Committed by GitHub Mar 30, 2023
20 changed files
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp
@@ -76,4 +76,8 @@ using DeviceGroupedConvNDFwdInstance =
 #include "run_conv2d_fwd_perchannel_quantization_example.inc"
-int main() { run_conv2d_fwd_perchannel_quantization_example(); }
+int main()
+{
+    const auto out_element_op = OutElementOp{ActivationOp{}};
+    run_conv2d_fwd_perchannel_quantization_example(out_element_op);
+}
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp
@@ -71,4 +71,9 @@ using DeviceGroupedConvNDFwdInstance =
 #include "run_conv2d_fwd_perlayer_quantization_example.inc"
-int main() { run_conv2d_fwd_perlayer_quantization_example(); }
+int main()
+{
+    float requant_scale       = 0.5f;
+    const auto out_element_op = OutElementOp{requant_scale, ActivationOp{}};
+    run_conv2d_fwd_perlayer_quantization_example(out_element_op);
+}
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp
@@ -80,6 +80,10 @@ using DeviceGroupedConvNDFwdInstance =
        S<1, 64, 1, 4>,
        8>;
-#include "run_conv2d_fwd_bias_relu_perchannel_quantization_example.inc"
+#include "run_conv2d_fwd_bias_perchannel_quantization_example.inc"
-int main() { run_conv2d_fwd_bias_relu_perchannel_quantization_example(); };
+int main()
+{
+    const auto out_element_op = OutElementOp{ActivationOp{}};
+    run_conv2d_fwd_bias_perchannel_quantization_example(out_element_op);
+};
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp
@@ -78,6 +78,11 @@ using DeviceGroupedConvNDFwdInstance =
        S<1, 64, 1, 4>,
        8>;
-#include "run_conv2d_fwd_bias_relu_perlayer_quantization_example.inc"
+#include "run_conv2d_fwd_bias_perlayer_quantization_example.inc"
-int main() { run_conv2d_fwd_bias_relu_perlayer_quantization_example(); }
+int main()
+{
+    float requant_scale       = 0.5f;
+    const auto out_element_op = OutElementOp{requant_scale, ActivationOp{}};
+    run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op);
+}
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp
@@ -80,4 +80,8 @@ using DeviceGroupedConvNDFwdInstance =
 #include "run_conv2d_fwd_perchannel_quantization_example.inc"
-int main() { run_conv2d_fwd_perchannel_quantization_example(); }
+int main()
+{
+    const auto out_element_op = OutElementOp{ActivationOp{}};
+    run_conv2d_fwd_perchannel_quantization_example(out_element_op);
+}
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp
@@ -75,4 +75,9 @@ using DeviceGroupedConvNDFwdInstance =
 #include "run_conv2d_fwd_perlayer_quantization_example.inc"
-int main() { run_conv2d_fwd_perlayer_quantization_example(); }
+int main()
+{
+    float requant_scale       = 0.5f;
+    const auto out_element_op = OutElementOp{requant_scale, ActivationOp{}};
+    run_conv2d_fwd_perlayer_quantization_example(out_element_op);
+}
--- a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_relu_perchannel_quantization_example.inc
+++ b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_relu_perchannel_quantization_example.inc
@@ -167,7 +167,7 @@ bool run_grouped_conv_fwd(bool do_verification,
    return (pass ? 0 : 1);
 }
-int run_conv2d_fwd_bias_relu_perchannel_quantization_example()
+int run_conv2d_fwd_bias_perchannel_quantization_example(const OutElementOp& out_element_op)
 {
    bool do_verification           = true;
    bool time_kernel               = true;
@@ -189,7 +189,6 @@ int run_conv2d_fwd_bias_relu_perchannel_quantization_example()
    const auto in_element_op  = InElementOp{};
    const auto wei_element_op = WeiElementOp{};
-    const auto out_element_op = OutElementOp{ActivationOp{}};
    using InLayout           = ck::tensor_layout::convolution::GNHWC;
    using WeiLayout          = ck::tensor_layout::convolution::GKYXC;

--- a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_relu_perlayer_quantization_example.inc
+++ b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_relu_perlayer_quantization_example.inc
@@ -155,7 +155,7 @@ bool run_grouped_conv_fwd(bool do_verification,
    return (pass ? 0 : 1);
 }
-int run_conv2d_fwd_bias_relu_perlayer_quantization_example()
+int run_conv2d_fwd_bias_perlayer_quantization_example(const OutElementOp& out_element_op)
 {
    bool do_verification           = true;
    bool time_kernel               = true;
@@ -177,7 +177,6 @@ int run_conv2d_fwd_bias_relu_perlayer_quantization_example()
    const auto in_element_op  = InElementOp{};
    const auto wei_element_op = WeiElementOp{};
-    const auto out_element_op = OutElementOp{0.5f, ActivationOp{}};
    using InLayout   = ck::tensor_layout::convolution::GNHWC;
    using WeiLayout  = ck::tensor_layout::convolution::GKYXC;

--- a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc
+++ b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc
@@ -157,7 +157,7 @@ bool run_grouped_conv_fwd(bool do_verification,
    return (pass ? 0 : 1);
 }
-int run_conv2d_fwd_perchannel_quantization_example()
+int run_conv2d_fwd_perchannel_quantization_example(const OutElementOp& out_element_op)
 {
    bool do_verification           = true;
    bool time_kernel               = true;
@@ -179,7 +179,6 @@ int run_conv2d_fwd_perchannel_quantization_example()
    const auto in_element_op  = InElementOp{};
    const auto wei_element_op = WeiElementOp{};
-    const auto out_element_op = OutElementOp{ActivationOp{}};
    using InLayout           = ck::tensor_layout::convolution::GNHWC;
    using WeiLayout          = ck::tensor_layout::convolution::GKYXC;

--- a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perlayer_quantization_example.inc
+++ b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perlayer_quantization_example.inc
@@ -139,7 +139,7 @@ bool run_grouped_conv_fwd(bool do_verification,
    return (pass ? 0 : 1);
 }
-int run_conv2d_fwd_perlayer_quantization_example()
+int run_conv2d_fwd_perlayer_quantization_example(const OutElementOp& out_element_op)
 {
    bool do_verification           = true;
    bool time_kernel               = false;
@@ -161,7 +161,6 @@ int run_conv2d_fwd_perlayer_quantization_example()
    const auto in_element_op  = InElementOp{};
    const auto wei_element_op = WeiElementOp{};
-    const auto out_element_op = OutElementOp{0.5f, ActivationOp{}};
    using InLayout  = ck::tensor_layout::convolution::GNHWC;
    using WeiLayout = ck::tensor_layout::convolution::GKYXC;

--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -36,7 +36,7 @@
 #elif defined(__gfx1030__) // for GPU code
 #define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
 #elif defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) // for GPU code
-#define CK_BUFFER_RESOURCE_3RD_DWORD 0x10020000
+#define CK_BUFFER_RESOURCE_3RD_DWORD 0x31004000
 #endif
 // FMA instruction

--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once

--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once

--- a/include/ck/tensor_operation/gpu/element/quantization_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/quantization_operation.hpp
@@ -7,10 +7,30 @@ namespace ck {
 namespace tensor_operation {
 namespace element_wise {
+// Y = Sy * Qy
+// W = Sw * Qw
+// X = Sx * Qx
+// B = Sb * Qb = Sw * Sx * Qb
+// Where X, W, Y are float32, Qx, Qw, Qy are int8
+// Sx, Sw, Sy are scale of x, w, y (float32), which is calculated from quantization range
+// Qb is int32, scale of B is Sw * Sx for convenient
+// Y = W @ X, where @ is convolution or matrix multiplication
+// Sy * Qy = Sw * Qw @ Sx * Qx
+// Qy = [(Sw*Sx)/Sy] * Qw @ Qx
 // For Activation function which is piecewise linear function, such as relu, leaky relu ...etc
+// Activation(Sy * Qy) = Sy * Activation(Qy)
 template <typename Activation>
 struct Activation_Mul_Clamp
 {
+    // Convolution + Activation (piecewise linear function)
+    // If an activation is piecewise linear function, then Activation(Sy * Qy) = Sy * Activation(Qy)
+    // Z = Activation(Y) = Activation(W @ X)
+    // Sz * Qz = Activation(Sy * Qy)
+    // Qz = Sy / Sz * Activation(Qy) = (Sw * Sx / Sz) * Activation(Qw @ Qx)
+    // requantScale_ = Sw * Sx / Sz
    Activation_Mul_Clamp(float requantScale, Activation activationOp)
        : requantScale_(requantScale), activationOp_(activationOp)
    {
@@ -45,8 +65,39 @@ struct Activation_Mul_Clamp
    Activation activationOp_;
 };
+// For Activation function which is non piecewise linear function, such as TanH, Sigmoid ...etc
+// If an activation is not piecewise linear function
+// then Activation(Sy * Qy) != Sy * Activation(Qy)
+template <typename Activation>
+struct Mul_Activation_Mul_Clamp
+{
+    // Convolution + Activation (non piecewise linear function)
+    // Z = Activation(Y) = Activation(W @ X)
+    // Sz * Qz = Activation(Sy * Qy)
+    // Qz = S1 * Activation[Sacc * (Qw @ Qx)]
+    // Where S1 = 1 / Sz, Sacc = Sw * Sx
+    Mul_Activation_Mul_Clamp(float scale_z_inv, float scaleAcc, Activation activationOp)
+        : scale_z_inv_(scale_z_inv), scaleAcc_(scaleAcc), activationOp_(activationOp)
+    {
+    }
+    __host__ __device__ constexpr void operator()(int8_t& y, const int32_t& x) const
+    {
+        float y_fp32 = ck::type_convert<float>(x);
+        y_fp32       = scaleAcc_ * y_fp32;
+        activationOp_(y_fp32, y_fp32);
+        y_fp32 = math::clamp(scale_z_inv_ * y_fp32, -128.f, 127.f);
+        y      = ck::type_convert<int8_t>(y_fp32);
+    }
+    float scale_z_inv_;
+    float scaleAcc_;
+    Activation activationOp_;
+};
 // Conv Perchannel quantization + Activation function which is piecewise linear function, such as
 // relu, leaky relu ...etc
+// Activation(Sy * Qy) = Sy * Activation(Qy)
 template <typename Activation>
 struct Activation_Mul2_Clamp
 {
@@ -76,9 +127,20 @@ struct Activation_Mul2_Clamp
 };
 // For Activation function which is piecewise linear function, such as relu, leaky relu ...etc
+// Activation(Sy * Qy) = Sy * Activation(Qy)
 template <typename Activation>
 struct Add_Activation_Mul_Clamp
 {
+    // Convolution + bias
+    // Let Bias = B = Sw * Sx * Qb
+    // Where Qb is int32
+    // Y = W @ X + B
+    // Sy * Qy = Sw * Qw @ Sx * Qx + Sw * Sx * Qb
+    // Qy = [(Sw*Sx)/Sy] * (Qw @ Qx + Qb)
+    // For activation, Z = Activaiton(Y)
+    // Sz * Qz = Activation(Sy * Qy)
+    // Qz = Sy / Sz * Activation(Qy) = [(Sw*Sx)/Sz] * Activation(Qw @ Qx + Qb)
    Add_Activation_Mul_Clamp(float requantScale, Activation activationOp)
        : requantScale_(requantScale), activationOp_(activationOp)
    {
@@ -139,11 +201,18 @@ struct Add_Activation_Mul2_Clamp
 };
 // For Activation function which is non piecewise linear function, such as TanH, Sigmoid ...etc
+// If an activation is not piecewise linear function
+// then Activation(Sy * Qy) != Sy * Activation(Qy)
 template <typename Activation>
 struct Add_Mul_Activation_Mul_Clamp
 {
-    Add_Mul_Activation_Mul_Clamp(float requantScale1, float requantScale2, Activation activationOp)
+    // Convolution + Activation (non piecewise linear function)
-        : requantScale1_(requantScale1), requantScale2_(requantScale2), activationOp_(activationOp)
+    // Z = Activation(Y) = Activation(W @ X + B)
+    // Sz * Qz = Activation(Sy * Qy)
+    // Qz = S1 * Activation[Sacc * (Qw @ Qx + Qb)]
+    // Where S1 = 1 / Sz, Sacc = Sw * Sx
+    Add_Mul_Activation_Mul_Clamp(float scale_z_inv, float scaleAcc, Activation activationOp)
+        : scale_z_inv_(scale_z_inv), scaleAcc_(scaleAcc), activationOp_(activationOp)
    {
    }
@@ -151,14 +220,64 @@ struct Add_Mul_Activation_Mul_Clamp
    operator()(int8_t& y, const int32_t& x, const int32_t& bias) const
    {
        float y_fp32 = ck::type_convert<float>(x + bias);
-        y_fp32       = requantScale1_ * y_fp32;
+        y_fp32       = scaleAcc_ * y_fp32;
+        activationOp_(y_fp32, y_fp32);
+        y_fp32 = math::clamp(scale_z_inv_ * y_fp32, -128.f, 127.f);
+        y      = ck::type_convert<int8_t>(y_fp32);
+    }
+    __host__ __device__ constexpr void
+    operator()(int32_t& y, const int32_t& x, const int32_t& bias) const
+    {
+        // CAUSION - We might type_convert to int8 in threadwise copy
+        // eg. GridwiseGemmDlMultipleD_km_kn_mn
+        float y_fp32 = ck::type_convert<float>(x + bias);
+        y_fp32       = scaleAcc_ * y_fp32;
        activationOp_(y_fp32, y_fp32);
-        y_fp32 = math::clamp(requantScale2_ * y_fp32, -128.f, 127.f);
+        y_fp32 = math::clamp(scale_z_inv_ * y_fp32, -128.f, 127.f);
+        y      = ck::type_convert<int32_t>(y_fp32);
+    }
+    float scale_z_inv_;
+    float scaleAcc_;
+    Activation activationOp_;
+};
+// Conv Perchannel quantization + Activation function which is non piecewise linear function,
+// such as TanH, Sigmoid ...etc
+// If an activation is not piecewise linear function
+// then Activation(Sy *Qy) != Sy * Activation(Qy)
+template <typename Activation>
+struct Add_Mul2_Activation_Mul_Clamp
+{
+    Add_Mul2_Activation_Mul_Clamp(float scale_z_inv, Activation activationOp)
+        : scale_z_inv_(scale_z_inv), activationOp_(activationOp)
+    {
+    }
+    __host__ __device__ constexpr void
+    operator()(int8_t& y, const int32_t& x, const int32_t& bias, const float& scaleAcc) const
+    {
+        float y_fp32 = ck::type_convert<float>(x + bias);
+        y_fp32       = scaleAcc * y_fp32;
+        activationOp_(y_fp32, y_fp32);
+        y_fp32 = math::clamp(scale_z_inv_ * y_fp32, -128.f, 127.f);
        y      = ck::type_convert<int8_t>(y_fp32);
    }
-    float requantScale1_;
+    __host__ __device__ constexpr void
-    float requantScale2_;
+    operator()(int32_t& y, const int32_t& x, const int32_t& bias, const float& scaleAcc) const
+    {
+        // CAUSION - We might type_convert to int8 in threadwise copy
+        // eg. GridwiseGemmDlMultipleD_km_kn_mn
+        float y_fp32 = ck::type_convert<float>(x + bias);
+        y_fp32       = scaleAcc * y_fp32;
+        activationOp_(y_fp32, y_fp32);
+        y_fp32 = math::clamp(scale_z_inv_ * y_fp32, -128.f, 127.f);
+        y      = ck::type_convert<int32_t>(y_fp32);
+    }
+    float scale_z_inv_;
    Activation activationOp_;
 };

--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -320,6 +320,19 @@ struct Sigmoid
    int32_t divider_ = 1;
 };
+struct TanH
+{
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
+    {
+        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+                          is_same<T, ck::half_t>::value,
+                      "Data type is not supported by this operation!");
+        y = ck::math::tanh(x);
+    };
+};
 } // namespace element_wise
 } // namespace tensor_operation
 } // namespace ck
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
@@ -431,6 +431,9 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle
        constexpr auto b_block_desc_k0perblock_nperblock_k1 =
            GetBBlockDescriptor_K0PerBlock_NPerBlock_K1();
+        constexpr auto cshuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat =
+            GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat();
        constexpr auto max_lds_align = K1;
        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
@@ -439,8 +442,13 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle
        constexpr auto b_block_space_size_aligned = math::integer_least_multiple(
            b_block_desc_k0perblock_nperblock_k1.GetElementSpaceSize(), max_lds_align);
-        return (a_block_space_size_aligned * sizeof(ADataType) +
+        constexpr auto c_block_space_size_aligned = math::integer_least_multiple(
-                b_block_space_size_aligned * sizeof(BDataType));
+            cshuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat.GetElementSpaceSize(),
+            max_lds_align);
+        return math::max((a_block_space_size_aligned * sizeof(ADataType) +
+                          b_block_space_size_aligned * sizeof(BDataType)),
+                         c_block_space_size_aligned * sizeof(CShuffleDataType));
    }
    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -92,6 +92,17 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
    using GridwiseGemmPipe = remove_cvref_t<decltype(
        GridwiseGemmPipeline_Selector<PipelineVer, NumGemmKPrefetchStage, LoopSched>())>;
+    // denorm test fix, required to work around fp16 mfma issue
+    // we convert fp16->fp32->bf16 and execute bf16 mfma instruction
+    // when mfma if fixed, remove this section and update
+    // ABDataTypeAdjusted -> ABDataType throughout this file
+#if defined(__gfx90a__)
+    using ABDataTypeAdjusted =
+        conditional_t<is_same_v<ABDataType, ck::half_t>, ck::bhalf_t, ABDataType>;
+#else
+    using ABDataTypeAdjusted = ABDataType;
+#endif
    __host__ __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
    {
        // A matrix in LDS memory, dst of blockwise copy
@@ -397,7 +408,7 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
                                                ABlockTransferThreadClusterLengths_AK0_M_AK1,
                                                ABlockTransferThreadClusterArrangeOrder,
                                                ABDataType,
-                                                ABDataType,
+                                                ABDataTypeAdjusted,
                                                decltype(a_grid_desc_ak0_m_ak1),
                                                decltype(a_block_desc_ak0_m_ak1),
                                                ABlockTransferSrcAccessOrder,
@@ -428,7 +439,7 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
                                                BBlockTransferThreadClusterLengths_BK0_N_BK1,
                                                BBlockTransferThreadClusterArrangeOrder,
                                                ABDataType,
-                                                ABDataType,
+                                                ABDataTypeAdjusted,
                                                decltype(b_grid_desc_bk0_n_bk1),
                                                decltype(b_block_desc_bk0_n_bk1),
                                                BBlockTransferSrcAccessOrder,
@@ -458,11 +469,11 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
        // sanity check
        constexpr index_t KPack =
            math::max(math::lcm(AK1, BK1),
-                      MfmaSelector<ABDataType, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+                      MfmaSelector<ABDataTypeAdjusted, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
        auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
            BlockSize,
-            ABDataType,
+            ABDataTypeAdjusted,
            AccDataType,
            decltype(a_block_desc_ak0_m_ak1),
            decltype(b_block_desc_bk0_n_bk1),
@@ -480,10 +491,11 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<ABDataType*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+            static_cast<ABDataTypeAdjusted*>(p_shared),
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize());
        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<ABDataType*>(p_shared) + a_block_space_size_aligned,
+            static_cast<ABDataTypeAdjusted*>(p_shared) + a_block_space_size_aligned,
            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1, 0, 0);

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -166,15 +166,12 @@ __global__ void
                                      const CBlockClusterAdaptor c_block_cluster_adaptor)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
-    constexpr index_t shared_block_size =
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
-    __shared__ FloatAB p_shared_block[shared_block_size];
    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
                                                  p_b_grid,
                                                  p_c_grid,
-                                                  p_shared_block,
+                                                  p_shared,
                                                  a_b_k0_m_k1_grid_desc,
                                                  b_b_k0_n_k1_grid_desc,
                                                  c_grid_desc_mblock_mperblock_nblock_nperblock,
@@ -264,6 +261,16 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
    using GridwiseGemmPipe = remove_cvref_t<decltype(
        GridwiseGemmPipeline_Selector<PipelineVer, NumGemmKPrefetchStage>())>;
+    // denorm test fix, required to work around fp16 mfma issue
+    // we convert fp16->fp32->bf16 and execute bf16 mfma instruction
+    // when mfma if fixed, remove this section and update
+    // FloatABAdjusted -> FloatAB throughout this file
+#if defined(__gfx90a__)
+    using FloatABAdjusted = conditional_t<is_same_v<FloatAB, ck::half_t>, ck::bhalf_t, FloatAB>;
+#else
+    using FloatABAdjusted = FloatAB;
+#endif
    // M0/M1/M1Padding
    static constexpr auto M1PerBlock = Number<ABlockLdsM1PerBlock>{};
    static constexpr auto M0PerBlock = Number<ABlockLdsM0PerBlock>{};
@@ -605,7 +612,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
    __device__ static void Run(const FloatAB* __restrict__ p_a_grid,
                               const FloatAB* __restrict__ p_b_grid,
                               FloatC* __restrict__ p_c_grid,
-                               FloatAB* __restrict__ p_shared_block,
+                               void* __restrict__ p_shared,
                               const AGridDesc_B_K0_M_K1& a_b_k0_m_k1_grid_desc,
                               const BGridDesc_B_K0_N_K1& b_b_k0_n_k1_grid_desc,
                               const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock&
@@ -666,7 +673,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
                                                ABlockTransferThreadClusterLengths_K0_M_K1,
                                                ABlockTransferThreadClusterArrangeOrder,
                                                FloatAB,
-                                                FloatAB,
+                                                FloatABAdjusted,
                                                decltype(a_b_k0_m_k1_grid_desc),
                                                decltype(a_b_k0_m_k1_block_desc),
                                                ABlockTransferSrcAccessOrder,
@@ -696,7 +703,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
                                                BBlockTransferThreadClusterLengths_K0_N_K1,
                                                BBlockTransferThreadClusterArrangeOrder,
                                                FloatAB,
-                                                FloatAB,
+                                                FloatABAdjusted,
                                                decltype(b_b_k0_n_k1_grid_desc),
                                                decltype(b_b_k0_n_k1_block_desc),
                                                BBlockTransferSrcAccessOrder,
@@ -725,11 +732,11 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
        // sanity check
        constexpr index_t KPack =
-            math::max(K1, MfmaSelector<FloatAB, MPerXDL, NPerXDL>::selected_mfma.k_per_blk);
+            math::max(K1, MfmaSelector<FloatABAdjusted, MPerXDL, NPerXDL>::selected_mfma.k_per_blk);
        auto blockwise_gemm =
            BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
-                                                                FloatAB,
+                                                                FloatABAdjusted,
                                                                FloatAcc,
                                                                decltype(a_k0_m_k1_block_desc),
                                                                decltype(b_k0_n_k1_block_desc),
@@ -745,16 +752,15 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
        constexpr auto a_block_space_size =
            math::integer_least_multiple(a_k0_m_k1_block_desc.GetElementSpaceSize(), max_lds_align);
-        FloatAB* p_a_block = p_shared_block;
-        FloatAB* p_b_block = p_shared_block + a_block_space_size;
        constexpr auto a_block_slice_copy_step = make_multi_index(0, K0PerBlock, 0, 0);
        constexpr auto b_block_slice_copy_step = make_multi_index(0, K0PerBlock, 0, 0);
        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            p_a_block, a_k0_m_k1_block_desc.GetElementSpaceSize());
+            static_cast<FloatABAdjusted*>(p_shared), a_k0_m_k1_block_desc.GetElementSpaceSize());
        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            p_b_block, b_k0_n_k1_block_desc.GetElementSpaceSize());
+            static_cast<FloatABAdjusted*>(p_shared) + a_block_space_size,
+            b_k0_n_k1_block_desc.GetElementSpaceSize());
        // gridwise GEMM pipeline
        const index_t K0BlockMainLoop = __builtin_amdgcn_readfirstlane(K0 / K0PerBlock);
@@ -798,8 +804,6 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
            constexpr auto c_block_desc_mblock_mperblock_nblock_nperblock =
                GetCBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
-            void* p_shared = static_cast<void*>(p_shared_block);
            auto c_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
                static_cast<FloatC*>(p_shared),
                c_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());