Merge branch 'develop' into feature/check-window-lengths

e6bb1dd7 · Po Yen Chen · GitHub · 9d6a3704 · ab250afd · e6bb1dd7
Unverified Commit e6bb1dd7 authored Jul 19, 2024 by Po Yen Chen Committed by GitHub Jul 19, 2024
20 changed files
--- a/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp

 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -14,11 +14,93 @@
 namespace ck {
 namespace tensor_operation {

-template <index_t NDimSpatial, device::ConvolutionForwardSpecialization ConvForwardSpecialization>
+// function to be used on device, emulates std::accumulate
+template <typename T, typename ForwardIterator, typename Size>
+__host__ __device__ auto mult_accumulate_n(ForwardIterator first, Size count, T init)
+{
+    for(ForwardIterator x = first; x != first + count; x++)
+    {
+        init *= *x;
+    }
+    return init;
+}
+
+template <index_t NDimSpatial,
+          device::ConvolutionForwardSpecialization ConvForwardSpecialization,
+          index_t NumGroupsToMerge = 1>
 struct TransformConvFwdToGemm
 {
    static constexpr auto I0 = Number<0>{};
    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+
+    static long_index_t
+    calculate_element_space_size_impl(const std::array<index_t, NDimSpatial + 3>& lengths,
+                                      const std::array<index_t, NDimSpatial + 3>& strides,
+                                      index_t i)
+    {
+        long_index_t acc = 1;
+        for(; i < (NDimSpatial + 3); i++)
+        {
+            acc +=
+                static_cast<long_index_t>(lengths[i] - I1) * static_cast<long_index_t>(strides[i]);
+        }
+
+        return acc;
+    }
+
+    template <typename ADataType, typename CDataType>
+    static index_t GetSplitedNSize(const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
+                                   const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+                                   const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
+                                   const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_strides)
+    {
+        const long_index_t a_element_space_size =
+            calculate_element_space_size_impl(a_g_n_c_wis_lengths, a_g_n_c_wis_strides, I1);
+        const long_index_t c_element_space_size =
+            calculate_element_space_size_impl(c_g_n_k_wos_lengths, c_g_n_k_wos_strides, I1);
+        const long_index_t element_space_size = math::max(a_element_space_size * sizeof(ADataType),
+                                                          c_element_space_size * sizeof(CDataType));
+        constexpr long_index_t TwoGB          = (long_index_t{1} << 31);
+
+        const index_t N = a_g_n_c_wis_lengths[I1];
+
+        if(element_space_size > TwoGB)
+        {
+            // Minimum divisor of N to not exceed 2GB
+            const auto divisor = math::integer_divide_ceil(element_space_size, TwoGB);
+
+            if(divisor <= static_cast<double>(N))
+            {
+                // Find least divisor of N larger than element_space_size / TwoGB
+                // Iterate up to sqrt(N). There are no divisors above this value.
+                for(index_t least_divisor = divisor; least_divisor * least_divisor <= N;
+                    least_divisor++)
+                {
+                    if(N % least_divisor == 0)
+                    {
+                        return N / least_divisor;
+                    }
+                }
+                // Not found, process one Convolution N per block
+                return 1;
+            }
+            else
+            {
+                // Not possible to support even after split N.
+                // Too large tensor.
+                return N;
+            }
+        }
+        else
+        {
+            // Split N is not needed.
+            return N;
+        }
+    }

    // TODO: implement ck::tensor_layout::convolution that describe packed/strided dimemsion as
    // properties
@@ -38,7 +120,1076 @@ struct TransformConvFwdToGemm
                        const std::array<index_t, NDimSpatial>& conv_filter_strides,
                        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
                        const std::array<index_t, NDimSpatial>& input_left_pads,
-                        const std::array<index_t, NDimSpatial>& input_right_pads)
+                        const std::array<index_t, NDimSpatial>& input_right_pads,
+                        const index_t N)
+    {
+        const index_t C = a_g_n_c_wis_lengths[I2];
+
+        const index_t Wi = a_g_n_c_wis_lengths[I3];
+
+        const index_t Wo = c_g_n_k_wos_lengths[I3];
+
+        const index_t ConvStrideW = conv_filter_strides[I0];
+
+        const index_t GStride  = a_g_n_c_wis_strides[I0];
+        const index_t NStride  = a_g_n_c_wis_strides[I1];
+        const auto CStride     = a_g_n_c_wis_strides[I2];
+        const index_t WiStride = a_g_n_c_wis_strides[I3];
+
+        if constexpr(ConvForwardSpecialization ==
+                     device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0)
+        {
+            const index_t NHoWo =
+                N * ck::accumulate_n<index_t>(
+                        c_g_n_k_wos_lengths.begin() + 3, NDimSpatial, 1, std::multiplies<>());
+
+            if constexpr(NumGroupsToMerge == 1)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NHoWo, C),
+                                                    make_tuple(WiStride, CStride));
+            }
+            else
+            {
+                const auto in_gemmm_groups_gemmk_desc = make_naive_tensor_descriptor(
+                    make_tuple(NHoWo, NumGroupsToMerge, C), make_tuple(WiStride, GStride, CStride));
+
+                return transform_tensor_descriptor(
+                    in_gemmm_groups_gemmk_desc,
+                    make_tuple(make_merge_transform(make_tuple(NHoWo, NumGroupsToMerge)),
+                               make_pass_through_transform(C)),
+                    make_tuple(Sequence<0, 1>{}, Sequence<2>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+            }
+        }
+        else if constexpr(ConvForwardSpecialization ==
+                          device::ConvolutionForwardSpecialization::Filter3x3)
+        {
+            const index_t ConvDilationW = conv_filter_dilations[0];
+
+            const index_t InLeftPadW = input_left_pads[0];
+
+            const index_t InRightPadW = input_right_pads[0];
+            if constexpr(NumGroupsToMerge == 1)
+            {
+
+                const auto in_n_wi_c_desc =
+                    make_naive_tensor_descriptor(make_tuple(N, Wi), make_tuple(NStride, WiStride));
+
+                const auto in_n_wip_c_desc = transform_tensor_descriptor(
+                    in_n_wi_c_desc,
+                    make_tuple(make_pass_through_transform(N),
+                               make_pad_transform(Wi, InLeftPadW, InRightPadW)),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+                const auto in_n_x_wo_c_desc = transform_tensor_descriptor(
+                    in_n_wip_c_desc,
+                    make_tuple(make_pass_through_transform(N),
+                               make_embed_transform(make_tuple(Number<3>{}, Wo),
+                                                    make_tuple(ConvDilationW, ConvStrideW))),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1, 2>{}));
+
+                return transform_tensor_descriptor(
+                    in_n_x_wo_c_desc,
+                    make_tuple(make_merge_transform(make_tuple(N, Wo)),
+                               make_pass_through_transform(Number<3>{})),
+                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+            }
+            else
+            {
+                const auto in_n_wi_c_desc = make_naive_tensor_descriptor(
+                    make_tuple(N, Wi, NumGroupsToMerge), make_tuple(NStride, WiStride, GStride));
+
+                const auto in_n_wip_c_desc = transform_tensor_descriptor(
+                    in_n_wi_c_desc,
+                    make_tuple(make_pass_through_transform(N),
+                               make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                               make_pass_through_transform(NumGroupsToMerge)),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+                const auto in_n_x_wo_c_desc = transform_tensor_descriptor(
+                    in_n_wip_c_desc,
+                    make_tuple(make_pass_through_transform(N),
+                               make_embed_transform(make_tuple(Number<3>{}, Wo),
+                                                    make_tuple(ConvDilationW, ConvStrideW)),
+                               make_pass_through_transform(NumGroupsToMerge)),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+
+                return transform_tensor_descriptor(
+                    in_n_x_wo_c_desc,
+                    make_tuple(make_merge_transform(make_tuple(N, Wo, NumGroupsToMerge)),
+                               make_pass_through_transform(Number<3>{})),
+                    make_tuple(Sequence<0, 2, 3>{}, Sequence<1>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+            }
+        }
+        else if constexpr(ConvForwardSpecialization ==
+                          device::ConvolutionForwardSpecialization::Filter1x1Pad0)
+        {
+            if constexpr(NumGroupsToMerge == 1)
+            {
+                const auto in_n_wi_c_desc = make_naive_tensor_descriptor(
+                    make_tuple(N, Wi, C), make_tuple(NStride, WiStride, CStride));
+
+                const auto in_n_wo_c_desc = transform_tensor_descriptor(
+                    in_n_wi_c_desc,
+                    make_tuple(make_pass_through_transform(N),
+                               make_embed_transform(make_tuple(Wo), make_tuple(ConvStrideW)),
+                               make_pass_through_transform(C)),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+                return transform_tensor_descriptor(
+                    in_n_wo_c_desc,
+                    make_tuple(make_merge_transform(make_tuple(N, Wo)),
+                               make_pass_through_transform(C)),
+                    make_tuple(Sequence<0, 1>{}, Sequence<2>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+            }
+            else
+            {
+                const auto in_n_wi_c_desc =
+                    make_naive_tensor_descriptor(make_tuple(N, Wi, NumGroupsToMerge, C),
+                                                 make_tuple(NStride, WiStride, GStride, CStride));
+
+                const auto in_n_wo_c_desc = transform_tensor_descriptor(
+                    in_n_wi_c_desc,
+                    make_tuple(make_pass_through_transform(N),
+                               make_embed_transform(make_tuple(Wo), make_tuple(ConvStrideW)),
+                               make_pass_through_transform(NumGroupsToMerge),
+                               make_pass_through_transform(C)),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+                return transform_tensor_descriptor(
+                    in_n_wo_c_desc,
+                    make_tuple(make_merge_transform(make_tuple(N, Wo, NumGroupsToMerge)),
+                               make_pass_through_transform(C)),
+                    make_tuple(Sequence<0, 1, 2>{}, Sequence<3>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+            }
+        }
+        else
+        {
+            const index_t X             = b_g_k_c_xs_lengths[3];
+            const index_t ConvDilationW = conv_filter_dilations[0];
+            const index_t InLeftPadW    = input_left_pads[0];
+            const index_t InRightPadW   = input_right_pads[0];
+            if constexpr(NumGroupsToMerge == 1)
+            {
+                const auto in_n_wi_c_desc = make_naive_tensor_descriptor(
+                    make_tuple(N, Wi, C), make_tuple(NStride, WiStride, CStride));
+
+                const auto in_n_wip_c_desc = transform_tensor_descriptor(
+                    in_n_wi_c_desc,
+                    make_tuple(make_pass_through_transform(N),
+                               make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                               make_pass_through_transform(C)),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+                const auto in_n_x_wo_c_desc = transform_tensor_descriptor(
+                    in_n_wip_c_desc,
+                    make_tuple(make_pass_through_transform(N),
+                               make_embed_transform(make_tuple(X, Wo),
+                                                    make_tuple(ConvDilationW, ConvStrideW)),
+                               make_pass_through_transform(C)),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+
+                return transform_tensor_descriptor(
+                    in_n_x_wo_c_desc,
+                    make_tuple(make_merge_transform(make_tuple(N, Wo)),
+                               make_merge_transform(make_tuple(X, C))),
+                    make_tuple(Sequence<0, 2>{}, Sequence<1, 3>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+            }
+            else
+            {
+                const auto in_n_wi_c_desc =
+                    make_naive_tensor_descriptor(make_tuple(N, Wi, NumGroupsToMerge, C),
+                                                 make_tuple(NStride, WiStride, GStride, CStride));
+
+                const auto in_n_wip_c_desc = transform_tensor_descriptor(
+                    in_n_wi_c_desc,
+                    make_tuple(make_pass_through_transform(N),
+                               make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                               make_pass_through_transform(NumGroupsToMerge),
+                               make_pass_through_transform(C)),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+                const auto in_n_x_wo_c_desc = transform_tensor_descriptor(
+                    in_n_wip_c_desc,
+                    make_tuple(make_pass_through_transform(N),
+                               make_embed_transform(make_tuple(X, Wo),
+                                                    make_tuple(ConvDilationW, ConvStrideW)),
+                               make_pass_through_transform(NumGroupsToMerge),
+                               make_pass_through_transform(C)),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}, Sequence<4>{}));
+
+                return transform_tensor_descriptor(
+                    in_n_x_wo_c_desc,
+                    make_tuple(make_merge_transform(make_tuple(N, Wo, NumGroupsToMerge)),
+                               make_merge_transform(make_tuple(X, C))),
+                    make_tuple(Sequence<0, 2, 3>{}, Sequence<1, 4>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+            }
+        }
+    }
+
+    template <typename ALayout,
+              typename std::enable_if<
+                  NDimSpatial == 2 && (is_same_v<ALayout, tensor_layout::convolution::G_NHW_C> ||
+                                       is_same_v<ALayout, tensor_layout::convolution::NHWGC> ||
+                                       is_same_v<ALayout, tensor_layout::convolution::GNHWC>),
+                  bool>::type = false>
+    static auto
+    MakeADescriptor_M_K(const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+                        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& /* b_g_k_c_xs_strides */,
+                        const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& /* c_g_n_k_wos_strides */,
+                        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                        const std::array<index_t, NDimSpatial>& input_left_pads,
+                        const std::array<index_t, NDimSpatial>& input_right_pads,
+                        const index_t N)
+
+    {
+        const index_t C = a_g_n_c_wis_lengths[2];
+
+        const index_t Hi = a_g_n_c_wis_lengths[3];
+        const index_t Wi = a_g_n_c_wis_lengths[4];
+
+        const index_t Ho = c_g_n_k_wos_lengths[3];
+        const index_t Wo = c_g_n_k_wos_lengths[4];
+
+        const index_t ConvStrideH = conv_filter_strides[0];
+        const index_t ConvStrideW = conv_filter_strides[1];
+
+        const index_t GStride  = a_g_n_c_wis_strides[I0];
+        const index_t NStride  = a_g_n_c_wis_strides[I1];
+        const index_t CStride  = a_g_n_c_wis_strides[I2];
+        const index_t HiStride = a_g_n_c_wis_strides[I3];
+        const index_t WiStride = a_g_n_c_wis_strides[I4];
+
+        if constexpr(ConvForwardSpecialization ==
+                     device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0)
+        {
+            const index_t NHoWo =
+                N * ck::accumulate_n<index_t>(
+                        c_g_n_k_wos_lengths.begin() + 3, NDimSpatial, 1, std::multiplies<>());
+            if constexpr(NumGroupsToMerge == 1)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NHoWo, C),
+                                                    make_tuple(WiStride, CStride));
+            }
+            else
+            {
+                const auto in_gemmm_groups_gemmk_desc = make_naive_tensor_descriptor(
+                    make_tuple(NHoWo, NumGroupsToMerge, C), make_tuple(WiStride, GStride, CStride));
+
+                return transform_tensor_descriptor(
+                    in_gemmm_groups_gemmk_desc,
+                    make_tuple(make_merge_transform(make_tuple(NHoWo, NumGroupsToMerge)),
+                               make_pass_through_transform(C)),
+                    make_tuple(Sequence<0, 1>{}, Sequence<2>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+            }
+        }
+        else if constexpr(ConvForwardSpecialization ==
+                          device::ConvolutionForwardSpecialization::Filter3x3)
+        {
+            const index_t ConvDilationH = conv_filter_dilations[0];
+            const index_t ConvDilationW = conv_filter_dilations[1];
+
+            const index_t InLeftPadH = input_left_pads[0];
+            const index_t InLeftPadW = input_left_pads[1];
+
+            const index_t InRightPadH = input_right_pads[0];
+            const index_t InRightPadW = input_right_pads[1];
+            if constexpr(NumGroupsToMerge == 1)
+            {
+                const auto in_n_hi_wi_c_desc = make_naive_tensor_descriptor(
+                    make_tuple(N, Hi, Wi), make_tuple(NStride, HiStride, WiStride));
+
+                const auto in_n_hip_wip_c_desc = transform_tensor_descriptor(
+                    in_n_hi_wi_c_desc,
+                    make_tuple(make_pass_through_transform(N),
+                               make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                               make_pad_transform(Wi, InLeftPadW, InRightPadW)),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+                const auto in_n_y_ho_x_wo_c_desc = transform_tensor_descriptor(
+                    in_n_hip_wip_c_desc,
+                    make_tuple(make_pass_through_transform(N),
+                               make_embed_transform(make_tuple(Number<3>{}, Ho),
+                                                    make_tuple(ConvDilationH, ConvStrideH)),
+                               make_embed_transform(make_tuple(Number<3>{}, Wo),
+                                                    make_tuple(ConvDilationW, ConvStrideW))),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}));
+
+                return transform_tensor_descriptor(
+                    in_n_y_ho_x_wo_c_desc,
+                    make_tuple(make_merge_transform(make_tuple(N, Ho, Wo)),
+                               make_merge_transform(make_tuple(Number<3>{}, Number<3>{}))),
+                    make_tuple(Sequence<0, 2, 4>{}, Sequence<1, 3>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+            }
+            else
+            {
+                const auto in_n_hi_wi_groups_c_desc =
+                    make_naive_tensor_descriptor(make_tuple(N, Hi, Wi, NumGroupsToMerge),
+                                                 make_tuple(NStride, HiStride, WiStride, GStride));
+
+                const auto in_n_hip_wip_groups_c_desc = transform_tensor_descriptor(
+                    in_n_hi_wi_groups_c_desc,
+                    make_tuple(make_pass_through_transform(N),
+                               make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                               make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                               make_pass_through_transform(NumGroupsToMerge)),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+                const auto in_n_y_ho_x_wo_groups_c_desc = transform_tensor_descriptor(
+                    in_n_hip_wip_groups_c_desc,
+                    make_tuple(make_pass_through_transform(N),
+                               make_embed_transform(make_tuple(Number<3>{}, Ho),
+                                                    make_tuple(ConvDilationH, ConvStrideH)),
+                               make_embed_transform(make_tuple(Number<3>{}, Wo),
+                                                    make_tuple(ConvDilationW, ConvStrideW)),
+                               make_pass_through_transform(NumGroupsToMerge)),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+                return transform_tensor_descriptor(
+                    in_n_y_ho_x_wo_groups_c_desc,
+                    make_tuple(make_merge_transform(make_tuple(N, Ho, Wo, NumGroupsToMerge)),
+                               make_merge_transform(make_tuple(Number<3>{}, Number<3>{}))),
+                    make_tuple(Sequence<0, 2, 4, 5>{}, Sequence<1, 3>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+            }
+        }
+        else if constexpr(ConvForwardSpecialization ==
+                          device::ConvolutionForwardSpecialization::Filter1x1Pad0)
+        {
+            if constexpr(NumGroupsToMerge == 1)
+            {
+                const auto in_n_hi_wi_c_desc = make_naive_tensor_descriptor(
+                    make_tuple(N, Hi, Wi, C), make_tuple(NStride, HiStride, WiStride, CStride));
+
+                const auto in_n_ho_wo_c_desc = transform_tensor_descriptor(
+                    in_n_hi_wi_c_desc,
+                    make_tuple(make_pass_through_transform(N),
+                               make_embed_transform(make_tuple(Ho), make_tuple(ConvStrideH)),
+                               make_embed_transform(make_tuple(Wo), make_tuple(ConvStrideW)),
+                               make_pass_through_transform(C)),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+                return transform_tensor_descriptor(
+                    in_n_ho_wo_c_desc,
+                    make_tuple(make_merge_transform(make_tuple(N, Ho, Wo)),
+                               make_pass_through_transform(C)),
+                    make_tuple(Sequence<0, 1, 2>{}, Sequence<3>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+            }
+            else
+            {
+                const auto in_n_hi_wi_groups_c_desc = make_naive_tensor_descriptor(
+                    make_tuple(N, Hi, Wi, NumGroupsToMerge, C),
+                    make_tuple(NStride, HiStride, WiStride, GStride, CStride));
+
+                const auto in_n_ho_wo_groups_c_desc = transform_tensor_descriptor(
+                    in_n_hi_wi_groups_c_desc,
+                    make_tuple(make_pass_through_transform(N),
+                               make_embed_transform(make_tuple(Ho), make_tuple(ConvStrideH)),
+                               make_embed_transform(make_tuple(Wo), make_tuple(ConvStrideW)),
+                               make_pass_through_transform(NumGroupsToMerge),
+                               make_pass_through_transform(C)),
+                    make_tuple(
+                        Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+                    make_tuple(
+                        Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
+
+                return transform_tensor_descriptor(
+                    in_n_ho_wo_groups_c_desc,
+                    make_tuple(make_merge_transform(make_tuple(N, Ho, Wo, NumGroupsToMerge)),
+                               make_pass_through_transform(C)),
+                    make_tuple(Sequence<0, 1, 2, 3>{}, Sequence<4>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+            }
+        }
+        else
+        {
+            const index_t Y = b_g_k_c_xs_lengths[3];
+            const index_t X = b_g_k_c_xs_lengths[4];
+
+            const index_t ConvDilationH = conv_filter_dilations[0];
+            const index_t ConvDilationW = conv_filter_dilations[1];
+
+            const index_t InLeftPadH = input_left_pads[0];
+            const index_t InLeftPadW = input_left_pads[1];
+
+            const index_t InRightPadH = input_right_pads[0];
+            const index_t InRightPadW = input_right_pads[1];
+
+            if constexpr(NumGroupsToMerge == 1)
+            {
+                const auto in_n_hi_wi_c_desc = make_naive_tensor_descriptor(
+                    make_tuple(N, Hi, Wi, C), make_tuple(NStride, HiStride, WiStride, CStride));
+
+                const auto in_n_hip_wip_c_desc = transform_tensor_descriptor(
+                    in_n_hi_wi_c_desc,
+                    make_tuple(make_pass_through_transform(N),
+                               make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                               make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                               make_pass_through_transform(C)),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+                const auto in_n_y_ho_x_wo_c_desc = transform_tensor_descriptor(
+                    in_n_hip_wip_c_desc,
+                    make_tuple(make_pass_through_transform(N),
+                               make_embed_transform(make_tuple(Y, Ho),
+                                                    make_tuple(ConvDilationH, ConvStrideH)),
+                               make_embed_transform(make_tuple(X, Wo),
+                                                    make_tuple(ConvDilationW, ConvStrideW)),
+                               make_pass_through_transform(C)),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+                return transform_tensor_descriptor(
+                    in_n_y_ho_x_wo_c_desc,
+                    make_tuple(make_merge_transform(make_tuple(N, Ho, Wo)),
+                               make_merge_transform(make_tuple(Y, X, C))),
+                    make_tuple(Sequence<0, 2, 4>{}, Sequence<1, 3, 5>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+            }
+            else
+            {
+
+                const auto in_n_hi_wi_groups_c_desc = make_naive_tensor_descriptor(
+                    make_tuple(N, Hi, Wi, NumGroupsToMerge, C),
+                    make_tuple(NStride, HiStride, WiStride, GStride, CStride));
+
+                const auto in_n_hip_wip_groups_c_desc = transform_tensor_descriptor(
+                    in_n_hi_wi_groups_c_desc,
+                    make_tuple(make_pass_through_transform(N),
+                               make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                               make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                               make_pass_through_transform(NumGroupsToMerge),
+                               make_pass_through_transform(C)),
+                    make_tuple(
+                        Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+                    make_tuple(
+                        Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
+
+                const auto in_n_y_ho_x_wo_groups_c_desc = transform_tensor_descriptor(
+                    in_n_hip_wip_groups_c_desc,
+                    make_tuple(make_pass_through_transform(N),
+                               make_embed_transform(make_tuple(Y, Ho),
+                                                    make_tuple(ConvDilationH, ConvStrideH)),
+                               make_embed_transform(make_tuple(X, Wo),
+                                                    make_tuple(ConvDilationW, ConvStrideW)),
+                               make_pass_through_transform(NumGroupsToMerge),
+                               make_pass_through_transform(C)),
+                    make_tuple(
+                        Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+                    make_tuple(Sequence<0>{},
+                               Sequence<1, 2>{},
+                               Sequence<3, 4>{},
+                               Sequence<5>{},
+                               Sequence<6>{}));
+
+                return transform_tensor_descriptor(
+                    in_n_y_ho_x_wo_groups_c_desc,
+                    make_tuple(make_merge_transform(make_tuple(N, Ho, Wo, NumGroupsToMerge)),
+                               make_merge_transform(make_tuple(Y, X, C))),
+                    make_tuple(Sequence<0, 2, 4, 5>{}, Sequence<1, 3, 6>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+            }
+        }
+    }
+
+    template <typename ALayout,
+              typename std::enable_if<
+                  NDimSpatial == 3 && (is_same_v<ALayout, tensor_layout::convolution::G_NDHW_C> ||
+                                       is_same_v<ALayout, tensor_layout::convolution::NDHWGC> ||
+                                       is_same_v<ALayout, tensor_layout::convolution::GNDHWC>),
+                  bool>::type = false>
+    static auto
+    MakeADescriptor_M_K(const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+                        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& /* b_g_k_c_xs_strides */,
+                        const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& /* c_g_n_k_wos_strides*/,
+                        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                        const std::array<index_t, NDimSpatial>& input_left_pads,
+                        const std::array<index_t, NDimSpatial>& input_right_pads,
+                        const index_t N)
+
+    {
+        const index_t C = a_g_n_c_wis_lengths[2];
+
+        const index_t Di = a_g_n_c_wis_lengths[3];
+        const index_t Hi = a_g_n_c_wis_lengths[4];
+        const index_t Wi = a_g_n_c_wis_lengths[5];
+
+        const index_t Do = c_g_n_k_wos_lengths[3];
+        const index_t Ho = c_g_n_k_wos_lengths[4];
+        const index_t Wo = c_g_n_k_wos_lengths[5];
+
+        const index_t ConvStrideD = conv_filter_strides[0];
+        const index_t ConvStrideH = conv_filter_strides[1];
+        const index_t ConvStrideW = conv_filter_strides[2];
+
+        const index_t GStride  = a_g_n_c_wis_strides[I0];
+        const index_t NStride  = a_g_n_c_wis_strides[I1];
+        const index_t CStride  = a_g_n_c_wis_strides[I2];
+        const index_t DiStride = a_g_n_c_wis_strides[I3];
+        const index_t HiStride = a_g_n_c_wis_strides[I4];
+        const index_t WiStride = a_g_n_c_wis_strides[I5];
+
+        if constexpr(ConvForwardSpecialization ==
+                     device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0)
+        {
+            const index_t NDoHoWo =
+                N * ck::accumulate_n<index_t>(
+                        c_g_n_k_wos_lengths.begin() + 3, NDimSpatial, 1, std::multiplies<>());
+
+            if constexpr(NumGroupsToMerge == 1)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NDoHoWo, C),
+                                                    make_tuple(WiStride, CStride));
+            }
+            else
+            {
+                const auto in_gemmm_groups_gemmk_desc =
+                    make_naive_tensor_descriptor(make_tuple(NDoHoWo, NumGroupsToMerge, C),
+                                                 make_tuple(WiStride, GStride, CStride));
+
+                return transform_tensor_descriptor(
+                    in_gemmm_groups_gemmk_desc,
+                    make_tuple(make_merge_transform(make_tuple(NDoHoWo, NumGroupsToMerge)),
+                               make_pass_through_transform(C)),
+                    make_tuple(Sequence<0, 1>{}, Sequence<2>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+            }
+        }
+        else if constexpr(ConvForwardSpecialization ==
+                          device::ConvolutionForwardSpecialization::Filter3x3)
+        {
+            const index_t ConvDilationD = conv_filter_dilations[0];
+            const index_t ConvDilationH = conv_filter_dilations[1];
+            const index_t ConvDilationW = conv_filter_dilations[2];
+
+            const index_t InLeftPadD = input_left_pads[0];
+            const index_t InLeftPadH = input_left_pads[1];
+            const index_t InLeftPadW = input_left_pads[2];
+
+            const index_t InRightPadD = input_right_pads[0];
+            const index_t InRightPadH = input_right_pads[1];
+            const index_t InRightPadW = input_right_pads[2];
+
+            if constexpr(NumGroupsToMerge == 1)
+            {
+                const auto in_n_di_hi_wi_c_desc = make_naive_tensor_descriptor(
+                    make_tuple(N, Di, Hi, Wi), make_tuple(NStride, DiStride, HiStride, WiStride));
+
+                const auto in_n_hip_wip_c_desc = transform_tensor_descriptor(
+                    in_n_di_hi_wi_c_desc,
+                    make_tuple(make_pass_through_transform(N),
+                               make_pad_transform(Di, InLeftPadD, InRightPadD),
+                               make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                               make_pad_transform(Wi, InLeftPadW, InRightPadW)),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+                const auto in_n_z_do_y_ho_x_wo_c_desc = transform_tensor_descriptor(
+                    in_n_hip_wip_c_desc,
+                    make_tuple(make_pass_through_transform(N),
+                               make_embed_transform(make_tuple(Number<3>{}, Do),
+                                                    make_tuple(ConvDilationD, ConvStrideD)),
+                               make_embed_transform(make_tuple(Number<3>{}, Ho),
+                                                    make_tuple(ConvDilationH, ConvStrideH)),
+                               make_embed_transform(make_tuple(Number<3>{}, Wo),
+                                                    make_tuple(ConvDilationW, ConvStrideW))),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                    make_tuple(
+                        Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5, 6>{}));
+
+                return transform_tensor_descriptor(
+                    in_n_z_do_y_ho_x_wo_c_desc,
+                    make_tuple(
+                        make_merge_transform(make_tuple(N, Do, Ho, Wo)),
+                        make_merge_transform(make_tuple(Number<3>{}, Number<3>{}, Number<3>{}))),
+                    make_tuple(Sequence<0, 2, 4, 6>{}, Sequence<1, 3, 5>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+            }
+            else
+            {
+                const auto in_n_di_hi_wi_c_desc = make_naive_tensor_descriptor(
+                    make_tuple(N, Di, Hi, Wi, NumGroupsToMerge),
+                    make_tuple(NStride, DiStride, HiStride, WiStride, GStride));
+
+                const auto in_n_hip_wip_c_desc = transform_tensor_descriptor(
+                    in_n_di_hi_wi_c_desc,
+                    make_tuple(make_pass_through_transform(N),
+                               make_pad_transform(Di, InLeftPadD, InRightPadD),
+                               make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                               make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                               make_pass_through_transform(NumGroupsToMerge)),
+                    make_tuple(
+                        Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+                    make_tuple(
+                        Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
+
+                const auto in_n_z_do_y_ho_x_wo_c_desc = transform_tensor_descriptor(
+                    in_n_hip_wip_c_desc,
+                    make_tuple(make_pass_through_transform(N),
+                               make_embed_transform(make_tuple(Number<3>{}, Do),
+                                                    make_tuple(ConvDilationD, ConvStrideD)),
+                               make_embed_transform(make_tuple(Number<3>{}, Ho),
+                                                    make_tuple(ConvDilationH, ConvStrideH)),
+                               make_embed_transform(make_tuple(Number<3>{}, Wo),
+                                                    make_tuple(ConvDilationW, ConvStrideW)),
+                               make_pass_through_transform(NumGroupsToMerge)),
+                    make_tuple(
+                        Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+                    make_tuple(Sequence<0>{},
+                               Sequence<1, 2>{},
+                               Sequence<3, 4>{},
+                               Sequence<5, 6>{},
+                               Sequence<7>{}));
+
+                return transform_tensor_descriptor(
+                    in_n_z_do_y_ho_x_wo_c_desc,
+                    make_tuple(
+                        make_merge_transform(make_tuple(N, Do, Ho, Wo, NumGroupsToMerge)),
+                        make_merge_transform(make_tuple(Number<3>{}, Number<3>{}, Number<3>{}))),
+                    make_tuple(Sequence<0, 2, 4, 6, 7>{}, Sequence<1, 3, 5>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+            }
+        }
+        else if constexpr(ConvForwardSpecialization ==
+                          device::ConvolutionForwardSpecialization::Filter1x1Pad0)
+        {
+            if constexpr(NumGroupsToMerge == 1)
+            {
+                const auto in_n_di_hi_wi_c_desc = make_naive_tensor_descriptor(
+                    make_tuple(N, Di, Hi, Wi, C),
+                    make_tuple(NStride, DiStride, HiStride, WiStride, CStride));
+
+                const auto in_n_do_ho_wo_c_desc = transform_tensor_descriptor(
+                    in_n_di_hi_wi_c_desc,
+                    make_tuple(make_pass_through_transform(N),
+                               make_embed_transform(make_tuple(Do), make_tuple(ConvStrideD)),
+                               make_embed_transform(make_tuple(Ho), make_tuple(ConvStrideH)),
+                               make_embed_transform(make_tuple(Wo), make_tuple(ConvStrideW)),
+                               make_pass_through_transform(C)),
+                    make_tuple(
+                        Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+                    make_tuple(
+                        Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
+
+                return transform_tensor_descriptor(
+                    in_n_do_ho_wo_c_desc,
+                    make_tuple(make_merge_transform(make_tuple(N, Do, Ho, Wo)),
+                               make_pass_through_transform(C)),
+                    make_tuple(Sequence<0, 1, 2, 3>{}, Sequence<4>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+            }
+            else
+            {
+                const auto in_n_di_hi_wi_c_desc = make_naive_tensor_descriptor(
+                    make_tuple(N, Di, Hi, Wi, NumGroupsToMerge, C),
+                    make_tuple(NStride, DiStride, HiStride, WiStride, GStride, CStride));
+
+                const auto in_n_do_ho_wo_c_desc = transform_tensor_descriptor(
+                    in_n_di_hi_wi_c_desc,
+                    make_tuple(make_pass_through_transform(N),
+                               make_embed_transform(make_tuple(Do), make_tuple(ConvStrideD)),
+                               make_embed_transform(make_tuple(Ho), make_tuple(ConvStrideH)),
+                               make_embed_transform(make_tuple(Wo), make_tuple(ConvStrideW)),
+                               make_pass_through_transform(NumGroupsToMerge),
+                               make_pass_through_transform(C)),
+                    make_tuple(Sequence<0>{},
+                               Sequence<1>{},
+                               Sequence<2>{},
+                               Sequence<3>{},
+                               Sequence<4>{},
+                               Sequence<5>{}),
+                    make_tuple(Sequence<0>{},
+                               Sequence<1>{},
+                               Sequence<2>{},
+                               Sequence<3>{},
+                               Sequence<4>{},
+                               Sequence<5>{}));
+
+                return transform_tensor_descriptor(
+                    in_n_do_ho_wo_c_desc,
+                    make_tuple(make_merge_transform(make_tuple(N, Do, Ho, Wo, NumGroupsToMerge)),
+                               make_pass_through_transform(C)),
+                    make_tuple(Sequence<0, 1, 2, 3, 4>{}, Sequence<5>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+            }
+        }
+        else
+        {
+            const index_t Z = b_g_k_c_xs_lengths[3];
+            const index_t Y = b_g_k_c_xs_lengths[4];
+            const index_t X = b_g_k_c_xs_lengths[5];
+
+            const index_t ConvDilationD = conv_filter_dilations[0];
+            const index_t ConvDilationH = conv_filter_dilations[1];
+            const index_t ConvDilationW = conv_filter_dilations[2];
+
+            const index_t InLeftPadD = input_left_pads[0];
+            const index_t InLeftPadH = input_left_pads[1];
+            const index_t InLeftPadW = input_left_pads[2];
+
+            const index_t InRightPadD = input_right_pads[0];
+            const index_t InRightPadH = input_right_pads[1];
+            const index_t InRightPadW = input_right_pads[2];
+
+            if constexpr(NumGroupsToMerge == 1)
+            {
+                const auto in_n_di_hi_wi_c_desc = make_naive_tensor_descriptor(
+                    make_tuple(N, Di, Hi, Wi, C),
+                    make_tuple(NStride, DiStride, HiStride, WiStride, CStride));
+
+                const auto in_n_hip_wip_c_desc = transform_tensor_descriptor(
+                    in_n_di_hi_wi_c_desc,
+                    make_tuple(make_pass_through_transform(N),
+                               make_pad_transform(Di, InLeftPadD, InRightPadD),
+                               make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                               make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                               make_pass_through_transform(C)),
+                    make_tuple(
+                        Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+                    make_tuple(
+                        Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
+
+                const auto in_n_z_do_y_ho_x_wo_c_desc = transform_tensor_descriptor(
+                    in_n_hip_wip_c_desc,
+                    make_tuple(make_pass_through_transform(N),
+                               make_embed_transform(make_tuple(Z, Do),
+                                                    make_tuple(ConvDilationD, ConvStrideD)),
+                               make_embed_transform(make_tuple(Y, Ho),
+                                                    make_tuple(ConvDilationH, ConvStrideH)),
+                               make_embed_transform(make_tuple(X, Wo),
+                                                    make_tuple(ConvDilationW, ConvStrideW)),
+                               make_pass_through_transform(C)),
+                    make_tuple(
+                        Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+                    make_tuple(Sequence<0>{},
+                               Sequence<1, 2>{},
+                               Sequence<3, 4>{},
+                               Sequence<5, 6>{},
+                               Sequence<7>{}));
+
+                return transform_tensor_descriptor(
+                    in_n_z_do_y_ho_x_wo_c_desc,
+                    make_tuple(make_merge_transform(make_tuple(N, Do, Ho, Wo)),
+                               make_merge_transform(make_tuple(Z, Y, X, C))),
+                    make_tuple(Sequence<0, 2, 4, 6>{}, Sequence<1, 3, 5, 7>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+            }
+            else
+            {
+                const auto in_n_di_hi_wi_c_desc = make_naive_tensor_descriptor(
+                    make_tuple(N, Di, Hi, Wi, NumGroupsToMerge, C),
+                    make_tuple(NStride, DiStride, HiStride, WiStride, GStride, CStride));
+
+                const auto in_n_hip_wip_c_desc = transform_tensor_descriptor(
+                    in_n_di_hi_wi_c_desc,
+                    make_tuple(make_pass_through_transform(N),
+                               make_pad_transform(Di, InLeftPadD, InRightPadD),
+                               make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                               make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                               make_pass_through_transform(NumGroupsToMerge),
+                               make_pass_through_transform(C)),
+                    make_tuple(Sequence<0>{},
+                               Sequence<1>{},
+                               Sequence<2>{},
+                               Sequence<3>{},
+                               Sequence<4>{},
+                               Sequence<5>{}),
+                    make_tuple(Sequence<0>{},
+                               Sequence<1>{},
+                               Sequence<2>{},
+                               Sequence<3>{},
+                               Sequence<4>{},
+                               Sequence<5>{}));
+
+                const auto in_n_z_do_y_ho_x_wo_c_desc = transform_tensor_descriptor(
+                    in_n_hip_wip_c_desc,
+                    make_tuple(make_pass_through_transform(N),
+                               make_embed_transform(make_tuple(Z, Do),
+                                                    make_tuple(ConvDilationD, ConvStrideD)),
+                               make_embed_transform(make_tuple(Y, Ho),
+                                                    make_tuple(ConvDilationH, ConvStrideH)),
+                               make_embed_transform(make_tuple(X, Wo),
+                                                    make_tuple(ConvDilationW, ConvStrideW)),
+                               make_pass_through_transform(NumGroupsToMerge),
+                               make_pass_through_transform(C)),
+                    make_tuple(Sequence<0>{},
+                               Sequence<1>{},
+                               Sequence<2>{},
+                               Sequence<3>{},
+                               Sequence<4>{},
+                               Sequence<5>{}),
+                    make_tuple(Sequence<0>{},
+                               Sequence<1, 2>{},
+                               Sequence<3, 4>{},
+                               Sequence<5, 6>{},
+                               Sequence<7>{},
+                               Sequence<8>{}));
+
+                return transform_tensor_descriptor(
+                    in_n_z_do_y_ho_x_wo_c_desc,
+                    make_tuple(make_merge_transform(make_tuple(N, Do, Ho, Wo, NumGroupsToMerge)),
+                               make_merge_transform(make_tuple(Z, Y, X, C))),
+                    make_tuple(Sequence<0, 2, 4, 6, 7>{}, Sequence<1, 3, 5, 8>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+            }
+        }
+    }
+
+    template <typename BLayout,
+              typename std::enable_if<is_same_v<BLayout, tensor_layout::convolution::GKXC> ||
+                                          is_same_v<BLayout, tensor_layout::convolution::GKYXC> ||
+                                          is_same_v<BLayout, tensor_layout::convolution::GKZYXC>,
+                                      bool>::type = false>
+    static auto MakeBDescriptor_N_K(const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+                                    const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides)
+    {
+        const index_t K = b_g_k_c_xs_lengths[1];
+        const index_t C = b_g_k_c_xs_lengths[2];
+
+        const index_t YX = ck::accumulate_n<index_t>(
+            b_g_k_c_xs_lengths.begin() + 3, NDimSpatial, 1, std::multiplies<>());
+
+        const index_t GStride = b_g_k_c_xs_strides[I0];
+        const index_t KStride = b_g_k_c_xs_strides[I1];
+        const index_t CStride = b_g_k_c_xs_strides[I2];
+
+        if constexpr(ConvForwardSpecialization ==
+                     device::ConvolutionForwardSpecialization::Filter3x3)
+        {
+            using FilterSizeNumType =
+                std::conditional_t<NDimSpatial == 1,
+                                   Number<3>,
+                                   std::conditional_t<NDimSpatial == 2, Number<9>, Number<27>>>;
+
+            if constexpr(NumGroupsToMerge == 1)
+            {
+                return make_naive_tensor_descriptor_packed(make_tuple(K, FilterSizeNumType{}));
+            }
+            else
+            {
+
+                const auto wei_gemmn_groups_gemmk_desc = make_naive_tensor_descriptor(
+                    make_tuple(K, NumGroupsToMerge, FilterSizeNumType{}),
+                    make_tuple(KStride, GStride, CStride));
+                return transform_tensor_descriptor(
+                    wei_gemmn_groups_gemmk_desc,
+                    make_tuple(make_merge_transform(make_tuple(K, NumGroupsToMerge)),
+                               make_pass_through_transform(FilterSizeNumType{})),
+                    make_tuple(Sequence<0, 1>{}, Sequence<2>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+            }
+        }
+        else
+        {
+            if constexpr(NumGroupsToMerge == 1)
+            {
+                return make_naive_tensor_descriptor_packed(make_tuple(K, YX * C));
+            }
+            else
+            {
+                const auto wei_gemmn_groups_gemmk_desc = make_naive_tensor_descriptor(
+                    make_tuple(K, NumGroupsToMerge, YX * C), make_tuple(KStride, GStride, CStride));
+                return transform_tensor_descriptor(
+                    wei_gemmn_groups_gemmk_desc,
+                    make_tuple(make_merge_transform(make_tuple(K, NumGroupsToMerge)),
+                               make_pass_through_transform(YX * C)),
+                    make_tuple(Sequence<0, 1>{}, Sequence<2>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+            }
+        }
+    }
+
+    template <
+        typename BLayout,
+        typename std::enable_if<is_same_v<BLayout, tensor_layout::convolution::G_K_X_C> ||
+                                    is_same_v<BLayout, tensor_layout::convolution::G_K_YX_C> ||
+                                    is_same_v<BLayout, tensor_layout::convolution::G_K_ZYX_C> ||
+                                    is_same_v<BLayout, tensor_layout::convolution::KXGC> ||
+                                    is_same_v<BLayout, tensor_layout::convolution::KYXGC> ||
+                                    is_same_v<BLayout, tensor_layout::convolution::KZYXGC>,
+                                bool>::type = false>
+    static auto MakeBDescriptor_N_K(const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+                                    const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides)
+    {
+        const index_t K = b_g_k_c_xs_lengths[1];
+        const index_t C = b_g_k_c_xs_lengths[2];
+
+        const index_t YX = ck::accumulate_n<index_t>(
+            b_g_k_c_xs_lengths.begin() + 3, NDimSpatial, 1, std::multiplies<>());
+
+        const index_t KStride = b_g_k_c_xs_strides[1];
+        const index_t XStride = b_g_k_c_xs_strides[2 + NDimSpatial];
+        const auto CStride    = I1;
+
+        const auto wei_k_yx_c_desc = make_naive_tensor_descriptor(
+            make_tuple(K, YX, C), make_tuple(KStride, XStride, CStride));
+
+        const auto wei_gemmn_gemmk_desc = transform_tensor_descriptor(
+            wei_k_yx_c_desc,
+            make_tuple(make_pass_through_transform(K), make_merge_transform(make_tuple(YX, C))),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        return wei_gemmn_gemmk_desc;
+    }
+
+    template <typename CLayout,
+              typename std::enable_if<is_same_v<CLayout, tensor_layout::convolution::GNWK> ||
+                                          is_same_v<CLayout, tensor_layout::convolution::GNHWK> ||
+                                          is_same_v<CLayout, tensor_layout::convolution::GNDHWK>,
+                                      bool>::type = false>
+    static auto
+    MakeCDescriptor_M_N(const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& /* c_g_n_k_wos_strides */,
+                        const index_t N)
+    {
+        const index_t K = c_g_n_k_wos_lengths[2];
+
+        const index_t NHoWo =
+            N * ck::accumulate_n<index_t>(
+                    c_g_n_k_wos_lengths.begin() + 3, NDimSpatial, 1, std::multiplies<>());
+
+        const auto out_gemmm_gemmn_desc = make_naive_tensor_descriptor_packed(make_tuple(NHoWo, K));
+
+        return out_gemmm_gemmn_desc;
+    }
+
+    template <
+        typename CLayout,
+        typename std::enable_if<is_same_v<CLayout, tensor_layout::convolution::G_NW_K> ||
+                                    is_same_v<CLayout, tensor_layout::convolution::G_NHW_K> ||
+                                    is_same_v<CLayout, tensor_layout::convolution::G_NDHW_K> ||
+                                    is_same_v<CLayout, tensor_layout::convolution::NWGK> ||
+                                    is_same_v<CLayout, tensor_layout::convolution::NHWGK> ||
+                                    is_same_v<CLayout, tensor_layout::convolution::NDHWGK>,
+                                bool>::type = false>
+    static auto MakeCDescriptor_M_N(const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
+                                    const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_strides,
+                                    const index_t N)
+    {
+        const index_t K = c_g_n_k_wos_lengths[2];
+
+        const index_t KStride  = I1;
+        const index_t WoStride = c_g_n_k_wos_strides[NDimSpatial + 2];
+        const index_t GStride  = c_g_n_k_wos_strides[0];
+
+        const index_t NHoWo =
+            N * ck::accumulate_n<index_t>(
+                    c_g_n_k_wos_lengths.begin() + 3, NDimSpatial, 1, std::multiplies<>());
+        if constexpr(NumGroupsToMerge == 1)
+        {
+            return make_naive_tensor_descriptor(make_tuple(NHoWo, K),
+                                                make_tuple(WoStride, KStride));
+        }
+        else
+        {
+            const auto nhwo_groups_k_1_desc =
+                make_naive_tensor_descriptor(make_tuple(NHoWo, NumGroupsToMerge, K, 1),
+                                             make_tuple(WoStride, GStride, KStride, GStride));
+            // Padd 1 to NumGroupsToMerge
+            const auto padded_desc = transform_tensor_descriptor(
+                nhwo_groups_k_1_desc,
+                make_tuple(make_pass_through_transform(NHoWo),
+                           make_pass_through_transform(NumGroupsToMerge),
+                           make_pass_through_transform(K),
+                           make_pad_transform(1, 0, NumGroupsToMerge - 1)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+            // We need only matrices from diagonal. Xor returns 0 for the same
+            // values. So if matrices is not on diagonal then it will be stored in padding.
+            // To avoid use of modulo after xor we assume that NumBatch to merge is power of 2.
+            static_assert(NumGroupsToMerge == 1 || NumGroupsToMerge == 2 || NumGroupsToMerge == 4 ||
+                          NumGroupsToMerge == 8 || NumGroupsToMerge == 16 ||
+                          NumGroupsToMerge == 32 || NumGroupsToMerge == 64);
+            const auto unmerged_padded_desc = transform_tensor_descriptor(
+                padded_desc,
+                make_tuple(make_pass_through_transform(NHoWo),
+                           make_xor_transform(make_tuple(NumGroupsToMerge, NumGroupsToMerge)),
+                           make_pass_through_transform(K)),
+                make_tuple(Sequence<0>{}, Sequence<1, 3>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 3>{}, Sequence<2>{}));
+            // Merge To M, N
+            return transform_tensor_descriptor(
+                unmerged_padded_desc,
+                make_tuple(make_merge_transform(make_tuple(NHoWo, NumGroupsToMerge)),
+                           make_merge_transform(make_tuple(K, NumGroupsToMerge))),
+                make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+    }
+
+    // for output bias
+    template <typename CLayout,
+              typename std::enable_if<is_same_v<CLayout, tensor_layout::convolution::G_K>,
+                                      bool>::type = false>
+    static auto MakeCDescriptor_M_N(const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
+                                    const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_strides,
+                                    const index_t N)
+    {
+        const index_t K       = c_g_n_k_wos_lengths[2];
+        const index_t KStride = c_g_n_k_wos_strides[2];
+
+        const index_t NHoWo =
+            N * ck::accumulate_n<index_t>(
+                    c_g_n_k_wos_lengths.begin() + 3, NDimSpatial, 1, std::multiplies<>());
+
+        const auto out_gemmm_gemmn_desc =
+            make_naive_tensor_descriptor(make_tuple(NHoWo, K), make_tuple(I0, KStride));
+
+        return out_gemmm_gemmn_desc;
+    }
+
+    // Overloaded functions for hipRTC purposes
+    template <typename ALayout,
+              typename std::enable_if<NDimSpatial == 1 &&
+                                          (is_same_v<ALayout, tensor_layout::convolution::G_NW_C> ||
+                                           is_same_v<ALayout, tensor_layout::convolution::NWGC> ||
+                                           is_same_v<ALayout, tensor_layout::convolution::GNWC>),
+                                      bool>::type = false>
+    __host__ __device__ static auto
+    MakeADescriptor_M_K(const ck::Array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
+                        const ck::Array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+                        const ck::Array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+                        const ck::Array<index_t, NDimSpatial + 3>& /* b_g_k_c_xs_strides */,
+                        const ck::Array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
+                        const ck::Array<index_t, NDimSpatial + 3>& /* c_g_n_k_wos_strides */,
+                        const ck::Array<index_t, NDimSpatial>& conv_filter_strides,
+                        const ck::Array<index_t, NDimSpatial>& conv_filter_dilations,
+                        const ck::Array<index_t, NDimSpatial>& input_left_pads,
+                        const ck::Array<index_t, NDimSpatial>& input_right_pads)
    {
        const index_t N = a_g_n_c_wis_lengths[1];
        const index_t C = a_g_n_c_wis_lengths[2];
@@ -141,17 +1292,17 @@ struct TransformConvFwdToGemm
                                       is_same_v<ALayout, tensor_layout::convolution::NHWGC> ||
                                       is_same_v<ALayout, tensor_layout::convolution::GNHWC>),
                  bool>::type = false>
-    static auto
-    MakeADescriptor_M_K(const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
-                        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
-                        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
-                        const std::array<index_t, NDimSpatial + 3>& /* b_g_k_c_xs_strides */,
-                        const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
-                        const std::array<index_t, NDimSpatial + 3>& /* c_g_n_k_wos_strides */,
-                        const std::array<index_t, NDimSpatial>& conv_filter_strides,
-                        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
-                        const std::array<index_t, NDimSpatial>& input_left_pads,
-                        const std::array<index_t, NDimSpatial>& input_right_pads)
+    __host__ __device__ static auto
+    MakeADescriptor_M_K(const ck::Array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
+                        const ck::Array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+                        const ck::Array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+                        const ck::Array<index_t, NDimSpatial + 3>& /* b_g_k_c_xs_strides */,
+                        const ck::Array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
+                        const ck::Array<index_t, NDimSpatial + 3>& /* c_g_n_k_wos_strides */,
+                        const ck::Array<index_t, NDimSpatial>& conv_filter_strides,
+                        const ck::Array<index_t, NDimSpatial>& conv_filter_dilations,
+                        const ck::Array<index_t, NDimSpatial>& input_left_pads,
+                        const ck::Array<index_t, NDimSpatial>& input_right_pads)
    {
        const index_t N = a_g_n_c_wis_lengths[1];
        const index_t C = a_g_n_c_wis_lengths[2];
@@ -271,16 +1422,16 @@ struct TransformConvFwdToGemm
                                       is_same_v<ALayout, tensor_layout::convolution::GNDHWC>),
                  bool>::type = false>
    static auto
-    MakeADescriptor_M_K(const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
-                        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
-                        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
-                        const std::array<index_t, NDimSpatial + 3>& /* b_g_k_c_xs_strides */,
-                        const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
-                        const std::array<index_t, NDimSpatial + 3>& /* c_g_n_k_wos_strides */,
-                        const std::array<index_t, NDimSpatial>& conv_filter_strides,
-                        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
-                        const std::array<index_t, NDimSpatial>& input_left_pads,
-                        const std::array<index_t, NDimSpatial>& input_right_pads)
+    MakeADescriptor_M_K(const ck::Array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
+                        const ck::Array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+                        const ck::Array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+                        const ck::Array<index_t, NDimSpatial + 3>& /* b_g_k_c_xs_strides */,
+                        const ck::Array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
+                        const ck::Array<index_t, NDimSpatial + 3>& /* c_g_n_k_wos_strides */,
+                        const ck::Array<index_t, NDimSpatial>& conv_filter_strides,
+                        const ck::Array<index_t, NDimSpatial>& conv_filter_dilations,
+                        const ck::Array<index_t, NDimSpatial>& input_left_pads,
+                        const ck::Array<index_t, NDimSpatial>& input_right_pads)
    {
        const index_t N = a_g_n_c_wis_lengths[1];
        const index_t C = a_g_n_c_wis_lengths[2];
@@ -421,15 +1572,15 @@ struct TransformConvFwdToGemm
                                          is_same_v<BLayout, tensor_layout::convolution::GKYXC> ||
                                          is_same_v<BLayout, tensor_layout::convolution::GKZYXC>,
                                      bool>::type = false>
-    static auto
-    MakeBDescriptor_N_K(const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
-                        const std::array<index_t, NDimSpatial + 3>& /* b_g_k_c_xs_strides */)
+    __host__ __device__ static auto
+    MakeBDescriptor_N_K(const ck::Array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+                        const ck::Array<index_t, NDimSpatial + 3>& /* b_g_k_c_xs_strides */)
    {
        const index_t K = b_g_k_c_xs_lengths[1];
        const index_t C = b_g_k_c_xs_lengths[2];

-        const index_t YX = ck::accumulate_n<index_t>(
-            b_g_k_c_xs_lengths.begin() + 3, NDimSpatial, 1, std::multiplies<>());
+        const index_t YX =
+            mult_accumulate_n<index_t>(b_g_k_c_xs_lengths.begin() + 3, NDimSpatial, 1);

        const auto wei_gemmn_gemmk_desc =
            make_naive_tensor_descriptor_packed(make_tuple(K, YX * C));
@@ -446,14 +1597,15 @@ struct TransformConvFwdToGemm
                                    is_same_v<BLayout, tensor_layout::convolution::KYXGC> ||
                                    is_same_v<BLayout, tensor_layout::convolution::KZYXGC>,
                                bool>::type = false>
-    static auto MakeBDescriptor_N_K(const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
-                                    const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides)
+    __host__ __device__ static auto
+    MakeBDescriptor_N_K(const ck::Array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+                        const ck::Array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides)
    {
        const index_t K = b_g_k_c_xs_lengths[1];
        const index_t C = b_g_k_c_xs_lengths[2];

-        const index_t YX = ck::accumulate_n<index_t>(
-            b_g_k_c_xs_lengths.begin() + 3, NDimSpatial, 1, std::multiplies<>());
+        const index_t YX =
+            mult_accumulate_n<index_t>(b_g_k_c_xs_lengths.begin() + 3, NDimSpatial, 1);

        const index_t KStride = b_g_k_c_xs_strides[1];
        const index_t XStride = b_g_k_c_xs_strides[2 + NDimSpatial];
@@ -476,16 +1628,15 @@ struct TransformConvFwdToGemm
                                          is_same_v<CLayout, tensor_layout::convolution::GNHWK> ||
                                          is_same_v<CLayout, tensor_layout::convolution::GNDHWK>,
                                      bool>::type = false>
-    static auto
-    MakeCDescriptor_M_N(const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
-                        const std::array<index_t, NDimSpatial + 3>& /* c_g_n_k_wos_strides */)
+    __host__ __device__ static auto
+    MakeCDescriptor_M_N(const ck::Array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
+                        const ck::Array<index_t, NDimSpatial + 3>& /* c_g_n_k_wos_strides */)
    {
        const index_t N = c_g_n_k_wos_lengths[1];
        const index_t K = c_g_n_k_wos_lengths[2];

        const index_t NHoWo =
-            N * ck::accumulate_n<index_t>(
-                    c_g_n_k_wos_lengths.begin() + 3, NDimSpatial, 1, std::multiplies<>());
+            N * mult_accumulate_n<index_t>(c_g_n_k_wos_lengths.begin() + 3, NDimSpatial, 1);

        const auto out_gemmm_gemmn_desc = make_naive_tensor_descriptor_packed(make_tuple(NHoWo, K));

@@ -501,8 +1652,9 @@ struct TransformConvFwdToGemm
                                    is_same_v<CLayout, tensor_layout::convolution::NHWGK> ||
                                    is_same_v<CLayout, tensor_layout::convolution::NDHWGK>,
                                bool>::type = false>
-    static auto MakeCDescriptor_M_N(const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
-                                    const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_strides)
+    __host__ __device__ static auto
+    MakeCDescriptor_M_N(const ck::Array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
+                        const ck::Array<index_t, NDimSpatial + 3>& c_g_n_k_wos_strides)
    {
        const index_t N = c_g_n_k_wos_lengths[1];
        const index_t K = c_g_n_k_wos_lengths[2];
@@ -511,8 +1663,7 @@ struct TransformConvFwdToGemm
        const index_t WoStride = c_g_n_k_wos_strides[NDimSpatial + 2];

        const index_t NHoWo =
-            N * ck::accumulate_n<index_t>(
-                    c_g_n_k_wos_lengths.begin() + 3, NDimSpatial, 1, std::multiplies<>());
+            N * mult_accumulate_n<index_t>(c_g_n_k_wos_lengths.begin() + 3, NDimSpatial, 1);

        const auto out_gemmm_gemmn_desc =
            make_naive_tensor_descriptor(make_tuple(NHoWo, K), make_tuple(WoStride, KStride));
@@ -524,16 +1675,16 @@ struct TransformConvFwdToGemm
    template <typename CLayout,
              typename std::enable_if<is_same_v<CLayout, tensor_layout::convolution::G_K>,
                                      bool>::type = false>
-    static auto MakeCDescriptor_M_N(const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
-                                    const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_strides)
+    __host__ __device__ static auto
+    MakeCDescriptor_M_N(const ck::Array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
+                        const ck::Array<index_t, NDimSpatial + 3>& c_g_n_k_wos_strides)
    {
        const index_t N       = c_g_n_k_wos_lengths[1];
        const index_t K       = c_g_n_k_wos_lengths[2];
        const index_t KStride = c_g_n_k_wos_strides[2];

        const index_t NHoWo =
-            N * ck::accumulate_n<index_t>(
-                    c_g_n_k_wos_lengths.begin() + 3, NDimSpatial, 1, std::multiplies<>());
+            N * mult_accumulate_n<index_t>(c_g_n_k_wos_lengths.begin() + 3, NDimSpatial, 1);

        const auto out_gemmm_gemmn_desc =
            make_naive_tensor_descriptor(make_tuple(NHoWo, K), make_tuple(I0, KStride));
@@ -542,5 +1693,38 @@ struct TransformConvFwdToGemm
    }
 };

+// wrapper class to call member functions on TransformConvToGemm struct at runtime
+// TODO: figure out aq way to properly pass in layout as an argument
+struct TransformConv
+{
+    TransformConv() {}
+
+    template <index_t NDimSpatial,
+              device::ConvolutionForwardSpecialization ConvForwardSpecialization>
+    auto
+    transform_func(ck::Array<index_t, NDimSpatial + 3> out_lengths,
+                   ck::Array<index_t, NDimSpatial + 3> out_strides,
+                   TransformConvFwdToGemm<NDimSpatial, ConvForwardSpecialization> conv_fwd_to_gemm)
+    {
+        if(NDimSpatial == 2)
+        {
+            return conv_fwd_to_gemm
+                .template MakeCDescriptor_M_N<ck::tensor_layout::convolution::NHWGK>(out_lengths,
+                                                                                     out_strides);
+        }
+        else if(NDimSpatial == 3)
+        {
+            return conv_fwd_to_gemm
+                .template MakeCDescriptor_M_N<tensor_layout::convolution::NDHWGK>(out_lengths,
+                                                                                  out_strides);
+        }
+        else if(NDimSpatial == 1)
+        {
+            return conv_fwd_to_gemm.template MakeCDescriptor_M_N<tensor_layout::convolution::NWGK>(
+                out_lengths, out_strides);
+        }
+    }
+};
+
 } // namespace tensor_operation
 } // namespace ck
--- a/include/ck/utility/amd_buffer_addressing.hpp
+++ b/include/ck/utility/amd_buffer_addressing.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once
 #include "data_type.hpp"
@@ -297,6 +297,17 @@ enum struct AmdBufferCoherenceEnum
    GLC              = 1,
    SLC              = 2,
    GLC_SLC          = 3,
+    // gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
+    // SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
+    // NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
+    WAVE_NT0   = 0,
+    WAVE_NT1   = 2,
+    GROUP_NT0  = 1,
+    GROUP_NT1  = 3,
+    DEVICE_NT0 = 8,
+    DEVICE_NT1 = 10,
+    SYSTEM_NT0 = 9,
+    SYSTEM_NT1 = 11,
 };

 template <index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
@@ -980,7 +991,8 @@ __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr,
    asm volatile("s_mov_b32 m0, %0; \n\t"
                 "buffer_load_dword %1, %2, 0 offen lds;\n\t" ::"s"(lds_ptr_sgpr),
                 "v"(global_offset_bytes),
-                 "s"(src_resource));
+                 "s"(src_resource)
+                 : "memory");
 #else
    // LDS pointer must be attributed with the LDS address space.
    __attribute__((address_space(3))) uint32_t* lds_ptr =

--- a/include/ck/utility/amd_smfmac.hpp
+++ b/include/ck/utility/amd_smfmac.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#pragma once
+
+namespace ck {
+
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_smfmac_f32_16x16x32f16;
+
+template <>
+struct intrin_smfmac_f32_16x16x32f16<16, 16>
+{
+    template <class FloatC>
+    __device__ static void
+    Run(const half4_t& reg_a, const half8_t& reg_b, const int32_t& reg_idx, FloatC& reg_c)
+    {
+#if defined(__gfx94__)
+        reg_c.template AsType<float4_t>()(Number<0>{}) = __builtin_amdgcn_smfmac_f32_16x16x32_f16(
+            reg_a, reg_b, reg_c.template AsType<float4_t>()[Number<0>{}], reg_idx, 0, 0);
+#else
+        ignore = reg_a;
+        ignore = reg_b;
+        ignore = reg_c;
+        ignore = reg_idx;
+#endif
+    }
+};
+
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_smfmac_f32_16x16x32bf16;
+
+template <>
+struct intrin_smfmac_f32_16x16x32bf16<16, 16>
+{
+    template <class FloatC>
+    __device__ static void
+    Run(const bhalf4_t& reg_a, const bhalf8_t& reg_b, const int32_t& reg_idx, FloatC& reg_c)
+    {
+#if defined(__gfx94__)
+        reg_c.template AsType<float4_t>()(Number<0>{}) = __builtin_amdgcn_smfmac_f32_16x16x32_bf16(
+            reg_a, reg_b, reg_c.template AsType<float4_t>()[Number<0>{}], reg_idx, 0, 0);
+#else
+        ignore = reg_a;
+        ignore = reg_b;
+        ignore = reg_c;
+        ignore = reg_idx;
+#endif
+    }
+};
+
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_smfmac_f32_32x32x16f16;
+
+template <>
+struct intrin_smfmac_f32_32x32x16f16<32, 32>
+{
+    template <class FloatC>
+    __device__ static void
+    Run(const half4_t& reg_a, const half8_t& reg_b, const int32_t& reg_idx, FloatC& reg_c)
+    {
+#if defined(__gfx94__)
+        reg_c.template AsType<float16_t>()(Number<0>{}) = __builtin_amdgcn_smfmac_f32_32x32x16_f16(
+            reg_a, reg_b, reg_c.template AsType<float16_t>()[Number<0>{}], reg_idx, 0, 0);
+#else
+        ignore = reg_a;
+        ignore = reg_b;
+        ignore = reg_c;
+        ignore = reg_idx;
+#endif
+    }
+};
+
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_smfmac_f32_32x32x16bf16;
+
+template <>
+struct intrin_smfmac_f32_32x32x16bf16<32, 32>
+{
+    template <class FloatC>
+    __device__ static void
+    Run(const bhalf4_t& reg_a, const bhalf8_t& reg_b, const int32_t& reg_idx, FloatC& reg_c)
+    {
+#if defined(__gfx94__)
+        reg_c.template AsType<float16_t>()(Number<0>{}) = __builtin_amdgcn_smfmac_f32_32x32x16_bf16(
+            reg_a, reg_b, reg_c.template AsType<float16_t>()[Number<0>{}], reg_idx, 0, 0);
+#else
+        ignore = reg_a;
+        ignore = reg_b;
+        ignore = reg_c;
+        ignore = reg_idx;
+#endif
+    }
+};
+
+} // namespace ck
--- a/include/ck/utility/amd_wave_read_first_lane.hpp
+++ b/include/ck/utility/amd_wave_read_first_lane.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -95,11 +95,33 @@ using get_carrier_t = typename get_carrier<SizeInBytes>::type;

 } // namespace detail

+__device__ inline uint32_t amd_wave_read_first_lane(uint32_t value)
+{
+    return __builtin_amdgcn_readfirstlane(value);
+}
+
 __device__ inline int32_t amd_wave_read_first_lane(int32_t value)
 {
    return __builtin_amdgcn_readfirstlane(value);
 }

+__device__ inline int64_t amd_wave_read_first_lane(int64_t value)
+{
+    constexpr unsigned object_size        = sizeof(int64_t);
+    constexpr unsigned second_part_offset = object_size / 2;
+    auto* const from_obj                  = reinterpret_cast<const std::byte*>(&value);
+    alignas(int64_t) std::byte to_obj[object_size];
+
+    using Sgpr = uint32_t;
+
+    *reinterpret_cast<Sgpr*>(to_obj) =
+        amd_wave_read_first_lane(*reinterpret_cast<const Sgpr*>(from_obj));
+    *reinterpret_cast<Sgpr*>(to_obj + second_part_offset) =
+        amd_wave_read_first_lane(*reinterpret_cast<const Sgpr*>(from_obj + second_part_offset));
+
+    return *reinterpret_cast<int64_t*>(to_obj);
+}
+
 template <
    typename Object,
    typename = std::enable_if_t<std::is_class_v<Object> && std::is_trivially_copyable_v<Object>>>

--- a/include/ck/utility/amd_wmma.hpp
+++ b/include/ck/utility/amd_wmma.hpp
@@ -257,5 +257,87 @@ struct intrin_wmma_i32_16x16x16_iu8_w64<16, 16, neg_a, neg_b, clamp>
    }
 };

+// gfx12
+/********************************WAVE32 MODE***********************************************/
+
+#if defined(__gfx1200__) || defined(__gfx1201__)
+#define __gfx12__
+#endif
+
+// src: fp16, dst: fp32
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_wmma_f32_16x16x16_f16_w32_gfx12;
+
+template <>
+struct intrin_wmma_f32_16x16x16_f16_w32_gfx12<16, 16>
+{
+    template <class FloatC>
+    __device__ static void Run(const half8_t& reg_a, const half8_t& reg_b, FloatC& reg_c)
+    {
+        // * Inline assembly need to elimate the duplicated data load, compiler won't help you
+        // delete them.
+        // amd_assembly_wmma_f32_16x16x16_f16_w32(
+        //     reg_a, reg_b, reg_c.template AsType<float8_t>()(Number<0>{}));
+#if defined(__gfx12__)
+        reg_c.template AsType<float8_t>()(Number<0>{}) =
+            __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(
+                reg_a, reg_b, reg_c.template AsType<float8_t>()[Number<0>{}]);
+#else
+        ignore = reg_a;
+        ignore = reg_b;
+        ignore = reg_c;
+#endif
+    }
+};
+
+// src: bf16, dst: fp32
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_wmma_f32_16x16x16_bf16_w32_gfx12;
+
+template <>
+struct intrin_wmma_f32_16x16x16_bf16_w32_gfx12<16, 16>
+{
+    template <class FloatC>
+    __device__ static void Run(const bhalf8_t& reg_a, const bhalf8_t& reg_b, FloatC& reg_c)
+    {
+#if defined(__gfx12__)
+        reg_c.template AsType<float8_t>()(Number<0>{}) =
+            __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(
+                reg_a, reg_b, reg_c.template AsType<float8_t>()[Number<0>{}]);
+#else
+        ignore = reg_a;
+        ignore = reg_b;
+        ignore = reg_c;
+#endif
+    }
+};
+
+// src: iu8, dst: i32
+template <index_t MPerWave, index_t NPerWave, bool neg_a, bool neg_b, bool clamp>
+struct intrin_wmma_i32_16x16x16_iu8_w32_gfx12;
+
+template <bool neg_a, bool neg_b, bool clamp>
+struct intrin_wmma_i32_16x16x16_iu8_w32_gfx12<16, 16, neg_a, neg_b, clamp>
+{
+    template <class FloatC>
+    __device__ static void Run(const int8x8_t& reg_a, const int8x8_t& reg_b, FloatC& reg_c)
+    {
+#if defined(__gfx12__)
+        reg_c.template AsType<int32x8_t>()(Number<0>{}) =
+            __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(
+                neg_a,
+                bit_cast<int32x2_t>(reg_a),
+                neg_b,
+                bit_cast<int32x2_t>(reg_b),
+                reg_c.template AsType<int32x8_t>()[Number<0>{}],
+                clamp);
+#else
+        ignore = reg_a;
+        ignore = reg_b;
+        ignore = reg_c;
+#endif
+    }
+};
+
 } // namespace ck
 #endif
--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -4,7 +4,7 @@
 #pragma once

 namespace ck {
-// Define the common macro for MI300 models
+// Define the common macro for gfx94x models
 #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
 #define __gfx94__
 #endif

--- a/include/ck/utility/array.hpp
+++ b/include/ck/utility/array.hpp
@@ -36,6 +36,8 @@ struct Array

        return *this;
    }
+    __host__ __device__ constexpr const TData* begin() const { return &mData[0]; }
+    __host__ __device__ constexpr const TData* end() const { return &mData[NSize]; }
 };

 // empty Array

--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
@@ -203,7 +203,7 @@ struct vector_type<T, 1>
    }
 };

-int static err = 0;
+__device__ int static err = 0;
 template <typename T>
 struct vector_type<T, 2>
 {

--- a/include/ck/utility/env.hpp
+++ b/include/ck/utility/env.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+#include <cstring>
+#include <string>
+#include <string_view>
+
+namespace ck {
+namespace internal {
+template <typename T>
+struct ParseEnvVal
+{
+};
+
+template <>
+struct ParseEnvVal<bool>
+{
+    static bool parse_env_var_value(const char* vp)
+    {
+        std::string value_env_str{vp};
+
+        for(auto& c : value_env_str)
+        {
+            if(std::isalpha(c) != 0)
+            {
+                c = std::tolower(static_cast<unsigned char>(c));
+            }
+        }
+
+        if(value_env_str == "disable" || value_env_str == "disabled" || value_env_str == "0" ||
+           value_env_str == "no" || value_env_str == "off" || value_env_str == "false")
+        {
+            return false;
+        }
+        else if(value_env_str == "enable" || value_env_str == "enabled" || value_env_str == "1" ||
+                value_env_str == "yes" || value_env_str == "on" || value_env_str == "true")
+        {
+            return true;
+        }
+        else
+        {
+            throw std::runtime_error("Invalid value for env variable");
+        }
+
+        return false; // shouldn't reach here
+    }
+};
+
+// Supports hexadecimals (with leading "0x"), octals (if prefix is "0") and decimals (default).
+// Returns 0 if environment variable is in wrong format (strtoull fails to parse the string).
+template <>
+struct ParseEnvVal<uint64_t>
+{
+    static uint64_t parse_env_var_value(const char* vp) { return std::strtoull(vp, nullptr, 0); }
+};
+
+template <>
+struct ParseEnvVal<std::string>
+{
+    static std::string parse_env_var_value(const char* vp) { return std::string{vp}; }
+};
+
+template <typename T>
+struct EnvVar
+{
+    private:
+    T value{};
+    bool is_unset = true;
+
+    public:
+    const T& GetValue() const { return value; }
+
+    bool IsUnset() const { return is_unset; }
+
+    void Unset() { is_unset = true; }
+
+    void UpdateValue(const T& val)
+    {
+        is_unset = false;
+        value    = val;
+    }
+
+    explicit EnvVar(const char* const name, const T& def_val)
+    {
+        // NOLINTNEXTLINE (concurrency-mt-unsafe)
+        const char* vp = std::getenv(name);
+        if(vp != nullptr) // a value was provided
+        {
+            is_unset = false;
+            value    = ParseEnvVal<T>::parse_env_var_value(vp);
+        }
+        else // no value provided, use default value
+        {
+            value = def_val;
+        }
+    }
+};
+} // end namespace internal
+
+// static inside function hides the variable and provides
+// thread-safety/locking
+// Used in global namespace
+#define CK_DECLARE_ENV_VAR(name, type, default_val)                            \
+    namespace ck::env {                                                        \
+    struct name                                                                \
+    {                                                                          \
+        static_assert(std::is_same_v<name, ::ck::env::name>,                   \
+                      "CK_DECLARE_ENV* must be used in the global namespace"); \
+        using value_type = type;                                               \
+        static ck::internal::EnvVar<type>& Ref()                               \
+        {                                                                      \
+            static ck::internal::EnvVar<type> var{#name, default_val};         \
+            return var;                                                        \
+        }                                                                      \
+    };                                                                         \
+    }
+
+#define CK_DECLARE_ENV_VAR_BOOL(name) CK_DECLARE_ENV_VAR(name, bool, false)
+
+#define CK_DECLARE_ENV_VAR_UINT64(name) CK_DECLARE_ENV_VAR(name, uint64_t, 0)
+
+#define CK_DECLARE_ENV_VAR_STR(name) CK_DECLARE_ENV_VAR(name, std::string, "")
+
+#define CK_ENV(name) \
+    ck::env::name {}
+
+template <class EnvVar>
+inline const std::string& EnvGetString(EnvVar)
+{
+    static_assert(std::is_same_v<typename EnvVar::value_type, std::string>);
+    return EnvVar::Ref().GetValue();
+}
+
+template <class EnvVar>
+inline bool EnvIsEnabled(EnvVar)
+{
+    static_assert(std::is_same_v<typename EnvVar::value_type, bool>);
+    return !EnvVar::Ref().IsUnset() && EnvVar::Ref().GetValue();
+}
+
+template <class EnvVar>
+inline bool EnvIsDisabled(EnvVar)
+{
+    static_assert(std::is_same_v<typename EnvVar::value_type, bool>);
+    return !EnvVar::Ref().IsUnset() && !EnvVar::Ref().GetValue();
+}
+
+template <class EnvVar>
+inline uint64_t EnvValue(EnvVar)
+{
+    static_assert(std::is_same_v<typename EnvVar::value_type, uint64_t>);
+    return EnvVar::Ref().GetValue();
+}
+
+template <class EnvVar>
+inline bool EnvIsUnset(EnvVar)
+{
+    return EnvVar::Ref().IsUnset();
+}
+
+template <class EnvVar>
+void EnvUnset(EnvVar)
+{
+    EnvVar::Ref().Unset();
+}
+
+/// updates the cached value of an environment variable
+template <typename EnvVar, typename ValueType>
+void UpdateEnvVar(EnvVar, const ValueType& val)
+{
+    static_assert(std::is_same_v<typename EnvVar::value_type, ValueType>);
+    EnvVar::Ref().UpdateValue(val);
+}
+
+template <typename EnvVar>
+void UpdateEnvVar(EnvVar, const std::string_view& val)
+{
+    EnvVar::Ref().UpdateValue(
+        ck::internal::ParseEnvVal<typename EnvVar::value_type>::parse_env_var_value(val.data()));
+}
+
+} // namespace ck
--- a/include/ck/utility/math_v2.hpp
+++ b/include/ck/utility/math_v2.hpp
@@ -839,7 +839,7 @@ inline __device__ T rcp(T x)
 template <typename T>
 inline __device__ T exp(T x)
 {
-    return ck::type_convert<T>(__expf(ck::type_convert<float>(x)));
+    return ck::type_convert<T>(__ocml_exp_f32(ck::type_convert<float>(x)));
 };

 template <>
@@ -851,7 +851,7 @@ inline __device__ half_t exp<half_t>(half_t x)
 template <>
 inline __device__ float exp<float>(float x)
 {
-    return __expf(x);
+    return __ocml_exp_f32(x);
 };

 template <>

--- a/include/ck/utility/synchronization.hpp
+++ b/include/ck/utility/synchronization.hpp
@@ -10,12 +10,20 @@ namespace ck {
 __device__ void block_sync_lds()
 {
 #if CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
+#ifdef __gfx12__
+    asm volatile("\
+    s_wait_dscnt 0x0 \n \
+    s_barrier_signal -1 \n \
+    s_barrier_wait -1 \
+    " ::);
+#else
    // asm volatile("\
    // s_waitcnt lgkmcnt(0) \n \
    // s_barrier \
    // " ::);
    __builtin_amdgcn_s_waitcnt(0xc07f);
    __builtin_amdgcn_s_barrier();
+#endif
 #else
    __syncthreads();
 #endif
@@ -23,11 +31,20 @@ __device__ void block_sync_lds()

 __device__ void block_sync_lds_direct_load()
 {
+#ifdef __gfx12__
+    asm volatile("\
+    s_wait_vmcnt 0x0 \n \
+    s_wait_dscnt 0x0 \n \
+    s_barrier_signal -1 \n \
+    s_barrier_wait -1 \
+    " ::);
+#else
    asm volatile("\
    s_waitcnt vmcnt(0) \n \
    s_waitcnt lgkmcnt(0) \n \
    s_barrier \
    " ::);
+#endif
 }

 __device__ void s_nop()

--- a/include/ck/utility/type_convert.hpp
+++ b/include/ck/utility/type_convert.hpp
@@ -8,7 +8,7 @@
 #include "ck/utility/random_gen.hpp"

 namespace ck {
-// Define the common macro for MI300 models
+// Define the common macro for gfx94x models
 #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
 #define __gfx94__
 #endif

--- a/include/ck_tile/core.hpp
+++ b/include/ck_tile/core.hpp
@@ -8,6 +8,7 @@
 #include "ck_tile/core/algorithm/space_filling_curve.hpp"
 #include "ck_tile/core/arch/amd_buffer_addressing.hpp"
 #include "ck_tile/core/arch/arch.hpp"
+#include "ck_tile/core/arch/generic_memory_space_atomic.hpp"
 #include "ck_tile/core/arch/utility.hpp"
 #include "ck_tile/core/config.hpp"
 #include "ck_tile/core/container/array.hpp"
@@ -26,6 +27,7 @@
 #include "ck_tile/core/numeric/integer.hpp"
 #include "ck_tile/core/numeric/integral_constant.hpp"
 #include "ck_tile/core/numeric/math.hpp"
+#include "ck_tile/core/numeric/null_type.hpp"
 #include "ck_tile/core/numeric/numeric.hpp"
 #include "ck_tile/core/numeric/type_convert.hpp"
 #include "ck_tile/core/numeric/vector_type.hpp"
@@ -47,10 +49,12 @@
 #include "ck_tile/core/tensor/tile_distribution_encoding.hpp"
 #include "ck_tile/core/tensor/tile_elementwise.hpp"
 #include "ck_tile/core/tensor/tile_window.hpp"
+#include "ck_tile/core/tensor/update_tile.hpp"
 #include "ck_tile/core/utility/bit_cast.hpp"
 #include "ck_tile/core/utility/functional.hpp"
 #include "ck_tile/core/utility/ignore.hpp"
 #include "ck_tile/core/utility/magic_div.hpp"
+#include "ck_tile/core/utility/philox_rand.hpp"
 #include "ck_tile/core/utility/random.hpp"
 #include "ck_tile/core/utility/to_sequence.hpp"
 #include "ck_tile/core/utility/transpose_vectors.hpp"

--- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -26,237 +26,346 @@ struct __attribute__((packed)) buffer_resource
 CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(const void* ptr, uint32_t size = 0xffffffff)
 {
    buffer_resource res{ptr, size, CK_TILE_BUFFER_RESOURCE_3RD_DWORD};
-    return __builtin_bit_cast(int32x4_t, res);
+    int32x4_t r = __builtin_bit_cast(int32x4_t, res);
+    r.x         = __builtin_amdgcn_readfirstlane(r.x);
+    r.y         = __builtin_amdgcn_readfirstlane(r.y);
+    r.z         = __builtin_amdgcn_readfirstlane(r.z);
+    r.w         = __builtin_amdgcn_readfirstlane(r.w);
+    return r;
 }

+namespace impl {
+// below type indicate the data type used for buffer load inline asm
+// clang-format off
+template<index_t N, typename T> struct buffer_load_trait;
+
+template<typename T> struct buffer_load_trait<16, T> { using payload_t = fp32x4_t; };
+template<typename T> struct buffer_load_trait<8 , T> { using payload_t = fp32x2_t; };
+template<typename T> struct buffer_load_trait<4 , T> { using payload_t = float; };
+template<typename T> struct buffer_load_trait<2 , T> { using payload_t = float; };
+template<typename T> struct buffer_load_trait<1 , T> { using payload_t = float; };
+
+#if CK_TILE_BUFFER_LOAD_RAW_BF16_WA
+template<> struct buffer_load_trait<16, thread_buffer<bf16_t, 8>> { using payload_t = bf16x8_t; };
+template<> struct buffer_load_trait<8 , thread_buffer<bf16_t, 4>> { using payload_t = bf16x4_t; };
+template<> struct buffer_load_trait<4 , thread_buffer<bf16_t, 2>> { using payload_t = bf16x2_t; };
+#endif
+// clang-format on
+} // namespace impl
+
 // TODO: glc/slc/...
-template <index_t bytes>
+template <index_t bytes, bool pre_nop = false>
 struct buffer_load;
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wundefined-reinterpret-cast"
 // TODO: strict aliasing rule seems fail when reinterpret_cast between vector type
 // (exp_vector_type(xxx))
-template <>
-struct buffer_load<16>
+template <bool pre_nop>
+struct buffer_load<16, pre_nop>
 {
    template <typename T>
    CK_TILE_DEVICE void operator()(T& value,
                                   int32x4_t res /*buffer resource*/,
                                   index_t v_offset,
-                                   index_t s_offset,
+                                   index_t /*s_offset*/,
                                   index_t i_offset /*max 0xFFF*/,
-                                   index_t /*flag*/ = 0)
+                                   index_t /*flag*/       = 0,
+                                   bool_constant<pre_nop> = {})
    {
        static_assert(sizeof(T) == 16);
-        using mbuf_t = fp32x4_t;
-        asm volatile("buffer_load_dwordx4 %0, %1, %2, %3 offen offset:%4"
-                     : "+v"(reinterpret_cast<mbuf_t&>(value))
-                     : "v"(v_offset), "s"(res), "s"(s_offset), "n"(i_offset)
-                     : "memory");
+        using mbuf_t = typename impl::buffer_load_trait<16, T>::payload_t;
+        if constexpr(pre_nop)
+            asm volatile("s_nop 4\n"
+                         "buffer_load_dwordx4 %0, %1, %2, 0 offen offset:%3"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset)
+                         : "memory");
+        else
+            asm volatile("buffer_load_dwordx4 %0, %1, %2, 0 offen offset:%3"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset)
+                         : "memory");
    }
 };

-template <>
-struct buffer_load<8>
+template <bool pre_nop>
+struct buffer_load<8, pre_nop>
 {
    template <typename T>
    CK_TILE_DEVICE void operator()(T& value,
                                   int32x4_t res /*buffer resource*/,
                                   index_t v_offset,
-                                   index_t s_offset,
+                                   index_t /*s_offset*/,
                                   index_t i_offset /*max 0xFFF*/,
-                                   index_t /*flag*/ = 0)
+                                   index_t /*flag*/       = 0,
+                                   bool_constant<pre_nop> = {})
    {
        static_assert(sizeof(T) == 8);
-        using mbuf_t = fp32x2_t;
-        asm volatile("buffer_load_dwordx2 %0, %1, %2, %3 offen offset:%4"
-                     : "+v"(reinterpret_cast<mbuf_t&>(value))
-                     : "v"(v_offset), "s"(res), "s"(s_offset), "n"(i_offset)
-                     : "memory");
+        using mbuf_t = typename impl::buffer_load_trait<8, T>::payload_t;
+        if constexpr(pre_nop)
+            asm volatile("s_nop 4\n"
+                         "buffer_load_dwordx2 %0, %1, %2, 0 offen offset:%3"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset)
+                         : "memory");
+        else
+            asm volatile("buffer_load_dwordx2 %0, %1, %2, 0 offen offset:%3"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset)
+                         : "memory");
    }
 };

-template <>
-struct buffer_load<4>
+template <bool pre_nop>
+struct buffer_load<4, pre_nop>
 {
    template <typename T>
    CK_TILE_DEVICE void operator()(T& value,
                                   int32x4_t res /*buffer resource*/,
                                   index_t v_offset,
-                                   index_t s_offset,
+                                   index_t /*s_offset*/,
                                   index_t i_offset /*max 0xFFF*/,
-                                   index_t /*flag*/ = 0)
+                                   index_t /*flag*/       = 0,
+                                   bool_constant<pre_nop> = {})
    {
        static_assert(sizeof(T) == 4);
-        using mbuf_t = float;
-        asm volatile("buffer_load_dword %0, %1, %2, %3 offen offset:%4"
-                     : "+v"(reinterpret_cast<mbuf_t&>(value))
-                     : "v"(v_offset), "s"(res), "s"(s_offset), "n"(i_offset)
-                     : "memory");
+        using mbuf_t = typename impl::buffer_load_trait<4, T>::payload_t;
+        if constexpr(pre_nop)
+            asm volatile("s_nop 4\n"
+                         "buffer_load_dword %0, %1, %2, 0 offen offset:%3"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset)
+                         : "memory");
+        else
+            asm volatile("buffer_load_dword %0, %1, %2, 0 offen offset:%3"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset)
+                         : "memory");
    }
 };

-template <>
-struct buffer_load<2>
+template <bool pre_nop>
+struct buffer_load<2, pre_nop>
 {
    template <typename T>
    CK_TILE_DEVICE void operator()(T& value,
                                   int32x4_t res /*buffer resource*/,
                                   index_t v_offset,
-                                   index_t s_offset,
+                                   index_t /*s_offset*/,
                                   index_t i_offset /*max 0xFFF*/,
-                                   index_t /*flag*/ = 0)
+                                   index_t /*flag*/       = 0,
+                                   bool_constant<pre_nop> = {})
    {
        static_assert(sizeof(T) == 4); // subdword is buggy, use dword buf and convert manually
-        using mbuf_t = float;
-        asm volatile("buffer_load_ushort %0, %1, %2, %3 offen offset:%4"
-                     : "+v"(reinterpret_cast<mbuf_t&>(value))
-                     : "v"(v_offset), "s"(res), "s"(s_offset), "n"(i_offset)
-                     : "memory");
+        using mbuf_t = typename impl::buffer_load_trait<2, T>::payload_t;
+        if constexpr(pre_nop)
+            asm volatile("s_nop 4\n"
+                         "buffer_load_ushort %0, %1, %2, 0 offen offset:%3"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset)
+                         : "memory");
+        else
+            asm volatile("buffer_load_ushort %0, %1, %2, 0 offen offset:%3"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset)
+                         : "memory");
    }
 };

-template <>
-struct buffer_load<1>
+template <bool pre_nop>
+struct buffer_load<1, pre_nop>
 {
    template <typename T>
    CK_TILE_DEVICE void operator()(T& value,
                                   int32x4_t res /*buffer resource*/,
                                   index_t v_offset,
-                                   index_t s_offset,
+                                   index_t /*s_offset*/,
                                   index_t i_offset /*max 0xFFF*/,
-                                   index_t /*flag*/ = 0)
+                                   index_t /*flag*/       = 0,
+                                   bool_constant<pre_nop> = {})
    {
        static_assert(sizeof(T) == 4);
-        using mbuf_t = float;
-        asm volatile("buffer_load_ubyte %0, %1, %2, %3 offen offset:%4"
-                     : "+v"(reinterpret_cast<mbuf_t&>(value))
-                     : "v"(v_offset), "s"(res), "s"(s_offset), "n"(i_offset)
-                     : "memory");
+        using mbuf_t = typename impl::buffer_load_trait<1, T>::payload_t;
+        if constexpr(pre_nop)
+            asm volatile("s_nop 4\n"
+                         "buffer_load_ubyte %0, %1, %2, 0 offen offset:%3"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset)
+                         : "memory");
+        else
+            asm volatile("buffer_load_ubyte %0, %1, %2, 0 offen offset:%3"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset)
+                         : "memory");
    }
 };

-template <index_t bytes>
+template <index_t bytes, bool pre_nop = false>
 struct buffer_load_if;

-template <>
-struct buffer_load_if<16>
+template <bool pre_nop>
+struct buffer_load_if<16, pre_nop>
 {
    template <typename T>
    CK_TILE_DEVICE void operator()(T& value,
                                   int32x4_t res /*buffer resource*/,
                                   index_t v_offset,
-                                   index_t s_offset,
+                                   index_t /*s_offset*/,
                                   index_t i_offset /*max 0xFFF*/,
-                                   index_t flag = 0)
+                                   index_t flag           = 0,
+                                   bool_constant<pre_nop> = {})
    {
        static_assert(sizeof(T) == 16);
        auto saved_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t    = fp32x4_t;
+        using mbuf_t    = typename impl::buffer_load_trait<16, T>::payload_t;
        static_assert(sizeof(mbuf_t) == sizeof(T));
-        asm volatile(
-            "v_cmpx_le_u32 exec, 1, %5\n"
-            "buffer_load_dwordx4 %0, %1, %2, %3 offen offset:%4\n"
-            "s_mov_b64 exec %6"
-            : "+v"(reinterpret_cast<mbuf_t&>(value))
-            : "v"(v_offset), "s"(res), "s"(s_offset), "n"(i_offset), "v"(flag), "s"(saved_exec)
-            : "memory");
+        if constexpr(pre_nop)
+            asm volatile("s_nop 4\n"
+                         "v_cmpx_le_u32 exec, 1, %4\n"
+                         "buffer_load_dwordx4 %0, %1, %2, 0 offen offset:%3\n"
+                         "s_mov_b64 exec %5"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
+                         : "memory");
+        else
+            asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
+                         "buffer_load_dwordx4 %0, %1, %2, 0 offen offset:%3\n"
+                         "s_mov_b64 exec %5"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
+                         : "memory");
    }
 };

-template <>
-struct buffer_load_if<8>
+template <bool pre_nop>
+struct buffer_load_if<8, pre_nop>
 {
    template <typename T>
    CK_TILE_DEVICE void operator()(T& value,
                                   int32x4_t res /*buffer resource*/,
                                   index_t v_offset,
-                                   index_t s_offset,
+                                   index_t /*s_offset*/,
                                   index_t i_offset /*max 0xFFF*/,
-                                   index_t flag = 0)
+                                   index_t flag           = 0,
+                                   bool_constant<pre_nop> = {})
    {
        static_assert(sizeof(T) == 8);
        auto saved_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t    = fp32x2_t;
-        asm volatile(
-            "v_cmpx_le_u32 exec, 1, %5\n"
-            "buffer_load_dwordx2 %0, %1, %2, %3 offen offset:%4\n"
-            "s_mov_b64 exec %6"
-            : "+v"(reinterpret_cast<mbuf_t&>(value))
-            : "v"(v_offset), "s"(res), "s"(s_offset), "n"(i_offset), "v"(flag), "s"(saved_exec)
-            : "memory");
+        using mbuf_t    = typename impl::buffer_load_trait<8, T>::payload_t;
+        if constexpr(pre_nop)
+            asm volatile("s_nop 4\n"
+                         "v_cmpx_le_u32 exec, 1, %4\n"
+                         "buffer_load_dwordx2 %0, %1, %2, 0 offen offset:%3\n"
+                         "s_mov_b64 exec %5"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
+                         : "memory");
+        else
+            asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
+                         "buffer_load_dwordx2 %0, %1, %2, 0 offen offset:%3\n"
+                         "s_mov_b64 exec %5"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
+                         : "memory");
    }
 };

-template <>
-struct buffer_load_if<4>
+template <bool pre_nop>
+struct buffer_load_if<4, pre_nop>
 {
    template <typename T>
    CK_TILE_DEVICE void operator()(T& value,
                                   int32x4_t res /*buffer resource*/,
                                   index_t v_offset,
-                                   index_t s_offset,
+                                   index_t /*s_offset*/,
                                   index_t i_offset /*max 0xFFF*/,
-                                   index_t flag = 0)
+                                   index_t flag           = 0,
+                                   bool_constant<pre_nop> = {})
    {
        static_assert(sizeof(T) == 4);
        auto saved_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t    = float;
-        asm volatile(
-            "v_cmpx_le_u32 exec, 1, %5\n"
-            "buffer_load_dword %0, %1, %2, %3 offen offset:%4\n"
-            "s_mov_b64 exec %6"
-            : "+v"(reinterpret_cast<mbuf_t&>(value))
-            : "v"(v_offset), "s"(res), "s"(s_offset), "n"(i_offset), "v"(flag), "s"(saved_exec)
-            : "memory");
+        using mbuf_t    = typename impl::buffer_load_trait<4, T>::payload_t;
+        if constexpr(pre_nop)
+            asm volatile("s_nop 4\n"
+                         "v_cmpx_le_u32 exec, 1, %4\n"
+                         "buffer_load_dword %0, %1, %2, 0 offen offset:%3\n"
+                         "s_mov_b64 exec %5"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
+                         : "memory");
+        else
+            asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
+                         "buffer_load_dword %0, %1, %2, 0 offen offset:%3\n"
+                         "s_mov_b64 exec %5"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
+                         : "memory");
    }
 };

-template <>
-struct buffer_load_if<2>
+template <bool pre_nop>
+struct buffer_load_if<2, pre_nop>
 {
    template <typename T>
    CK_TILE_DEVICE void operator()(T& value,
                                   int32x4_t res /*buffer resource*/,
                                   index_t v_offset,
-                                   index_t s_offset,
+                                   index_t /*s_offset*/,
                                   index_t i_offset /*max 0xFFF*/,
-                                   index_t flag = 0)
+                                   index_t flag           = 0,
+                                   bool_constant<pre_nop> = {})
    {
        static_assert(sizeof(T) == 4);
        auto saved_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t    = float;
-        asm volatile(
-            "v_cmpx_le_u32 exec, 1, %5\n"
-            "buffer_load_ushort %0, %1, %2, %3 offen offset:%4\n"
-            "s_mov_b64 exec %6"
-            : "+v"(reinterpret_cast<mbuf_t&>(value))
-            : "v"(v_offset), "s"(res), "s"(s_offset), "n"(i_offset), "v"(flag), "s"(saved_exec)
-            : "memory");
+        using mbuf_t    = typename impl::buffer_load_trait<2, T>::payload_t;
+        if constexpr(pre_nop)
+            asm volatile("s_nop 4\n"
+                         "v_cmpx_le_u32 exec, 1, %4\n"
+                         "buffer_load_ushort %0, %1, %2, 0 offen offset:%3\n"
+                         "s_mov_b64 exec %5"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
+                         : "memory");
+        else
+            asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
+                         "buffer_load_ushort %0, %1, %2, 0 offen offset:%3\n"
+                         "s_mov_b64 exec %5"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
+                         : "memory");
    }
 };

-template <>
-struct buffer_load_if<1>
+template <bool pre_nop>
+struct buffer_load_if<1, pre_nop>
 {
    template <typename T>
    CK_TILE_DEVICE void operator()(T& value,
                                   int32x4_t res /*buffer resource*/,
                                   index_t v_offset,
-                                   index_t s_offset,
+                                   index_t /*s_offset*/,
                                   index_t i_offset /*max 0xFFF*/,
-                                   index_t flag = 0)
+                                   index_t flag           = 0,
+                                   bool_constant<pre_nop> = {})
    {
        static_assert(sizeof(T) == 4);
        auto saved_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t    = float;
-        asm volatile(
-            "v_cmpx_le_u32 exec, 1, %5\n"
-            "buffer_load_ubyte %0, %1, %2, %3 offen offset:%4\n"
-            "s_mov_b64 exec %6"
-            : "+v"(reinterpret_cast<mbuf_t&>(value))
-            : "v"(v_offset), "s"(res), "s"(s_offset), "n"(i_offset), "v"(flag), "s"(saved_exec)
-            : "memory");
+        using mbuf_t    = typename impl::buffer_load_trait<1, T>::payload_t;
+        if constexpr(pre_nop)
+            asm volatile("s_nop 4\n"
+                         "v_cmpx_le_u32 exec, 1, %4\n"
+                         "buffer_load_ubyte %0, %1, %2, 0 offen offset:%3\n"
+                         "s_mov_b64 exec %5"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
+                         : "memory");
+        else
+            asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
+                         "buffer_load_ubyte %0, %1, %2, 0 offen offset:%3\n"
+                         "s_mov_b64 exec %5"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
+                         : "memory");
    }
 };
 #pragma clang diagnostic pop // "-Wundefined-reinterpret-cast"
@@ -270,17 +379,16 @@ struct buffer_store<16>
    CK_TILE_DEVICE void operator()(const T& value,
                                   int32x4_t res /*buffer resource*/,
                                   index_t v_offset,
-                                   index_t s_offset,
+                                   index_t /*s_offset*/,
                                   index_t i_offset /*max 0xFFF*/,
                                   index_t /*flag*/ = 1)
    {
        static_assert(sizeof(T) == 16);
        using mbuf_t = fp32x4_t;
-        asm volatile(
-            "buffer_store_dwordx4 %0, %1, %2, %3 offen offset:%4"
-            :
-            : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "s"(s_offset), "n"(i_offset)
-            : "memory");
+        asm volatile("buffer_store_dwordx4 %0, %1, %2, 0 offen offset:%3"
+                     :
+                     : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
+                     : "memory");
    }
 };

@@ -291,17 +399,16 @@ struct buffer_store<8>
    CK_TILE_DEVICE void operator()(const T& value,
                                   int32x4_t res /*buffer resource*/,
                                   index_t v_offset,
-                                   index_t s_offset,
+                                   index_t /*s_offset*/,
                                   index_t i_offset /*max 0xFFF*/,
                                   index_t /*flag*/ = 1)
    {
        static_assert(sizeof(T) == 8);
        using mbuf_t = fp32x2_t;
-        asm volatile(
-            "buffer_store_dwordx2 %0, %1, %2, %3 offen offset:%4"
-            :
-            : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "s"(s_offset), "n"(i_offset)
-            : "memory");
+        asm volatile("buffer_store_dwordx2 %0, %1, %2, 0 offen offset:%3"
+                     :
+                     : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
+                     : "memory");
    }
 };

@@ -312,17 +419,16 @@ struct buffer_store<4>
    CK_TILE_DEVICE void operator()(const T& value,
                                   int32x4_t res /*buffer resource*/,
                                   index_t v_offset,
-                                   index_t s_offset,
+                                   index_t /*s_offset*/,
                                   index_t i_offset /*max 0xFFF*/,
                                   index_t /*flag*/ = 1)
    {
        static_assert(sizeof(T) == 4);
        using mbuf_t = float;
-        asm volatile(
-            "buffer_store_dword %0, %1, %2, %3 offen offset:%4"
-            :
-            : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "s"(s_offset), "n"(i_offset)
-            : "memory");
+        asm volatile("buffer_store_dword %0, %1, %2, 0 offen offset:%3"
+                     :
+                     : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
+                     : "memory");
    }
 };

@@ -333,17 +439,16 @@ struct buffer_store<2>
    CK_TILE_DEVICE void operator()(const T& value,
                                   int32x4_t res /*buffer resource*/,
                                   index_t v_offset,
-                                   index_t s_offset,
+                                   index_t /*s_offset*/,
                                   index_t i_offset /*max 0xFFF*/,
                                   index_t /*flag*/ = 1)
    {
        static_assert(sizeof(T) == 2);
        using mbuf_t = short;
-        asm volatile(
-            "buffer_store_short %0, %1, %2, %3 offen offset:%4"
-            :
-            : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "s"(s_offset), "n"(i_offset)
-            : "memory");
+        asm volatile("buffer_store_short %0, %1, %2, 0 offen offset:%3"
+                     :
+                     : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
+                     : "memory");
    }
 };

@@ -354,17 +459,16 @@ struct buffer_store<1>
    CK_TILE_DEVICE void operator()(const T& value,
                                   int32x4_t res /*buffer resource*/,
                                   index_t v_offset,
-                                   index_t s_offset,
+                                   index_t /*s_offset*/,
                                   index_t i_offset /*max 0xFFF*/,
                                   index_t /*flag*/ = 1)
    {
        static_assert(sizeof(T) == 4);
        using mbuf_t = float;
-        asm volatile(
-            "buffer_store_byte %0, %1, %2, %3 offen offset:%4"
-            :
-            : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "s"(s_offset), "n"(i_offset)
-            : "memory");
+        asm volatile("buffer_store_byte %0, %1, %2, 0 offen offset:%3"
+                     :
+                     : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
+                     : "memory");
    }
 };

@@ -378,21 +482,20 @@ struct buffer_store_if<16>
    CK_TILE_DEVICE void operator()(const T& value,
                                   int32x4_t res /*buffer resource*/,
                                   index_t v_offset,
-                                   index_t s_offset,
+                                   index_t /*s_offset*/,
                                   index_t i_offset /*max 0xFFF*/,
                                   index_t flag = 1)
    {
        static_assert(sizeof(T) == 16);
        auto save_exec = __builtin_amdgcn_read_exec();
        using mbuf_t   = fp32x4_t;
-        asm volatile("v_cmpx_le_u32 exec, 1, %5\n"
-                     "buffer_store_dwordx4 %0, %1, %2, %3 offen offset:%4\n"
-                     "s_mov_b64 exec %6"
+        asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
+                     "buffer_store_dwordx4 %0, %1, %2, 0 offen offset:%3\n"
+                     "s_mov_b64 exec %5"
                     :
                     : "v"(bit_cast<mbuf_t>(value)),
                       "v"(v_offset),
                       "s"(res),
-                       "s"(s_offset),
                       "n"(i_offset),
                       "v"(flag),
                       "s"(save_exec)
@@ -407,7 +510,7 @@ struct buffer_store_if<8>
    CK_TILE_DEVICE void operator()(const T& value,
                                   int32x4_t res /*buffer resource*/,
                                   index_t v_offset,
-                                   index_t s_offset,
+                                   index_t /*s_offset*/,
                                   index_t i_offset /*max 0xFFF*/,
                                   index_t flag = 1)
    {
@@ -415,14 +518,13 @@ struct buffer_store_if<8>
        auto save_exec = __builtin_amdgcn_read_exec();
        // TODO: ugly. rocm-6.0/6.1 seems neet bit_cast to same base type to avoid scratch
        using mbuf_t = ext_vector_t<typename T::value_type, T::size()>;
-        asm volatile("v_cmpx_le_u32 exec, 1, %5\n"
-                     "buffer_store_dwordx2 %0, %1, %2, %3 offen offset:%4\n"
-                     "s_mov_b64 exec %6"
+        asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
+                     "buffer_store_dwordx2 %0, %1, %2, 0 offen offset:%3\n"
+                     "s_mov_b64 exec %5"
                     :
                     : "v"(bit_cast<mbuf_t>(value)),
                       "v"(v_offset),
                       "s"(res),
-                       "s"(s_offset),
                       "n"(i_offset),
                       "v"(flag),
                       "s"(save_exec)
@@ -437,21 +539,20 @@ struct buffer_store_if<4>
    CK_TILE_DEVICE void operator()(const T& value,
                                   int32x4_t res /*buffer resource*/,
                                   index_t v_offset,
-                                   index_t s_offset,
+                                   index_t /*s_offset*/,
                                   index_t i_offset /*max 0xFFF*/,
                                   index_t flag = 1)
    {
        static_assert(sizeof(T) == 4);
        auto save_exec = __builtin_amdgcn_read_exec();
        using mbuf_t   = float;
-        asm volatile("v_cmpx_le_u32 exec, 1, %5\n"
-                     "buffer_store_dword %0, %1, %2, %3 offen offset:%4\n"
-                     "s_mov_b64 exec %6"
+        asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
+                     "buffer_store_dword %0, %1, %2, 0 offen offset:%3\n"
+                     "s_mov_b64 exec %5"
                     :
                     : "v"(bit_cast<mbuf_t>(value)),
                       "v"(v_offset),
                       "s"(res),
-                       "s"(s_offset),
                       "n"(i_offset),
                       "v"(flag),
                       "s"(save_exec)
@@ -466,21 +567,20 @@ struct buffer_store_if<2>
    CK_TILE_DEVICE void operator()(const T& value,
                                   int32x4_t res /*buffer resource*/,
                                   index_t v_offset,
-                                   index_t s_offset,
+                                   index_t /*s_offset*/,
                                   index_t i_offset /*max 0xFFF*/,
                                   index_t flag = 1)
    {
        static_assert(sizeof(T) == 2);
        auto save_exec = __builtin_amdgcn_read_exec();
        using mbuf_t   = short;
-        asm volatile("v_cmpx_le_u32 exec, 1, %5\n"
-                     "buffer_store_short %0, %1, %2, %3 offen offset:%4\n"
-                     "s_mov_b64 exec %6"
+        asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
+                     "buffer_store_short %0, %1, %2, 0 offen offset:%3\n"
+                     "s_mov_b64 exec %5"
                     :
                     : "v"(bit_cast<mbuf_t>(value)),
                       "v"(v_offset),
                       "s"(res),
-                       "s"(s_offset),
                       "n"(i_offset),
                       "v"(flag),
                       "s"(save_exec)
@@ -495,21 +595,20 @@ struct buffer_store_if<1>
    CK_TILE_DEVICE void operator()(const T& value,
                                   int32x4_t res /*buffer resource*/,
                                   index_t v_offset,
-                                   index_t s_offset,
+                                   index_t /*s_offset*/,
                                   index_t i_offset /*max 0xFFF*/,
                                   index_t flag = 1)
    {
        static_assert(sizeof(T) == 4);
        auto save_exec = __builtin_amdgcn_read_exec();
        using mbuf_t   = float;
-        asm volatile("v_cmpx_le_u32 exec, 1, %5\n"
-                     "buffer_store_byte %0, %1, %2, %3 offen offset:%4\n"
-                     "s_mov_b64 exec %6"
+        asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
+                     "buffer_store_byte %0, %1, %2, 0 offen offset:%3\n"
+                     "s_mov_b64 exec %5"
                     :
                     : "v"(bit_cast<mbuf_t>(value)),
                       "v"(v_offset),
                       "s"(res),
-                       "s"(s_offset),
                       "n"(i_offset),
                       "v"(flag),
                       "s"(save_exec)
@@ -533,8 +632,9 @@ namespace impl{
 template<index_t N>
 CK_TILE_DEVICE void insert_dummy_dep_per_dword(array<float, N>& b)
 {
-    static_for<0, b.size(), 1>{}([&](auto i){
-        asm volatile(" " : : "v"(b.get(i)) : "memory");
+    constexpr auto kSize = remove_cvref_t<decltype(b)>::size(); 
+    static_for<0, kSize, 1>{}([&](auto i){
+        asm volatile(" " : : "v"(b.get(number<i>{})) : "memory");
    });
 }
 #if 1
@@ -764,6 +864,28 @@ llvm_amdgcn_raw_buffer_store_i32(int32_t vdata,
                                 index_t soffset,
                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i32");

+// buffer store ui16
+CK_TILE_DEVICE_EXTERN void
+llvm_amdgcn_raw_buffer_store_ui16(uint16_t vdata,
+                                  int32x4_t rsrc,
+                                  index_t voffset,
+                                  index_t soffset,
+                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16");
+
+CK_TILE_DEVICE_EXTERN void
+llvm_amdgcn_raw_buffer_store_ui16x2(uint16x2_t vdata,
+                                    int32x4_t rsrc,
+                                    index_t voffset,
+                                    index_t soffset,
+                                    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16");
+
+CK_TILE_DEVICE_EXTERN void
+llvm_amdgcn_raw_buffer_store_ui16x4(uint16x4_t vdata,
+                                    int32x4_t rsrc,
+                                    index_t voffset,
+                                    index_t soffset,
+                                    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16");
+
 CK_TILE_DEVICE_EXTERN void
 llvm_amdgcn_raw_buffer_store_i32x2(int32x2_t vdata,
                                   int32x4_t rsrc,
@@ -854,17 +976,26 @@ llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata,
                                       int soffset,    // dst_wave_addr_offset
                                       int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64");

-CK_TILE_DEVICE void async_buffer_load_dword(void* smem,
-                                            int32x4_t rsrc,
-                                            index_t voffset,
-                                            index_t soffset,
-                                            index_t ioffset /*max 0xFFF*/,
-                                            index_t /*flag*/ = 0)
+template <bool pre_nop = false>
+CK_TILE_DEVICE void async_buffer_load_dword_v(void* smem,
+                                              int32x4_t rsrc,
+                                              index_t voffset,
+                                              index_t /*soffset*/,
+                                              index_t ioffset /*max 0xFFF*/,
+                                              index_t /*flag*/       = 0,
+                                              bool_constant<pre_nop> = {})
 {
-    asm volatile("buffer_load_dword %1, %2, %3 offen offset:%4 lds"
-                 : "=r"(smem) /*dummy dependency for smem*/
-                 : "v"(voffset), "s"(rsrc), "s"(soffset), "n"(ioffset)
-                 : "memory");
+    if constexpr(pre_nop)
+        asm volatile("s_nop 4\n"
+                     "buffer_load_dword %1, %2, 0 offen offset:%3 lds"
+                     : "=r"(smem) /*dummy dependency for smem*/
+                     : "v"(voffset), "s"(rsrc), "n"(ioffset)
+                     : "memory");
+    else
+        asm volatile("buffer_load_dword %1, %2, 0 offen offset:%3 lds"
+                     : "=r"(smem) /*dummy dependency for smem*/
+                     : "v"(voffset), "s"(rsrc), "n"(ioffset)
+                     : "memory");
 }

 CK_TILE_DEVICE void async_buffer_load_fence(index_t cnt = 0)
@@ -1176,12 +1307,14 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
 template <typename T,
          index_t N,
          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
-          bool oob_conditional_check          = true>
+          bool oob_conditional_check          = true,
+          bool pre_nop                        = false>
 CK_TILE_DEVICE void amd_buffer_load_raw_impl(thread_buffer<T, N>& dst,
                                             int32x4_t src_wave_buffer_resource,
                                             index_t src_thread_addr_offset,
                                             index_t src_wave_addr_offset,
-                                             index_t flag = 0)
+                                             index_t flag           = 0,
+                                             bool_constant<pre_nop> = {})
 {
    constexpr index_t bytes = sizeof(T) * N;
    static_assert(bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8 || bytes == 16,
@@ -1190,32 +1323,46 @@ CK_TILE_DEVICE void amd_buffer_load_raw_impl(thread_buffer<T, N>& dst,
    using type = thread_buffer<T, N>;
    if constexpr(oob_conditional_check)
    {
-        buffer_load_if<sizeof(type)>{}(
-            dst, src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0, flag);
+        buffer_load_if<sizeof(type), pre_nop>{}(dst,
+                                                src_wave_buffer_resource,
+                                                src_thread_addr_offset,
+                                                src_wave_addr_offset,
+                                                0,
+                                                flag,
+                                                bool_constant<pre_nop>{});
    }
    else
    {
-        buffer_load<sizeof(type)>{}(
-            dst, src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0, flag);
+        buffer_load<sizeof(type), pre_nop>{}(dst,
+                                             src_wave_buffer_resource,
+                                             src_thread_addr_offset,
+                                             src_wave_addr_offset,
+                                             0,
+                                             flag,
+                                             bool_constant<pre_nop>{});
    }
 }

 template <typename T,
          index_t N,
-          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
+          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
+          bool pre_nop                        = false>
 CK_TILE_DEVICE void amd_async_buffer_load_impl(T* smem,
                                               int32x4_t src_wave_buffer_resource,
                                               index_t src_thread_addr_offset,
                                               index_t src_wave_addr_offset,
-                                               index_t src_immediate_addr_offset = 0)
+                                               index_t src_immediate_addr_offset = 0,
+                                               bool_constant<pre_nop>            = {})
 {
    static_assert(sizeof(T) * N == 4, "wrong! not implemented vector size");

-    async_buffer_load_dword(smem,
-                            src_wave_buffer_resource,
-                            src_thread_addr_offset,
-                            src_wave_addr_offset,
-                            src_immediate_addr_offset);
+    async_buffer_load_dword_v(smem,
+                              src_wave_buffer_resource,
+                              src_thread_addr_offset,
+                              src_wave_addr_offset,
+                              src_immediate_addr_offset,
+                              0,
+                              bool_constant<pre_nop>{});
 }

 template <index_t N,
@@ -1334,7 +1481,10 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
             (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
            (std::is_same<T, fp8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
            (std::is_same<T, bf8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (std::is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
+            (std::is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, uint16_t>::value &&
+             (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, uint8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
        "wrong! not implemented");

    if constexpr(std::is_same<T, float>::value) // fp32
@@ -1473,6 +1623,49 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
                static_cast<index_t>(coherence));
        }
    }
+    else if constexpr(std::is_same<T, uint16_t>::value)
+    {
+        if constexpr(N == 1)
+        {
+            llvm_amdgcn_raw_buffer_store_ui16(bit_cast<uint16_t>(src_thread_data),
+                                              dst_wave_buffer_resource,
+                                              dst_thread_addr_offset,
+                                              dst_wave_addr_offset,
+                                              static_cast<index_t>(coherence));
+        }
+        else if constexpr(N == 2)
+        {
+            llvm_amdgcn_raw_buffer_store_ui16x2(bit_cast<uint16x2_t>(src_thread_data),
+                                                dst_wave_buffer_resource,
+                                                dst_thread_addr_offset,
+                                                dst_wave_addr_offset,
+                                                static_cast<index_t>(coherence));
+        }
+        else if constexpr(N == 4)
+        {
+            llvm_amdgcn_raw_buffer_store_ui16x4(bit_cast<uint16x4_t>(src_thread_data),
+                                                dst_wave_buffer_resource,
+                                                dst_thread_addr_offset,
+                                                dst_wave_addr_offset,
+                                                static_cast<index_t>(coherence));
+        }
+        else if constexpr(N == 8)
+        {
+            llvm_amdgcn_raw_buffer_store_ui16x4(
+                src_thread_data.template get_as<uint16x4_t>()[number<0>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset,
+                static_cast<index_t>(coherence));
+
+            llvm_amdgcn_raw_buffer_store_ui16x4(
+                src_thread_data.template get_as<uint16x4_t>()[number<1>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + 4 * sizeof(uint16_t),
+                static_cast<index_t>(coherence));
+        }
+    }
    else
    {
        using r_t = thread_buffer<int8_t, sizeof(T) * N>;
@@ -1590,7 +1783,7 @@ CK_TILE_DEVICE void amd_buffer_atomic_add_impl(const thread_buffer<T, N>& src_th
    {
        if constexpr(N == 2)
        {
-            llvm_amdgcn_raw_buffer_atomic_add_fp16x2(bit_cast<fp16_t>(src_thread_data),
+            llvm_amdgcn_raw_buffer_atomic_add_fp16x2(bit_cast<fp16x2_t>(src_thread_data),
                                                     dst_wave_buffer_resource,
                                                     dst_thread_addr_offset,
                                                     dst_wave_addr_offset,
@@ -1816,20 +2009,50 @@ amd_buffer_load_invalid_element_return_customized_value(const T* p_src_wave,
 template <typename T,
          index_t N,
          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
-          bool oob_conditional_check          = true>
+          bool oob_conditional_check          = true,
+          bool pre_nop                        = false>
 CK_TILE_DEVICE void amd_buffer_load_raw(thread_buffer<T, N>& dst,
                                        const T* p_src_wave,
                                        index_t src_thread_element_offset,
                                        index_t src_element_space_size,
-                                        index_t is_valid_element = 0)
+                                        index_t is_valid_element = 0,
+                                        bool_constant<pre_nop>   = {})
 {
    const int32x4_t src_wave_buffer_resource =
        make_wave_buffer_resource(p_src_wave, src_element_space_size * sizeof(T));

    index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);

-    amd_buffer_load_raw_impl<T, N, coherence, oob_conditional_check>(
-        dst, src_wave_buffer_resource, src_thread_addr_offset, 0, is_valid_element);
+    amd_buffer_load_raw_impl<T, N, coherence, oob_conditional_check, pre_nop>(
+        dst,
+        src_wave_buffer_resource,
+        src_thread_addr_offset,
+        0,
+        is_valid_element,
+        bool_constant<pre_nop>{});
+}
+
+// This version support buffer resource as input arg
+template <typename T,
+          index_t N,
+          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
+          bool oob_conditional_check          = true,
+          bool pre_nop                        = false>
+CK_TILE_DEVICE void amd_buffer_load_raw(thread_buffer<T, N>& dst,
+                                        const int32x4_t src_wave_buffer_resource,
+                                        index_t src_thread_element_offset,
+                                        index_t is_valid_element = 0,
+                                        bool_constant<pre_nop>   = {})
+{
+    index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
+
+    amd_buffer_load_raw_impl<T, N, coherence, oob_conditional_check, pre_nop>(
+        dst,
+        src_wave_buffer_resource,
+        src_thread_addr_offset,
+        0,
+        is_valid_element,
+        bool_constant<pre_nop>{});
 }

 // unfortunately async copy can not make sure invalid data is zero inside LDS
@@ -1838,11 +2061,13 @@ CK_TILE_DEVICE void amd_buffer_load_raw(thread_buffer<T, N>& dst,
 // buffer_load OOB still working.
 template <typename T,
          index_t N,
-          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
-CK_TILE_DEVICE void amd_async_buffer_load_with_oob(T* smem,
-                                                   const T* p_src_wave,
-                                                   index_t src_thread_element_offset,
-                                                   index_t src_element_space_size)
+          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
+          bool pre_nop                        = false>
+CK_TILE_DEVICE void amd_async_buffer_load_with_oob_raw(T* smem,
+                                                       const T* p_src_wave,
+                                                       index_t src_thread_element_offset,
+                                                       index_t src_element_space_size,
+                                                       bool_constant<pre_nop> = {})
 {
    const int32x4_t src_wave_buffer_resource =
        make_wave_buffer_resource(p_src_wave, src_element_space_size * sizeof(T));
@@ -1850,7 +2075,23 @@ CK_TILE_DEVICE void amd_async_buffer_load_with_oob(T* smem,
    index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);

    amd_async_buffer_load_impl<T, N, coherence>(
-        smem, src_wave_buffer_resource, src_thread_addr_offset, 0, 0);
+        smem, src_wave_buffer_resource, src_thread_addr_offset, 0, 0, bool_constant<pre_nop>{});
+}
+
+// This version support buffer resource as input arg
+template <typename T,
+          index_t N,
+          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
+          bool pre_nop                        = false>
+CK_TILE_DEVICE void amd_async_buffer_load_with_oob_raw(T* smem,
+                                                       const int32x4_t src_wave_buffer_resource,
+                                                       index_t src_thread_element_offset,
+                                                       bool_constant<pre_nop> = {})
+{
+    index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
+
+    amd_async_buffer_load_impl<T, N, coherence>(
+        smem, src_wave_buffer_resource, src_thread_addr_offset, 0, 0, bool_constant<pre_nop>{});
 }

 // buffer_store requires:
@@ -2016,7 +2257,8 @@ CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr,
    asm volatile("s_mov_b32 m0, %0; \n\t"
                 "buffer_load_dword %1, %2, 0 offen lds;\n\t" ::"s"(lds_ptr_sgpr),
                 "v"(global_offset_bytes),
-                 "s"(src_resource));
+                 "s"(src_resource)
+                 : "memory");
 #else
    // LDS pointer must be attributed with the LDS address space.
    __attribute__((address_space(3))) uint32_t* lds_ptr =

--- a/include/ck_tile/core/arch/arch.hpp
+++ b/include/ck_tile/core/arch/arch.hpp
@@ -61,10 +61,13 @@ CK_TILE_DEVICE index_t get_block_id() { return blockIdx.x; }
 CK_TILE_DEVICE void block_sync_lds()
 {
 #if CK_TILE_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
-    asm volatile("\
-    s_waitcnt lgkmcnt(0) \n \
-    s_barrier \
-    " ::);
+    // asm volatile("\
+    // s_waitcnt lgkmcnt(0) \n \
+    // s_barrier \
+    // " ::);
+
+    __builtin_amdgcn_s_waitcnt(0xc07f);
+    __builtin_amdgcn_s_barrier();
 #else
    __syncthreads();
 #endif
@@ -79,14 +82,12 @@ CK_TILE_DEVICE void block_sync_lds_direct_load()
    " ::);
 }

-CK_TILE_DEVICE void s_nop()
+CK_TILE_DEVICE void s_nop(index_t cnt = 0)
 {
 #if 1
-    asm volatile("\
-    s_nop 0 \n \
-    " ::);
+    asm volatile("s_nop %0" : : "n"(cnt) :);
 #else
-    __builtin_amdgcn_sched_barrier(0);
+    __builtin_amdgcn_sched_barrier(cnt);
 #endif
 }


--- a/include/ck_tile/core/arch/generic_memory_space_atomic.hpp
+++ b/include/ck_tile/core/arch/generic_memory_space_atomic.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+#include "ck_tile/core/numeric/vector_type.hpp"
+#include "ck_tile/core/numeric/type_convert.hpp"
+#include "ck_tile/core/container/thread_buffer.hpp"
+
+namespace ck_tile {
+
+CK_TILE_HOST_DEVICE bf16_t add_bf16_t(const bf16_t& a, const bf16_t& b)
+{
+    return type_convert<bf16_t>(type_convert<float>(a) + type_convert<float>(b));
+}
+
+CK_TILE_HOST_DEVICE bf16x2_t add_bf16x2_t(const bf16x2_t& a, const bf16x2_t& b)
+{
+    bf16x2_t rtn;
+    rtn[0] = add_bf16_t(a[0], b[0]);
+    rtn[1] = add_bf16_t(a[1], b[1]);
+    return rtn;
+}
+
+// Caution: DO NOT REMOVE
+// intentionally have only declaration but no definition to cause compilation failure when trying to
+// instantiate this template. The purpose is to make the implementation of atomic_add explicit for
+// each datatype.
+template <typename X>
+CK_TILE_DEVICE void atomic_add(X* p_dst, const X& x);
+
+template <>
+CK_TILE_DEVICE void atomic_add<bf16x2_t>(bf16x2_t* p_dst, const bf16x2_t& x)
+{
+    union U32BF162_ADDR
+    {
+        uint32_t* u32_a;
+        bf16x2_t* bf162_a;
+    };
+
+    union U32BF162
+    {
+        uint32_t u32;
+        bf16x2_t bf162;
+    };
+
+    U32BF162_ADDR dword_addr;
+    U32BF162 cur_v;
+    U32BF162 new_;
+    uint32_t old_v, new_v;
+    dword_addr.bf162_a = p_dst;
+    cur_v.u32          = *dword_addr.u32_a;
+
+    do
+    {
+        old_v      = cur_v.u32;
+        new_.bf162 = add_bf16x2_t(cur_v.bf162, x);
+        new_v      = new_.u32;
+        cur_v.u32  = atomicCAS(dword_addr.u32_a, old_v, new_v);
+    } while(cur_v.u32 != old_v);
+}
+
+template <typename T, index_t N>
+CK_TILE_DEVICE void atomic_add_g(T* p_dst, const thread_buffer<T, N>& x)
+{
+    static_assert((std::is_same<T, int32_t>::value && (N == 1)) ||
+                      (std::is_same<T, uint32_t>::value && (N == 1)) ||
+                      (std::is_same<T, float>::value && (N == 1 || N == 2)) ||
+                      (std::is_same<T, double>::value && (N == 1 || N == 2)) ||
+                      (std::is_same<T, bf16_t>::value && (N == 2 || N == 4)),
+                  "wrong! not implemented");
+
+    constexpr auto I0 = number<0>{};
+    constexpr auto I1 = number<1>{};
+
+    if constexpr(std::is_same<T, float>::value)
+    {
+        if constexpr(N == 1)
+        {
+            atomicAdd(p_dst, bit_cast<float>(x));
+        }
+        else if constexpr(N == 2)
+        {
+            atomicAdd(c_style_pointer_cast<float*>(p_dst), x.template get_as<float>()[I0]);
+            atomicAdd(c_style_pointer_cast<float*>(p_dst) + 1, x.template get_as<float>()[I1]);
+        }
+    }
+    else if constexpr(std::is_same<T, double>::value)
+    {
+        if constexpr(N == 1)
+        {
+            return atomicAdd(p_dst, bit_cast<double>(x));
+        }
+        else if constexpr(N == 2)
+        {
+            atomicAdd(c_style_pointer_cast<double*>(p_dst), x.template get_as<double>()[I0]);
+            atomicAdd(c_style_pointer_cast<double*>(p_dst) + 1, x.template get_as<double>()[I1]);
+        }
+    }
+    else if constexpr(std::is_same<T, int32_t>::value)
+    {
+        if constexpr(N == 1)
+        {
+            atomicAdd(p_dst, bit_cast<int32_t>(x));
+        }
+    }
+    else if constexpr(std::is_same<T, uint32_t>::value)
+    {
+        if constexpr(N == 1)
+        {
+            atomicAdd(p_dst, bit_cast<uint32_t>(x));
+        }
+    }
+    else if constexpr(std::is_same<T, bf16_t>::value)
+    {
+        if constexpr(N == 2)
+        {
+            atomic_add(c_style_pointer_cast<bf16x2_t*>(p_dst), bit_cast<bf16x2_t>(x));
+        }
+        else if constexpr(N == 4)
+        {
+            atomic_add(c_style_pointer_cast<bf16x2_t*>(p_dst), x.template get_as<bf16x2_t>()[I0]);
+            atomic_add(c_style_pointer_cast<bf16x2_t*>(p_dst) + 1,
+                       x.template get_as<bf16x2_t>()[I1]);
+        }
+    }
+}
+
+template <typename T, index_t N>
+CK_TILE_DEVICE void atomic_max_g(T* p_dst, const thread_buffer<T, N>& x)
+{
+    static_assert((std::is_same<T, int32_t>::value && (N == 1)) ||
+                      (std::is_same<T, uint32_t>::value && (N == 1)) ||
+                      (std::is_same<T, float>::value && (N == 1 || N == 2)) ||
+                      (std::is_same<T, double>::value && (N == 1)),
+                  "wrong! not implemented");
+
+    constexpr auto I0 = number<0>{};
+    constexpr auto I1 = number<1>{};
+
+    if constexpr(std::is_same<T, float>::value)
+    {
+        if constexpr(N == 1)
+        {
+            atomicMax(p_dst, bit_cast<float>(x));
+        }
+        else if constexpr(N == 2)
+        {
+            atomicMax(c_style_pointer_cast<float*>(p_dst), x.template get_as<float>()[I0]);
+            atomicMax(c_style_pointer_cast<float*>(p_dst) + 1, x.template get_as<float>()[I1]);
+        }
+    }
+    else if constexpr(std::is_same<T, double>::value)
+    {
+        if constexpr(N == 1)
+        {
+            atomicMax(p_dst, bit_cast<double>(x));
+        }
+    }
+    else if constexpr(std::is_same<T, int32_t>::value)
+    {
+        if constexpr(N == 1)
+        {
+            atomicMax(p_dst, bit_cast<int32_t>(x));
+        }
+    }
+    else if constexpr(std::is_same<T, uint32_t>::value)
+    {
+        if constexpr(N == 1)
+        {
+            atomicMax(p_dst, bit_cast<uint32_t>(x));
+        }
+    }
+}
+
+} // namespace ck_tile
--- a/include/ck_tile/core/config.hpp
+++ b/include/ck_tile/core/config.hpp
@@ -3,6 +3,25 @@

 #pragma once

+#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || \
+    defined(__gfx942__)
+#define __gfx9__
+#endif
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#define __gfx94__
+#endif
+#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || \
+    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__)
+#define __gfx103__
+#endif
+#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__)
+#define __gfx11__
+#endif
+#if defined(__gfx1200__) || defined(__gfx1201__)
+#define __gfx12__
+#endif
+
+#include "hip/hip_version.h"
 #ifndef CK_TILE_DONT_USE_HIP_RUNTIME_HEADERS
 #include "hip/hip_runtime.h"
 #include "hip/hip_fp16.h"
@@ -109,15 +128,13 @@
 // buffer atomic add: floating point
 #ifndef __HIP_DEVICE_COMPILE__ // for host code
 #define CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
-#elif defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || \
-    defined(__gfx942__) // for GPU code
+#elif defined(__gfx9__) // for GPU code
 #define CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
 #else // for GPU code
 #define CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 0
 #endif

-#if(defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || \
-    defined(__gfx942__)) // for GPU code
+#if(defined(__gfx90a__) || defined(__gfx94__)) // for GPU code
 #define CK_TILE_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 1
 #else
 #define CK_TILE_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 0
@@ -131,19 +148,26 @@
 #define CK_TILE_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE 1
 #endif

+#ifndef CK_TILE_WORKAROUND_ROCM_6_1_SCRATCH_MEMORY_ISSUE
+#if HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR == 1 && HIP_VERSION_PATCH >= 40091
+#define CK_TILE_WORKAROUND_ROCM_6_1_SCRATCH_MEMORY_ISSUE 1
+#else
+#define CK_TILE_WORKAROUND_ROCM_6_1_SCRATCH_MEMORY_ISSUE 0
+#endif
+#endif
+
 #ifndef CK_TILE_DEBUG_LOG
 #define CK_TILE_DEBUG_LOG 0
 #endif

 #ifndef __HIP_DEVICE_COMPILE__ // for host code
 #define CK_TILE_BUFFER_RESOURCE_3RD_DWORD 0xffffffff
-#elif defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) || defined(__gfx908__) || \
-    defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) ||                          \
-    defined(__gfx942__) // for GPU code
+#elif defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) || \
+    defined(__gfx9__) // for GPU code
 #define CK_TILE_BUFFER_RESOURCE_3RD_DWORD 0x00020000
-#elif defined(__gfx1030__) // for GPU code
+#elif defined(__gfx103__) // for GPU code
 #define CK_TILE_BUFFER_RESOURCE_3RD_DWORD 0x31014000
-#elif defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) // for GPU code
+#elif defined(__gfx11__) || defined(__gfx12__) // for GPU code
 #define CK_TILE_BUFFER_RESOURCE_3RD_DWORD 0x31004000
 #endif

@@ -154,3 +178,16 @@
 #ifndef CK_TILE_USE_SUBDWORD_TILE_CAST
 #define CK_TILE_USE_SUBDWORD_TILE_CAST 0
 #endif
+
+#ifndef CK_TILE_USE_PK_FP16_TILE_CAST
+#define CK_TILE_USE_PK_FP16_TILE_CAST 0
+#endif
+
+// TODO: better solve this inside compiler
+#ifndef CK_TILE_FMHA_FWD_FAST_EXP2
+#define CK_TILE_FMHA_FWD_FAST_EXP2 0
+#endif
+
+#ifndef CK_TILE_BUFFER_LOAD_RAW_BF16_WA
+#define CK_TILE_BUFFER_LOAD_RAW_BF16_WA 1
+#endif
--- a/include/ck_tile/core/numeric/bfloat16.hpp
+++ b/include/ck_tile/core/numeric/bfloat16.hpp
@@ -331,7 +331,10 @@ bfloat16_t sqrt(bfloat16_t x)
 };

 CK_TILE_DEVICE
-bfloat16_t exp(bfloat16_t x) { return static_cast<bfloat16_t>(__expf(static_cast<float>(x))); };
+bfloat16_t exp(bfloat16_t x)
+{
+    return static_cast<bfloat16_t>(__ocml_exp_f32(static_cast<float>(x)));
+};

 CK_TILE_DEVICE
 bfloat16_t exp2(bfloat16_t x) { return static_cast<bfloat16_t>(exp2f(static_cast<float>(x))); };

--- a/include/ck_tile/core/numeric/float8.hpp
+++ b/include/ck_tile/core/numeric/float8.hpp
@@ -55,7 +55,7 @@ struct alignas(1) float8_e4m3_t
 {
    static constexpr int exponent = 4;
    static constexpr int mantissa = 3;
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx94__)
    static constexpr int bias = 1 << (exponent - 1); // NANOO
 #else
    static constexpr int bias = (1 << (exponent - 1)) - 1; // IEEE
@@ -113,7 +113,7 @@ struct alignas(1) float8_e5m2_t
 {
    static constexpr int exponent = 5;
    static constexpr int mantissa = 2;
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx94__)
    static constexpr int bias = 1 << (exponent - 1); // NANOO
 #else
    static constexpr int bias = (1 << (exponent - 1)) - 1; // IEEE
@@ -470,7 +470,7 @@ CK_TILE_HOST_DEVICE fp8_raw_t float_to_fp8_sr_raw(float x)
 {
    constexpr int seed = 42;
    uint32_t rng       = prand_generator_t<float, seed>{}(reinterpret_cast<uintptr_t>(&x), x);
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx94__)
    float max_fp8 = 240.0f;
    x             = x > max_fp8 ? max_fp8 : (x < -max_fp8 ? -max_fp8 : x);
    union
@@ -500,7 +500,7 @@ CK_TILE_HOST_DEVICE bf8_raw_t float_to_bf8_sr_raw(float x)
 {
    constexpr int seed = 42;
    uint32_t rng       = prand_generator_t<float, seed>{}(reinterpret_cast<uintptr_t>(&x), x);
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx94__)
    union
    {
        float fval;
@@ -526,7 +526,7 @@ CK_TILE_HOST_DEVICE bf8_raw_t float_to_bf8_sr_raw(float x)

 CK_TILE_HOST_DEVICE fp8_raw_t float_to_fp8_rtn_raw(float x)
 {
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx94__)
    float max_fp8 = 240.0f;
    x             = x > max_fp8 ? max_fp8 : (x < -max_fp8 ? -max_fp8 : x);
    union
@@ -554,7 +554,7 @@ CK_TILE_HOST_DEVICE fp8_raw_t float_to_fp8_rtn_raw(float x)
 }
 CK_TILE_HOST_DEVICE bf8_raw_t float_to_bf8_rtn_raw(float x)
 {
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx94__)
    union
    {
        float fval;
@@ -598,7 +598,7 @@ CK_TILE_HOST_DEVICE bf8_raw_t float_to_bf8_raw(float x, constant<rounding>)

 CK_TILE_HOST_DEVICE float fp8_to_float_raw(fp8_raw_t x)
 {
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx94__)
    float fval;
    uint32_t i32val = static_cast<uint32_t>(x);
    fval            = __builtin_amdgcn_cvt_f32_fp8(i32val, 0);
@@ -612,7 +612,7 @@ CK_TILE_HOST_DEVICE float fp8_to_float_raw(fp8_raw_t x)

 CK_TILE_HOST_DEVICE float bf8_to_float_raw(bf8_raw_t x)
 {
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx94__)
    float fval;
    uint32_t i32val = static_cast<uint32_t>(x);
    fval            = __builtin_amdgcn_cvt_f32_bf8(i32val, 0);
@@ -656,7 +656,7 @@ struct numeric_traits<fp8_t>
 {
    static constexpr int exp  = 4;
    static constexpr int mant = 3;
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx94__)
    static constexpr int bias = 8;
 #else
    static constexpr int bias = 7;
@@ -668,7 +668,7 @@ struct numeric_traits<bf8_t>
 {
    static constexpr int exp  = 5;
    static constexpr int mant = 2;
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx94__)
    static constexpr int bias = 16;
 #else
    static constexpr int bias = 15; // IEEE
@@ -835,7 +835,7 @@ CK_TILE_DEVICE
 fp8_t sqrt(fp8_t x) { return static_cast<fp8_t>(__builtin_amdgcn_sqrtf(static_cast<float>(x))); };

 CK_TILE_DEVICE
-fp8_t exp(fp8_t x) { return static_cast<fp8_t>(__expf(static_cast<float>(x))); };
+fp8_t exp(fp8_t x) { return static_cast<fp8_t>(__ocml_exp_f32(static_cast<float>(x))); };

 CK_TILE_DEVICE
 fp8_t exp2(fp8_t x) { return static_cast<fp8_t>(exp2f(static_cast<float>(x))); };
@@ -860,7 +860,7 @@ CK_TILE_DEVICE
 bf8_t sqrt(bf8_t x) { return static_cast<bf8_t>(__builtin_amdgcn_sqrtf(static_cast<float>(x))); };

 CK_TILE_DEVICE
-bf8_t exp(bf8_t x) { return static_cast<bf8_t>(__expf(static_cast<float>(x))); };
+bf8_t exp(bf8_t x) { return static_cast<bf8_t>(__ocml_exp_f32(static_cast<float>(x))); };

 CK_TILE_DEVICE
 bf8_t exp2(bf8_t x) { return static_cast<bf8_t>(exp2f(static_cast<float>(x))); };

--- a/include/ck_tile/core/numeric/half.hpp
+++ b/include/ck_tile/core/numeric/half.hpp
@@ -129,8 +129,8 @@ constexpr double fp16_to_double_hip(const fp16_hip_t& x)
 CK_TILE_HOST_DEVICE
 constexpr fp16_hip_t float_to_fp16_hip(const float& x)
 {
-    return __float2half(x);
-    // return static_cast<fp16_hip_t>(x);
+    // return __float2half(x);
+    return static_cast<fp16_hip_t>(x);
 }

 CK_TILE_HOST_DEVICE
@@ -374,7 +374,7 @@ half_t sqrt(half_t x)
 };

 CK_TILE_DEVICE
-half_t exp(half_t x) { return static_cast<half_t>(__expf(static_cast<float>(x))); };
+half_t exp(half_t x) { return static_cast<half_t>(__ocml_exp_f32(static_cast<float>(x))); };

 CK_TILE_DEVICE
 half_t exp2(half_t x) { return static_cast<half_t>(exp2f(static_cast<float>(x))); };