Merge remote-tracking branch 'origin/develop' into ck_tile/moe

0eb75e21 · carlushuang · 1b4b640b · c8b6b642 · 0eb75e21 · 0eb75e21
Commit 0eb75e21 authored Aug 17, 2024 by carlushuang
20 changed files
--- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_multiblock.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_multiblock.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -258,7 +258,7 @@ struct GridwiseMultipleReduction_mk_to_m_multiblock

            if(thread_k_cluster_id == 0)
            {
-                if(block_group_size == 0 && !float_equal_zero{}(beta_values[iR]))
+                if(!float_equal_zero{}(beta_values[iR]))
                {
                    StaticBuffer<AddressSpaceEnum::Vgpr, OutDataType, MThreadSliceSize, true>
                        priorDstValueBuf;

--- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -244,7 +244,7 @@ struct GridwiseReduction_mk_to_m_multiblock

        if(thread_k_cluster_id == 0)
        {
-            if(block_group_size == 0 && !float_equal_zero{}(beta))
+            if(!float_equal_zero{}(beta))
            {
                StaticBuffer<AddressSpaceEnum::Vgpr, OutDataType, MThreadSliceSize, true>
                    priorDstValueBuf;

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_multiple_d.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -111,6 +111,15 @@ struct GridwiseGemmDlMultipleD_km_kn_mn
                  const BGridDesc_K0_N_K1& b_grid_desc_k0_n_k1,
                  const CGridDesc_M_N& c_grid_desc_m_n)
    {
+        constexpr long_index_t TwoGB = (long_index_t{1} << 31);
+
+        if(!(a_grid_desc_k0_m_k1.GetElementSpaceSize() * sizeof(FloatAB) <= TwoGB &&
+             b_grid_desc_k0_n_k1.GetElementSpaceSize() * sizeof(FloatAB) <= TwoGB &&
+             c_grid_desc_m_n.GetElementSpaceSize() * sizeof(FloatC) <= TwoGB))
+        {
+            return false;
+        }
+
        const auto M  = a_grid_desc_k0_m_k1.GetLength(I1);
        const auto N  = b_grid_desc_k0_n_k1.GetLength(I1);
        const auto K0 = a_grid_desc_k0_m_k1.GetLength(I0);

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_v1r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_v1r3.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -649,6 +649,15 @@ struct GridwiseGemmDl_bkm_bkn_mn_v1r3
                  const BGridDesc_B_K0_N_K1& b_grid_desc_b_k0_n_k1,
                  const CGridDesc_M_N& c_grid_desc_m_n)
    {
+        constexpr long_index_t TwoGB = (long_index_t{1} << 31);
+
+        if(!(a_grid_desc_b_k0_m_k1.GetElementSpaceSize() * sizeof(FloatAB) <= TwoGB &&
+             b_grid_desc_b_k0_n_k1.GetElementSpaceSize() * sizeof(FloatAB) <= TwoGB &&
+             c_grid_desc_m_n.GetElementSpaceSize() * sizeof(FloatC) <= TwoGB))
+        {
+            return false;
+        }
+
        const auto M      = a_grid_desc_b_k0_m_k1.GetLength(I2);
        const auto N      = b_grid_desc_b_k0_n_k1.GetLength(I2);
        const auto K0     = a_grid_desc_b_k0_m_k1.GetLength(I1);

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
@@ -417,6 +417,13 @@ struct GridwiseGemm_xdl_cshuffle_v3
            }
        }();

+        // pad M and N
+        return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
+                                           make_tuple(make_right_pad_transform(M, MPad - M),
+                                                      make_right_pad_transform(N, NPad - N)),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}));
+#if 0
        using GemmSpecialization = tensor_operation::device::GemmSpecialization;

        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
@@ -454,6 +461,7 @@ struct GridwiseGemm_xdl_cshuffle_v3
            // not pad M or N
            return c_grid_desc_mraw_nraw;
        }
+#endif
    }

    struct Problem
@@ -953,7 +961,8 @@ struct GridwiseGemm_xdl_cshuffle_v3
        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
-                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) &&
+                     !(is_same<tensor_layout::gemm::RowMajor, ALayout>::value))
        {
            if(!(karg.M % MPerBlock == 0))
            {
@@ -970,7 +979,8 @@ struct GridwiseGemm_xdl_cshuffle_v3
        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::NPadding ||
                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
-                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) &&
+                     (is_same<tensor_layout::gemm::RowMajor, BLayout>::value))
        {
            if(!(karg.N % NPerBlock == 0))
            {
@@ -1105,7 +1115,9 @@ struct GridwiseGemm_xdl_cshuffle_v3
        }

        if constexpr(!(is_same<remove_cvref_t<CDataType>, half_t>::value ||
-                       is_same<remove_cvref_t<CDataType>, float>::value))
+                       is_same<remove_cvref_t<CDataType>, float>::value ||
+                       is_same<remove_cvref_t<CDataType>, bhalf_t>::value ||
+                       is_same<remove_cvref_t<CDataType>, int32_t>::value))
        {
            if(!karg.IsReduceAdd())
            {

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp
@@ -36,10 +36,9 @@ __global__ void
    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
    // __attribute__((amdgpu_waves_per_eu(1, 1)))
-    kernel_gemm_xdl_cshuffle_v3(typename GridwiseGemm::Argument karg)
+    kernel_gemm_xdl_cshuffle_v3_multi_d(typename GridwiseGemm::Argument karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];

    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg);
@@ -56,7 +55,7 @@ __global__ void
        karg.c_element_op);
 #else
    ignore = karg;
-#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+#endif // end of if (defined(__gfx9__))
 }

 template <typename GridwiseGemm,
@@ -69,10 +68,9 @@ __global__ void
    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
    // __attribute__((amdgpu_waves_per_eu(1, 1)))
-    kernel_gemm_xdl_cshuffle_v3_2lds(typename GridwiseGemm::Argument karg)
+    kernel_gemm_xdl_cshuffle_v3_multi_d_2lds(typename GridwiseGemm::Argument karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
    // Pass two lds pointer is the key to tell compiler that ds_read/write
    // operate on different lds chunk at same time without order dependecy
    __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
@@ -93,7 +91,7 @@ __global__ void
        karg.c_element_op);
 #else
    ignore = karg;
-#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+#endif // end of if (defined(__gfx9__))
 }

 template <typename ALayout,
@@ -454,6 +452,13 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
            }
        }();

+        // pad M and N
+        return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
+                                           make_tuple(make_right_pad_transform(M, MPad - M),
+                                                      make_right_pad_transform(N, NPad - N)),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}));
+#if 0
        using GemmSpecialization = tensor_operation::device::GemmSpecialization;

        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
@@ -491,6 +496,7 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
            // not pad M or N
            return c_grid_desc_mraw_nraw;
        }
+#endif
    }

    __host__ __device__ static auto MakeDsGridDescriptor_M_N(
@@ -1016,7 +1022,8 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
-                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) &&
+                     !(is_same<tensor_layout::gemm::RowMajor, ALayout>::value))
        {
            if(!(karg.M % MPerBlock == 0))
            {
@@ -1033,7 +1040,8 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::NPadding ||
                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
-                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) &&
+                     (is_same<tensor_layout::gemm::RowMajor, BLayout>::value))
        {
            if(!(karg.N % NPerBlock == 0))
            {

--- a/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp
--- a/include/ck/utility/amd_buffer_addressing.hpp
+++ b/include/ck/utility/amd_buffer_addressing.hpp
@@ -562,6 +562,34 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                                        dst_wave_addr_offset);
 }

+template <typename T, index_t N>
+__device__ void amd_global_atomic_add_impl(const typename vector_type<T, N>::type src_thread_data,
+                                           T* addr)
+{
+    static_assert((is_same<T, bhalf_t>::value && (N == 2 || N == 4 || N == 8)) ||
+                      (is_same<T, half_t>::value && (N == 2 || N == 4 || N == 8)),
+                  "wrong! not implemented");
+
+    if constexpr(is_same<T, half_t>::value)
+    {
+        vector_type<half_t, N> tmp{src_thread_data};
+        static_for<0, N / 2, 1>{}([&](auto i) {
+            __builtin_amdgcn_global_atomic_fadd_v2f16(bit_cast<half2_t*>(addr) + i,
+                                                      tmp.template AsType<half2_t>()[i]);
+        });
+    }
+#if defined(__gfx942__)
+    else if constexpr(is_same<T, bhalf_t>::value)
+    {
+        vector_type<bhalf_t, N> tmp{src_thread_data};
+        static_for<0, N / 2, 1>{}([&](auto i) {
+            __builtin_amdgcn_global_atomic_fadd_v2bf16(bit_cast<bhalf2_t*>(addr) + i,
+                                                       tmp.template AsType<bhalf2_t>()[i]);
+        });
+    }
+#endif
+}
+
 template <typename T, index_t N>
 __device__ void amd_buffer_atomic_add_impl(const typename vector_type<T, N>::type src_thread_data,
                                           int32x4_t dst_wave_buffer_resource,
@@ -907,6 +935,16 @@ amd_buffer_atomic_add(const typename vector_type_maker<T, N>::type::type src_thr
    using scalar_t                = typename scalar_type<vector_t>::type;
    constexpr index_t vector_size = scalar_type<vector_t>::vector_size;

+    if constexpr(is_same<T, bhalf_t>::value)
+    {
+        if(dst_thread_element_valid)
+        {
+            amd_global_atomic_add_impl<scalar_t, vector_size>(
+                src_thread_data, p_dst_wave + dst_thread_element_offset);
+        }
+    }
+    else
+    {
 #if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK
        uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;

@@ -919,6 +957,7 @@ amd_buffer_atomic_add(const typename vector_type_maker<T, N>::type::type src_thr
                src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
        }
 #endif
+    }
 }

 // buffer_atomic_max requires:

--- a/include/ck/utility/dynamic_buffer.hpp
+++ b/include/ck/utility/dynamic_buffer.hpp
@@ -358,13 +358,15 @@ struct DynamicBuffer
        bool constexpr use_amd_buffer_addressing =
            is_same_v<remove_cvref_t<scalar_t>, int32_t> ||
            is_same_v<remove_cvref_t<scalar_t>, float> ||
-            (is_same_v<remove_cvref_t<scalar_t>, half_t> && scalar_per_x_vector % 2 == 0);
+            (is_same_v<remove_cvref_t<scalar_t>, half_t> && scalar_per_x_vector % 2 == 0) ||
+            (is_same_v<remove_cvref_t<scalar_t>, bhalf_t> && scalar_per_x_vector % 2 == 0);
 #elif CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && (!CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT)
        bool constexpr use_amd_buffer_addressing = is_same_v<remove_cvref_t<scalar_t>, int32_t>;
 #elif(!CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER) && CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT
        bool constexpr use_amd_buffer_addressing =
            is_same_v<remove_cvref_t<scalar_t>, float> ||
-            (is_same_v<remove_cvref_t<scalar_t>, half_t> && scalar_per_x_vector % 2 == 0);
+            (is_same_v<remove_cvref_t<scalar_t>, half_t> && scalar_per_x_vector % 2 == 0) ||
+            (is_same_v<remove_cvref_t<scalar_t>, bhalf_t> && scalar_per_x_vector % 2 == 0);
 #else
        bool constexpr use_amd_buffer_addressing = false;
 #endif

--- a/include/ck/utility/f8_utils.hpp
+++ b/include/ck/utility/f8_utils.hpp
@@ -44,7 +44,7 @@ __host__ __device__ Y run_cast_to_f8(X x, uint32_t rng)

    // convert to bitwise
    using T_bitwise     = typename NumericUtils<X>::bitwise_type;
-    T_bitwise x_bitwise = *(reinterpret_cast<T_bitwise*>(&x));
+    T_bitwise x_bitwise = bit_cast<T_bitwise>(x);

    // unpack the input, depends on datatype
    head     = x_bitwise & NumericUtils<X>::head_mask;
@@ -165,7 +165,7 @@ In this case, the fp16 mantissa should be shift left by 1 */

    if(out_exponent > max_exp)
    {
-        if(clip)
+        if constexpr(clip)
        {
            mantissa     = (1 << out_mant) - 1;
            out_exponent = max_exp;
@@ -196,7 +196,6 @@ __host__ __device__ Y run_cast_from_f8(X x)

    // prepare the codes
    constexpr X nan_code = 0x80;
-    Y Inf, NegInf, NaN, Neg0;
    using T_bitwise      = typename NumericUtils<Y>::bitwise_type;

    constexpr T_bitwise Inf_bitwise    = NumericUtils<Y>::Inf;
@@ -204,10 +203,10 @@ __host__ __device__ Y run_cast_from_f8(X x)
    constexpr T_bitwise NaN_bitwise    = NumericUtils<Y>::NaN;
    constexpr T_bitwise Neg0_bitwise   = NumericUtils<Y>::Neg0;

-    Inf    = *(reinterpret_cast<const Y*>(&Inf_bitwise));
-    NegInf = *(reinterpret_cast<const Y*>(&NegInf_bitwise));
-    NaN    = *(reinterpret_cast<const Y*>(&NaN_bitwise));
-    Neg0   = *(reinterpret_cast<const Y*>(&Neg0_bitwise));
+    constexpr Y Inf    = bit_cast<Y>(Inf_bitwise);
+    constexpr Y NegInf = bit_cast<Y>(NegInf_bitwise);
+    constexpr Y NaN    = bit_cast<Y>(NaN_bitwise);
+    constexpr Y Neg0   = bit_cast<Y>(Neg0_bitwise);

    // check if x is 0.0
    if(x == 0)
@@ -235,11 +234,12 @@ __host__ __device__ Y run_cast_from_f8(X x)
            return (mantissa == 0) ? (sign ? NegInf : Inf) : NaN;
    }

-    if((NumericUtils<Y>::mant == 10) && (NumericUtils<X>::mant == 2) && !negative_zero_nan)
+    if constexpr((NumericUtils<Y>::mant == 10) && (NumericUtils<X>::mant == 2) &&
+                 !negative_zero_nan)
    {
        retval = x;
        retval <<= 8;
-        return *(reinterpret_cast<const Y*>(&retval));
+        return bit_cast<Y>(retval);
    }

    // subnormal input
@@ -263,7 +263,7 @@ __host__ __device__ Y run_cast_from_f8(X x)
    }

    retval = (sign << (out_exp + out_mant)) | (exponent << out_mant) | mantissa;
-    return *(reinterpret_cast<const Y*>(&retval));
+    return bit_cast<Y>(retval);
 }

 } // namespace

--- a/include/ck/utility/type_convert.hpp
+++ b/include/ck/utility/type_convert.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

 #include "ck/utility/data_type.hpp"
 #include "ck/utility/f8_utils.hpp"
 #include "ck/utility/random_gen.hpp"
+#include "ck/utility/array.hpp"

 namespace ck {
 // Define the common macro for gfx94x models
@@ -500,6 +501,25 @@ inline __host__ __device__ half_t type_convert<half_t, bf8_t>(bf8_t x)
 #endif
 }

+template <typename Y, typename X, std::size_t NumElems>
+inline __host__ __device__ void array_convert(std::array<Y, NumElems>& y,
+                                              const std::array<X, NumElems>& x)
+{
+    for(std::size_t i = 0; i < NumElems; i++)
+    {
+        y[i] = type_convert<Y>(x[i]);
+    }
+}
+
+template <typename Y, typename X, index_t NumElems>
+inline __host__ __device__ void array_convert(Array<Y, NumElems>& y, const Array<X, NumElems>& x)
+{
+    for(std::size_t i = 0; i < NumElems; i++)
+    {
+        y[i] = type_convert<Y>(x[i]);
+    }
+}
+
 // Declare a template function for bf16 conversion using RTN
 template <typename Y, typename X>
 __host__ __device__ constexpr Y bf16_convert_rtn(X x);

--- a/include/ck_tile/core/algorithm/coordinate_transform.hpp
+++ b/include/ck_tile/core/algorithm/coordinate_transform.hpp
@@ -1342,7 +1342,7 @@ struct modulo : public base_transform<1, 1>
 };

 // 2D XOR, NOTE: "xor" is a keyword
-template <typename LowLengths, typename RightShift>
+template <typename LowLengths>
 struct xor_t : public base_transform<2, 2>
 {
    static constexpr auto type_enum = coord_transform_enum::xor_t;
@@ -1353,15 +1353,10 @@ struct xor_t : public base_transform<2, 2>
    using UpLengths = LowLengths;

    UpLengths up_lengths_;
-    RightShift right_shift_;

-    CK_TILE_HOST_DEVICE constexpr xor_t() : up_lengths_{}, right_shift_{} {}
+    CK_TILE_HOST_DEVICE constexpr xor_t() : up_lengths_{} {}

-    CK_TILE_HOST_DEVICE constexpr xor_t(const LowLengths& low_lengths,
-                                        const RightShift& right_shift)
-        : up_lengths_{low_lengths}, right_shift_{right_shift}
-    {
-    }
+    CK_TILE_HOST_DEVICE constexpr xor_t(const LowLengths& low_lengths) : up_lengths_{low_lengths} {}

    CK_TILE_HOST_DEVICE static constexpr auto get_type_enum()
    {
@@ -1379,13 +1374,8 @@ struct xor_t : public base_transform<2, 2>

        idx_low(number<0>{}) = idx_up[number<0>{}];

-        const auto idx_low_1_tmp =
-            (idx_up[number<1>{}] - idx_up[number<0>{}] * right_shift_) % up_lengths_[number<1>{}];
-
-        const auto idx_low_1 =
-            (idx_low_1_tmp >= 0) ? idx_low_1_tmp : up_lengths_[number<1>{}] + idx_low_1_tmp;
-
-        idx_low(number<1>{}) = idx_low_1;
+        idx_low(number<1>{}) =
+            idx_up[number<1>{}] ^ (idx_up[number<0>{}] % up_lengths_[number<1>{}]);
    }

    template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
@@ -1420,8 +1410,7 @@ struct xor_t : public base_transform<2, 2>

    CK_TILE_HOST_DEVICE static constexpr bool is_known_at_compile_time()
    {
-        return ck_tile::is_known_at_compile_time<UpLengths>::value &&
-               ck_tile::is_known_at_compile_time<RightShift>::value;
+        return ck_tile::is_known_at_compile_time<UpLengths>::value;
    }

    // MUST be static function
@@ -1433,14 +1422,6 @@ struct xor_t : public base_transform<2, 2>
        array<index_t, 2> up_vector_lengths = low_vector_lengths;
        array<index_t, 2> up_vector_strides = low_vector_strides;

-        if constexpr(ck_tile::is_known_at_compile_time<RightShift>::value)
-        {
-            if(low_vector_lengths[1] != -1)
-            {
-                up_vector_lengths(1) = gcd(low_vector_lengths[1], abs(right_shift_));
-            }
-        }
-
        return make_tuple(up_vector_lengths, up_vector_strides);
    }

@@ -1453,10 +1434,6 @@ struct xor_t : public base_transform<2, 2>
        print(up_lengths_);
        printf(", ");

-        //
-        printf("right_shift_: ");
-        print(right_shift_);
-
        printf("}");
    }
 };
@@ -1834,11 +1811,10 @@ CK_TILE_HOST_DEVICE constexpr auto make_modulo_transform(const Modulus& modulus,
    return modulo<Modulus, UpLength>{modulus, up_length};
 }

-template <typename LowLengths, typename RightShift>
-CK_TILE_HOST_DEVICE constexpr auto make_xor_transform(const LowLengths& low_lengths,
-                                                      const RightShift& right_shift)
+template <typename LowLengths>
+CK_TILE_HOST_DEVICE constexpr auto make_xor_transform(const LowLengths& low_lengths)
 {
-    return xor_t<LowLengths, RightShift>{low_lengths, right_shift};
+    return xor_t<LowLengths>{low_lengths};
 }

 template <typename LowLength, typename OffsetLength>

--- a/include/ck_tile/core/numeric/vector_type.hpp
+++ b/include/ck_tile/core/numeric/vector_type.hpp
@@ -117,6 +117,15 @@ using int32x16_t = int32_t __attribute__((ext_vector_type(16)));
 using int32x32_t = int32_t __attribute__((ext_vector_type(32)));
 using int32x64_t = int32_t __attribute__((ext_vector_type(64)));

+// u32
+// using uint32_t = ...
+using uint32x2_t  = uint32_t __attribute__((ext_vector_type(2)));
+using uint32x4_t  = uint32_t __attribute__((ext_vector_type(4)));
+using uint32x8_t  = uint32_t __attribute__((ext_vector_type(8)));
+using uint32x16_t = uint32_t __attribute__((ext_vector_type(16)));
+using uint32x32_t = uint32_t __attribute__((ext_vector_type(32)));
+using uint32x64_t = uint32_t __attribute__((ext_vector_type(64)));
+
 // i16
 // using int16_t = ...
 using int16x2_t  = int16_t __attribute__((ext_vector_type(2)));

--- a/include/ck_tile/core/tensor/tile_distribution.hpp
+++ b/include/ck_tile/core/tensor/tile_distribution.hpp
@@ -763,7 +763,8 @@ CK_TILE_HOST_DEVICE constexpr auto slice_distribution_from_x(
    return make_tuple(
        make_static_tile_distribution(
            tile_distribution_encoding<typename Encoding::RsLengths,
-                                       decltype(sliced_h_lengths), // only need to change the
+                                       remove_cvref_t<decltype(sliced_h_lengths)>, // only need to
+                                                                                   // change the
                                                                                   // h_lengths type
                                       typename Encoding::Ps2RHssMajor,
                                       typename Encoding::Ps2RHssMinor,

--- a/include/ck_tile/core/utility/philox_rand.hpp
+++ b/include/ck_tile/core/utility/philox_rand.hpp
@@ -53,6 +53,39 @@ class philox
        out_tmp[3] = tmp_ph.w;
    }

+    CK_TILE_HOST_DEVICE void get_random_8x8(uint8_t* out,
+                                            const unsigned long long subsequence,
+                                            const index_t start_idx) const
+    {
+        uint4 tmp_ph;
+        tmp_ph = get_philox_4x32(subsequence);
+
+        uint32x4_t tmp;
+        tmp[0]            = tmp_ph.x;
+        tmp[1]            = tmp_ph.y;
+        tmp[2]            = tmp_ph.z;
+        tmp[3]            = tmp_ph.w;
+        uint32_t* out_tmp = reinterpret_cast<uint32_t*>(&out[0]);
+        out_tmp[0]        = tmp[start_idx];
+        out_tmp[1]        = tmp[start_idx + 2];
+    }
+
+    CK_TILE_HOST_DEVICE void get_random_4x8(uint8_t* out,
+                                            const unsigned long long subsequence,
+                                            const index_t start_idx) const
+    {
+        uint4 tmp_ph;
+        tmp_ph = get_philox_4x32(subsequence);
+
+        uint32x4_t tmp;
+        tmp[0]            = tmp_ph.x;
+        tmp[1]            = tmp_ph.y;
+        tmp[2]            = tmp_ph.z;
+        tmp[3]            = tmp_ph.w;
+        uint32_t* out_tmp = reinterpret_cast<uint32_t*>(&out[0]);
+        out_tmp[0]        = tmp[start_idx];
+    }
+
    private:
    struct ull2
    {

--- a/include/ck_tile/ops/fmha.hpp
+++ b/include/ck_tile/ops/fmha.hpp
@@ -8,21 +8,16 @@
 #include "ck_tile/ops/fmha/block/block_masking.hpp"
 #include "ck_tile/ops/fmha/block/block_position_encoding.hpp"
 #include "ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp"
-#include "ck_tile/ops/fmha/kernel/fmha_bwd_tile_partitioner.hpp"
 #include "ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp"
 #include "ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp"
 #include "ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_tile_partitioner.hpp"
 #include "ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp"
 #include "ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_tile_partitioner.hpp"
 #include "ck_tile/ops/fmha/kernel/fmha_fwd_tile_partitioner.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_convert_dq.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dot_do_o.hpp"
-#include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dot_do_o_default_policy.hpp"
-#include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_ks_kts_vr.hpp"
-#include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_ks_kts_vr_default_policy.hpp"
-#include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_ks_vr.hpp"
-#include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_ks_vr_default_policy.hpp"
-#include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_qs_ks_vr_dos.hpp"
-#include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_qs_ks_vr_dos_default_policy.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_enum.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_problem.hpp"

--- a/include/ck_tile/ops/fmha/block/block_dropout.hpp
+++ b/include/ck_tile/ops/fmha/block/block_dropout.hpp
--- a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp
--- a/include/ck_tile/ops/fmha/kernel/fmha_bwd_tile_partitioner.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_bwd_tile_partitioner.hpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck_tile/core.hpp"
-
-namespace ck_tile {
-
-template <typename BlockFmhaShape_>
-struct FmhaBwdTilePartitioner
-{
-    using BlockFmhaShape = ck_tile::remove_cvref_t<BlockFmhaShape_>;
-
-    static constexpr ck_tile::index_t kN0 = BlockFmhaShape::kN0;
-
-    CK_TILE_HOST static constexpr auto
-    GridSize(ck_tile::index_t batch_size_, ck_tile::index_t nhead_, ck_tile::index_t seqlen_k_)
-    {
-        // TODO: this may need tuning
-        return dim3(ck_tile::integer_divide_ceil(seqlen_k_, kN0), nhead_, batch_size_);
-    }
-
-    CK_TILE_DEVICE auto operator()(ck_tile::index_t /*seqlen_k*/)
-    {
-        const index_t i_block = blockIdx.x;
-        const index_t i_nhead = blockIdx.y;
-        const index_t i_batch = blockIdx.z;
-
-        return ck_tile::make_tuple(i_block, i_nhead, i_batch);
-    }
-};
-
-template <ck_tile::index_t kBlockSize>
-struct FmhaBwdOGradDotOTilePartitioner
-{
-    CK_TILE_HOST static constexpr auto
-    GridSize(ck_tile::index_t batch_size_, ck_tile::index_t nhead_, ck_tile::index_t seqlen_q_)
-    {
-        // TODO: this may need tuning
-        return dim3(ck_tile::integer_divide_ceil(seqlen_q_, kBlockSize), nhead_, batch_size_);
-    }
-
-    CK_TILE_DEVICE auto operator()(ck_tile::index_t /*seqlen_q*/)
-    {
-        const index_t i_block = blockIdx.x;
-        const index_t i_nhead = blockIdx.y;
-        const index_t i_batch = blockIdx.z;
-
-        return ck_tile::make_tuple(i_block, i_nhead, i_batch);
-    }
-};
-
-} // namespace ck_tile
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
@@ -387,7 +387,6 @@ struct FmhaFwdKernel
              ck_tile::index_t nhead_stride_randval,
              ck_tile::index_t nhead_stride_lse,
              ck_tile::index_t nhead_stride_o,
-              ck_tile::index_t batch_stride_lse,
              ck_tile::index_t window_size_left,
              ck_tile::index_t window_size_right,
              ck_tile::index_t mask_type,
@@ -448,7 +447,6 @@ struct FmhaFwdKernel
        {
            kargs.lse_ptr          = lse_ptr;
            kargs.nhead_stride_lse = nhead_stride_lse;
-            kargs.batch_stride_lse = batch_stride_lse;
        }
        if constexpr(kDoFp8StaticQuant)
        {
@@ -524,7 +522,7 @@ struct FmhaFwdKernel
            }
            if constexpr(kStoreLSE)
            {
-                batch_offset_lse = static_cast<long_index_t>(i_batch) * kargs.batch_stride_lse;
+                batch_offset_lse = query_start;
            }
            if constexpr(kHasDropout)
            {