Merge branch 'junhzhan/fa-ifu-mqa' of...

Merge branch 'junhzhan/fa-ifu-mqa' of https://github.com/ROCmSoftwarePlatform/composable_kernel into junhzhan/fa-ifu-mqa

Merge branch 'junhzhan/fa-ifu-mqa' of...
Merge branch 'junhzhan/fa-ifu-mqa' of https://github.com/ROCmSoftwarePlatform/composable_kernel into junhzhan/fa-ifu-mqa
9bb4ab41 · Junhao · 980b8835 · 5ff2d646 · 9bb4ab41 · 9bb4ab41
Commit 9bb4ab41 authored Oct 30, 2023 by Junhao
12 changed files
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_infer_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_infer_xdl_cshuffle.hpp
--- a/include/ck/tensor_operation/gpu/device/matrix_padder.hpp
+++ b/include/ck/tensor_operation/gpu/device/matrix_padder.hpp
@@ -119,6 +119,15 @@ struct GemmGemmPadder
            c_desc_mraw_nraw, make_tuple(MPerTile_, OPerTile_), Sequence<PadM, PadO>{});
    }

+    // C[M, Gemm1N] = C[M, N]
+    template <typename C0Desc_MRaw_NRaw>
+    __host__ __device__ constexpr auto
+    PadC0Descriptor_M_N(const C0Desc_MRaw_NRaw& c_desc_mraw_nraw) const
+    {
+        return PadTensorDescriptor(
+            c_desc_mraw_nraw, make_tuple(MPerTile_, NPerTile_), Sequence<PadM, PadN>{});
+    }
+
    MPerTileType MPerTile_;
    NPerTileType NPerTile_;
    KPerTileType KPerTile_;

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_light_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_light_v1.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_light_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_light_v2.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v1.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v2.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_fwd_xdl_cshuffle_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_fwd_xdl_cshuffle_v2.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_infer_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_infer_xdl_cshuffle.hpp
--- a/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp
--- a/include/ck/utility/type_convert.hpp
+++ b/include/ck/utility/type_convert.hpp
@@ -31,7 +31,7 @@ inline __host__ __device__ constexpr float type_convert<float, bhalf_t>(bhalf_t
    return u.fp32;
 }

-#if FLASH_ATTENTION_INTERNAL_USE_RTN
+#ifdef USE_RTN_BF16_CONVERT
 // Convert fp32 to bf16 with RTN if higher precision is needed
 template <>
 inline __host__ __device__ constexpr bhalf_t type_convert<bhalf_t, float>(float x)

--- a/library/include/ck/library/utility/host_common_util.hpp
+++ b/library/include/ck/library/utility/host_common_util.hpp
--- a/library/include/ck/library/utility/host_tensor_generator.hpp
+++ b/library/include/ck/library/utility/host_tensor_generator.hpp