Merge branch 'develop' into amd-develop

7e8230da · Jun Liu · 56c72035 · bd09b5c5 · 7e8230da · 7e8230da
Commit 7e8230da authored Oct 02, 2023 by Jun Liu
20 changed files
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle.hpp
@@ -588,6 +588,7 @@ struct GridwiseGemmSplitKMultipleD_xdl_cshuffle
        auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
            BlockSize,
            ABDataType,
+            ABDataType,
            AccDataType,
            decltype(a_block_desc_ak0_m_ak1),
            decltype(b_block_desc_bk0_n_bk1),
@@ -1012,6 +1013,7 @@ struct GridwiseGemmSplitKMultipleD_xdl_cshuffle
        auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
            BlockSize,
            ABDataType,
+            ABDataType,
            AccDataType,
            decltype(a_block_desc_ak0_m_ak1),
            decltype(b_block_desc_bk0_n_bk1),

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
@@ -108,7 +108,8 @@ template <typename ALayout,
          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
          LoopScheduler LoopSched,
          PipelineVersion PipelineVer = PipelineVersion::v1,
-          typename ComputeType        = FloatC>
+          typename ComputeTypeA       = FloatC,
+          typename ComputeTypeB       = ComputeTypeA>
 struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
 {
    static constexpr auto I0 = Number<0>{};
@@ -547,8 +548,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
        constexpr auto c_block_size =
            c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize();
-        return math::max((a_block_space_size_aligned * sizeof(ComputeType) +
+        return math::max((a_block_space_size_aligned * sizeof(ComputeTypeA) +
-                          b_block_space_size_aligned * sizeof(ComputeType)),
+                          b_block_space_size_aligned * sizeof(ComputeTypeB)),
                         c_block_size * sizeof(FloatCShuffle));
    }
@@ -750,7 +751,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
                                                ABlockTransferThreadClusterLengths_AK0_M_AK1,
                                                ABlockTransferThreadClusterArrangeOrder,
                                                FloatA,
-                                                ComputeType,
+                                                ComputeTypeA,
                                                decltype(a_grid_desc_ak0_m_ak1),
                                                decltype(a_block_desc_ak0_m_ak1),
                                                ABlockTransferSrcAccessOrder,
@@ -781,7 +782,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
                                                BBlockTransferThreadClusterLengths_BK0_N_BK1,
                                                BBlockTransferThreadClusterArrangeOrder,
                                                FloatB,
-                                                ComputeType,
+                                                ComputeTypeB,
                                                decltype(b_grid_desc_bk0_n_bk1),
                                                decltype(b_block_desc_bk0_n_bk1),
                                                BBlockTransferSrcAccessOrder,
@@ -809,13 +810,14 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
        //       register
        // sanity check
-        constexpr index_t KPack =
+        constexpr index_t KPack = math::max(
-            math::max(math::lcm(AK1Number, BK1Number),
+            math::lcm(AK1Number, BK1Number),
-                      MfmaSelector<ComputeType, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+            MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeB>::selected_mfma.k_per_blk);
        auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
            BlockSize,
-            ComputeType,
+            ComputeTypeA,
+            ComputeTypeB,
            FloatGemmAcc,
            decltype(a_block_desc_ak0_m_ak1),
            decltype(b_block_desc_bk0_n_bk1),
@@ -833,10 +835,10 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<ComputeType*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+            static_cast<ComputeTypeA*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<ComputeType*>(p_shared) + a_block_space_size_aligned,
+            static_cast<ComputeTypeB*>(p_shared) + a_block_space_size_aligned,
            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0);

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
@@ -495,6 +495,7 @@ struct GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
        auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
            BlockSize,
            FloatAB,
+            FloatAB,
            FloatGemmAcc,
            decltype(a_block_desc_ak0_m_ak1),
            decltype(b_block_desc_bk0_n_bk1),

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_waveletmodel_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_waveletmodel_cshuffle.hpp
@@ -494,6 +494,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_waveletmodel_cshuffle
            auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<
                TileMathThreadGroupSize,
                ABDataType,
+                ABDataType,
                FloatGemmAcc,
                decltype(a_block_desc_ak0_m_ak1),
                decltype(b_block_desc_bk0_n_bk1),

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
@@ -737,6 +737,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
        auto blockwise_gemm =
            BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
+                                                                FloatABAdjusted,
                                                                FloatABAdjusted,
                                                                FloatAcc,
                                                                decltype(a_k0_m_k1_block_desc),

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_streamk.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_streamk.hpp
@@ -490,6 +490,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_streamk
        auto blockwise_gemm =
            BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
+                                                                FloatAB,
                                                                FloatAB,
                                                                FloatAcc,
                                                                decltype(a_block_desc_k0_m_k1),

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
@@ -424,6 +424,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
        using BlockwiseGemm =
            BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
+                                                                FloatABAdjusted,
                                                                FloatABAdjusted,
                                                                FloatAcc,
                                                                decltype(a_block_desc_k0_m_k1),
@@ -569,6 +570,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
        auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
            BlockSize,
            FloatABAdjusted,
+            FloatABAdjusted,
            FloatAcc,
            decltype(a_block_desc_k0_m_k1),
            decltype(b_block_desc_k0_n_k1),

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
@@ -762,6 +762,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
        auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
            BlockSize,
            ComputeType,
+            ComputeType,
            FloatAcc,
            decltype(a_k0_m_k1_block_desc),
            decltype(b_k0_n_k1_block_desc),

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
@@ -451,6 +451,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
        auto blockwise_gemm =
            BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
+                                                                FloatAB,
                                                                FloatAB,
                                                                FloatAcc,
                                                                decltype(a_block_desc_ak0_m_ak1),

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
@@ -471,6 +471,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r2
        auto blockwise_gemm =
            BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
+                                                                FloatAB,
                                                                FloatAB,
                                                                FloatAcc,
                                                                decltype(a_block_desc_k0_m_k1),

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
@@ -489,6 +489,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3
        auto blockwise_gemm =
            BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
+                                                                FloatAB,
                                                                FloatAB,
                                                                FloatAcc,
                                                                decltype(a_block_desc_k0_m_k1),

--- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
@@ -31,7 +31,9 @@ enum struct MfmaInstr
    mfma_i32_16x16x32i8,
    mfma_f64_16x16x4f64,
    mfma_f32_32x32x16f8f8,
-    mfma_f32_16x16x32f8f8
+    mfma_f32_16x16x32f8f8,
+    mfma_f32_32x32x16f8bf8,
+    mfma_f32_16x16x32f8bf8
 };
 template <MfmaInstr instr>
@@ -502,10 +504,62 @@ struct mfma_type<MfmaInstr::mfma_f32_16x16x32f8f8>
 };
 #endif
-template <typename base_type, index_t MPerXdlops, index_t NPerXdlops>
+#if defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
+template <>
+struct mfma_type<MfmaInstr::mfma_f32_32x32x16f8bf8>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 4;
+    static constexpr index_t num_regs_per_blk    = 16;
+    static constexpr index_t num_threads_per_blk = 32;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 2;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 32;
+    static constexpr index_t n_per_blk           = 32;
+    static constexpr index_t k_per_blk           = 8;
+    static constexpr bool is_k_reduction         = true;
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_32x32x16f8bf8<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
+template <>
+struct mfma_type<MfmaInstr::mfma_f32_16x16x32f8bf8>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 1;
+    static constexpr index_t num_regs_per_blk    = 4;
+    static constexpr index_t num_threads_per_blk = 16;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 4;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 16;
+    static constexpr index_t n_per_blk           = 16;
+    static constexpr index_t k_per_blk           = 8;
+    static constexpr bool is_k_reduction         = true;
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_16x16x32f8bf8<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
+#endif
+template <typename base_type,
+          index_t MPerXdlops,
+          index_t NPerXdlops,
+          typename additional_type = base_type>
 struct MfmaSelector
 {
-    template <typename base_type_, index_t MPerXdlops_, index_t NPerXdlops_>
+    template <typename base_type_,
+              index_t MPerXdlops_,
+              index_t NPerXdlops_,
+              typename additional_type_ = base_type_>
    static constexpr auto GetMfma();
    template <>
@@ -656,7 +710,22 @@ struct MfmaSelector
    }
 #endif
-    static constexpr auto selected_mfma = mfma_type<GetMfma<base_type, MPerXdlops, NPerXdlops>()>{};
+#if defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
+    template <>
+    static constexpr auto GetMfma<f8_t, 32, 32, bf8_t>()
+    {
+        return MfmaInstr::mfma_f32_32x32x16f8bf8;
+    }
+    template <>
+    static constexpr auto GetMfma<f8_t, 16, 16, bf8_t>()
+    {
+        return MfmaInstr::mfma_f32_16x16x32f8bf8;
+    }
+#endif
+    static constexpr auto selected_mfma =
+        mfma_type<GetMfma<base_type, MPerXdlops, NPerXdlops, additional_type>()>{};
    __host__ __device__ constexpr MfmaSelector()
    {
@@ -703,6 +772,7 @@ template <typename base_type,
          index_t MPerXdlops,
          index_t NPerXdlops,
          index_t KPack,
+          typename additional_type = base_type,
          bool TransposeC          = false>
 struct XdlopsGemm
 {
@@ -854,14 +924,18 @@ struct XdlopsGemm
    template <class FloatA, class FloatB, class FloatC>
    __device__ void Run(const FloatA& p_a_wave, const FloatB& p_b_wave, FloatC& p_c_thread) const
    {
-        static_assert(is_same<base_type, double>::value || is_same<base_type, float>::value ||
+        static_assert(
+            is_same<base_type, double>::value || is_same<base_type, float>::value ||
                is_same<base_type, half_t>::value || is_same<base_type, bhalf_t>::value ||
                is_same<base_type, int8_t>::value
 #if defined CK_ENABLE_FP8
                || is_same<base_type, f8_t>::value
+#endif
+#if defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
+                || (is_same<base_type, f8_t>::value && is_same<additional_type, bf8_t>::value)
 #endif
                ,
-                      "base base_type must be double, float, half, bfloat16, and int8_t!");
+            "base base_type must be double, float, half, bfloat16, int8_t, f8_t or bf8_t!");
        static_for<0, KPack / mfma_instr.k_per_blk, 1>{}([&](auto k) {
            if constexpr(!TransposeC)
@@ -957,7 +1031,7 @@ struct XdlopsGemm
        return TransposeC ? CIndex4D{blk_td, I0, blk_id, I0} : CIndex4D{I0, blk_id, I0, blk_td};
    }
-    static constexpr auto mfma = MfmaSelector<base_type, MPerXdlops, NPerXdlops>{};
+    static constexpr auto mfma = MfmaSelector<base_type, MPerXdlops, NPerXdlops, additional_type>{};
    static constexpr auto mfma_instr = mfma.selected_mfma;

--- a/include/ck/utility/amd_buffer_addressing.hpp
+++ b/include/ck/utility/amd_buffer_addressing.hpp
--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_contraction.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_contraction.hpp
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
--- a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
+++ b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
@@ -29,6 +29,8 @@ using BF8 = ck::bf8_t;
 using Empty_Tuple = ck::Tuple<>;
+using BF16_Tuple = ck::Tuple<BF16>;
 using F16_Tuple     = ck::Tuple<F16>;
 using F16_F16_Tuple = ck::Tuple<F16, F16>;

--- a/library/include/ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp
--- a/library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp
--- a/library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp