Merge branch 'develop' of https://github.com/ROCmSoftwarePlatform/composable_kernel into lwpck-977

de1afb7b · Rostyslav Geyyer · ce562aa6 · f7331c60 · de1afb7b · de1afb7b
Commit de1afb7b authored Oct 19, 2023 by Rostyslav Geyyer
20 changed files
--- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
@@ -462,7 +462,6 @@ struct mfma_type<MfmaInstr::mfma_f64_16x16x4f64>
    }
 };
-#if defined CK_ENABLE_FP8
 template <>
 struct mfma_type<MfmaInstr::mfma_f32_32x32x16f8f8>
 {
@@ -506,9 +505,7 @@ struct mfma_type<MfmaInstr::mfma_f32_16x16x32f8f8>
        intrin_mfma_f32_16x16x32f8f8<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
    }
 };
-#endif
-#if defined CK_ENABLE_BF8
 template <>
 struct mfma_type<MfmaInstr::mfma_f32_32x32x16bf8bf8>
 {
@@ -552,9 +549,7 @@ struct mfma_type<MfmaInstr::mfma_f32_16x16x32bf8bf8>
        intrin_mfma_f32_16x16x32bf8bf8<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
    }
 };
-#endif
-#if defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
 template <>
 struct mfma_type<MfmaInstr::mfma_f32_32x32x16f8bf8>
 {
@@ -598,9 +593,7 @@ struct mfma_type<MfmaInstr::mfma_f32_16x16x32f8bf8>
        intrin_mfma_f32_16x16x32f8bf8<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
    }
 };
-#endif
-#if defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
 template <>
 struct mfma_type<MfmaInstr::mfma_f32_32x32x16bf8f8>
 {
@@ -644,7 +637,6 @@ struct mfma_type<MfmaInstr::mfma_f32_16x16x32bf8f8>
        intrin_mfma_f32_16x16x32bf8f8<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
    }
 };
-#endif
 template <typename base_type,
          index_t MPerXdlops,
@@ -792,7 +784,6 @@ struct MfmaSelector
    }
 #endif
-#if defined CK_ENABLE_FP8
    template <>
    static constexpr auto GetMfma<f8_t, 32, 32>()
    {
@@ -804,9 +795,7 @@ struct MfmaSelector
    {
        return MfmaInstr::mfma_f32_16x16x32f8f8;
    }
-#endif
-#if defined CK_ENABLE_BF8
    template <>
    static constexpr auto GetMfma<bf8_t, 32, 32>()
    {
@@ -818,9 +807,7 @@ struct MfmaSelector
    {
        return MfmaInstr::mfma_f32_16x16x32bf8bf8;
    }
-#endif
-#if defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
    template <>
    static constexpr auto GetMfma<f8_t, 32, 32, bf8_t>()
    {
@@ -832,9 +819,7 @@ struct MfmaSelector
    {
        return MfmaInstr::mfma_f32_16x16x32f8bf8;
    }
-#endif
-#if defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
    template <>
    static constexpr auto GetMfma<bf8_t, 32, 32, f8_t>()
    {
@@ -846,7 +831,6 @@ struct MfmaSelector
    {
        return MfmaInstr::mfma_f32_16x16x32bf8f8;
    }
-#endif
    static constexpr auto selected_mfma =
        mfma_type<GetMfma<base_type, MPerXdlops, NPerXdlops, additional_type>()>{};
@@ -1051,18 +1035,10 @@ struct XdlopsGemm
        static_assert(
            is_same<base_type, double>::value || is_same<base_type, float>::value ||
                is_same<base_type, half_t>::value || is_same<base_type, bhalf_t>::value ||
-                is_same<base_type, int8_t>::value
+                is_same<base_type, int8_t>::value || is_same<base_type, f8_t>::value ||
-#if defined CK_ENABLE_FP8
+                is_same<base_type, bf8_t>::value ||
-                || is_same<base_type, f8_t>::value
+                (is_same<base_type, f8_t>::value && is_same<additional_type, bf8_t>::value) ||
-#endif
+                (is_same<base_type, bf8_t>::value && is_same<additional_type, f8_t>::value),
-#if defined CK_ENABLE_BF8
-                || is_same<base_type, bf8_t>::value
-#endif
-#if defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
-                || (is_same<base_type, f8_t>::value && is_same<additional_type, bf8_t>::value) ||
-                (is_same<base_type, bf8_t>::value && is_same<additional_type, f8_t>::value)
-#endif
-                ,
            "base base_type must be double, float, half, bfloat16, int8_t, f8_t or bf8_t!");
        static_for<0, KPack / mfma_instr.k_per_blk, 1>{}([&](auto k) {

--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#ifndef CK_AMD_XDLOPS_HPP
+#pragma once
-#define CK_AMD_XDLOPS_HPP
-#include "data_type.hpp"
 namespace ck {
@@ -355,7 +352,6 @@ struct intrin_mfma_f64_16x16x4f64<16, 16>
    }
 };
-#if defined CK_ENABLE_FP8
 template <index_t MPerWave, index_t NPerWave>
 struct intrin_mfma_f32_32x32x16f8f8;
@@ -418,9 +414,7 @@ struct intrin_mfma_f32_16x16x32f8f8<16, 16>
 #endif
    }
 };
-#endif
-#if defined CK_ENABLE_BF8
 template <index_t MPerWave, index_t NPerWave>
 struct intrin_mfma_f32_32x32x16bf8bf8;
@@ -483,9 +477,7 @@ struct intrin_mfma_f32_16x16x32bf8bf8<16, 16>
 #endif
    }
 };
-#endif
-#if defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
 template <index_t MPerWave, index_t NPerWave>
 struct intrin_mfma_f32_32x32x16f8bf8;
@@ -548,9 +540,7 @@ struct intrin_mfma_f32_16x16x32f8bf8<16, 16>
 #endif
    }
 };
-#endif
-#if defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
 template <index_t MPerWave, index_t NPerWave>
 struct intrin_mfma_f32_32x32x16bf8f8;
@@ -613,6 +603,5 @@ struct intrin_mfma_f32_16x16x32bf8f8<16, 16>
 #endif
    }
 };
-#endif
 } // namespace ck
-#endif
--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
@@ -9,9 +9,7 @@ namespace ck {
 using bhalf_t = ushort;
 using half_t  = _Float16;
-#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
 using int4_t  = _BitInt(4);
-#endif
 using f8_t    = _BitInt(8);
 using bf8_t   = unsigned _BitInt(8);
@@ -1123,5 +1121,4 @@ struct NumericUtils<bf8_t>
    static constexpr int bias = 16; // negative zero nan mode
    // static constexpr int bias = 15; // ieee mode
 };
 } // namespace ck
--- a/include/ck/utility/inner_product.hpp
+++ b/include/ck/utility/inner_product.hpp
@@ -192,6 +192,8 @@ inner_product<int8x4_t, int8x4_t, int32_t>(const int8x4_t& a, const int8x4_t& b,
 #else
    c = __builtin_amdgcn_sdot4(bit_cast<int32_t>(a), bit_cast<int32_t>(b), c, false);
 #endif
+#elif defined(CK_USE_AMD_V_DOT4_I32_I8_GFX11)
+    c = __builtin_amdgcn_sudot4(true, bit_cast<int32_t>(a), true, bit_cast<int32_t>(b), c, false);
 #else
    const vector_type<int8_t, 4> a_vector{a};
    const vector_type<int8_t, 4> b_vector{b};

--- a/include/ck/utility/math.hpp
+++ b/include/ck/utility/math.hpp
--- a/include/ck/utility/math_v2.hpp
+++ b/include/ck/utility/math_v2.hpp
--- a/include/ck/utility/statically_indexed_array_multi_index.hpp
+++ b/include/ck/utility/statically_indexed_array_multi_index.hpp
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp
--- a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
+++ b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
--- a/library/include/ck/library/tensor_operation_instance/gpu/convolution_backward_data.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/convolution_backward_data.hpp
--- a/library/include/ck/library/tensor_operation_instance/gpu/convolution_forward.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/convolution_forward.hpp
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
--- a/library/include/ck/library/tensor_operation_instance/gpu/normalization.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/normalization.hpp
--- a/library/include/ck/library/tensor_operation_instance/gpu/normalization_swish.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/normalization_swish.hpp
--- a/library/include/ck/library/utility/check_err.hpp
+++ b/library/include/ck/library/utility/check_err.hpp
--- a/library/include/ck/library/utility/host_common_util.hpp
+++ b/library/include/ck/library/utility/host_common_util.hpp
--- a/library/include/ck/library/utility/host_tensor_generator.hpp
+++ b/library/include/ck/library/utility/host_tensor_generator.hpp