merge from develop and revisison for pr#881

29448ffd · Harisankar Sadasivan · 9223a5e2 · 8f84a012 · 29448ffd · 29448ffd
Commit 29448ffd authored Sep 08, 2023 by Harisankar Sadasivan
20 changed files
--- a/include/ck/utility/get_id.hpp
+++ b/include/ck/utility/get_id.hpp
--- a/include/ck/utility/get_shift.hpp
+++ b/include/ck/utility/get_shift.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+namespace ck {
+template <index_t N>
+static constexpr __device__ index_t get_shift()
+{
+    return (get_shift<N / 2>() + 1);
+};
+template <>
+constexpr __device__ index_t get_shift<1>()
+{
+    return (0);
+}
+} // namespace ck
--- a/include/ck/utility/ignore.hpp
+++ b/include/ck/utility/ignore.hpp
--- a/include/ck/utility/inner_product.hpp
+++ b/include/ck/utility/inner_product.hpp
@@ -3,6 +3,7 @@
 #pragma once
 #include "data_type.hpp"
+#include "type_convert.hpp"
 namespace ck {
@@ -18,13 +19,13 @@ __device__ void inner_product<half_t, half_t, float>(const half_t& a, const half
 template <>
 __device__ void inner_product<float, float, float>(const float& a, const float& b, float& c)
 {
-#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM && defined(CK_USE_AMD_V_MAC_F32)
+#if CK_USE_AMD_V_MAC_INLINE_ASM && defined(CK_USE_AMD_V_MAC_F32)
    asm volatile("\n \
            v_mac_f32 %0, %1, %2 \n \
            "
                 : "=v"(c)
                 : "v"(a), "v"(b), "0"(c));
-#elif CK_USE_AMD_INNER_PRODUCT_INLINE_ASM && defined(CK_USE_AMD_V_FMAC_F32)
+#elif CK_USE_AMD_V_MAC_INLINE_ASM && defined(CK_USE_AMD_V_FMAC_F32)
    asm volatile("\n \
            v_fmac_f32 %0, %1, %2 \n \
            "
@@ -81,22 +82,26 @@ template <>
 __device__ void inner_product<half2_t, half2_t, float>(const half2_t& a, const half2_t& b, float& c)
 {
 #if defined(CK_USE_AMD_V_DOT2_F32_F16)
-#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM
+#if CK_USE_AMD_V_DOT_INLINE_ASM
+    // Use 3 x s_nop to avoid hazard (mi200 cdna2 isa page 47
+    // https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf
+    // ) s_nop with parameter 2 is equal to 3 x s_nop
    asm volatile("\n \
            v_dot2_f32_f16 %0, %1, %2, %0\n \
+            s_nop 2 \n \
            "
                 : "=v"(c)
                 : "v"(a), "v"(b), "0"(c));
 #else
-    c = __builtin_amdgcn_sdot2(a, b, c, false);
+    c = __builtin_amdgcn_fdot2(a, b, c, false);
 #endif
 #else
    const vector_type<half_t, 2> a_vector{a};
    const vector_type<half_t, 2> b_vector{b};
    static_for<0, 2, 1>{}([&](auto i) {
-        c += type_convert<int32_t>(a_vector.AsType<half_t>()[i]) *
+        c += type_convert<float>(a_vector.AsType<half_t>()[i]) *
-             type_convert<int32_t>(b_vector.AsType<half_t>()[i]);
+             type_convert<float>(b_vector.AsType<half_t>()[i]);
    });
 #endif
 }
@@ -168,9 +173,13 @@ __device__ void
 inner_product<int8x4_t, int8x4_t, int32_t>(const int8x4_t& a, const int8x4_t& b, int32_t& c)
 {
 #if defined(CK_USE_AMD_V_DOT4_I32_I8)
-#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM
+#if CK_USE_AMD_V_DOT_INLINE_ASM
+    // Use 3 x s_nop to avoid hazard (mi200 cdna2 isa page 47
+    // https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf
+    // ) s_nop with parameter 2 is equal to 3 x s_nop
    asm volatile("\n \
            v_dot4_i32_i8 %0, %1, %2, %0\n \
+            s_nop 2 \n \
            "
                 : "=v"(c)
                 : "v"(bit_cast<int32_t>(a)), "v"(bit_cast<int32_t>(b)), "0"(c));

--- a/include/ck/utility/inner_product_dpp8.hpp
+++ b/include/ck/utility/inner_product_dpp8.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "amd_gemm_dpp.hpp"
+#include "data_type.hpp"
+#include "type_convert.hpp"
+namespace ck {
+namespace dpp8 {
+/// Number of lanes that can share data using DPP8 modifiers.
+constexpr index_t lane_group_size = 8;
+template <int SrcLaneIdx>
+__device__ void inline_v_dot2c_dpp8_instr(const half2_t& a, const half2_t& b, float& c);
+// clang-format off
+template <>
+__device__ void inline_v_dot2c_dpp8_instr<0>(const half2_t& a, const half2_t& b, float& c){
+    asm volatile("\n v_dot2c_f32_f16_dpp %0, %1, %2 dpp8:[0, 0, 0, 0, 0, 0, 0, 0]" : "=v"(c) : "v"(a), "v"(b), "0"(c));
+}
+template <>
+__device__ void inline_v_dot2c_dpp8_instr<1>(const half2_t& a, const half2_t& b, float& c){
+    asm volatile("\n v_dot2c_f32_f16_dpp %0, %1, %2 dpp8:[1, 1, 1, 1, 1, 1, 1, 1]" : "=v"(c) : "v"(a), "v"(b), "0"(c));
+}
+template <>
+__device__ void inline_v_dot2c_dpp8_instr<2>(const half2_t& a, const half2_t& b, float& c){
+    asm volatile("\n v_dot2c_f32_f16_dpp %0, %1, %2 dpp8:[2, 2, 2, 2, 2, 2, 2, 2]" : "=v"(c) : "v"(a), "v"(b), "0"(c));
+}
+template <>
+__device__ void inline_v_dot2c_dpp8_instr<3>(const half2_t& a, const half2_t& b, float& c){
+    asm volatile("\n v_dot2c_f32_f16_dpp %0, %1, %2 dpp8:[3, 3, 3, 3, 3, 3, 3, 3]" : "=v"(c) : "v"(a), "v"(b), "0"(c));
+}
+template <>
+__device__ void inline_v_dot2c_dpp8_instr<4>(const half2_t& a, const half2_t& b, float& c){
+    asm volatile("\n v_dot2c_f32_f16_dpp %0, %1, %2 dpp8:[4, 4, 4, 4, 4, 4, 4, 4]" : "=v"(c) : "v"(a), "v"(b), "0"(c));
+}
+template <>
+__device__ void inline_v_dot2c_dpp8_instr<5>(const half2_t& a, const half2_t& b, float& c){
+    asm volatile("\n v_dot2c_f32_f16_dpp %0, %1, %2 dpp8:[5, 5, 5, 5, 5, 5, 5, 5]" : "=v"(c) : "v"(a), "v"(b), "0"(c));
+}
+template <>
+__device__ void inline_v_dot2c_dpp8_instr<6>(const half2_t& a, const half2_t& b, float& c){
+    asm volatile("\n v_dot2c_f32_f16_dpp %0, %1, %2 dpp8:[6, 6, 6, 6, 6, 6, 6, 6]" : "=v"(c) : "v"(a), "v"(b), "0"(c));
+}
+template <>
+__device__ void inline_v_dot2c_dpp8_instr<7>(const half2_t& a, const half2_t& b, float& c){
+    asm volatile("\n v_dot2c_f32_f16_dpp %0, %1, %2 dpp8:[7, 7, 7, 7, 7, 7, 7, 7]" : "=v"(c) : "v"(a), "v"(b), "0"(c));
+}
+// clang-format on
+/**
+ * Dot product of two vectors using `v_dot` instruction with DPP8 submitted as inline assembly.
+ */
+template <int SrcLaneIdx, bool ShareA>
+__device__ void inline_v_dot2c_dpp8(const half2_t& a, const half2_t& b, float& c)
+{
+    static_assert(SrcLaneIdx >= 0 && SrcLaneIdx < dpp8::lane_group_size,
+                  "DPP8 src broadcast lane out of range <0, 7>.");
+    if constexpr(ShareA)
+    {
+        inline_v_dot2c_dpp8_instr<SrcLaneIdx>(a, b, c);
+    }
+    else
+    {
+        inline_v_dot2c_dpp8_instr<SrcLaneIdx>(b, a, c);
+    }
+}
+/**
+ * DPP8 instrinsics expects to get an integer mask, hardcoding integers for specific broadcast
+ * patters.
+ */
+constexpr std::array<int, dpp8::lane_group_size> IntrinsicMaskDpp8 = {
+    0,        // 0, 0, 0, 0, 0, 0, 0, 0
+    2396745,  // 1, 1, 1, 1, 1, 1, 1, 1
+    4793490,  // 2, 2, 2, 2, 2, 2, 2, 2
+    7190235,  // 3, 3, 3, 3, 3, 3, 3, 3
+    9586980,  // 4, 4, 4, 4, 4, 4, 4, 4
+    11983725, // 5, 5, 5, 5, 5, 5, 5, 5
+    14380470, // 6, 6, 6, 6, 6, 6, 6, 6
+    16777215, // 7, 7, 7, 7, 7, 7, 7, 7
+};
+/**
+ * Returns DPP8 sel modifier as an integer required for the intrinsic instruction.
+ */
+template <int SrcLaneIdx>
+constexpr int get_dpp_sel_mask_broadcast()
+{
+    static_assert(SrcLaneIdx >= 0 && SrcLaneIdx < dpp8::lane_group_size,
+                  "DPP8 src broadcast lane out of range <0, 7>.");
+    return IntrinsicMaskDpp8[SrcLaneIdx];
+}
+template <int SrcLaneIdx>
+__device__ void intrinsic_fdot2_impl(const half2_t& a, const half2_t& b, float& c)
+{
+    constexpr int sel_mask = get_dpp_sel_mask_broadcast<SrcLaneIdx>();
+    const half2_t val_from_other_lane =
+        bit_cast<half2_t>(__builtin_amdgcn_mov_dpp8(bit_cast<int>(a), sel_mask));
+    c = __builtin_amdgcn_fdot2(val_from_other_lane, b, c, false);
+}
+/**
+ * Dot product of two vectors using `v_dot` instruction with DPP8 submitted using intrinsics.
+ */
+template <int SrcLaneIdx, bool ShareA>
+__device__ void intrinsic_fdot2(const half2_t& a, const half2_t& b, float& c)
+{
+    if constexpr(ShareA)
+    {
+        intrinsic_fdot2_impl<SrcLaneIdx>(a, b, c);
+    }
+    else
+    {
+        intrinsic_fdot2_impl<SrcLaneIdx>(b, a, c);
+    }
+}
+/**
+ * Dot product of two input vectors `a`, `b` using `v_dot` instructions with DPP modifier.
+ *
+ * DPP modifier allows us to share one of the vectors between lanes in a lane group.
+ * When `ShareA` is set, instruction uses vector `a` from lane `SrcLaneIdx` from the same
+ * lane group (8 lanes per lane group in DPP8). When `ShareA` is not set, vector `b` is shared.
+ * Note that all the threads in a lane group uses the same vector - broadcast pattern.
+ *
+ * `SrcLaneIdx` must be in range from 0 to 7.
+ */
+template <typename TA, typename TB, typename TC, int SrcLaneIdx, bool ShareA>
+__device__ void inner_product_dpp(const TA& a, const TB& b, TC& c)
+{
+#if CK_USE_AMD_V_DOT_DPP8_INLINE_ASM
+    inline_v_dot2c_dpp8<SrcLaneIdx, ShareA>(a, b, c);
+#else
+    intrinsic_fdot2<SrcLaneIdx, ShareA>(a, b, c);
+#endif
+}
+} // namespace dpp8
+} // namespace ck
--- a/include/ck/utility/integral_constant.hpp
+++ b/include/ck/utility/integral_constant.hpp
--- a/include/ck/utility/is_known_at_compile_time.hpp
+++ b/include/ck/utility/is_known_at_compile_time.hpp
--- a/include/ck/utility/loop_scheduler.hpp
+++ b/include/ck/utility/loop_scheduler.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_adaptor.hpp"
+namespace ck {
+enum struct LoopScheduler
+{
+    Default,
+    Interwave,
+};
+constexpr LoopScheduler make_default_loop_scheduler()
+{
+#if CK_EXPERIMENTAL_DEFAULT_TO_INTER_WAVE_SCHEDULING
+    return LoopScheduler::Interwave;
+#else
+    return LoopScheduler::Default;
+#endif // if CK_EXPERIMENTAL_DEFAULT_TO_INTER_WAVE_SCHEDULING
+}
+} // namespace ck
--- a/include/ck/utility/magic_division.hpp
+++ b/include/ck/utility/magic_division.hpp
@@ -157,4 +157,76 @@ struct MagicDivision
    }
 };
+struct MDiv
+{
+    // 1 dword -> 3 dword storage
+    uint32_t divisor;
+    uint32_t multiplier;
+    uint32_t shift; // TODO: 8 bit is enough
+    // prefer construct on host
+    __host__ __device__ MDiv(uint32_t divisor_) : divisor(divisor_)
+    {
+        auto tmp = MagicDivision::CalculateMagicNumbers(divisor_);
+        multiplier = tmp[Number<0>{}];
+        shift      = tmp[Number<1>{}];
+    }
+    __host__ __device__ MDiv() : divisor(0), multiplier(0), shift(0) {}
+    __host__ __device__ void update(uint32_t divisor_)
+    {
+        divisor  = divisor_;
+        auto tmp = MagicDivision::CalculateMagicNumbers(divisor_);
+        multiplier = tmp[Number<0>{}];
+        shift      = tmp[Number<1>{}];
+    }
+    __host__ __device__ uint32_t div(uint32_t dividend_) const
+    {
+        return MagicDivision::DoMagicDivision(dividend_, multiplier, shift);
+    }
+    __host__ __device__ void
+    divmod(uint32_t dividend_, uint32_t& quotient_, uint32_t& remainder_) const
+    {
+        quotient_  = div(dividend_);
+        remainder_ = dividend_ - (quotient_ * divisor);
+    }
+    __host__ __device__ uint32_t get() const { return divisor; }
+};
+struct MDiv2
+{
+    // 1 dword -> 2 dword storage, divisor need compute from runtime
+    uint32_t multiplier;
+    uint32_t shift; // TODO: 8 bit is enough
+    // prefer construct on host
+    __host__ __device__ MDiv2(uint32_t divisor_)
+    {
+        auto tmp = MagicDivision::CalculateMagicNumbers(divisor_);
+        multiplier = tmp[Number<0>{}];
+        shift      = tmp[Number<1>{}];
+    }
+    __host__ __device__ MDiv2() : multiplier(0), shift(0) {}
+    __host__ __device__ uint32_t div(uint32_t dividend_) const
+    {
+        return MagicDivision::DoMagicDivision(dividend_, multiplier, shift);
+    }
+    __host__ __device__ void
+    divmod(uint32_t dividend_, uint32_t divisor_, uint32_t& quotient_, uint32_t& remainder_) const
+    {
+        quotient_  = div(dividend_);
+        remainder_ = dividend_ - (quotient_ * divisor_);
+    }
+};
 } // namespace ck
--- a/include/ck/utility/math.hpp
+++ b/include/ck/utility/math.hpp
@@ -240,5 +240,21 @@ struct less
    __host__ __device__ constexpr bool operator()(T x, T y) const { return x < y; }
 };
+template <index_t X>
+__host__ __device__ constexpr auto next_power_of_two()
+{
+    // TODO: X need to be 2 ~ 0x7fffffff. 0, 1, or larger than 0x7fffffff will compile fail
+    constexpr index_t Y = 1 << (32 - __builtin_clz(X - 1));
+    return Y;
+}
+template <index_t X>
+__host__ __device__ constexpr auto next_power_of_two(Number<X> x)
+{
+    // TODO: X need to be 2 ~ 0x7fffffff. 0, 1, or larger than 0x7fffffff will compile fail
+    constexpr index_t Y = 1 << (32 - __builtin_clz(x.value - 1));
+    return Number<Y>{};
+}
 } // namespace math
 } // namespace ck
--- a/include/ck/utility/math_v2.hpp
+++ b/include/ck/utility/math_v2.hpp
--- a/include/ck/utility/multi_index.hpp
+++ b/include/ck/utility/multi_index.hpp
--- a/include/ck/utility/number.hpp
+++ b/include/ck/utility/number.hpp
--- a/include/ck/utility/random_gen.hpp
+++ b/include/ck/utility/random_gen.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+namespace ck {
+// Pseudo random number generator
+// version for fp32
+template <typename T, uint32_t seed_t, std::enable_if_t<std::is_same<float, T>{}, bool> = false>
+__host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed = seed_t)
+{
+    uint32_t x         = *(reinterpret_cast<uint32_t*>(&val));
+    uint32_t drop_bits = uint32_t(x) & 0xFFFFu;
+    drop_bits ^= x >> 16;
+    drop_bits = ((drop_bits & 31) << 11) | (drop_bits >> 5);
+    drop_bits *= 0x7000149;
+    // NOTE: If id is in 64 bit, we are only using lower 32 bit.
+    //       So, it can have an effect of using same id for multiple elements when the id is very
+    //       large!
+    uint32_t rng = (drop_bits ^ 0x13371337 ^ (id * 229791) ^ seed);
+    return rng;
+}
+// version for fp16
+template <typename T, uint32_t seed_t, std::enable_if_t<std::is_same<half_t, T>{}, bool> = false>
+__host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed = seed_t)
+{
+    uint16_t x         = *(reinterpret_cast<uint16_t*>(&val));
+    uint32_t drop_bits = uint32_t(x) & 0xFFFFu;
+    drop_bits          = ((drop_bits & 31) << 11) | (drop_bits >> 5);
+    drop_bits *= 0x7000149;
+    // NOTE: If id is in 64 bit, we are only using lower 32 bit.
+    //       So, it can have an effect of using same id for multiple elements when the id is very
+    //       large!
+    uint32_t rng = (drop_bits ^ 0x13371337 ^ (id * 229791) ^ seed);
+    return rng;
+}
+// return 0 if data is not fp16 or fp32
+template <typename T,
+          uint32_t seed_t,
+          std::enable_if_t<!(std::is_same<float, T>{} || std::is_same<half_t, T>{}), bool> = false>
+__host__ __device__ uint32_t prand_generator(int id, T val, uint32_t seed = seed_t)
+{
+    std::ignore = id;
+    std::ignore = val;
+    std::ignore = seed;
+    return 0;
+}
+} // namespace ck
--- a/include/ck/utility/reduction_common.hpp
+++ b/include/ck/utility/reduction_common.hpp
@@ -25,16 +25,4 @@ struct float_equal_zero
    };
 };
-template <index_t N>
-static constexpr __device__ index_t get_shift()
-{
-    return (get_shift<N / 2>() + 1);
-};
-template <>
-constexpr __device__ index_t get_shift<1>()
-{
-    return (0);
-}
 } // namespace ck
--- a/include/ck/utility/reduction_enums.hpp
+++ b/include/ck/utility/reduction_enums.hpp
--- a/include/ck/utility/reduction_functions_accumulate.hpp
+++ b/include/ck/utility/reduction_functions_accumulate.hpp
--- a/include/ck/utility/reduction_operator.hpp
+++ b/include/ck/utility/reduction_operator.hpp
@@ -6,6 +6,7 @@
 #include "ck/ck.hpp"
 #include "ck/utility/data_type.hpp"
 #include "ck/utility/type.hpp"
+#include "ck/utility/type_convert.hpp"
 namespace ck {
@@ -115,7 +116,15 @@ struct Max
    template <typename T>
    __host__ __device__ static constexpr T GetIdentityValue()
    {
-        return NumericLimits<T>::Lowest();
+        if constexpr(is_same_v<T, bhalf_t>)
+        {
+            float val = NumericLimits<float>::Lowest();
+            return type_convert<bhalf_t>(val);
+        }
+        else
+        {
+            return NumericLimits<T>::Lowest();
+        }
    };
    __host__ __device__ static constexpr bool
@@ -137,6 +146,15 @@ struct Max
            a = b;
    }
+    __host__ __device__ inline constexpr void operator()(bhalf_t& a, bhalf_t b) const
+    {
+        float a_ = type_convert<float>(a);
+        float b_ = type_convert<float>(b);
+        if(a_ < b_)
+            a = b;
+    }
    template <typename T>
    __host__ __device__ inline constexpr void operator()(T& a, T b, bool& changed) const
    {
@@ -151,6 +169,18 @@ struct Max
            changed = true;
        }
    }
+    __host__ __device__ inline constexpr void operator()(bhalf_t& a, bhalf_t b, bool& changed) const
+    {
+        float a_ = type_convert<float>(a);
+        float b_ = type_convert<float>(b);
+        if(a_ < b_)
+        {
+            a       = b;
+            changed = true;
+        }
+    }
 };
 struct Min
@@ -158,6 +188,15 @@ struct Min
    template <typename T>
    __host__ __device__ static constexpr T GetIdentityValue()
    {
+        if constexpr(is_same_v<T, bhalf_t>)
+        {
+            float val = NumericLimits<float>::Max();
+            return type_convert<bhalf_t>(val);
+        }
+        else
+        {
+            return NumericLimits<T>::Max();
+        }
        return NumericLimits<T>::Max();
    };
@@ -180,6 +219,15 @@ struct Min
            a = b;
    }
+    __host__ __device__ inline constexpr void operator()(bhalf_t& a, bhalf_t b) const
+    {
+        float a_ = type_convert<float>(a);
+        float b_ = type_convert<float>(b);
+        if(a_ > b_)
+            a = b;
+    }
    template <typename T>
    __host__ __device__ inline constexpr void operator()(T& a, T b, bool& changed) const
    {
@@ -194,6 +242,18 @@ struct Min
            changed = true;
        }
    }
+    __host__ __device__ inline constexpr void operator()(bhalf_t& a, bhalf_t b, bool& changed) const
+    {
+        float a_ = type_convert<float>(a);
+        float b_ = type_convert<float>(b);
+        if(a_ > b_)
+        {
+            a       = b;
+            changed = true;
+        }
+    }
 };
 struct AMax

--- a/include/ck/utility/sequence.hpp
+++ b/include/ck/utility/sequence.hpp
--- a/include/ck/utility/sequence_helper.hpp
+++ b/include/ck/utility/sequence_helper.hpp