#ifndef CK_AMD_DLOP_HPP #define CK_AMD_DLOP_HPP #include "data_type.hpp" namespace ck { template __device__ void amd_inner_product_dlop(const TA& a, const TB& b, TC& c); template <> __device__ void amd_inner_product_dlop(const float& a, const float& b, float& c) { #if CK_USE_AMD_DLOP_INLINE_ASM asm volatile("\n \ v_fmac_f32 %0, %1, %2 \n \ " : "=v"(c) : "v"(a), "v"(b), "0"(c)); #else c += a * b; #endif } template <> __device__ void amd_inner_product_dlop(const float2_t& a, const float2_t& b, float& c) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; amd_inner_product_dlop(vector_type{a}.AsType()[I0], vector_type{b}.AsType()[I0], c); amd_inner_product_dlop(vector_type{a}.AsType()[I1], vector_type{b}.AsType()[I1], c); } template <> __device__ void amd_inner_product_dlop(const float4_t& a, const float4_t& b, float& c) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; amd_inner_product_dlop(vector_type{a}.AsType()[I0], vector_type{b}.AsType()[I0], c); amd_inner_product_dlop(vector_type{a}.AsType()[I1], vector_type{b}.AsType()[I1], c); amd_inner_product_dlop(vector_type{a}.AsType()[I2], vector_type{b}.AsType()[I2], c); amd_inner_product_dlop(vector_type{a}.AsType()[I3], vector_type{b}.AsType()[I3], c); } #if CK_USE_AMD_DLOP template <> __device__ void amd_inner_product_dlop(const half2_t& a, const half2_t& b, float& c) { #if CK_USE_AMD_DLOP_INLINE_ASM asm volatile("\n \ v_dot2_f32_f16 %0, %1, %2, %0\n \ " : "=v"(c) : "v"(a), "v"(b), "0"(c)); #else c = __builtin_amdgcn_sdot2(a, b, c, false); #endif } template <> __device__ void amd_inner_product_dlop(const half4_t& a, const half4_t& b, float& c) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; amd_inner_product_dlop(vector_type{a}.AsType()[I0], vector_type{b}.AsType()[I0], c); amd_inner_product_dlop(vector_type{a}.AsType()[I1], vector_type{b}.AsType()[I1], c); } template <> __device__ void amd_inner_product_dlop(const half8_t& a, const half8_t& b, float& c) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; amd_inner_product_dlop(vector_type{a}.AsType()[I0], vector_type{b}.AsType()[I0], c); amd_inner_product_dlop(vector_type{a}.AsType()[I1], vector_type{b}.AsType()[I1], c); amd_inner_product_dlop(vector_type{a}.AsType()[I2], vector_type{b}.AsType()[I2], c); amd_inner_product_dlop(vector_type{a}.AsType()[I3], vector_type{b}.AsType()[I3], c); } template <> __device__ void amd_inner_product_dlop(const int8x4_t& a, const int8x4_t& b, int32_t& c) { #if CK_USE_AMD_DLOP_INLINE_ASM asm volatile("\n \ v_dot4_i32_i8 %0, %1, %2, %0\n \ " : "=v"(c) : "v"(as_type(a)), "v"(as_type(b)), "0"(c)); #else c = __builtin_amdgcn_sdot4(as_type(a), as_type(b), c, false); #endif } template <> __device__ void amd_inner_product_dlop(const int8x8_t& a, const int8x8_t& b, int32_t& c) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; amd_inner_product_dlop(vector_type{a}.AsType()[I0], vector_type{b}.AsType()[I0], c); amd_inner_product_dlop(vector_type{a}.AsType()[I1], vector_type{b}.AsType()[I1], c); } template <> __device__ void amd_inner_product_dlop(const int8x16_t& a, const int8x16_t& b, int32_t& c) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; amd_inner_product_dlop(vector_type{a}.AsType()[I0], vector_type{b}.AsType()[I0], c); amd_inner_product_dlop(vector_type{a}.AsType()[I1], vector_type{b}.AsType()[I1], c); amd_inner_product_dlop(vector_type{a}.AsType()[I2], vector_type{b}.AsType()[I2], c); amd_inner_product_dlop(vector_type{a}.AsType()[I3], vector_type{b}.AsType()[I3], c); } #endif // CK_USE_AMD_DLOP } // namespace ck #endif