Commit 29448ffd authored by Harisankar Sadasivan's avatar Harisankar Sadasivan
Browse files

merge from develop and revisison for pr#881

parents 9223a5e2 8f84a012
File mode changed from 100644 to 100755
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
namespace ck {
template <index_t N>
static constexpr __device__ index_t get_shift()
{
return (get_shift<N / 2>() + 1);
};
template <>
constexpr __device__ index_t get_shift<1>()
{
return (0);
}
} // namespace ck
File mode changed from 100644 to 100755
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
#pragma once #pragma once
#include "data_type.hpp" #include "data_type.hpp"
#include "type_convert.hpp"
namespace ck { namespace ck {
...@@ -18,13 +19,13 @@ __device__ void inner_product<half_t, half_t, float>(const half_t& a, const half ...@@ -18,13 +19,13 @@ __device__ void inner_product<half_t, half_t, float>(const half_t& a, const half
template <> template <>
__device__ void inner_product<float, float, float>(const float& a, const float& b, float& c) __device__ void inner_product<float, float, float>(const float& a, const float& b, float& c)
{ {
#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM && defined(CK_USE_AMD_V_MAC_F32) #if CK_USE_AMD_V_MAC_INLINE_ASM && defined(CK_USE_AMD_V_MAC_F32)
asm volatile("\n \ asm volatile("\n \
v_mac_f32 %0, %1, %2 \n \ v_mac_f32 %0, %1, %2 \n \
" "
: "=v"(c) : "=v"(c)
: "v"(a), "v"(b), "0"(c)); : "v"(a), "v"(b), "0"(c));
#elif CK_USE_AMD_INNER_PRODUCT_INLINE_ASM && defined(CK_USE_AMD_V_FMAC_F32) #elif CK_USE_AMD_V_MAC_INLINE_ASM && defined(CK_USE_AMD_V_FMAC_F32)
asm volatile("\n \ asm volatile("\n \
v_fmac_f32 %0, %1, %2 \n \ v_fmac_f32 %0, %1, %2 \n \
" "
...@@ -81,22 +82,26 @@ template <> ...@@ -81,22 +82,26 @@ template <>
__device__ void inner_product<half2_t, half2_t, float>(const half2_t& a, const half2_t& b, float& c) __device__ void inner_product<half2_t, half2_t, float>(const half2_t& a, const half2_t& b, float& c)
{ {
#if defined(CK_USE_AMD_V_DOT2_F32_F16) #if defined(CK_USE_AMD_V_DOT2_F32_F16)
#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM #if CK_USE_AMD_V_DOT_INLINE_ASM
// Use 3 x s_nop to avoid hazard (mi200 cdna2 isa page 47
// https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf
// ) s_nop with parameter 2 is equal to 3 x s_nop
asm volatile("\n \ asm volatile("\n \
v_dot2_f32_f16 %0, %1, %2, %0\n \ v_dot2_f32_f16 %0, %1, %2, %0\n \
s_nop 2 \n \
" "
: "=v"(c) : "=v"(c)
: "v"(a), "v"(b), "0"(c)); : "v"(a), "v"(b), "0"(c));
#else #else
c = __builtin_amdgcn_sdot2(a, b, c, false); c = __builtin_amdgcn_fdot2(a, b, c, false);
#endif #endif
#else #else
const vector_type<half_t, 2> a_vector{a}; const vector_type<half_t, 2> a_vector{a};
const vector_type<half_t, 2> b_vector{b}; const vector_type<half_t, 2> b_vector{b};
static_for<0, 2, 1>{}([&](auto i) { static_for<0, 2, 1>{}([&](auto i) {
c += type_convert<int32_t>(a_vector.AsType<half_t>()[i]) * c += type_convert<float>(a_vector.AsType<half_t>()[i]) *
type_convert<int32_t>(b_vector.AsType<half_t>()[i]); type_convert<float>(b_vector.AsType<half_t>()[i]);
}); });
#endif #endif
} }
...@@ -168,9 +173,13 @@ __device__ void ...@@ -168,9 +173,13 @@ __device__ void
inner_product<int8x4_t, int8x4_t, int32_t>(const int8x4_t& a, const int8x4_t& b, int32_t& c) inner_product<int8x4_t, int8x4_t, int32_t>(const int8x4_t& a, const int8x4_t& b, int32_t& c)
{ {
#if defined(CK_USE_AMD_V_DOT4_I32_I8) #if defined(CK_USE_AMD_V_DOT4_I32_I8)
#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM #if CK_USE_AMD_V_DOT_INLINE_ASM
// Use 3 x s_nop to avoid hazard (mi200 cdna2 isa page 47
// https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf
// ) s_nop with parameter 2 is equal to 3 x s_nop
asm volatile("\n \ asm volatile("\n \
v_dot4_i32_i8 %0, %1, %2, %0\n \ v_dot4_i32_i8 %0, %1, %2, %0\n \
s_nop 2 \n \
" "
: "=v"(c) : "=v"(c)
: "v"(bit_cast<int32_t>(a)), "v"(bit_cast<int32_t>(b)), "0"(c)); : "v"(bit_cast<int32_t>(a)), "v"(bit_cast<int32_t>(b)), "0"(c));
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "amd_gemm_dpp.hpp"
#include "data_type.hpp"
#include "type_convert.hpp"
namespace ck {
namespace dpp8 {
/// Number of lanes that can share data using DPP8 modifiers.
constexpr index_t lane_group_size = 8;
template <int SrcLaneIdx>
__device__ void inline_v_dot2c_dpp8_instr(const half2_t& a, const half2_t& b, float& c);
// clang-format off
template <>
__device__ void inline_v_dot2c_dpp8_instr<0>(const half2_t& a, const half2_t& b, float& c){
asm volatile("\n v_dot2c_f32_f16_dpp %0, %1, %2 dpp8:[0, 0, 0, 0, 0, 0, 0, 0]" : "=v"(c) : "v"(a), "v"(b), "0"(c));
}
template <>
__device__ void inline_v_dot2c_dpp8_instr<1>(const half2_t& a, const half2_t& b, float& c){
asm volatile("\n v_dot2c_f32_f16_dpp %0, %1, %2 dpp8:[1, 1, 1, 1, 1, 1, 1, 1]" : "=v"(c) : "v"(a), "v"(b), "0"(c));
}
template <>
__device__ void inline_v_dot2c_dpp8_instr<2>(const half2_t& a, const half2_t& b, float& c){
asm volatile("\n v_dot2c_f32_f16_dpp %0, %1, %2 dpp8:[2, 2, 2, 2, 2, 2, 2, 2]" : "=v"(c) : "v"(a), "v"(b), "0"(c));
}
template <>
__device__ void inline_v_dot2c_dpp8_instr<3>(const half2_t& a, const half2_t& b, float& c){
asm volatile("\n v_dot2c_f32_f16_dpp %0, %1, %2 dpp8:[3, 3, 3, 3, 3, 3, 3, 3]" : "=v"(c) : "v"(a), "v"(b), "0"(c));
}
template <>
__device__ void inline_v_dot2c_dpp8_instr<4>(const half2_t& a, const half2_t& b, float& c){
asm volatile("\n v_dot2c_f32_f16_dpp %0, %1, %2 dpp8:[4, 4, 4, 4, 4, 4, 4, 4]" : "=v"(c) : "v"(a), "v"(b), "0"(c));
}
template <>
__device__ void inline_v_dot2c_dpp8_instr<5>(const half2_t& a, const half2_t& b, float& c){
asm volatile("\n v_dot2c_f32_f16_dpp %0, %1, %2 dpp8:[5, 5, 5, 5, 5, 5, 5, 5]" : "=v"(c) : "v"(a), "v"(b), "0"(c));
}
template <>
__device__ void inline_v_dot2c_dpp8_instr<6>(const half2_t& a, const half2_t& b, float& c){
asm volatile("\n v_dot2c_f32_f16_dpp %0, %1, %2 dpp8:[6, 6, 6, 6, 6, 6, 6, 6]" : "=v"(c) : "v"(a), "v"(b), "0"(c));
}
template <>
__device__ void inline_v_dot2c_dpp8_instr<7>(const half2_t& a, const half2_t& b, float& c){
asm volatile("\n v_dot2c_f32_f16_dpp %0, %1, %2 dpp8:[7, 7, 7, 7, 7, 7, 7, 7]" : "=v"(c) : "v"(a), "v"(b), "0"(c));
}
// clang-format on
/**
* Dot product of two vectors using `v_dot` instruction with DPP8 submitted as inline assembly.
*/
template <int SrcLaneIdx, bool ShareA>
__device__ void inline_v_dot2c_dpp8(const half2_t& a, const half2_t& b, float& c)
{
static_assert(SrcLaneIdx >= 0 && SrcLaneIdx < dpp8::lane_group_size,
"DPP8 src broadcast lane out of range <0, 7>.");
if constexpr(ShareA)
{
inline_v_dot2c_dpp8_instr<SrcLaneIdx>(a, b, c);
}
else
{
inline_v_dot2c_dpp8_instr<SrcLaneIdx>(b, a, c);
}
}
/**
* DPP8 instrinsics expects to get an integer mask, hardcoding integers for specific broadcast
* patters.
*/
constexpr std::array<int, dpp8::lane_group_size> IntrinsicMaskDpp8 = {
0, // 0, 0, 0, 0, 0, 0, 0, 0
2396745, // 1, 1, 1, 1, 1, 1, 1, 1
4793490, // 2, 2, 2, 2, 2, 2, 2, 2
7190235, // 3, 3, 3, 3, 3, 3, 3, 3
9586980, // 4, 4, 4, 4, 4, 4, 4, 4
11983725, // 5, 5, 5, 5, 5, 5, 5, 5
14380470, // 6, 6, 6, 6, 6, 6, 6, 6
16777215, // 7, 7, 7, 7, 7, 7, 7, 7
};
/**
* Returns DPP8 sel modifier as an integer required for the intrinsic instruction.
*/
template <int SrcLaneIdx>
constexpr int get_dpp_sel_mask_broadcast()
{
static_assert(SrcLaneIdx >= 0 && SrcLaneIdx < dpp8::lane_group_size,
"DPP8 src broadcast lane out of range <0, 7>.");
return IntrinsicMaskDpp8[SrcLaneIdx];
}
template <int SrcLaneIdx>
__device__ void intrinsic_fdot2_impl(const half2_t& a, const half2_t& b, float& c)
{
constexpr int sel_mask = get_dpp_sel_mask_broadcast<SrcLaneIdx>();
const half2_t val_from_other_lane =
bit_cast<half2_t>(__builtin_amdgcn_mov_dpp8(bit_cast<int>(a), sel_mask));
c = __builtin_amdgcn_fdot2(val_from_other_lane, b, c, false);
}
/**
* Dot product of two vectors using `v_dot` instruction with DPP8 submitted using intrinsics.
*/
template <int SrcLaneIdx, bool ShareA>
__device__ void intrinsic_fdot2(const half2_t& a, const half2_t& b, float& c)
{
if constexpr(ShareA)
{
intrinsic_fdot2_impl<SrcLaneIdx>(a, b, c);
}
else
{
intrinsic_fdot2_impl<SrcLaneIdx>(b, a, c);
}
}
/**
* Dot product of two input vectors `a`, `b` using `v_dot` instructions with DPP modifier.
*
* DPP modifier allows us to share one of the vectors between lanes in a lane group.
* When `ShareA` is set, instruction uses vector `a` from lane `SrcLaneIdx` from the same
* lane group (8 lanes per lane group in DPP8). When `ShareA` is not set, vector `b` is shared.
* Note that all the threads in a lane group uses the same vector - broadcast pattern.
*
* `SrcLaneIdx` must be in range from 0 to 7.
*/
template <typename TA, typename TB, typename TC, int SrcLaneIdx, bool ShareA>
__device__ void inner_product_dpp(const TA& a, const TB& b, TC& c)
{
#if CK_USE_AMD_V_DOT_DPP8_INLINE_ASM
inline_v_dot2c_dpp8<SrcLaneIdx, ShareA>(a, b, c);
#else
intrinsic_fdot2<SrcLaneIdx, ShareA>(a, b, c);
#endif
}
} // namespace dpp8
} // namespace ck
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_adaptor.hpp"
namespace ck {
enum struct LoopScheduler
{
Default,
Interwave,
};
constexpr LoopScheduler make_default_loop_scheduler()
{
#if CK_EXPERIMENTAL_DEFAULT_TO_INTER_WAVE_SCHEDULING
return LoopScheduler::Interwave;
#else
return LoopScheduler::Default;
#endif // if CK_EXPERIMENTAL_DEFAULT_TO_INTER_WAVE_SCHEDULING
}
} // namespace ck
...@@ -157,4 +157,76 @@ struct MagicDivision ...@@ -157,4 +157,76 @@ struct MagicDivision
} }
}; };
struct MDiv
{
// 1 dword -> 3 dword storage
uint32_t divisor;
uint32_t multiplier;
uint32_t shift; // TODO: 8 bit is enough
// prefer construct on host
__host__ __device__ MDiv(uint32_t divisor_) : divisor(divisor_)
{
auto tmp = MagicDivision::CalculateMagicNumbers(divisor_);
multiplier = tmp[Number<0>{}];
shift = tmp[Number<1>{}];
}
__host__ __device__ MDiv() : divisor(0), multiplier(0), shift(0) {}
__host__ __device__ void update(uint32_t divisor_)
{
divisor = divisor_;
auto tmp = MagicDivision::CalculateMagicNumbers(divisor_);
multiplier = tmp[Number<0>{}];
shift = tmp[Number<1>{}];
}
__host__ __device__ uint32_t div(uint32_t dividend_) const
{
return MagicDivision::DoMagicDivision(dividend_, multiplier, shift);
}
__host__ __device__ void
divmod(uint32_t dividend_, uint32_t& quotient_, uint32_t& remainder_) const
{
quotient_ = div(dividend_);
remainder_ = dividend_ - (quotient_ * divisor);
}
__host__ __device__ uint32_t get() const { return divisor; }
};
struct MDiv2
{
// 1 dword -> 2 dword storage, divisor need compute from runtime
uint32_t multiplier;
uint32_t shift; // TODO: 8 bit is enough
// prefer construct on host
__host__ __device__ MDiv2(uint32_t divisor_)
{
auto tmp = MagicDivision::CalculateMagicNumbers(divisor_);
multiplier = tmp[Number<0>{}];
shift = tmp[Number<1>{}];
}
__host__ __device__ MDiv2() : multiplier(0), shift(0) {}
__host__ __device__ uint32_t div(uint32_t dividend_) const
{
return MagicDivision::DoMagicDivision(dividend_, multiplier, shift);
}
__host__ __device__ void
divmod(uint32_t dividend_, uint32_t divisor_, uint32_t& quotient_, uint32_t& remainder_) const
{
quotient_ = div(dividend_);
remainder_ = dividend_ - (quotient_ * divisor_);
}
};
} // namespace ck } // namespace ck
...@@ -240,5 +240,21 @@ struct less ...@@ -240,5 +240,21 @@ struct less
__host__ __device__ constexpr bool operator()(T x, T y) const { return x < y; } __host__ __device__ constexpr bool operator()(T x, T y) const { return x < y; }
}; };
template <index_t X>
__host__ __device__ constexpr auto next_power_of_two()
{
// TODO: X need to be 2 ~ 0x7fffffff. 0, 1, or larger than 0x7fffffff will compile fail
constexpr index_t Y = 1 << (32 - __builtin_clz(X - 1));
return Y;
}
template <index_t X>
__host__ __device__ constexpr auto next_power_of_two(Number<X> x)
{
// TODO: X need to be 2 ~ 0x7fffffff. 0, 1, or larger than 0x7fffffff will compile fail
constexpr index_t Y = 1 << (32 - __builtin_clz(x.value - 1));
return Number<Y>{};
}
} // namespace math } // namespace math
} // namespace ck } // namespace ck
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
namespace ck {
// Pseudo random number generator
// version for fp32
template <typename T, uint32_t seed_t, std::enable_if_t<std::is_same<float, T>{}, bool> = false>
__host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed = seed_t)
{
uint32_t x = *(reinterpret_cast<uint32_t*>(&val));
uint32_t drop_bits = uint32_t(x) & 0xFFFFu;
drop_bits ^= x >> 16;
drop_bits = ((drop_bits & 31) << 11) | (drop_bits >> 5);
drop_bits *= 0x7000149;
// NOTE: If id is in 64 bit, we are only using lower 32 bit.
// So, it can have an effect of using same id for multiple elements when the id is very
// large!
uint32_t rng = (drop_bits ^ 0x13371337 ^ (id * 229791) ^ seed);
return rng;
}
// version for fp16
template <typename T, uint32_t seed_t, std::enable_if_t<std::is_same<half_t, T>{}, bool> = false>
__host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed = seed_t)
{
uint16_t x = *(reinterpret_cast<uint16_t*>(&val));
uint32_t drop_bits = uint32_t(x) & 0xFFFFu;
drop_bits = ((drop_bits & 31) << 11) | (drop_bits >> 5);
drop_bits *= 0x7000149;
// NOTE: If id is in 64 bit, we are only using lower 32 bit.
// So, it can have an effect of using same id for multiple elements when the id is very
// large!
uint32_t rng = (drop_bits ^ 0x13371337 ^ (id * 229791) ^ seed);
return rng;
}
// return 0 if data is not fp16 or fp32
template <typename T,
uint32_t seed_t,
std::enable_if_t<!(std::is_same<float, T>{} || std::is_same<half_t, T>{}), bool> = false>
__host__ __device__ uint32_t prand_generator(int id, T val, uint32_t seed = seed_t)
{
std::ignore = id;
std::ignore = val;
std::ignore = seed;
return 0;
}
} // namespace ck
...@@ -25,16 +25,4 @@ struct float_equal_zero ...@@ -25,16 +25,4 @@ struct float_equal_zero
}; };
}; };
template <index_t N>
static constexpr __device__ index_t get_shift()
{
return (get_shift<N / 2>() + 1);
};
template <>
constexpr __device__ index_t get_shift<1>()
{
return (0);
}
} // namespace ck } // namespace ck
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/utility/data_type.hpp" #include "ck/utility/data_type.hpp"
#include "ck/utility/type.hpp" #include "ck/utility/type.hpp"
#include "ck/utility/type_convert.hpp"
namespace ck { namespace ck {
...@@ -115,7 +116,15 @@ struct Max ...@@ -115,7 +116,15 @@ struct Max
template <typename T> template <typename T>
__host__ __device__ static constexpr T GetIdentityValue() __host__ __device__ static constexpr T GetIdentityValue()
{ {
return NumericLimits<T>::Lowest(); if constexpr(is_same_v<T, bhalf_t>)
{
float val = NumericLimits<float>::Lowest();
return type_convert<bhalf_t>(val);
}
else
{
return NumericLimits<T>::Lowest();
}
}; };
__host__ __device__ static constexpr bool __host__ __device__ static constexpr bool
...@@ -137,6 +146,15 @@ struct Max ...@@ -137,6 +146,15 @@ struct Max
a = b; a = b;
} }
__host__ __device__ inline constexpr void operator()(bhalf_t& a, bhalf_t b) const
{
float a_ = type_convert<float>(a);
float b_ = type_convert<float>(b);
if(a_ < b_)
a = b;
}
template <typename T> template <typename T>
__host__ __device__ inline constexpr void operator()(T& a, T b, bool& changed) const __host__ __device__ inline constexpr void operator()(T& a, T b, bool& changed) const
{ {
...@@ -151,6 +169,18 @@ struct Max ...@@ -151,6 +169,18 @@ struct Max
changed = true; changed = true;
} }
} }
__host__ __device__ inline constexpr void operator()(bhalf_t& a, bhalf_t b, bool& changed) const
{
float a_ = type_convert<float>(a);
float b_ = type_convert<float>(b);
if(a_ < b_)
{
a = b;
changed = true;
}
}
}; };
struct Min struct Min
...@@ -158,6 +188,15 @@ struct Min ...@@ -158,6 +188,15 @@ struct Min
template <typename T> template <typename T>
__host__ __device__ static constexpr T GetIdentityValue() __host__ __device__ static constexpr T GetIdentityValue()
{ {
if constexpr(is_same_v<T, bhalf_t>)
{
float val = NumericLimits<float>::Max();
return type_convert<bhalf_t>(val);
}
else
{
return NumericLimits<T>::Max();
}
return NumericLimits<T>::Max(); return NumericLimits<T>::Max();
}; };
...@@ -180,6 +219,15 @@ struct Min ...@@ -180,6 +219,15 @@ struct Min
a = b; a = b;
} }
__host__ __device__ inline constexpr void operator()(bhalf_t& a, bhalf_t b) const
{
float a_ = type_convert<float>(a);
float b_ = type_convert<float>(b);
if(a_ > b_)
a = b;
}
template <typename T> template <typename T>
__host__ __device__ inline constexpr void operator()(T& a, T b, bool& changed) const __host__ __device__ inline constexpr void operator()(T& a, T b, bool& changed) const
{ {
...@@ -194,6 +242,18 @@ struct Min ...@@ -194,6 +242,18 @@ struct Min
changed = true; changed = true;
} }
} }
__host__ __device__ inline constexpr void operator()(bhalf_t& a, bhalf_t b, bool& changed) const
{
float a_ = type_convert<float>(a);
float b_ = type_convert<float>(b);
if(a_ > b_)
{
a = b;
changed = true;
}
}
}; };
struct AMax struct AMax
......
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment