Commit ae7cef7c authored by illsilin's avatar illsilin
Browse files

replace inline assembly with builtins in FHMA

parent 1ff50e78
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
#include "ck_tile/core/utility/bit_cast.hpp" #include "ck_tile/core/utility/bit_cast.hpp"
#include "ck_tile/core/utility/functional.hpp" #include "ck_tile/core/utility/functional.hpp"
#define LIKELY(x) (__builtin_expect(!!(x), 1))
namespace ck_tile { namespace ck_tile {
// 128 bit SGPRs to supply buffer resource in buffer instructions // 128 bit SGPRs to supply buffer resource in buffer instructions
...@@ -60,6 +62,12 @@ struct buffer_load; ...@@ -60,6 +62,12 @@ struct buffer_load;
#pragma clang diagnostic ignored "-Wundefined-reinterpret-cast" #pragma clang diagnostic ignored "-Wundefined-reinterpret-cast"
// TODO: strict aliasing rule seems fail when reinterpret_cast between vector type // TODO: strict aliasing rule seems fail when reinterpret_cast between vector type
// (exp_vector_type(xxx)) // (exp_vector_type(xxx))
union BR {
int32x4_t res;
__amdgpu_buffer_rsrc_t opaque;
};
template <bool pre_nop> template <bool pre_nop>
struct buffer_load<16, pre_nop> struct buffer_load<16, pre_nop>
{ {
...@@ -67,24 +75,15 @@ struct buffer_load<16, pre_nop> ...@@ -67,24 +75,15 @@ struct buffer_load<16, pre_nop>
CK_TILE_DEVICE void operator()(T& value, CK_TILE_DEVICE void operator()(T& value,
int32x4_t res /*buffer resource*/, int32x4_t res /*buffer resource*/,
index_t v_offset, index_t v_offset,
index_t /*s_offset*/, index_t s_offset,
index_t i_offset /*max 0xFFF*/, index_t /*max 0xFFF*/,
index_t /*flag*/ = 0, index_t /*flag*/ = 0,
bool_constant<pre_nop> = {}) bool_constant<pre_nop> = {})
{ {
static_assert(sizeof(T) == 16); static_assert(sizeof(T) == 16);
using mbuf_t = typename impl::buffer_load_trait<16, T>::payload_t; using mbuf_t = typename impl::buffer_load_trait<16, T>::payload_t;
if constexpr(pre_nop) const BR br{res};
asm volatile("s_nop 4\n" reinterpret_cast<mbuf_t&>(value) = __builtin_amdgcn_raw_buffer_load_b128(br.opaque, v_offset, s_offset, 0);
"buffer_load_dwordx4 %0, %1, %2, 0 offen offset:%3"
: "+v"(reinterpret_cast<mbuf_t&>(value))
: "v"(v_offset), "s"(res), "n"(i_offset)
: "memory");
else
asm volatile("buffer_load_dwordx4 %0, %1, %2, 0 offen offset:%3"
: "+v"(reinterpret_cast<mbuf_t&>(value))
: "v"(v_offset), "s"(res), "n"(i_offset)
: "memory");
} }
}; };
...@@ -95,24 +94,15 @@ struct buffer_load<8, pre_nop> ...@@ -95,24 +94,15 @@ struct buffer_load<8, pre_nop>
CK_TILE_DEVICE void operator()(T& value, CK_TILE_DEVICE void operator()(T& value,
int32x4_t res /*buffer resource*/, int32x4_t res /*buffer resource*/,
index_t v_offset, index_t v_offset,
index_t /*s_offset*/, index_t s_offset,
index_t i_offset /*max 0xFFF*/, index_t /*max 0xFFF*/,
index_t /*flag*/ = 0, index_t /*flag*/ = 0,
bool_constant<pre_nop> = {}) bool_constant<pre_nop> = {})
{ {
static_assert(sizeof(T) == 8); static_assert(sizeof(T) == 8);
using mbuf_t = typename impl::buffer_load_trait<8, T>::payload_t; using mbuf_t = typename impl::buffer_load_trait<8, T>::payload_t;
if constexpr(pre_nop) const BR br{res};
asm volatile("s_nop 4\n" reinterpret_cast<mbuf_t&>(value) = __builtin_amdgcn_raw_buffer_load_b64(br.opaque, v_offset, s_offset, 0);
"buffer_load_dwordx2 %0, %1, %2, 0 offen offset:%3"
: "+v"(reinterpret_cast<mbuf_t&>(value))
: "v"(v_offset), "s"(res), "n"(i_offset)
: "memory");
else
asm volatile("buffer_load_dwordx2 %0, %1, %2, 0 offen offset:%3"
: "+v"(reinterpret_cast<mbuf_t&>(value))
: "v"(v_offset), "s"(res), "n"(i_offset)
: "memory");
} }
}; };
...@@ -123,24 +113,15 @@ struct buffer_load<4, pre_nop> ...@@ -123,24 +113,15 @@ struct buffer_load<4, pre_nop>
CK_TILE_DEVICE void operator()(T& value, CK_TILE_DEVICE void operator()(T& value,
int32x4_t res /*buffer resource*/, int32x4_t res /*buffer resource*/,
index_t v_offset, index_t v_offset,
index_t /*s_offset*/, index_t s_offset,
index_t i_offset /*max 0xFFF*/, index_t /*max 0xFFF*/,
index_t /*flag*/ = 0, index_t /*flag*/ = 0,
bool_constant<pre_nop> = {}) bool_constant<pre_nop> = {})
{ {
static_assert(sizeof(T) == 4); static_assert(sizeof(T) == 4);
using mbuf_t = typename impl::buffer_load_trait<4, T>::payload_t; using mbuf_t = typename impl::buffer_load_trait<4, T>::payload_t;
if constexpr(pre_nop) const BR br{res};
asm volatile("s_nop 4\n" reinterpret_cast<mbuf_t&>(value) = __builtin_amdgcn_raw_buffer_load_b32(br.opaque, v_offset, s_offset, 0);
"buffer_load_dword %0, %1, %2, 0 offen offset:%3"
: "+v"(reinterpret_cast<mbuf_t&>(value))
: "v"(v_offset), "s"(res), "n"(i_offset)
: "memory");
else
asm volatile("buffer_load_dword %0, %1, %2, 0 offen offset:%3"
: "+v"(reinterpret_cast<mbuf_t&>(value))
: "v"(v_offset), "s"(res), "n"(i_offset)
: "memory");
} }
}; };
...@@ -151,24 +132,15 @@ struct buffer_load<2, pre_nop> ...@@ -151,24 +132,15 @@ struct buffer_load<2, pre_nop>
CK_TILE_DEVICE void operator()(T& value, CK_TILE_DEVICE void operator()(T& value,
int32x4_t res /*buffer resource*/, int32x4_t res /*buffer resource*/,
index_t v_offset, index_t v_offset,
index_t /*s_offset*/, index_t s_offset,
index_t i_offset /*max 0xFFF*/, index_t /*max 0xFFF*/,
index_t /*flag*/ = 0, index_t /*flag*/ = 0,
bool_constant<pre_nop> = {}) bool_constant<pre_nop> = {})
{ {
static_assert(sizeof(T) == 4); // subdword is buggy, use dword buf and convert manually static_assert(sizeof(T) == 4); // subdword is buggy, use dword buf and convert manually
using mbuf_t = typename impl::buffer_load_trait<2, T>::payload_t; using mbuf_t = typename impl::buffer_load_trait<2, T>::payload_t;
if constexpr(pre_nop) const BR br{res};
asm volatile("s_nop 4\n" reinterpret_cast<mbuf_t&>(value) = __builtin_amdgcn_raw_buffer_load_b16(br.opaque, v_offset, s_offset, 0);
"buffer_load_ushort %0, %1, %2, 0 offen offset:%3"
: "+v"(reinterpret_cast<mbuf_t&>(value))
: "v"(v_offset), "s"(res), "n"(i_offset)
: "memory");
else
asm volatile("buffer_load_ushort %0, %1, %2, 0 offen offset:%3"
: "+v"(reinterpret_cast<mbuf_t&>(value))
: "v"(v_offset), "s"(res), "n"(i_offset)
: "memory");
} }
}; };
...@@ -179,195 +151,42 @@ struct buffer_load<1, pre_nop> ...@@ -179,195 +151,42 @@ struct buffer_load<1, pre_nop>
CK_TILE_DEVICE void operator()(T& value, CK_TILE_DEVICE void operator()(T& value,
int32x4_t res /*buffer resource*/, int32x4_t res /*buffer resource*/,
index_t v_offset, index_t v_offset,
index_t /*s_offset*/, index_t s_offset,
index_t i_offset /*max 0xFFF*/, index_t /*max 0xFFF*/,
index_t /*flag*/ = 0, index_t /*flag*/ = 0,
bool_constant<pre_nop> = {}) bool_constant<pre_nop> = {})
{ {
static_assert(sizeof(T) == 4); static_assert(sizeof(T) == 4);
using mbuf_t = typename impl::buffer_load_trait<1, T>::payload_t; using mbuf_t = typename impl::buffer_load_trait<1, T>::payload_t;
if constexpr(pre_nop) const BR br{res};
asm volatile("s_nop 4\n" reinterpret_cast<mbuf_t&>(value) = __builtin_amdgcn_raw_buffer_load_b16(br.opaque, v_offset, s_offset, 0);
"buffer_load_ubyte %0, %1, %2, 0 offen offset:%3"
: "+v"(reinterpret_cast<mbuf_t&>(value))
: "v"(v_offset), "s"(res), "n"(i_offset)
: "memory");
else
asm volatile("buffer_load_ubyte %0, %1, %2, 0 offen offset:%3"
: "+v"(reinterpret_cast<mbuf_t&>(value))
: "v"(v_offset), "s"(res), "n"(i_offset)
: "memory");
} }
}; };
template <index_t bytes, bool pre_nop = false> template <index_t bytes, bool pre_nop = false>
struct buffer_load_if; struct buffer_load_if {
template <bool pre_nop>
struct buffer_load_if<16, pre_nop>
{
template <typename T> template <typename T>
CK_TILE_DEVICE void operator()(T& value, CK_TILE_DEVICE void operator()(T& value,
int32x4_t res /*buffer resource*/, int32x4_t res /*buffer resource*/,
index_t v_offset, index_t v_offset,
index_t /*s_offset*/, index_t s_offset,
index_t i_offset /*max 0xFFF*/, index_t i_offset /*max 0xFFF*/,
index_t flag = 0, index_t flag = 0,
bool_constant<pre_nop> = {}) bool_constant<pre_nop> = {})
{ {
static_assert(sizeof(T) == 16); static_assert(sizeof(T) == 16);
auto saved_exec = __builtin_amdgcn_read_exec(); if LIKELY(1 <= flag) {
using mbuf_t = typename impl::buffer_load_trait<16, T>::payload_t; buffer_load<bytes, pre_nop>{}(value,
static_assert(sizeof(mbuf_t) == sizeof(T)); res,
if constexpr(pre_nop) v_offset,
asm volatile("s_nop 4\n" s_offset,
"v_cmpx_le_u32 exec, 1, %4\n" i_offset,
"buffer_load_dwordx4 %0, %1, %2, 0 offen offset:%3\n" flag,
"s_mov_b64 exec %5" bool_constant<pre_nop>{});
: "+v"(reinterpret_cast<mbuf_t&>(value))
: "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
: "memory");
else
asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
"buffer_load_dwordx4 %0, %1, %2, 0 offen offset:%3\n"
"s_mov_b64 exec %5"
: "+v"(reinterpret_cast<mbuf_t&>(value))
: "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
: "memory");
}
};
template <bool pre_nop>
struct buffer_load_if<8, pre_nop>
{
template <typename T>
CK_TILE_DEVICE void operator()(T& value,
int32x4_t res /*buffer resource*/,
index_t v_offset,
index_t /*s_offset*/,
index_t i_offset /*max 0xFFF*/,
index_t flag = 0,
bool_constant<pre_nop> = {})
{
static_assert(sizeof(T) == 8);
auto saved_exec = __builtin_amdgcn_read_exec();
using mbuf_t = typename impl::buffer_load_trait<8, T>::payload_t;
if constexpr(pre_nop)
asm volatile("s_nop 4\n"
"v_cmpx_le_u32 exec, 1, %4\n"
"buffer_load_dwordx2 %0, %1, %2, 0 offen offset:%3\n"
"s_mov_b64 exec %5"
: "+v"(reinterpret_cast<mbuf_t&>(value))
: "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
: "memory");
else
asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
"buffer_load_dwordx2 %0, %1, %2, 0 offen offset:%3\n"
"s_mov_b64 exec %5"
: "+v"(reinterpret_cast<mbuf_t&>(value))
: "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
: "memory");
}
};
template <bool pre_nop>
struct buffer_load_if<4, pre_nop>
{
template <typename T>
CK_TILE_DEVICE void operator()(T& value,
int32x4_t res /*buffer resource*/,
index_t v_offset,
index_t /*s_offset*/,
index_t i_offset /*max 0xFFF*/,
index_t flag = 0,
bool_constant<pre_nop> = {})
{
static_assert(sizeof(T) == 4);
auto saved_exec = __builtin_amdgcn_read_exec();
using mbuf_t = typename impl::buffer_load_trait<4, T>::payload_t;
if constexpr(pre_nop)
asm volatile("s_nop 4\n"
"v_cmpx_le_u32 exec, 1, %4\n"
"buffer_load_dword %0, %1, %2, 0 offen offset:%3\n"
"s_mov_b64 exec %5"
: "+v"(reinterpret_cast<mbuf_t&>(value))
: "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
: "memory");
else
asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
"buffer_load_dword %0, %1, %2, 0 offen offset:%3\n"
"s_mov_b64 exec %5"
: "+v"(reinterpret_cast<mbuf_t&>(value))
: "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
: "memory");
} }
};
template <bool pre_nop>
struct buffer_load_if<2, pre_nop>
{
template <typename T>
CK_TILE_DEVICE void operator()(T& value,
int32x4_t res /*buffer resource*/,
index_t v_offset,
index_t /*s_offset*/,
index_t i_offset /*max 0xFFF*/,
index_t flag = 0,
bool_constant<pre_nop> = {})
{
static_assert(sizeof(T) == 4);
auto saved_exec = __builtin_amdgcn_read_exec();
using mbuf_t = typename impl::buffer_load_trait<2, T>::payload_t;
if constexpr(pre_nop)
asm volatile("s_nop 4\n"
"v_cmpx_le_u32 exec, 1, %4\n"
"buffer_load_ushort %0, %1, %2, 0 offen offset:%3\n"
"s_mov_b64 exec %5"
: "+v"(reinterpret_cast<mbuf_t&>(value))
: "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
: "memory");
else
asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
"buffer_load_ushort %0, %1, %2, 0 offen offset:%3\n"
"s_mov_b64 exec %5"
: "+v"(reinterpret_cast<mbuf_t&>(value))
: "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
: "memory");
} }
}; };
template <bool pre_nop>
struct buffer_load_if<1, pre_nop>
{
template <typename T>
CK_TILE_DEVICE void operator()(T& value,
int32x4_t res /*buffer resource*/,
index_t v_offset,
index_t /*s_offset*/,
index_t i_offset /*max 0xFFF*/,
index_t flag = 0,
bool_constant<pre_nop> = {})
{
static_assert(sizeof(T) == 4);
auto saved_exec = __builtin_amdgcn_read_exec();
using mbuf_t = typename impl::buffer_load_trait<1, T>::payload_t;
if constexpr(pre_nop)
asm volatile("s_nop 4\n"
"v_cmpx_le_u32 exec, 1, %4\n"
"buffer_load_ubyte %0, %1, %2, 0 offen offset:%3\n"
"s_mov_b64 exec %5"
: "+v"(reinterpret_cast<mbuf_t&>(value))
: "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
: "memory");
else
asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
"buffer_load_ubyte %0, %1, %2, 0 offen offset:%3\n"
"s_mov_b64 exec %5"
: "+v"(reinterpret_cast<mbuf_t&>(value))
: "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
: "memory");
}
};
#pragma clang diagnostic pop // "-Wundefined-reinterpret-cast" #pragma clang diagnostic pop // "-Wundefined-reinterpret-cast"
template <index_t bytes> template <index_t bytes>
struct buffer_store; struct buffer_store;
...@@ -379,16 +198,14 @@ struct buffer_store<16> ...@@ -379,16 +198,14 @@ struct buffer_store<16>
CK_TILE_DEVICE void operator()(const T& value, CK_TILE_DEVICE void operator()(const T& value,
int32x4_t res /*buffer resource*/, int32x4_t res /*buffer resource*/,
index_t v_offset, index_t v_offset,
index_t /*s_offset*/, index_t s_offset,
index_t i_offset /*max 0xFFF*/, index_t /*max 0xFFF*/,
index_t /*flag*/ = 1) index_t /*flag*/ = 1)
{ {
static_assert(sizeof(T) == 16); static_assert(sizeof(T) == 16);
using mbuf_t = fp32x4_t; using mbuf_t = fp32x4_t;
asm volatile("buffer_store_dwordx4 %0, %1, %2, 0 offen offset:%3" const BR br{res};
: __builtin_amdgcn_raw_buffer_store_b128(static_cast<mbuf_t>(value), br.opaque, v_offset, s_offset, 0);
: "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
: "memory");
} }
}; };
...@@ -399,16 +216,14 @@ struct buffer_store<8> ...@@ -399,16 +216,14 @@ struct buffer_store<8>
CK_TILE_DEVICE void operator()(const T& value, CK_TILE_DEVICE void operator()(const T& value,
int32x4_t res /*buffer resource*/, int32x4_t res /*buffer resource*/,
index_t v_offset, index_t v_offset,
index_t /*s_offset*/, index_t s_offset,
index_t i_offset /*max 0xFFF*/, index_t /*max 0xFFF*/,
index_t /*flag*/ = 1) index_t /*flag*/ = 1)
{ {
static_assert(sizeof(T) == 8); static_assert(sizeof(T) == 8);
using mbuf_t = fp32x2_t; using mbuf_t = fp32x2_t;
asm volatile("buffer_store_dwordx2 %0, %1, %2, 0 offen offset:%3" const BR br{res};
: __builtin_amdgcn_raw_buffer_store_b64(__builtin_bit_cast(mbuf_t, value), br.opaque, v_offset, s_offset, 0);
: "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
: "memory");
} }
}; };
...@@ -419,16 +234,14 @@ struct buffer_store<4> ...@@ -419,16 +234,14 @@ struct buffer_store<4>
CK_TILE_DEVICE void operator()(const T& value, CK_TILE_DEVICE void operator()(const T& value,
int32x4_t res /*buffer resource*/, int32x4_t res /*buffer resource*/,
index_t v_offset, index_t v_offset,
index_t /*s_offset*/, index_t s_offset,
index_t i_offset /*max 0xFFF*/, index_t /*max 0xFFF*/,
index_t /*flag*/ = 1) index_t /*flag*/ = 1)
{ {
static_assert(sizeof(T) == 4); static_assert(sizeof(T) == 4);
using mbuf_t = float; using mbuf_t = float;
asm volatile("buffer_store_dword %0, %1, %2, 0 offen offset:%3" const BR br{res};
: __builtin_amdgcn_raw_buffer_store_b32(static_cast<mbuf_t>(value), br.opaque, v_offset, s_offset, 0);
: "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
: "memory");
} }
}; };
...@@ -439,16 +252,14 @@ struct buffer_store<2> ...@@ -439,16 +252,14 @@ struct buffer_store<2>
CK_TILE_DEVICE void operator()(const T& value, CK_TILE_DEVICE void operator()(const T& value,
int32x4_t res /*buffer resource*/, int32x4_t res /*buffer resource*/,
index_t v_offset, index_t v_offset,
index_t /*s_offset*/, index_t s_offset,
index_t i_offset /*max 0xFFF*/, index_t /*max 0xFFF*/,
index_t /*flag*/ = 1) index_t /*flag*/ = 1)
{ {
static_assert(sizeof(T) == 2); static_assert(sizeof(T) == 2);
using mbuf_t = short; using mbuf_t = short;
asm volatile("buffer_store_short %0, %1, %2, 0 offen offset:%3" const BR br{res};
: __builtin_amdgcn_raw_buffer_store_b16(__builtin_bit_cast(mbuf_t, value), br.opaque, v_offset, s_offset, 0);
: "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
: "memory");
} }
}; };
...@@ -459,160 +270,35 @@ struct buffer_store<1> ...@@ -459,160 +270,35 @@ struct buffer_store<1>
CK_TILE_DEVICE void operator()(const T& value, CK_TILE_DEVICE void operator()(const T& value,
int32x4_t res /*buffer resource*/, int32x4_t res /*buffer resource*/,
index_t v_offset, index_t v_offset,
index_t /*s_offset*/, index_t s_offset,
index_t i_offset /*max 0xFFF*/, index_t /*max 0xFFF*/,
index_t /*flag*/ = 1) index_t /*flag*/ = 1)
{ {
static_assert(sizeof(T) == 4); static_assert(sizeof(T) == 4);
using mbuf_t = float; using mbuf_t = float;
asm volatile("buffer_store_byte %0, %1, %2, 0 offen offset:%3" const BR br{res};
: __builtin_amdgcn_raw_buffer_store_b8(static_cast<mbuf_t>(value), br.opaque, v_offset, s_offset, 0);
: "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
: "memory");
} }
}; };
template <index_t bytes> template <index_t bytes>
struct buffer_store_if; struct buffer_store_if
template <>
struct buffer_store_if<16>
{
template <typename T>
CK_TILE_DEVICE void operator()(const T& value,
int32x4_t res /*buffer resource*/,
index_t v_offset,
index_t /*s_offset*/,
index_t i_offset /*max 0xFFF*/,
index_t flag = 1)
{
static_assert(sizeof(T) == 16);
auto save_exec = __builtin_amdgcn_read_exec();
using mbuf_t = fp32x4_t;
asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
"buffer_store_dwordx4 %0, %1, %2, 0 offen offset:%3\n"
"s_mov_b64 exec %5"
:
: "v"(bit_cast<mbuf_t>(value)),
"v"(v_offset),
"s"(res),
"n"(i_offset),
"v"(flag),
"s"(save_exec)
: "memory");
}
};
template <>
struct buffer_store_if<8>
{
template <typename T>
CK_TILE_DEVICE void operator()(const T& value,
int32x4_t res /*buffer resource*/,
index_t v_offset,
index_t /*s_offset*/,
index_t i_offset /*max 0xFFF*/,
index_t flag = 1)
{
static_assert(sizeof(T) == 8);
auto save_exec = __builtin_amdgcn_read_exec();
// TODO: ugly. rocm-6.0/6.1 seems neet bit_cast to same base type to avoid scratch
using mbuf_t = ext_vector_t<typename T::value_type, T::size()>;
asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
"buffer_store_dwordx2 %0, %1, %2, 0 offen offset:%3\n"
"s_mov_b64 exec %5"
:
: "v"(bit_cast<mbuf_t>(value)),
"v"(v_offset),
"s"(res),
"n"(i_offset),
"v"(flag),
"s"(save_exec)
: "memory");
}
};
template <>
struct buffer_store_if<4>
{
template <typename T>
CK_TILE_DEVICE void operator()(const T& value,
int32x4_t res /*buffer resource*/,
index_t v_offset,
index_t /*s_offset*/,
index_t i_offset /*max 0xFFF*/,
index_t flag = 1)
{
static_assert(sizeof(T) == 4);
auto save_exec = __builtin_amdgcn_read_exec();
using mbuf_t = float;
asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
"buffer_store_dword %0, %1, %2, 0 offen offset:%3\n"
"s_mov_b64 exec %5"
:
: "v"(bit_cast<mbuf_t>(value)),
"v"(v_offset),
"s"(res),
"n"(i_offset),
"v"(flag),
"s"(save_exec)
: "memory");
}
};
template <>
struct buffer_store_if<2>
{ {
template <typename T> template <typename T>
CK_TILE_DEVICE void operator()(const T& value, CK_TILE_DEVICE void operator()(const T& value,
int32x4_t res /*buffer resource*/, int32x4_t res /*buffer resource*/,
index_t v_offset, index_t v_offset,
index_t /*s_offset*/, index_t s_offset,
index_t i_offset /*max 0xFFF*/, index_t i_offset /*max 0xFFF*/,
index_t flag = 1) index_t flag = 1)
{ {
static_assert(sizeof(T) == 2); if LIKELY(1 <= flag) {
auto save_exec = __builtin_amdgcn_read_exec(); buffer_store<bytes>{}(value,
using mbuf_t = short; res,
asm volatile("v_cmpx_le_u32 exec, 1, %4\n" v_offset,
"buffer_store_short %0, %1, %2, 0 offen offset:%3\n" s_offset,
"s_mov_b64 exec %5" i_offset);
:
: "v"(bit_cast<mbuf_t>(value)),
"v"(v_offset),
"s"(res),
"n"(i_offset),
"v"(flag),
"s"(save_exec)
: "memory");
} }
};
template <>
struct buffer_store_if<1>
{
template <typename T>
CK_TILE_DEVICE void operator()(const T& value,
int32x4_t res /*buffer resource*/,
index_t v_offset,
index_t /*s_offset*/,
index_t i_offset /*max 0xFFF*/,
index_t flag = 1)
{
static_assert(sizeof(T) == 4);
auto save_exec = __builtin_amdgcn_read_exec();
using mbuf_t = float;
asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
"buffer_store_byte %0, %1, %2, 0 offen offset:%3\n"
"s_mov_b64 exec %5"
:
: "v"(bit_cast<mbuf_t>(value)),
"v"(v_offset),
"s"(res),
"n"(i_offset),
"v"(flag),
"s"(save_exec)
: "memory");
} }
}; };
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment