Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
ae7cef7c
Commit
ae7cef7c
authored
Jan 16, 2025
by
illsilin
Browse files
replace inline assembly with builtins in FHMA
parent
1ff50e78
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
68 additions
and
382 deletions
+68
-382
include/ck_tile/core/arch/amd_buffer_addressing.hpp
include/ck_tile/core/arch/amd_buffer_addressing.hpp
+68
-382
No files found.
include/ck_tile/core/arch/amd_buffer_addressing.hpp
View file @
ae7cef7c
...
@@ -12,6 +12,8 @@
...
@@ -12,6 +12,8 @@
#include "ck_tile/core/utility/bit_cast.hpp"
#include "ck_tile/core/utility/bit_cast.hpp"
#include "ck_tile/core/utility/functional.hpp"
#include "ck_tile/core/utility/functional.hpp"
#define LIKELY(x) (__builtin_expect(!!(x), 1))
namespace
ck_tile
{
namespace
ck_tile
{
// 128 bit SGPRs to supply buffer resource in buffer instructions
// 128 bit SGPRs to supply buffer resource in buffer instructions
...
@@ -60,6 +62,12 @@ struct buffer_load;
...
@@ -60,6 +62,12 @@ struct buffer_load;
#pragma clang diagnostic ignored "-Wundefined-reinterpret-cast"
#pragma clang diagnostic ignored "-Wundefined-reinterpret-cast"
// TODO: strict aliasing rule seems fail when reinterpret_cast between vector type
// TODO: strict aliasing rule seems fail when reinterpret_cast between vector type
// (exp_vector_type(xxx))
// (exp_vector_type(xxx))
union
BR
{
int32x4_t
res
;
__amdgpu_buffer_rsrc_t
opaque
;
};
template
<
bool
pre_nop
>
template
<
bool
pre_nop
>
struct
buffer_load
<
16
,
pre_nop
>
struct
buffer_load
<
16
,
pre_nop
>
{
{
...
@@ -67,24 +75,15 @@ struct buffer_load<16, pre_nop>
...
@@ -67,24 +75,15 @@ struct buffer_load<16, pre_nop>
CK_TILE_DEVICE
void
operator
()(
T
&
value
,
CK_TILE_DEVICE
void
operator
()(
T
&
value
,
int32x4_t
res
/*buffer resource*/
,
int32x4_t
res
/*buffer resource*/
,
index_t
v_offset
,
index_t
v_offset
,
index_t
/*
s_offset
*/
,
index_t
s_offset
,
index_t
i_offset
/*max 0xFFF*/
,
index_t
/*max 0xFFF*/
,
index_t
/*flag*/
=
0
,
index_t
/*flag*/
=
0
,
bool_constant
<
pre_nop
>
=
{})
bool_constant
<
pre_nop
>
=
{})
{
{
static_assert
(
sizeof
(
T
)
==
16
);
static_assert
(
sizeof
(
T
)
==
16
);
using
mbuf_t
=
typename
impl
::
buffer_load_trait
<
16
,
T
>::
payload_t
;
using
mbuf_t
=
typename
impl
::
buffer_load_trait
<
16
,
T
>::
payload_t
;
if
constexpr
(
pre_nop
)
const
BR
br
{
res
};
asm
volatile
(
"s_nop 4
\n
"
reinterpret_cast
<
mbuf_t
&>
(
value
)
=
__builtin_amdgcn_raw_buffer_load_b128
(
br
.
opaque
,
v_offset
,
s_offset
,
0
);
"buffer_load_dwordx4 %0, %1, %2, 0 offen offset:%3"
:
"+v"
(
reinterpret_cast
<
mbuf_t
&>
(
value
))
:
"v"
(
v_offset
),
"s"
(
res
),
"n"
(
i_offset
)
:
"memory"
);
else
asm
volatile
(
"buffer_load_dwordx4 %0, %1, %2, 0 offen offset:%3"
:
"+v"
(
reinterpret_cast
<
mbuf_t
&>
(
value
))
:
"v"
(
v_offset
),
"s"
(
res
),
"n"
(
i_offset
)
:
"memory"
);
}
}
};
};
...
@@ -95,24 +94,15 @@ struct buffer_load<8, pre_nop>
...
@@ -95,24 +94,15 @@ struct buffer_load<8, pre_nop>
CK_TILE_DEVICE
void
operator
()(
T
&
value
,
CK_TILE_DEVICE
void
operator
()(
T
&
value
,
int32x4_t
res
/*buffer resource*/
,
int32x4_t
res
/*buffer resource*/
,
index_t
v_offset
,
index_t
v_offset
,
index_t
/*
s_offset
*/
,
index_t
s_offset
,
index_t
i_offset
/*max 0xFFF*/
,
index_t
/*max 0xFFF*/
,
index_t
/*flag*/
=
0
,
index_t
/*flag*/
=
0
,
bool_constant
<
pre_nop
>
=
{})
bool_constant
<
pre_nop
>
=
{})
{
{
static_assert
(
sizeof
(
T
)
==
8
);
static_assert
(
sizeof
(
T
)
==
8
);
using
mbuf_t
=
typename
impl
::
buffer_load_trait
<
8
,
T
>::
payload_t
;
using
mbuf_t
=
typename
impl
::
buffer_load_trait
<
8
,
T
>::
payload_t
;
if
constexpr
(
pre_nop
)
const
BR
br
{
res
};
asm
volatile
(
"s_nop 4
\n
"
reinterpret_cast
<
mbuf_t
&>
(
value
)
=
__builtin_amdgcn_raw_buffer_load_b64
(
br
.
opaque
,
v_offset
,
s_offset
,
0
);
"buffer_load_dwordx2 %0, %1, %2, 0 offen offset:%3"
:
"+v"
(
reinterpret_cast
<
mbuf_t
&>
(
value
))
:
"v"
(
v_offset
),
"s"
(
res
),
"n"
(
i_offset
)
:
"memory"
);
else
asm
volatile
(
"buffer_load_dwordx2 %0, %1, %2, 0 offen offset:%3"
:
"+v"
(
reinterpret_cast
<
mbuf_t
&>
(
value
))
:
"v"
(
v_offset
),
"s"
(
res
),
"n"
(
i_offset
)
:
"memory"
);
}
}
};
};
...
@@ -123,24 +113,15 @@ struct buffer_load<4, pre_nop>
...
@@ -123,24 +113,15 @@ struct buffer_load<4, pre_nop>
CK_TILE_DEVICE
void
operator
()(
T
&
value
,
CK_TILE_DEVICE
void
operator
()(
T
&
value
,
int32x4_t
res
/*buffer resource*/
,
int32x4_t
res
/*buffer resource*/
,
index_t
v_offset
,
index_t
v_offset
,
index_t
/*
s_offset
*/
,
index_t
s_offset
,
index_t
i_offset
/*max 0xFFF*/
,
index_t
/*max 0xFFF*/
,
index_t
/*flag*/
=
0
,
index_t
/*flag*/
=
0
,
bool_constant
<
pre_nop
>
=
{})
bool_constant
<
pre_nop
>
=
{})
{
{
static_assert
(
sizeof
(
T
)
==
4
);
static_assert
(
sizeof
(
T
)
==
4
);
using
mbuf_t
=
typename
impl
::
buffer_load_trait
<
4
,
T
>::
payload_t
;
using
mbuf_t
=
typename
impl
::
buffer_load_trait
<
4
,
T
>::
payload_t
;
if
constexpr
(
pre_nop
)
const
BR
br
{
res
};
asm
volatile
(
"s_nop 4
\n
"
reinterpret_cast
<
mbuf_t
&>
(
value
)
=
__builtin_amdgcn_raw_buffer_load_b32
(
br
.
opaque
,
v_offset
,
s_offset
,
0
);
"buffer_load_dword %0, %1, %2, 0 offen offset:%3"
:
"+v"
(
reinterpret_cast
<
mbuf_t
&>
(
value
))
:
"v"
(
v_offset
),
"s"
(
res
),
"n"
(
i_offset
)
:
"memory"
);
else
asm
volatile
(
"buffer_load_dword %0, %1, %2, 0 offen offset:%3"
:
"+v"
(
reinterpret_cast
<
mbuf_t
&>
(
value
))
:
"v"
(
v_offset
),
"s"
(
res
),
"n"
(
i_offset
)
:
"memory"
);
}
}
};
};
...
@@ -151,24 +132,15 @@ struct buffer_load<2, pre_nop>
...
@@ -151,24 +132,15 @@ struct buffer_load<2, pre_nop>
CK_TILE_DEVICE
void
operator
()(
T
&
value
,
CK_TILE_DEVICE
void
operator
()(
T
&
value
,
int32x4_t
res
/*buffer resource*/
,
int32x4_t
res
/*buffer resource*/
,
index_t
v_offset
,
index_t
v_offset
,
index_t
/*
s_offset
*/
,
index_t
s_offset
,
index_t
i_offset
/*max 0xFFF*/
,
index_t
/*max 0xFFF*/
,
index_t
/*flag*/
=
0
,
index_t
/*flag*/
=
0
,
bool_constant
<
pre_nop
>
=
{})
bool_constant
<
pre_nop
>
=
{})
{
{
static_assert
(
sizeof
(
T
)
==
4
);
// subdword is buggy, use dword buf and convert manually
static_assert
(
sizeof
(
T
)
==
4
);
// subdword is buggy, use dword buf and convert manually
using
mbuf_t
=
typename
impl
::
buffer_load_trait
<
2
,
T
>::
payload_t
;
using
mbuf_t
=
typename
impl
::
buffer_load_trait
<
2
,
T
>::
payload_t
;
if
constexpr
(
pre_nop
)
const
BR
br
{
res
};
asm
volatile
(
"s_nop 4
\n
"
reinterpret_cast
<
mbuf_t
&>
(
value
)
=
__builtin_amdgcn_raw_buffer_load_b16
(
br
.
opaque
,
v_offset
,
s_offset
,
0
);
"buffer_load_ushort %0, %1, %2, 0 offen offset:%3"
:
"+v"
(
reinterpret_cast
<
mbuf_t
&>
(
value
))
:
"v"
(
v_offset
),
"s"
(
res
),
"n"
(
i_offset
)
:
"memory"
);
else
asm
volatile
(
"buffer_load_ushort %0, %1, %2, 0 offen offset:%3"
:
"+v"
(
reinterpret_cast
<
mbuf_t
&>
(
value
))
:
"v"
(
v_offset
),
"s"
(
res
),
"n"
(
i_offset
)
:
"memory"
);
}
}
};
};
...
@@ -179,195 +151,42 @@ struct buffer_load<1, pre_nop>
...
@@ -179,195 +151,42 @@ struct buffer_load<1, pre_nop>
CK_TILE_DEVICE
void
operator
()(
T
&
value
,
CK_TILE_DEVICE
void
operator
()(
T
&
value
,
int32x4_t
res
/*buffer resource*/
,
int32x4_t
res
/*buffer resource*/
,
index_t
v_offset
,
index_t
v_offset
,
index_t
/*
s_offset
*/
,
index_t
s_offset
,
index_t
i_offset
/*max 0xFFF*/
,
index_t
/*max 0xFFF*/
,
index_t
/*flag*/
=
0
,
index_t
/*flag*/
=
0
,
bool_constant
<
pre_nop
>
=
{})
bool_constant
<
pre_nop
>
=
{})
{
{
static_assert
(
sizeof
(
T
)
==
4
);
static_assert
(
sizeof
(
T
)
==
4
);
using
mbuf_t
=
typename
impl
::
buffer_load_trait
<
1
,
T
>::
payload_t
;
using
mbuf_t
=
typename
impl
::
buffer_load_trait
<
1
,
T
>::
payload_t
;
if
constexpr
(
pre_nop
)
const
BR
br
{
res
};
asm
volatile
(
"s_nop 4
\n
"
reinterpret_cast
<
mbuf_t
&>
(
value
)
=
__builtin_amdgcn_raw_buffer_load_b16
(
br
.
opaque
,
v_offset
,
s_offset
,
0
);
"buffer_load_ubyte %0, %1, %2, 0 offen offset:%3"
:
"+v"
(
reinterpret_cast
<
mbuf_t
&>
(
value
))
:
"v"
(
v_offset
),
"s"
(
res
),
"n"
(
i_offset
)
:
"memory"
);
else
asm
volatile
(
"buffer_load_ubyte %0, %1, %2, 0 offen offset:%3"
:
"+v"
(
reinterpret_cast
<
mbuf_t
&>
(
value
))
:
"v"
(
v_offset
),
"s"
(
res
),
"n"
(
i_offset
)
:
"memory"
);
}
}
};
};
template
<
index_t
bytes
,
bool
pre_nop
=
false
>
template
<
index_t
bytes
,
bool
pre_nop
=
false
>
struct
buffer_load_if
;
struct
buffer_load_if
{
template
<
bool
pre_nop
>
struct
buffer_load_if
<
16
,
pre_nop
>
{
template
<
typename
T
>
template
<
typename
T
>
CK_TILE_DEVICE
void
operator
()(
T
&
value
,
CK_TILE_DEVICE
void
operator
()(
T
&
value
,
int32x4_t
res
/*buffer resource*/
,
int32x4_t
res
/*buffer resource*/
,
index_t
v_offset
,
index_t
v_offset
,
index_t
/*
s_offset
*/
,
index_t
s_offset
,
index_t
i_offset
/*max 0xFFF*/
,
index_t
i_offset
/*max 0xFFF*/
,
index_t
flag
=
0
,
index_t
flag
=
0
,
bool_constant
<
pre_nop
>
=
{})
bool_constant
<
pre_nop
>
=
{})
{
{
static_assert
(
sizeof
(
T
)
==
16
);
static_assert
(
sizeof
(
T
)
==
16
);
auto
saved_exec
=
__builtin_amdgcn_read_exec
();
if
LIKELY
(
1
<=
flag
)
{
using
mbuf_t
=
typename
impl
::
buffer_load_trait
<
16
,
T
>::
payload_t
;
buffer_load
<
bytes
,
pre_nop
>
{}(
value
,
static_assert
(
sizeof
(
mbuf_t
)
==
sizeof
(
T
));
res
,
if
constexpr
(
pre_nop
)
v_offset
,
asm
volatile
(
"s_nop 4
\n
"
s_offset
,
"v_cmpx_le_u32 exec, 1, %4
\n
"
i_offset
,
"buffer_load_dwordx4 %0, %1, %2, 0 offen offset:%3
\n
"
flag
,
"s_mov_b64 exec %5"
bool_constant
<
pre_nop
>
{});
:
"+v"
(
reinterpret_cast
<
mbuf_t
&>
(
value
))
:
"v"
(
v_offset
),
"s"
(
res
),
"n"
(
i_offset
),
"v"
(
flag
),
"s"
(
saved_exec
)
:
"memory"
);
else
asm
volatile
(
"v_cmpx_le_u32 exec, 1, %4
\n
"
"buffer_load_dwordx4 %0, %1, %2, 0 offen offset:%3
\n
"
"s_mov_b64 exec %5"
:
"+v"
(
reinterpret_cast
<
mbuf_t
&>
(
value
))
:
"v"
(
v_offset
),
"s"
(
res
),
"n"
(
i_offset
),
"v"
(
flag
),
"s"
(
saved_exec
)
:
"memory"
);
}
};
template
<
bool
pre_nop
>
struct
buffer_load_if
<
8
,
pre_nop
>
{
template
<
typename
T
>
CK_TILE_DEVICE
void
operator
()(
T
&
value
,
int32x4_t
res
/*buffer resource*/
,
index_t
v_offset
,
index_t
/*s_offset*/
,
index_t
i_offset
/*max 0xFFF*/
,
index_t
flag
=
0
,
bool_constant
<
pre_nop
>
=
{})
{
static_assert
(
sizeof
(
T
)
==
8
);
auto
saved_exec
=
__builtin_amdgcn_read_exec
();
using
mbuf_t
=
typename
impl
::
buffer_load_trait
<
8
,
T
>::
payload_t
;
if
constexpr
(
pre_nop
)
asm
volatile
(
"s_nop 4
\n
"
"v_cmpx_le_u32 exec, 1, %4
\n
"
"buffer_load_dwordx2 %0, %1, %2, 0 offen offset:%3
\n
"
"s_mov_b64 exec %5"
:
"+v"
(
reinterpret_cast
<
mbuf_t
&>
(
value
))
:
"v"
(
v_offset
),
"s"
(
res
),
"n"
(
i_offset
),
"v"
(
flag
),
"s"
(
saved_exec
)
:
"memory"
);
else
asm
volatile
(
"v_cmpx_le_u32 exec, 1, %4
\n
"
"buffer_load_dwordx2 %0, %1, %2, 0 offen offset:%3
\n
"
"s_mov_b64 exec %5"
:
"+v"
(
reinterpret_cast
<
mbuf_t
&>
(
value
))
:
"v"
(
v_offset
),
"s"
(
res
),
"n"
(
i_offset
),
"v"
(
flag
),
"s"
(
saved_exec
)
:
"memory"
);
}
};
template
<
bool
pre_nop
>
struct
buffer_load_if
<
4
,
pre_nop
>
{
template
<
typename
T
>
CK_TILE_DEVICE
void
operator
()(
T
&
value
,
int32x4_t
res
/*buffer resource*/
,
index_t
v_offset
,
index_t
/*s_offset*/
,
index_t
i_offset
/*max 0xFFF*/
,
index_t
flag
=
0
,
bool_constant
<
pre_nop
>
=
{})
{
static_assert
(
sizeof
(
T
)
==
4
);
auto
saved_exec
=
__builtin_amdgcn_read_exec
();
using
mbuf_t
=
typename
impl
::
buffer_load_trait
<
4
,
T
>::
payload_t
;
if
constexpr
(
pre_nop
)
asm
volatile
(
"s_nop 4
\n
"
"v_cmpx_le_u32 exec, 1, %4
\n
"
"buffer_load_dword %0, %1, %2, 0 offen offset:%3
\n
"
"s_mov_b64 exec %5"
:
"+v"
(
reinterpret_cast
<
mbuf_t
&>
(
value
))
:
"v"
(
v_offset
),
"s"
(
res
),
"n"
(
i_offset
),
"v"
(
flag
),
"s"
(
saved_exec
)
:
"memory"
);
else
asm
volatile
(
"v_cmpx_le_u32 exec, 1, %4
\n
"
"buffer_load_dword %0, %1, %2, 0 offen offset:%3
\n
"
"s_mov_b64 exec %5"
:
"+v"
(
reinterpret_cast
<
mbuf_t
&>
(
value
))
:
"v"
(
v_offset
),
"s"
(
res
),
"n"
(
i_offset
),
"v"
(
flag
),
"s"
(
saved_exec
)
:
"memory"
);
}
}
};
template
<
bool
pre_nop
>
struct
buffer_load_if
<
2
,
pre_nop
>
{
template
<
typename
T
>
CK_TILE_DEVICE
void
operator
()(
T
&
value
,
int32x4_t
res
/*buffer resource*/
,
index_t
v_offset
,
index_t
/*s_offset*/
,
index_t
i_offset
/*max 0xFFF*/
,
index_t
flag
=
0
,
bool_constant
<
pre_nop
>
=
{})
{
static_assert
(
sizeof
(
T
)
==
4
);
auto
saved_exec
=
__builtin_amdgcn_read_exec
();
using
mbuf_t
=
typename
impl
::
buffer_load_trait
<
2
,
T
>::
payload_t
;
if
constexpr
(
pre_nop
)
asm
volatile
(
"s_nop 4
\n
"
"v_cmpx_le_u32 exec, 1, %4
\n
"
"buffer_load_ushort %0, %1, %2, 0 offen offset:%3
\n
"
"s_mov_b64 exec %5"
:
"+v"
(
reinterpret_cast
<
mbuf_t
&>
(
value
))
:
"v"
(
v_offset
),
"s"
(
res
),
"n"
(
i_offset
),
"v"
(
flag
),
"s"
(
saved_exec
)
:
"memory"
);
else
asm
volatile
(
"v_cmpx_le_u32 exec, 1, %4
\n
"
"buffer_load_ushort %0, %1, %2, 0 offen offset:%3
\n
"
"s_mov_b64 exec %5"
:
"+v"
(
reinterpret_cast
<
mbuf_t
&>
(
value
))
:
"v"
(
v_offset
),
"s"
(
res
),
"n"
(
i_offset
),
"v"
(
flag
),
"s"
(
saved_exec
)
:
"memory"
);
}
}
};
};
template
<
bool
pre_nop
>
struct
buffer_load_if
<
1
,
pre_nop
>
{
template
<
typename
T
>
CK_TILE_DEVICE
void
operator
()(
T
&
value
,
int32x4_t
res
/*buffer resource*/
,
index_t
v_offset
,
index_t
/*s_offset*/
,
index_t
i_offset
/*max 0xFFF*/
,
index_t
flag
=
0
,
bool_constant
<
pre_nop
>
=
{})
{
static_assert
(
sizeof
(
T
)
==
4
);
auto
saved_exec
=
__builtin_amdgcn_read_exec
();
using
mbuf_t
=
typename
impl
::
buffer_load_trait
<
1
,
T
>::
payload_t
;
if
constexpr
(
pre_nop
)
asm
volatile
(
"s_nop 4
\n
"
"v_cmpx_le_u32 exec, 1, %4
\n
"
"buffer_load_ubyte %0, %1, %2, 0 offen offset:%3
\n
"
"s_mov_b64 exec %5"
:
"+v"
(
reinterpret_cast
<
mbuf_t
&>
(
value
))
:
"v"
(
v_offset
),
"s"
(
res
),
"n"
(
i_offset
),
"v"
(
flag
),
"s"
(
saved_exec
)
:
"memory"
);
else
asm
volatile
(
"v_cmpx_le_u32 exec, 1, %4
\n
"
"buffer_load_ubyte %0, %1, %2, 0 offen offset:%3
\n
"
"s_mov_b64 exec %5"
:
"+v"
(
reinterpret_cast
<
mbuf_t
&>
(
value
))
:
"v"
(
v_offset
),
"s"
(
res
),
"n"
(
i_offset
),
"v"
(
flag
),
"s"
(
saved_exec
)
:
"memory"
);
}
};
#pragma clang diagnostic pop // "-Wundefined-reinterpret-cast"
#pragma clang diagnostic pop // "-Wundefined-reinterpret-cast"
template
<
index_t
bytes
>
template
<
index_t
bytes
>
struct
buffer_store
;
struct
buffer_store
;
...
@@ -379,16 +198,14 @@ struct buffer_store<16>
...
@@ -379,16 +198,14 @@ struct buffer_store<16>
CK_TILE_DEVICE
void
operator
()(
const
T
&
value
,
CK_TILE_DEVICE
void
operator
()(
const
T
&
value
,
int32x4_t
res
/*buffer resource*/
,
int32x4_t
res
/*buffer resource*/
,
index_t
v_offset
,
index_t
v_offset
,
index_t
/*
s_offset
*/
,
index_t
s_offset
,
index_t
i_offset
/*max 0xFFF*/
,
index_t
/*max 0xFFF*/
,
index_t
/*flag*/
=
1
)
index_t
/*flag*/
=
1
)
{
{
static_assert
(
sizeof
(
T
)
==
16
);
static_assert
(
sizeof
(
T
)
==
16
);
using
mbuf_t
=
fp32x4_t
;
using
mbuf_t
=
fp32x4_t
;
asm
volatile
(
"buffer_store_dwordx4 %0, %1, %2, 0 offen offset:%3"
const
BR
br
{
res
};
:
__builtin_amdgcn_raw_buffer_store_b128
(
static_cast
<
mbuf_t
>
(
value
),
br
.
opaque
,
v_offset
,
s_offset
,
0
);
:
"v"
(
bit_cast
<
mbuf_t
>
(
value
)),
"v"
(
v_offset
),
"s"
(
res
),
"n"
(
i_offset
)
:
"memory"
);
}
}
};
};
...
@@ -399,16 +216,14 @@ struct buffer_store<8>
...
@@ -399,16 +216,14 @@ struct buffer_store<8>
CK_TILE_DEVICE
void
operator
()(
const
T
&
value
,
CK_TILE_DEVICE
void
operator
()(
const
T
&
value
,
int32x4_t
res
/*buffer resource*/
,
int32x4_t
res
/*buffer resource*/
,
index_t
v_offset
,
index_t
v_offset
,
index_t
/*
s_offset
*/
,
index_t
s_offset
,
index_t
i_offset
/*max 0xFFF*/
,
index_t
/*max 0xFFF*/
,
index_t
/*flag*/
=
1
)
index_t
/*flag*/
=
1
)
{
{
static_assert
(
sizeof
(
T
)
==
8
);
static_assert
(
sizeof
(
T
)
==
8
);
using
mbuf_t
=
fp32x2_t
;
using
mbuf_t
=
fp32x2_t
;
asm
volatile
(
"buffer_store_dwordx2 %0, %1, %2, 0 offen offset:%3"
const
BR
br
{
res
};
:
__builtin_amdgcn_raw_buffer_store_b64
(
__builtin_bit_cast
(
mbuf_t
,
value
),
br
.
opaque
,
v_offset
,
s_offset
,
0
);
:
"v"
(
bit_cast
<
mbuf_t
>
(
value
)),
"v"
(
v_offset
),
"s"
(
res
),
"n"
(
i_offset
)
:
"memory"
);
}
}
};
};
...
@@ -419,16 +234,14 @@ struct buffer_store<4>
...
@@ -419,16 +234,14 @@ struct buffer_store<4>
CK_TILE_DEVICE
void
operator
()(
const
T
&
value
,
CK_TILE_DEVICE
void
operator
()(
const
T
&
value
,
int32x4_t
res
/*buffer resource*/
,
int32x4_t
res
/*buffer resource*/
,
index_t
v_offset
,
index_t
v_offset
,
index_t
/*
s_offset
*/
,
index_t
s_offset
,
index_t
i_offset
/*max 0xFFF*/
,
index_t
/*max 0xFFF*/
,
index_t
/*flag*/
=
1
)
index_t
/*flag*/
=
1
)
{
{
static_assert
(
sizeof
(
T
)
==
4
);
static_assert
(
sizeof
(
T
)
==
4
);
using
mbuf_t
=
float
;
using
mbuf_t
=
float
;
asm
volatile
(
"buffer_store_dword %0, %1, %2, 0 offen offset:%3"
const
BR
br
{
res
};
:
__builtin_amdgcn_raw_buffer_store_b32
(
static_cast
<
mbuf_t
>
(
value
),
br
.
opaque
,
v_offset
,
s_offset
,
0
);
:
"v"
(
bit_cast
<
mbuf_t
>
(
value
)),
"v"
(
v_offset
),
"s"
(
res
),
"n"
(
i_offset
)
:
"memory"
);
}
}
};
};
...
@@ -439,16 +252,14 @@ struct buffer_store<2>
...
@@ -439,16 +252,14 @@ struct buffer_store<2>
CK_TILE_DEVICE
void
operator
()(
const
T
&
value
,
CK_TILE_DEVICE
void
operator
()(
const
T
&
value
,
int32x4_t
res
/*buffer resource*/
,
int32x4_t
res
/*buffer resource*/
,
index_t
v_offset
,
index_t
v_offset
,
index_t
/*
s_offset
*/
,
index_t
s_offset
,
index_t
i_offset
/*max 0xFFF*/
,
index_t
/*max 0xFFF*/
,
index_t
/*flag*/
=
1
)
index_t
/*flag*/
=
1
)
{
{
static_assert
(
sizeof
(
T
)
==
2
);
static_assert
(
sizeof
(
T
)
==
2
);
using
mbuf_t
=
short
;
using
mbuf_t
=
short
;
asm
volatile
(
"buffer_store_short %0, %1, %2, 0 offen offset:%3"
const
BR
br
{
res
};
:
__builtin_amdgcn_raw_buffer_store_b16
(
__builtin_bit_cast
(
mbuf_t
,
value
),
br
.
opaque
,
v_offset
,
s_offset
,
0
);
:
"v"
(
bit_cast
<
mbuf_t
>
(
value
)),
"v"
(
v_offset
),
"s"
(
res
),
"n"
(
i_offset
)
:
"memory"
);
}
}
};
};
...
@@ -459,160 +270,35 @@ struct buffer_store<1>
...
@@ -459,160 +270,35 @@ struct buffer_store<1>
CK_TILE_DEVICE
void
operator
()(
const
T
&
value
,
CK_TILE_DEVICE
void
operator
()(
const
T
&
value
,
int32x4_t
res
/*buffer resource*/
,
int32x4_t
res
/*buffer resource*/
,
index_t
v_offset
,
index_t
v_offset
,
index_t
/*
s_offset
*/
,
index_t
s_offset
,
index_t
i_offset
/*max 0xFFF*/
,
index_t
/*max 0xFFF*/
,
index_t
/*flag*/
=
1
)
index_t
/*flag*/
=
1
)
{
{
static_assert
(
sizeof
(
T
)
==
4
);
static_assert
(
sizeof
(
T
)
==
4
);
using
mbuf_t
=
float
;
using
mbuf_t
=
float
;
asm
volatile
(
"buffer_store_byte %0, %1, %2, 0 offen offset:%3"
const
BR
br
{
res
};
:
__builtin_amdgcn_raw_buffer_store_b8
(
static_cast
<
mbuf_t
>
(
value
),
br
.
opaque
,
v_offset
,
s_offset
,
0
);
:
"v"
(
bit_cast
<
mbuf_t
>
(
value
)),
"v"
(
v_offset
),
"s"
(
res
),
"n"
(
i_offset
)
:
"memory"
);
}
}
};
};
template
<
index_t
bytes
>
template
<
index_t
bytes
>
struct
buffer_store_if
;
struct
buffer_store_if
template
<
>
struct
buffer_store_if
<
16
>
{
template
<
typename
T
>
CK_TILE_DEVICE
void
operator
()(
const
T
&
value
,
int32x4_t
res
/*buffer resource*/
,
index_t
v_offset
,
index_t
/*s_offset*/
,
index_t
i_offset
/*max 0xFFF*/
,
index_t
flag
=
1
)
{
static_assert
(
sizeof
(
T
)
==
16
);
auto
save_exec
=
__builtin_amdgcn_read_exec
();
using
mbuf_t
=
fp32x4_t
;
asm
volatile
(
"v_cmpx_le_u32 exec, 1, %4
\n
"
"buffer_store_dwordx4 %0, %1, %2, 0 offen offset:%3
\n
"
"s_mov_b64 exec %5"
:
:
"v"
(
bit_cast
<
mbuf_t
>
(
value
)),
"v"
(
v_offset
),
"s"
(
res
),
"n"
(
i_offset
),
"v"
(
flag
),
"s"
(
save_exec
)
:
"memory"
);
}
};
template
<
>
struct
buffer_store_if
<
8
>
{
template
<
typename
T
>
CK_TILE_DEVICE
void
operator
()(
const
T
&
value
,
int32x4_t
res
/*buffer resource*/
,
index_t
v_offset
,
index_t
/*s_offset*/
,
index_t
i_offset
/*max 0xFFF*/
,
index_t
flag
=
1
)
{
static_assert
(
sizeof
(
T
)
==
8
);
auto
save_exec
=
__builtin_amdgcn_read_exec
();
// TODO: ugly. rocm-6.0/6.1 seems neet bit_cast to same base type to avoid scratch
using
mbuf_t
=
ext_vector_t
<
typename
T
::
value_type
,
T
::
size
()
>
;
asm
volatile
(
"v_cmpx_le_u32 exec, 1, %4
\n
"
"buffer_store_dwordx2 %0, %1, %2, 0 offen offset:%3
\n
"
"s_mov_b64 exec %5"
:
:
"v"
(
bit_cast
<
mbuf_t
>
(
value
)),
"v"
(
v_offset
),
"s"
(
res
),
"n"
(
i_offset
),
"v"
(
flag
),
"s"
(
save_exec
)
:
"memory"
);
}
};
template
<
>
struct
buffer_store_if
<
4
>
{
template
<
typename
T
>
CK_TILE_DEVICE
void
operator
()(
const
T
&
value
,
int32x4_t
res
/*buffer resource*/
,
index_t
v_offset
,
index_t
/*s_offset*/
,
index_t
i_offset
/*max 0xFFF*/
,
index_t
flag
=
1
)
{
static_assert
(
sizeof
(
T
)
==
4
);
auto
save_exec
=
__builtin_amdgcn_read_exec
();
using
mbuf_t
=
float
;
asm
volatile
(
"v_cmpx_le_u32 exec, 1, %4
\n
"
"buffer_store_dword %0, %1, %2, 0 offen offset:%3
\n
"
"s_mov_b64 exec %5"
:
:
"v"
(
bit_cast
<
mbuf_t
>
(
value
)),
"v"
(
v_offset
),
"s"
(
res
),
"n"
(
i_offset
),
"v"
(
flag
),
"s"
(
save_exec
)
:
"memory"
);
}
};
template
<
>
struct
buffer_store_if
<
2
>
{
{
template
<
typename
T
>
template
<
typename
T
>
CK_TILE_DEVICE
void
operator
()(
const
T
&
value
,
CK_TILE_DEVICE
void
operator
()(
const
T
&
value
,
int32x4_t
res
/*buffer resource*/
,
int32x4_t
res
/*buffer resource*/
,
index_t
v_offset
,
index_t
v_offset
,
index_t
/*
s_offset
*/
,
index_t
s_offset
,
index_t
i_offset
/*max 0xFFF*/
,
index_t
i_offset
/*max 0xFFF*/
,
index_t
flag
=
1
)
index_t
flag
=
1
)
{
{
static_assert
(
sizeof
(
T
)
==
2
);
if
LIKELY
(
1
<=
flag
)
{
auto
save_exec
=
__builtin_amdgcn_read_exec
();
buffer_store
<
bytes
>
{}(
value
,
using
mbuf_t
=
short
;
res
,
asm
volatile
(
"v_cmpx_le_u32 exec, 1, %4
\n
"
v_offset
,
"buffer_store_short %0, %1, %2, 0 offen offset:%3
\n
"
s_offset
,
"s_mov_b64 exec %5"
i_offset
);
:
:
"v"
(
bit_cast
<
mbuf_t
>
(
value
)),
"v"
(
v_offset
),
"s"
(
res
),
"n"
(
i_offset
),
"v"
(
flag
),
"s"
(
save_exec
)
:
"memory"
);
}
}
};
template
<
>
struct
buffer_store_if
<
1
>
{
template
<
typename
T
>
CK_TILE_DEVICE
void
operator
()(
const
T
&
value
,
int32x4_t
res
/*buffer resource*/
,
index_t
v_offset
,
index_t
/*s_offset*/
,
index_t
i_offset
/*max 0xFFF*/
,
index_t
flag
=
1
)
{
static_assert
(
sizeof
(
T
)
==
4
);
auto
save_exec
=
__builtin_amdgcn_read_exec
();
using
mbuf_t
=
float
;
asm
volatile
(
"v_cmpx_le_u32 exec, 1, %4
\n
"
"buffer_store_byte %0, %1, %2, 0 offen offset:%3
\n
"
"s_mov_b64 exec %5"
:
:
"v"
(
bit_cast
<
mbuf_t
>
(
value
)),
"v"
(
v_offset
),
"s"
(
res
),
"n"
(
i_offset
),
"v"
(
flag
),
"s"
(
save_exec
)
:
"memory"
);
}
}
};
};
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment