Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
a75f162b
Commit
a75f162b
authored
Jan 12, 2025
by
coderfeli
Browse files
add files
parent
59c05300
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
1354 additions
and
39 deletions
+1354
-39
example/ck_tile/15_fused_moe/instances/fused_moegemm_api_traits.hpp
..._tile/15_fused_moe/instances/fused_moegemm_api_traits.hpp
+7
-7
example/ck_tile/15_fused_moe/main.cpp
example/ck_tile/15_fused_moe/main.cpp
+8
-1
include/ck_tile/ops/flatmm/block/flatmm_sn_32x128x256_1x4x1_16x16x32_itl.hpp
.../flatmm/block/flatmm_sn_32x128x256_1x4x1_16x16x32_itl.hpp
+511
-0
include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x128x256_1x4x1_16x16x16_itl.inc
...ck/uk/flatmm_sn_uk_gfx9_32x128x256_1x4x1_16x16x16_itl.inc
+794
-0
include/ck_tile/ops/fused_moe/kernel/fused_moegemm_shape.hpp
include/ck_tile/ops/fused_moe/kernel/fused_moegemm_shape.hpp
+28
-27
include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp
...s/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp
+6
-4
No files found.
example/ck_tile/15_fused_moe/instances/fused_moegemm_api_traits.hpp
View file @
a75f162b
...
...
@@ -33,20 +33,20 @@ struct fmoe_ // traits, ugly name, only used for internal
using
YSmoothScaleDataType
=
ck_tile
::
remove_cvref_t
<
typename
TypeConfig
::
YSmoothScaleDataType
>
;
using
TopkWeightDataType
=
ck_tile
::
remove_cvref_t
<
typename
TypeConfig
::
TopkWeightDataType
>
;
using
IndexDataType
=
ck_tile
::
remove_cvref_t
<
typename
TypeConfig
::
IndexDataType
>
;
// S<32, 1024, 128, 128>, S<1, 4, 1>, S<16, 16, 32>
static
constexpr
ck_tile
::
index_t
BT_
=
BlockTIle_
::
at
(
ck_tile
::
number
<
0
>
{});
// block token
static
constexpr
ck_tile
::
index_t
BI_
=
BlockTIle_
::
at
(
ck_tile
::
number
<
1
>
{});
// block intermediate
static
constexpr
ck_tile
::
index_t
BH_
=
BlockTIle_
::
at
(
ck_tile
::
number
<
2
>
{});
// block hidden
static
constexpr
ck_tile
::
index_t
BD_
=
BlockTIle_
::
at
(
ck_tile
::
number
<
3
>
{});
// block down
using
BlockTile_0
=
ck_tile
::
sequence
<
BT_
,
BI_
/
(
GateOnly_
?
1
:
2
),
BH_
>
;
using
WarpPerBlock_0
=
ck_tile
::
remove_cvref_t
<
WarpPerBlock_
>
;
using
WarpTile_0
=
ck_tile
::
remove_cvref_t
<
WarpTile_
>
;
using
BlockTile_0
=
ck_tile
::
sequence
<
BT_
,
BI_
/
(
GateOnly_
?
1
:
2
),
BH_
>
;
//32, 512, 128
using
WarpPerBlock_0
=
ck_tile
::
remove_cvref_t
<
WarpPerBlock_
>
;
// S<1, 4, 1>
using
WarpTile_0
=
ck_tile
::
remove_cvref_t
<
WarpTile_
>
;
// S<16, 16, 32>
using
BlockTile_1
=
ck_tile
::
sequence
<
BT_
,
BD_
,
BI_
/
(
GateOnly_
?
1
:
2
)
>
;
using
WarpPerBlock_1
=
ck_tile
::
remove_cvref_t
<
WarpPerBlock_
>
;
using
WarpTile_1
=
ck_tile
::
remove_cvref_t
<
WarpTile_
>
;
using
BlockTile_1
=
ck_tile
::
sequence
<
BT_
,
BD_
,
BI_
/
(
GateOnly_
?
1
:
2
)
>
;
// 32, 128, 512
using
WarpPerBlock_1
=
ck_tile
::
remove_cvref_t
<
WarpPerBlock_
>
;
/// S<1, 4, 1>
using
WarpTile_1
=
ck_tile
::
remove_cvref_t
<
WarpTile_
>
;
// S<16, 16, 32>
static
constexpr
ck_tile
::
index_t
GateOnly
=
GateOnly_
;
static
constexpr
ck_tile
::
index_t
FusedQuant
=
FusedQuant_
;
...
...
example/ck_tile/15_fused_moe/main.cpp
View file @
a75f162b
...
...
@@ -285,7 +285,10 @@ bool run(const ck_tile::ArgParser& arg_parser)
}
else
{
topid_unique_gen
<
IndexDataType
>
(
topk_ids_host
.
mData
,
tokens
,
topk
,
experts
,
11913
);
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
topk_ids_host
.
mData
.
size
());
i
++
)
{
topk_ids_host
.
mData
[
i
]
=
i
%
4
;
}
// topid_unique_gen<IndexDataType>(topk_ids_host.mData, tokens, topk, experts, 11913);
}
// leave it here for future debug purpose
...
...
@@ -442,6 +445,10 @@ bool run(const ck_tile::ArgParser& arg_parser)
topk
,
gate_only
);
sorted_token_ids_host
.
savetxt
(
"sorted_token_ids_host.txt"
,
"int"
);
sorted_expert_ids_host
.
savetxt
(
"sorted_expert_ids_host.txt"
,
"int"
);
num_sorted_tiles_host
.
savetxt
(
"num_sorted_tiles_host.txt"
,
"int"
);
auto
o_dev
=
o_buf
.
ToHost
<
ODataType
>
();
// o_dev.savetxt("gpu-out.txt", "float");
auto
[
rtol
,
atol
]
=
get_elimit
<
ADataType
>
();
...
...
include/ck_tile/ops/flatmm/block/flatmm_sn_32x128x256_1x4x1_16x16x32_itl.hpp
0 → 100644
View file @
a75f162b
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/ops/gemm/warp/warp_gemm.hpp"
#include "ck_tile/ops/flatmm/block/flatmm_uk_config.hpp"
#include "ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32.hpp"
namespace
ck_tile
{
// "S"tream update output along "N"
// A in smem, B load from global
// require 4 wave, occupancy=1c
struct
FlatmmSn_32x128x512_1x4x1_16x16x32_BF16_itl
:
public
FlatmmSn_32x128x512_1x4x1_16x16x32_Base
{
using
BDataType
=
bf16_t
;
using
ODataType
=
bf16_t
;
// TODO: need paired with tile_window_linear!
// TODO: need call init_raw() before call this function!
// template <typename AWindow, typename BWindow, typename OWindow, typename ScaleTensor>
template
<
typename
BRes
,
typename
BCoords
,
typename
ORes
,
typename
OCoords
,
typename
OFlags
,
typename
ScaleTensor
>
CK_TILE_DEVICE
auto
operator
()(
const
BRes
&
res_b
,
const
BCoords
&
cached_coords_b
,
const
ORes
&
res_o
,
const
OCoords
&
cached_coords_o
,
const
OFlags
&
o_flags
,
// this should be in sgpr
CK_TILE_LDS_ADDR
void
*
smem
,
index_t
n
,
// loop along n dim
const
ScaleTensor
&
scale_
,
index_t
tile_offset_b
,
// stride b is fixed to blockKr * blockW, but still can adjust
index_t
tile_offset_o
)
{
// static_assert(BCoords::size() == 8); // 8
static_assert
(
BCoords
::
size
()
==
4
);
static_assert
(
OCoords
::
size
()
==
8
);
const
index_t
tile_stride_b_bytes
=
tile_offset_b
*
sizeof
(
BDataType
);
const
index_t
tile_stride_o_bytes
=
tile_offset_o
*
sizeof
(
ODataType
);
static_assert
(
ScaleTensor
::
size
()
==
2
);
float
s0
=
scale_
[
number
<
0
>
{}];
float
s1
=
scale_
[
number
<
1
>
{}];
// index_t loop_cnt = n / Block_N;
register
float
v_c0
asm
(
"v64"
);
register
float
v_c1
asm
(
"v65"
);
register
float
v_c2
asm
(
"v66"
);
register
float
v_c3
asm
(
"v67"
);
register
float
v_c4
asm
(
"v68"
);
register
float
v_c5
asm
(
"v69"
);
register
float
v_c6
asm
(
"v70"
);
register
float
v_c7
asm
(
"v71"
);
register
float
v_c8
asm
(
"v72"
);
register
float
v_c9
asm
(
"v73"
);
register
float
v_c10
asm
(
"v74"
);
register
float
v_c11
asm
(
"v75"
);
register
float
v_c12
asm
(
"v76"
);
register
float
v_c13
asm
(
"v77"
);
register
float
v_c14
asm
(
"v78"
);
register
float
v_c15
asm
(
"v79"
);
register
float
v_c16
asm
(
"v80"
);
register
float
v_c17
asm
(
"v81"
);
register
float
v_c18
asm
(
"v82"
);
register
float
v_c19
asm
(
"v83"
);
register
float
v_c20
asm
(
"v84"
);
register
float
v_c21
asm
(
"v85"
);
register
float
v_c22
asm
(
"v86"
);
register
float
v_c23
asm
(
"v87"
);
register
float
v_c24
asm
(
"v88"
);
register
float
v_c25
asm
(
"v89"
);
register
float
v_c26
asm
(
"v90"
);
register
float
v_c27
asm
(
"v91"
);
register
float
v_c28
asm
(
"v92"
);
register
float
v_c29
asm
(
"v93"
);
register
float
v_c30
asm
(
"v94"
);
register
float
v_c31
asm
(
"v95"
);
int32_t
nan_hi
=
0x7fff0000
;
int32_t
nan_lo
=
0x00007fff
;
// in smem, the layout is M0(2)*K0(128)*M1(16)*K1(4)
// every threads need 8xK in contiguous register
// ... and every wave need the same data
int
lane_id
=
threadIdx
.
x
%
64
;
int
sld_y_os
=
(
lane_id
%
16
)
*
4
+
(
lane_id
/
16
)
*
128
;
sld_y_os
*=
2
;
// y y p p p y
// reg before shfl M0(2)*N0(2)*Nl(4)*Nw(4)*Mw(16)*Nv(4)
// but order is N0*M0*Nv
// in LDS we need store as
// M0(2)* N0(2) * Nl(4) * Nw(4) * (Mw(16)*Nv(4) + 4)
// y y wave-id lid/16 lid%16 v
// sst(v3) = (v0/16*34 + v0%16 * 2 + wid*136) * 4
int
sfl_sst
=
(
threadIdx
.
x
%
16
*
4
)
+
(
threadIdx
.
x
/
16
)
*
(
64
+
4
);
sfl_sst
*=
2
;
// from LDS we need load as
// M0(2)* N0(2) * Nl(4) * Nw(4) * (Mw(16) * Nv(4) + 4)
// ( 2 issue) (rem 32-lane) (4 wave*4issue) 2lane*1ussue(pk2)
// sld(v4) = v0/2 *34*4 + v0 % 2 *4 + wid*2 *4
int
sfl_sld
=
(
lane_id
%
2
)
*
2
+
(
lane_id
/
2
)
*
(
64
+
4
)
+
(
threadIdx
.
x
/
64
)
*
4
;
sfl_sld
*=
2
;
// B nr->kr
// clang-format off
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Winline-asm"
asm
volatile
(
#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_BF16
#include "uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16_itl.inc"
#undef CK_TILE_FLATMM_UK_MFMA
:
[
smem_
]
"+r"
(
smem
),
// [s_loop_cnt]"+s"(loop_cnt),
[
s_loop_cnt
]
"+s"
(
n
),
[
c0
]
"+v"
(
v_c0
),
[
c1
]
"+v"
(
v_c1
),
[
c2
]
"+v"
(
v_c2
),
[
c3
]
"+v"
(
v_c3
),
[
c4
]
"+v"
(
v_c4
),
[
c5
]
"+v"
(
v_c5
),
[
c6
]
"+v"
(
v_c6
),
[
c7
]
"+v"
(
v_c7
),
[
c8
]
"+v"
(
v_c8
),
[
c9
]
"+v"
(
v_c9
),
[
c10
]
"+v"
(
v_c10
),
[
c11
]
"+v"
(
v_c11
),
[
c12
]
"+v"
(
v_c12
),
[
c13
]
"+v"
(
v_c13
),
[
c14
]
"+v"
(
v_c14
),
[
c15
]
"+v"
(
v_c15
),
[
c16
]
"+v"
(
v_c16
),
[
c17
]
"+v"
(
v_c17
),
[
c18
]
"+v"
(
v_c18
),
[
c19
]
"+v"
(
v_c19
),
[
c20
]
"+v"
(
v_c20
),
[
c21
]
"+v"
(
v_c21
),
[
c22
]
"+v"
(
v_c22
),
[
c23
]
"+v"
(
v_c23
),
[
c24
]
"+v"
(
v_c24
),
[
c25
]
"+v"
(
v_c25
),
[
c26
]
"+v"
(
v_c26
),
[
c27
]
"+v"
(
v_c27
),
[
c28
]
"+v"
(
v_c28
),
[
c29
]
"+v"
(
v_c29
),
[
c30
]
"+v"
(
v_c30
),
[
c31
]
"+v"
(
v_c31
)
:
[
sld_a_base
]
"n"
(
0
),
[
shfl_base
]
"n"
(
0
),
[
v_sld_y_os
]
"v"
(
sld_y_os
),
[
v_sfl_sld
]
"v"
(
sfl_sld
),
[
v_sfl_sst
]
"v"
(
sfl_sst
),
[
s_res_o0
]
"s"
(
res_o
[
0
]),
[
s_res_o1
]
"s"
(
res_o
[
1
]),
//[s_res_o2]"s"(res_o[2]),
//[s_res_o3]"s"(res_o[3]),
[
s_res_b0
]
"s"
(
res_b
[
0
]),
[
s_res_b1
]
"s"
(
res_b
[
1
]),
[
s_res_b2
]
"s"
(
res_b
[
2
]),
[
s_res_b3
]
"s"
(
res_b
[
3
]),
[
v_os_o0
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_o
[
number
<
0
>
{}]
*
sizeof
(
ODataType
))),
[
v_os_o1
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_o
[
number
<
1
>
{}]
*
sizeof
(
ODataType
))),
[
v_os_o2
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_o
[
number
<
2
>
{}]
*
sizeof
(
ODataType
))),
[
v_os_o3
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_o
[
number
<
3
>
{}]
*
sizeof
(
ODataType
))),
[
v_os_o4
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_o
[
number
<
4
>
{}]
*
sizeof
(
ODataType
))),
[
v_os_o5
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_o
[
number
<
5
>
{}]
*
sizeof
(
ODataType
))),
[
v_os_o6
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_o
[
number
<
6
>
{}]
*
sizeof
(
ODataType
))),
[
v_os_o7
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_o
[
number
<
7
>
{}]
*
sizeof
(
ODataType
))),
[
v_os_b0
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
0
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b1
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
1
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b2
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
2
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b3
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
3
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b4
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
4
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b5
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
5
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b6
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
6
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b7
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
7
>
{}]
*
sizeof
(
BDataType
))),
[
s_tile_os_o
]
"s"
(
tile_stride_o_bytes
),
[
s_tile_os_b
]
"s"
(
tile_stride_b_bytes
),
[
scale_0
]
"v"
(
s0
),
[
scale_1
]
"v"
(
s1
),
[
v_nan_lo
]
"v"
(
nan_lo
),
[
v_nan_hi
]
"v"
(
nan_hi
),
[
s_execflag_0
]
"s"
(
o_flags
[
number
<
0
>
{}]),
[
s_execflag_1
]
"s"
(
o_flags
[
number
<
1
>
{}]),
[
s_execflag_2
]
"s"
(
o_flags
[
number
<
2
>
{}]),
[
s_execflag_3
]
"s"
(
o_flags
[
number
<
3
>
{}]),
[
s_execflag_4
]
"s"
(
o_flags
[
number
<
4
>
{}]),
[
s_execflag_5
]
"s"
(
o_flags
[
number
<
5
>
{}]),
[
s_execflag_6
]
"s"
(
o_flags
[
number
<
6
>
{}]),
[
s_execflag_7
]
"s"
(
o_flags
[
number
<
7
>
{}])
:
"memory"
,
"a0"
,
"a1"
,
"a2"
,
"a3"
,
"a4"
,
"a5"
,
"a6"
,
"a7"
,
"a8"
,
"a9"
,
"a10"
,
"a11"
,
"a12"
,
"a13"
,
"a14"
,
"a15"
,
"a16"
,
"a17"
,
"a18"
,
"a19"
,
"a20"
,
"a21"
,
"a22"
,
"a23"
,
"a24"
,
"a25"
,
"a26"
,
"a27"
,
"a28"
,
"a29"
,
"a30"
,
"a31"
,
"a32"
,
"a33"
,
"a34"
,
"a35"
,
"a36"
,
"a37"
,
"a38"
,
"a39"
,
"a40"
,
"a41"
,
"a42"
,
"a43"
,
"a44"
,
"a45"
,
"a46"
,
"a47"
,
"a48"
,
"a49"
,
"a50"
,
"a51"
,
"a52"
,
"a53"
,
"a54"
,
"a55"
,
"a56"
,
"a57"
,
"a58"
,
"a59"
,
"a60"
,
"a61"
,
"a62"
,
"a63"
,
"a64"
,
"a65"
,
"a66"
,
"a67"
,
"a68"
,
"a69"
,
"a70"
,
"a71"
,
"a72"
,
"a73"
,
"a74"
,
"a75"
,
"a76"
,
"a77"
,
"a78"
,
"a79"
,
"a80"
,
"a81"
,
"a82"
,
"a83"
,
"a84"
,
"a85"
,
"a86"
,
"a87"
,
"a88"
,
"a89"
,
"a90"
,
"a91"
,
"a92"
,
"a93"
,
"a94"
,
"a95"
,
"a96"
,
"a97"
,
"a98"
,
"a99"
,
"a100"
,
"a101"
,
"a102"
,
"a103"
,
"a104"
,
"a105"
,
"a106"
,
"a107"
,
"a108"
,
"a109"
,
"a110"
,
"a111"
,
"a112"
,
"a113"
,
"a114"
,
"a115"
,
"a116"
,
"a117"
,
"a118"
,
"a119"
,
"a120"
,
"a121"
,
"a122"
,
"a123"
,
"a124"
,
"a125"
,
"a126"
,
"a127"
,
"a128"
,
"a129"
,
"a130"
,
"a131"
,
"a132"
,
"a133"
,
"a134"
,
"a135"
,
"a136"
,
"a137"
,
"a138"
,
"a139"
,
"a140"
,
"a141"
,
"a142"
,
"a143"
,
"a144"
,
"a145"
,
"a146"
,
"a147"
,
"a148"
,
"a149"
,
"a150"
,
"a151"
,
"a152"
,
"a153"
,
"a154"
,
"a155"
,
"a156"
,
"a157"
,
"a158"
,
"a159"
,
"a160"
,
"a161"
,
"a162"
,
"a163"
,
"a164"
,
"a165"
,
"a166"
,
"a167"
,
"a168"
,
"a169"
,
"a170"
,
"a171"
,
"a172"
,
"a173"
,
"a174"
,
"a175"
,
"a176"
,
"a177"
,
"a178"
,
"a179"
,
"a180"
,
"a181"
,
"a182"
,
"a183"
,
"a184"
,
"a185"
,
"a186"
,
"a187"
,
"a188"
,
"a189"
,
"a190"
,
"a191"
,
"a192"
,
"a193"
,
"a194"
,
"a195"
,
"a196"
,
"a197"
,
"a198"
,
"a199"
,
"a200"
,
"a201"
,
"a202"
,
"a203"
,
"a204"
,
"a205"
,
"a206"
,
"a207"
,
"a208"
,
"a209"
,
"a210"
,
"a211"
,
"a212"
,
"a213"
,
"a214"
,
"a215"
,
"a216"
,
"a217"
,
"a218"
,
"a219"
,
"a220"
,
"a221"
,
"a222"
,
"a223"
,
"a224"
,
"a225"
,
"a226"
,
"a227"
,
"a228"
,
"a229"
,
"a230"
,
"a231"
,
"a232"
,
"a233"
,
"a234"
,
"a235"
,
"a236"
,
"a237"
,
"a238"
,
"a239"
,
"a240"
,
"a241"
,
"a242"
,
"a243"
,
"a244"
,
"a245"
,
"a246"
,
"a247"
,
"a248"
,
"a249"
,
"a250"
,
"a251"
,
"a252"
,
"a253"
,
"a254"
,
"a255"
,
"s8"
,
"s9"
,
"s12"
,
"s13"
,
"s14"
,
"s15"
,
"s38"
,
"s39"
,
"s52"
,
"s86"
,
"s36"
,
"s37"
,
"s59"
,
"s80"
,
"v10"
,
"v11"
,
"v12"
,
"v13"
,
"v14"
,
"v15"
,
"v16"
,
"v17"
,
"v50"
,
"v54"
,
"v55"
,
"v64"
,
"v65"
,
"v66"
,
"v67"
,
"v68"
,
"v69"
,
"v70"
,
"v71"
,
"v72"
,
"v73"
,
"v74"
,
"v75"
,
"v76"
,
"v77"
,
"v78"
,
"v79"
,
"v80"
,
"v81"
,
"v82"
,
"v83"
,
"v84"
,
"v85"
,
"v86"
,
"v87"
,
"v88"
,
"v89"
,
"v90"
,
"v91"
,
"v92"
,
"v93"
,
"v94"
,
"v95"
,
"v128"
,
"v129"
,
"v130"
,
"v131"
,
"v132"
,
"v133"
,
"v134"
,
"v135"
,
"v136"
,
"v137"
,
"v138"
,
"v139"
,
"v140"
,
"v141"
,
"v142"
,
"v143"
,
"v144"
,
"v145"
,
"v146"
,
"v147"
,
"v148"
,
"v149"
,
"v150"
,
"v151"
,
"v152"
,
"v153"
,
"v154"
,
"v155"
,
"v156"
,
"v157"
,
"v158"
,
"v159"
,
"v160"
,
"v161"
,
"v162"
,
"v163"
,
"v164"
,
"v165"
,
"v166"
,
"v167"
,
"v168"
,
"v169"
,
"v170"
,
"v171"
,
"v172"
,
"v173"
,
"v174"
,
"v175"
,
"v176"
,
"v177"
,
"v178"
,
"v179"
,
"v180"
,
"v181"
,
"v182"
,
"v183"
,
"v184"
,
"v185"
,
"v186"
,
"v187"
,
"v188"
,
"v189"
,
"v190"
,
"v191"
,
"v192"
,
"v193"
,
"v194"
,
"v195"
,
"v196"
,
"v197"
,
"v198"
,
"v199"
,
"v200"
,
"v201"
,
"v202"
,
"v203"
,
"v204"
,
"v205"
,
"v206"
,
"v207"
,
"v208"
,
"v209"
,
"v210"
,
"v211"
,
"v212"
,
"v213"
,
"v214"
,
"v215"
,
"v216"
,
"v217"
,
"v218"
,
"v219"
,
"v220"
,
"v221"
,
"v222"
,
"v223"
,
"v224"
,
"v225"
,
"v226"
,
"v227"
,
"v228"
,
"v229"
,
"v230"
,
"v231"
,
"v232"
,
"v233"
,
"v234"
,
"v235"
,
"v236"
,
"v237"
,
"v238"
,
"v239"
,
"v240"
,
"v241"
,
"v242"
,
"v243"
,
"v244"
,
"v245"
,
"v246"
,
"v247"
,
"v248"
,
"v249"
,
"v250"
,
"v251"
,
"v252"
,
"v253"
,
"v254"
,
"v255"
);
#pragma clang diagnostic pop
// clang-format on
}
};
struct
FlatmmSn_32x128x512_1x4x1_16x16x32_FP16_itl
:
public
FlatmmSn_32x128x512_1x4x1_16x16x32_Base
{
using
BDataType
=
bf16_t
;
using
ODataType
=
bf16_t
;
// TODO: need paired with tile_window_linear!
// TODO: need call init_raw() before call this function!
// template <typename AWindow, typename BWindow, typename OWindow, typename ScaleTensor>
template
<
typename
BRes
,
typename
BCoords
,
typename
ORes
,
typename
OCoords
,
typename
OFlags
,
typename
ScaleTensor
>
CK_TILE_DEVICE
auto
operator
()(
const
BRes
&
res_b
,
const
BCoords
&
cached_coords_b
,
const
ORes
&
res_o
,
const
OCoords
&
cached_coords_o
,
const
OFlags
&
o_flags
,
// this should be in sgpr
CK_TILE_LDS_ADDR
void
*
smem
,
index_t
n
,
// loop along n dim
const
ScaleTensor
&
scale_
,
index_t
tile_offset_b
,
// stride b is fixed to blockKr * blockW, but still can adjust
index_t
tile_offset_o
)
{
static_assert
(
BCoords
::
size
()
==
8
);
// 8
static_assert
(
OCoords
::
size
()
==
8
);
const
index_t
tile_stride_b_bytes
=
tile_offset_b
*
sizeof
(
BDataType
);
const
index_t
tile_stride_o_bytes
=
tile_offset_o
*
sizeof
(
ODataType
);
static_assert
(
ScaleTensor
::
size
()
==
2
);
float
s0
=
scale_
[
number
<
0
>
{}];
float
s1
=
scale_
[
number
<
1
>
{}];
// index_t loop_cnt = n / Block_N;
register
float
v_c0
asm
(
"v64"
);
register
float
v_c1
asm
(
"v65"
);
register
float
v_c2
asm
(
"v66"
);
register
float
v_c3
asm
(
"v67"
);
register
float
v_c4
asm
(
"v68"
);
register
float
v_c5
asm
(
"v69"
);
register
float
v_c6
asm
(
"v70"
);
register
float
v_c7
asm
(
"v71"
);
register
float
v_c8
asm
(
"v72"
);
register
float
v_c9
asm
(
"v73"
);
register
float
v_c10
asm
(
"v74"
);
register
float
v_c11
asm
(
"v75"
);
register
float
v_c12
asm
(
"v76"
);
register
float
v_c13
asm
(
"v77"
);
register
float
v_c14
asm
(
"v78"
);
register
float
v_c15
asm
(
"v79"
);
register
float
v_c16
asm
(
"v80"
);
register
float
v_c17
asm
(
"v81"
);
register
float
v_c18
asm
(
"v82"
);
register
float
v_c19
asm
(
"v83"
);
register
float
v_c20
asm
(
"v84"
);
register
float
v_c21
asm
(
"v85"
);
register
float
v_c22
asm
(
"v86"
);
register
float
v_c23
asm
(
"v87"
);
register
float
v_c24
asm
(
"v88"
);
register
float
v_c25
asm
(
"v89"
);
register
float
v_c26
asm
(
"v90"
);
register
float
v_c27
asm
(
"v91"
);
register
float
v_c28
asm
(
"v92"
);
register
float
v_c29
asm
(
"v93"
);
register
float
v_c30
asm
(
"v94"
);
register
float
v_c31
asm
(
"v95"
);
int32_t
nan_hi
=
0x7fff0000
;
int32_t
nan_lo
=
0x00007fff
;
// in smem, the layout is M0(2)*K0(128)*M1(16)*K1(4)
// every threads need 8xK in contiguous register
// ... and every wave need the same data
int
lane_id
=
threadIdx
.
x
%
64
;
int
sld_y_os
=
(
lane_id
%
16
)
*
4
+
(
lane_id
/
16
)
*
128
;
sld_y_os
*=
2
;
// y y p p p y
// reg before shfl M0(2)*N0(2)*Nl(4)*Nw(4)*Mw(16)*Nv(4)
// but order is N0*M0*Nv
// in LDS we need store as
// M0(2)* N0(2) * Nl(4) * Nw(4) * (Mw(16)*Nv(4) + 4)
// y y wave-id lid/16 lid%16 v
// sst(v3) = (v0/16*34 + v0%16 * 2 + wid*136) * 4
int
sfl_sst
=
(
threadIdx
.
x
%
16
*
4
)
+
(
threadIdx
.
x
/
16
)
*
(
64
+
4
);
sfl_sst
*=
2
;
// from LDS we need load as
// M0(2)* N0(2) * Nl(4) * Nw(4) * (Mw(16) * Nv(4) + 4)
// ( 2 issue) (rem 32-lane) (4 wave*4issue) 2lane*1ussue(pk2)
// sld(v4) = v0/2 *34*4 + v0 % 2 *4 + wid*2 *4
int
sfl_sld
=
(
lane_id
%
2
)
*
2
+
(
lane_id
/
2
)
*
(
64
+
4
)
+
(
threadIdx
.
x
/
64
)
*
4
;
sfl_sld
*=
2
;
// B nr->kr
// clang-format off
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Winline-asm"
asm
volatile
(
#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_FP16
#include "uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16_itl.inc"
#undef CK_TILE_FLATMM_UK_MFMA
:
[
smem_
]
"+r"
(
smem
),
[
s_loop_cnt
]
"+s"
(
n
),
[
c0
]
"+v"
(
v_c0
),
[
c1
]
"+v"
(
v_c1
),
[
c2
]
"+v"
(
v_c2
),
[
c3
]
"+v"
(
v_c3
),
[
c4
]
"+v"
(
v_c4
),
[
c5
]
"+v"
(
v_c5
),
[
c6
]
"+v"
(
v_c6
),
[
c7
]
"+v"
(
v_c7
),
[
c8
]
"+v"
(
v_c8
),
[
c9
]
"+v"
(
v_c9
),
[
c10
]
"+v"
(
v_c10
),
[
c11
]
"+v"
(
v_c11
),
[
c12
]
"+v"
(
v_c12
),
[
c13
]
"+v"
(
v_c13
),
[
c14
]
"+v"
(
v_c14
),
[
c15
]
"+v"
(
v_c15
),
[
c16
]
"+v"
(
v_c16
),
[
c17
]
"+v"
(
v_c17
),
[
c18
]
"+v"
(
v_c18
),
[
c19
]
"+v"
(
v_c19
),
[
c20
]
"+v"
(
v_c20
),
[
c21
]
"+v"
(
v_c21
),
[
c22
]
"+v"
(
v_c22
),
[
c23
]
"+v"
(
v_c23
),
[
c24
]
"+v"
(
v_c24
),
[
c25
]
"+v"
(
v_c25
),
[
c26
]
"+v"
(
v_c26
),
[
c27
]
"+v"
(
v_c27
),
[
c28
]
"+v"
(
v_c28
),
[
c29
]
"+v"
(
v_c29
),
[
c30
]
"+v"
(
v_c30
),
[
c31
]
"+v"
(
v_c31
)
:
[
sld_a_base
]
"n"
(
0
),
[
shfl_base
]
"n"
(
0
),
[
v_sld_y_os
]
"v"
(
sld_y_os
),
[
v_sfl_sld
]
"v"
(
sfl_sld
),
[
v_sfl_sst
]
"v"
(
sfl_sst
),
[
s_res_o0
]
"s"
(
res_o
[
0
]),
[
s_res_o1
]
"s"
(
res_o
[
1
]),
//[s_res_o2]"s"(res_o[2]),
//[s_res_o3]"s"(res_o[3]),
[
s_res_b0
]
"s"
(
res_b
[
0
]),
[
s_res_b1
]
"s"
(
res_b
[
1
]),
[
s_res_b2
]
"s"
(
res_b
[
2
]),
[
s_res_b3
]
"s"
(
res_b
[
3
]),
[
v_os_o0
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_o
[
number
<
0
>
{}]
*
sizeof
(
ODataType
))),
[
v_os_o1
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_o
[
number
<
1
>
{}]
*
sizeof
(
ODataType
))),
[
v_os_o2
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_o
[
number
<
2
>
{}]
*
sizeof
(
ODataType
))),
[
v_os_o3
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_o
[
number
<
3
>
{}]
*
sizeof
(
ODataType
))),
[
v_os_o4
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_o
[
number
<
4
>
{}]
*
sizeof
(
ODataType
))),
[
v_os_o5
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_o
[
number
<
5
>
{}]
*
sizeof
(
ODataType
))),
[
v_os_o6
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_o
[
number
<
6
>
{}]
*
sizeof
(
ODataType
))),
[
v_os_o7
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_o
[
number
<
7
>
{}]
*
sizeof
(
ODataType
))),
[
v_os_b0
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
0
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b1
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
1
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b2
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
2
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b3
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
3
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b4
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
4
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b5
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
5
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b6
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
6
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b7
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
7
>
{}]
*
sizeof
(
BDataType
))),
[
s_tile_os_o
]
"s"
(
tile_stride_o_bytes
),
[
s_tile_os_b
]
"s"
(
tile_stride_b_bytes
),
[
scale_0
]
"v"
(
s0
),
[
scale_1
]
"v"
(
s1
),
[
v_nan_lo
]
"v"
(
nan_lo
),
[
v_nan_hi
]
"v"
(
nan_hi
),
[
s_execflag_0
]
"s"
(
o_flags
[
number
<
0
>
{}]),
[
s_execflag_1
]
"s"
(
o_flags
[
number
<
1
>
{}]),
[
s_execflag_2
]
"s"
(
o_flags
[
number
<
2
>
{}]),
[
s_execflag_3
]
"s"
(
o_flags
[
number
<
3
>
{}]),
[
s_execflag_4
]
"s"
(
o_flags
[
number
<
4
>
{}]),
[
s_execflag_5
]
"s"
(
o_flags
[
number
<
5
>
{}]),
[
s_execflag_6
]
"s"
(
o_flags
[
number
<
6
>
{}]),
[
s_execflag_7
]
"s"
(
o_flags
[
number
<
7
>
{}])
:
"memory"
,
"a0"
,
"a1"
,
"a2"
,
"a3"
,
"a4"
,
"a5"
,
"a6"
,
"a7"
,
"a8"
,
"a9"
,
"a10"
,
"a11"
,
"a12"
,
"a13"
,
"a14"
,
"a15"
,
"a16"
,
"a17"
,
"a18"
,
"a19"
,
"a20"
,
"a21"
,
"a22"
,
"a23"
,
"a24"
,
"a25"
,
"a26"
,
"a27"
,
"a28"
,
"a29"
,
"a30"
,
"a31"
,
"a32"
,
"a33"
,
"a34"
,
"a35"
,
"a36"
,
"a37"
,
"a38"
,
"a39"
,
"a40"
,
"a41"
,
"a42"
,
"a43"
,
"a44"
,
"a45"
,
"a46"
,
"a47"
,
"a48"
,
"a49"
,
"a50"
,
"a51"
,
"a52"
,
"a53"
,
"a54"
,
"a55"
,
"a56"
,
"a57"
,
"a58"
,
"a59"
,
"a60"
,
"a61"
,
"a62"
,
"a63"
,
"a64"
,
"a65"
,
"a66"
,
"a67"
,
"a68"
,
"a69"
,
"a70"
,
"a71"
,
"a72"
,
"a73"
,
"a74"
,
"a75"
,
"a76"
,
"a77"
,
"a78"
,
"a79"
,
"a80"
,
"a81"
,
"a82"
,
"a83"
,
"a84"
,
"a85"
,
"a86"
,
"a87"
,
"a88"
,
"a89"
,
"a90"
,
"a91"
,
"a92"
,
"a93"
,
"a94"
,
"a95"
,
"a96"
,
"a97"
,
"a98"
,
"a99"
,
"a100"
,
"a101"
,
"a102"
,
"a103"
,
"a104"
,
"a105"
,
"a106"
,
"a107"
,
"a108"
,
"a109"
,
"a110"
,
"a111"
,
"a112"
,
"a113"
,
"a114"
,
"a115"
,
"a116"
,
"a117"
,
"a118"
,
"a119"
,
"a120"
,
"a121"
,
"a122"
,
"a123"
,
"a124"
,
"a125"
,
"a126"
,
"a127"
,
"a128"
,
"a129"
,
"a130"
,
"a131"
,
"a132"
,
"a133"
,
"a134"
,
"a135"
,
"a136"
,
"a137"
,
"a138"
,
"a139"
,
"a140"
,
"a141"
,
"a142"
,
"a143"
,
"a144"
,
"a145"
,
"a146"
,
"a147"
,
"a148"
,
"a149"
,
"a150"
,
"a151"
,
"a152"
,
"a153"
,
"a154"
,
"a155"
,
"a156"
,
"a157"
,
"a158"
,
"a159"
,
"a160"
,
"a161"
,
"a162"
,
"a163"
,
"a164"
,
"a165"
,
"a166"
,
"a167"
,
"a168"
,
"a169"
,
"a170"
,
"a171"
,
"a172"
,
"a173"
,
"a174"
,
"a175"
,
"a176"
,
"a177"
,
"a178"
,
"a179"
,
"a180"
,
"a181"
,
"a182"
,
"a183"
,
"a184"
,
"a185"
,
"a186"
,
"a187"
,
"a188"
,
"a189"
,
"a190"
,
"a191"
,
"a192"
,
"a193"
,
"a194"
,
"a195"
,
"a196"
,
"a197"
,
"a198"
,
"a199"
,
"a200"
,
"a201"
,
"a202"
,
"a203"
,
"a204"
,
"a205"
,
"a206"
,
"a207"
,
"a208"
,
"a209"
,
"a210"
,
"a211"
,
"a212"
,
"a213"
,
"a214"
,
"a215"
,
"a216"
,
"a217"
,
"a218"
,
"a219"
,
"a220"
,
"a221"
,
"a222"
,
"a223"
,
"a224"
,
"a225"
,
"a226"
,
"a227"
,
"a228"
,
"a229"
,
"a230"
,
"a231"
,
"a232"
,
"a233"
,
"a234"
,
"a235"
,
"a236"
,
"a237"
,
"a238"
,
"a239"
,
"a240"
,
"a241"
,
"a242"
,
"a243"
,
"a244"
,
"a245"
,
"a246"
,
"a247"
,
"a248"
,
"a249"
,
"a250"
,
"a251"
,
"a252"
,
"a253"
,
"a254"
,
"a255"
,
"s8"
,
"s9"
,
"s12"
,
"s13"
,
"s14"
,
"s15"
,
"s38"
,
"s39"
,
"s52"
,
"s86"
,
"s36"
,
"s37"
,
"s59"
,
"s80"
,
"v10"
,
"v11"
,
"v12"
,
"v13"
,
"v14"
,
"v15"
,
"v16"
,
"v17"
,
"v50"
,
"v54"
,
"v55"
,
"v64"
,
"v65"
,
"v66"
,
"v67"
,
"v68"
,
"v69"
,
"v70"
,
"v71"
,
"v72"
,
"v73"
,
"v74"
,
"v75"
,
"v76"
,
"v77"
,
"v78"
,
"v79"
,
"v80"
,
"v81"
,
"v82"
,
"v83"
,
"v84"
,
"v85"
,
"v86"
,
"v87"
,
"v88"
,
"v89"
,
"v90"
,
"v91"
,
"v92"
,
"v93"
,
"v94"
,
"v95"
,
"v128"
,
"v129"
,
"v130"
,
"v131"
,
"v132"
,
"v133"
,
"v134"
,
"v135"
,
"v136"
,
"v137"
,
"v138"
,
"v139"
,
"v140"
,
"v141"
,
"v142"
,
"v143"
,
"v144"
,
"v145"
,
"v146"
,
"v147"
,
"v148"
,
"v149"
,
"v150"
,
"v151"
,
"v152"
,
"v153"
,
"v154"
,
"v155"
,
"v156"
,
"v157"
,
"v158"
,
"v159"
,
"v160"
,
"v161"
,
"v162"
,
"v163"
,
"v164"
,
"v165"
,
"v166"
,
"v167"
,
"v168"
,
"v169"
,
"v170"
,
"v171"
,
"v172"
,
"v173"
,
"v174"
,
"v175"
,
"v176"
,
"v177"
,
"v178"
,
"v179"
,
"v180"
,
"v181"
,
"v182"
,
"v183"
,
"v184"
,
"v185"
,
"v186"
,
"v187"
,
"v188"
,
"v189"
,
"v190"
,
"v191"
,
"v192"
,
"v193"
,
"v194"
,
"v195"
,
"v196"
,
"v197"
,
"v198"
,
"v199"
,
"v200"
,
"v201"
,
"v202"
,
"v203"
,
"v204"
,
"v205"
,
"v206"
,
"v207"
,
"v208"
,
"v209"
,
"v210"
,
"v211"
,
"v212"
,
"v213"
,
"v214"
,
"v215"
,
"v216"
,
"v217"
,
"v218"
,
"v219"
,
"v220"
,
"v221"
,
"v222"
,
"v223"
,
"v224"
,
"v225"
,
"v226"
,
"v227"
,
"v228"
,
"v229"
,
"v230"
,
"v231"
,
"v232"
,
"v233"
,
"v234"
,
"v235"
,
"v236"
,
"v237"
,
"v238"
,
"v239"
,
"v240"
,
"v241"
,
"v242"
,
"v243"
,
"v244"
,
"v245"
,
"v246"
,
"v247"
,
"v248"
,
"v249"
,
"v250"
,
"v251"
,
"v252"
,
"v253"
,
"v254"
,
"v255"
);
#pragma clang diagnostic pop
// clang-format on
}
};
}
// namespace ck_tile
include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x128x256_1x4x1_16x16x16_itl.inc
0 → 100644
View file @
a75f162b
#ifndef CK_TILE_FLATMM_UK_MFMA
#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_BF16
#endif
#if CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_BF16
#define _UK_MFMA_ "v_mfma_f32_16x16x16_bf16"
#define _UK_PK_CVT_(x0_, x1_, y_) \
" v_cmp_u_f32 s[36:37], "
x0_
", "
x0_
"
\n
"
\
" v_add3_u32 v50, "
x0_
", %[v_nan_lo], 1
\n
"
\
" v_cndmask_b32 v54, v50, %[v_nan_hi], s[36:37]
\n
"
\
" v_cmp_u_f32 s[36:37], "
x1_
", "
x1_
"
\n
"
\
" v_add3_u32 v50, "
x1_
", %[v_nan_lo], 1
\n
"
\
" v_cndmask_b32 v55, v50, %[v_nan_hi], s[36:37]
\n
"
\
" v_perm_b32 "
y_
", v55, v54, s52
\n
"
#define _UK_ATOMIC_ADD_ "global_atomic_pk_add_bf16"
#elif CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_FP16
#define _UK_MFMA_ "v_mfma_f32_16x16x16_f16"
#define _UK_PK_CVT_(x0_, x1_, y_) \
" v_cvt_f16_f32 v54, "
x0_
"
\n
"
\
" v_cvt_f16_f32 v55, "
x1_
"
\n
"
\
" v_pack_b32_f16 "
y_
", v54, v55
\n
"
#define _UK_ATOMIC_ADD_ "global_atomic_pk_add_f16"
#endif
";-------------------------------------------------------------
\n
"
" s_mov_b32 s52, 0x07060302 ; v_perm
\n
"
" s_mov_b64 s[38:39], exec ; save current exec
\n
"
" s_mov_b32 s8, %[s_res_o0]
\n
"
" s_mov_b32 s9, %[s_res_o1]
\n
"
" s_mov_b32 s12, %[s_res_b0]
\n
"
" s_mov_b32 s13, %[s_res_b1]
\n
"
" s_mov_b32 s14, %[s_res_b2]
\n
"
" s_mov_b32 s15, %[s_res_b3]
\n
"
" s_mov_b32 s59, 0
\n
"
" ds_read_b64 v[128:129], %[v_sld_y_os] offset:0 + %[sld_a_base]
\n
"
" ds_read_b64 v[130:131], %[v_sld_y_os] offset:128 + %[sld_a_base]
\n
"
" ds_read_b64 v[132:133], %[v_sld_y_os] offset:1024 + %[sld_a_base]
\n
"
" ds_read_b64 v[134:135], %[v_sld_y_os] offset:1152 + %[sld_a_base]
\n
"
" ds_read_b64 v[136:137], %[v_sld_y_os] offset:2048 + %[sld_a_base]
\n
"
" ds_read_b64 v[138:139], %[v_sld_y_os] offset:2176 + %[sld_a_base]
\n
"
" ds_read_b64 v[140:141], %[v_sld_y_os] offset:3072 + %[sld_a_base]
\n
"
" ds_read_b64 v[142:143], %[v_sld_y_os] offset:3200 + %[sld_a_base]
\n
"
" ds_read_b64 v[144:145], %[v_sld_y_os] offset:4096 + %[sld_a_base]
\n
"
" ds_read_b64 v[146:147], %[v_sld_y_os] offset:4224 + %[sld_a_base]
\n
"
" ds_read_b64 v[148:149], %[v_sld_y_os] offset:5120 + %[sld_a_base]
\n
"
" ds_read_b64 v[150:151], %[v_sld_y_os] offset:5248 + %[sld_a_base]
\n
"
" ds_read_b64 v[152:153], %[v_sld_y_os] offset:6144 + %[sld_a_base]
\n
"
" ds_read_b64 v[154:155], %[v_sld_y_os] offset:6272 + %[sld_a_base]
\n
"
" ds_read_b64 v[156:157], %[v_sld_y_os] offset:7168 + %[sld_a_base]
\n
"
" ds_read_b64 v[158:159], %[v_sld_y_os] offset:7296 + %[sld_a_base]
\n
"
// " ds_read_b64 v[160:161], %[v_sld_y_os] offset:8192 + %[sld_a_base] \n"
// " ds_read_b64 v[162:163], %[v_sld_y_os] offset:8320 + %[sld_a_base] \n"
// " ds_read_b64 v[164:165], %[v_sld_y_os] offset:9216 + %[sld_a_base] \n"
// " ds_read_b64 v[166:167], %[v_sld_y_os] offset:9344 + %[sld_a_base] \n"
// " ds_read_b64 v[168:169], %[v_sld_y_os] offset:10240 + %[sld_a_base] \n"
// " ds_read_b64 v[170:171], %[v_sld_y_os] offset:10368 + %[sld_a_base] \n"
// " ds_read_b64 v[172:173], %[v_sld_y_os] offset:11264 + %[sld_a_base] \n"
// " ds_read_b64 v[174:175], %[v_sld_y_os] offset:11392 + %[sld_a_base] \n"
// " ds_read_b64 v[176:177], %[v_sld_y_os] offset:12288 + %[sld_a_base] \n"
// " ds_read_b64 v[178:179], %[v_sld_y_os] offset:12416 + %[sld_a_base] \n"
// " ds_read_b64 v[180:181], %[v_sld_y_os] offset:13312 + %[sld_a_base] \n"
// " ds_read_b64 v[182:183], %[v_sld_y_os] offset:13440 + %[sld_a_base] \n"
// " ds_read_b64 v[184:185], %[v_sld_y_os] offset:14336 + %[sld_a_base] \n"
// " ds_read_b64 v[186:187], %[v_sld_y_os] offset:14464 + %[sld_a_base] \n"
// " ds_read_b64 v[188:189], %[v_sld_y_os] offset:15360 + %[sld_a_base] \n"
// " ds_read_b64 v[190:191], %[v_sld_y_os] offset:15488 + %[sld_a_base] \n"
" ds_read_b64 v[192:193], %[v_sld_y_os] offset:8192 + %[sld_a_base]
\n
"
" ds_read_b64 v[194:195], %[v_sld_y_os] offset:8320 + %[sld_a_base]
\n
"
" ds_read_b64 v[196:197], %[v_sld_y_os] offset:9216 + %[sld_a_base]
\n
"
" ds_read_b64 v[198:199], %[v_sld_y_os] offset:9344 + %[sld_a_base]
\n
"
" ds_read_b64 v[200:201], %[v_sld_y_os] offset:10240 + %[sld_a_base]
\n
"
" ds_read_b64 v[202:203], %[v_sld_y_os] offset:10368 + %[sld_a_base]
\n
"
" ds_read_b64 v[204:205], %[v_sld_y_os] offset:11264 + %[sld_a_base]
\n
"
" ds_read_b64 v[206:207], %[v_sld_y_os] offset:11392 + %[sld_a_base]
\n
"
" ds_read_b64 v[208:209], %[v_sld_y_os] offset:12288 + %[sld_a_base]
\n
"
" ds_read_b64 v[210:211], %[v_sld_y_os] offset:12416 + %[sld_a_base]
\n
"
" ds_read_b64 v[212:213], %[v_sld_y_os] offset:13312 + %[sld_a_base]
\n
"
" ds_read_b64 v[214:215], %[v_sld_y_os] offset:13440 + %[sld_a_base]
\n
"
" ds_read_b64 v[216:217], %[v_sld_y_os] offset:14336 + %[sld_a_base]
\n
"
" ds_read_b64 v[218:219], %[v_sld_y_os] offset:14464 + %[sld_a_base]
\n
"
" ds_read_b64 v[220:221], %[v_sld_y_os] offset:15360 + %[sld_a_base]
\n
"
" ds_read_b64 v[222:223], %[v_sld_y_os] offset:15488 + %[sld_a_base]
\n
"
// " ds_read_b64 v[192:193], %[v_sld_y_os] offset:16384 + %[sld_a_base] \n"
// " ds_read_b64 v[194:195], %[v_sld_y_os] offset:16512 + %[sld_a_base] \n"
// " ds_read_b64 v[196:197], %[v_sld_y_os] offset:17408 + %[sld_a_base] \n"
// " ds_read_b64 v[198:199], %[v_sld_y_os] offset:17536 + %[sld_a_base] \n"
// " ds_read_b64 v[200:201], %[v_sld_y_os] offset:18432 + %[sld_a_base] \n"
// " ds_read_b64 v[202:203], %[v_sld_y_os] offset:18560 + %[sld_a_base] \n"
// " ds_read_b64 v[204:205], %[v_sld_y_os] offset:19456 + %[sld_a_base] \n"
// " ds_read_b64 v[206:207], %[v_sld_y_os] offset:19584 + %[sld_a_base] \n"
// " ds_read_b64 v[208:209], %[v_sld_y_os] offset:20480 + %[sld_a_base] \n"
// " ds_read_b64 v[210:211], %[v_sld_y_os] offset:20608 + %[sld_a_base] \n"
// " ds_read_b64 v[212:213], %[v_sld_y_os] offset:21504 + %[sld_a_base] \n"
// " ds_read_b64 v[214:215], %[v_sld_y_os] offset:21632 + %[sld_a_base] \n"
// " ds_read_b64 v[216:217], %[v_sld_y_os] offset:22528 + %[sld_a_base] \n"
// " ds_read_b64 v[218:219], %[v_sld_y_os] offset:22656 + %[sld_a_base] \n"
// " ds_read_b64 v[220:221], %[v_sld_y_os] offset:23552 + %[sld_a_base] \n"
// " ds_read_b64 v[222:223], %[v_sld_y_os] offset:23680 + %[sld_a_base] \n"
// " ds_read_b64 v[224:225], %[v_sld_y_os] offset:24576 + %[sld_a_base] \n"
// " ds_read_b64 v[226:227], %[v_sld_y_os] offset:24704 + %[sld_a_base] \n"
// " ds_read_b64 v[228:229], %[v_sld_y_os] offset:25600 + %[sld_a_base] \n"
// " ds_read_b64 v[230:231], %[v_sld_y_os] offset:25728 + %[sld_a_base] \n"
// " ds_read_b64 v[232:233], %[v_sld_y_os] offset:26624 + %[sld_a_base] \n"
// " ds_read_b64 v[234:235], %[v_sld_y_os] offset:26752 + %[sld_a_base] \n"
// " ds_read_b64 v[236:237], %[v_sld_y_os] offset:27648 + %[sld_a_base] \n"
// " ds_read_b64 v[238:239], %[v_sld_y_os] offset:27776 + %[sld_a_base] \n"
// " ds_read_b64 v[240:241], %[v_sld_y_os] offset:28672 + %[sld_a_base] \n"
// " ds_read_b64 v[242:243], %[v_sld_y_os] offset:28800 + %[sld_a_base] \n"
// " ds_read_b64 v[244:245], %[v_sld_y_os] offset:29696 + %[sld_a_base] \n"
// " ds_read_b64 v[246:247], %[v_sld_y_os] offset:29824 + %[sld_a_base] \n"
// " ds_read_b64 v[248:249], %[v_sld_y_os] offset:30720 + %[sld_a_base] \n"
// " ds_read_b64 v[250:251], %[v_sld_y_os] offset:30848 + %[sld_a_base] \n"
// " ds_read_b64 v[252:253], %[v_sld_y_os] offset:31744 + %[sld_a_base] \n"
// " ds_read_b64 v[254:255], %[v_sld_y_os] offset:31872 + %[sld_a_base] \n"
" s_waitcnt 0
\n
"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072
\n
"
// " buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[12:15], 0 offen \n"
// " buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[12:15], 0 offen offset:1024 \n"
// " buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[12:15], 0 offen offset:2048 \n"
// " buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[12:15], 0 offen offset:3072 \n"
// " buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[12:15], 0 offen \n"
// " buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[12:15], 0 offen offset:1024 \n"
// " buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[12:15], 0 offen offset:2048 \n"
// " buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[12:15], 0 offen offset:3072 \n"
// " buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[12:15], 0 offen \n"
// " buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[12:15], 0 offen offset:1024 \n"
// " buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[12:15], 0 offen offset:2048 \n"
// " buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[12:15], 0 offen offset:3072 \n"
// " buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[12:15], 0 offen \n"
// " buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[12:15], 0 offen offset:1024 \n"
// " buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[12:15], 0 offen offset:2048 \n"
// " buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[12:15], 0 offen offset:3072 \n"
" s_add_u32 s12, %[s_tile_os_b], s12
\n
"
" s_addc_u32 s13, 0, s13
\n
"
" v_mov_b32 v64, 0
\n
"
" v_mov_b32 v80, 0
\n
"
" v_mov_b32 v65, 0
\n
"
" v_mov_b32 v81, 0
\n
"
" v_mov_b32 v66, 0
\n
"
" v_mov_b32 v82, 0
\n
"
" v_mov_b32 v67, 0
\n
"
" v_mov_b32 v83, 0
\n
"
" v_mov_b32 v68, 0
\n
"
" v_mov_b32 v84, 0
\n
"
" v_mov_b32 v69, 0
\n
"
" v_mov_b32 v85, 0
\n
"
" v_mov_b32 v70, 0
\n
"
" v_mov_b32 v86, 0
\n
"
" v_mov_b32 v71, 0
\n
"
" v_mov_b32 v87, 0
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c0],%[c1]] offset:16640
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c2],%[c3]] offset:20992
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c4],%[c5]] offset:18816
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c6],%[c7]] offset:23168
\n
"
" s_mov_b32 s80, 0
\n
"
" s_waitcnt vmcnt(8)
\n
"
"coreloop_top_%=:
\n
"
" s_waitcnt vmcnt(0) & lgkmcnt(0)
\n
"
" s_barrier
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[0:1], v[128:129], 0
\n
"
" ds_read_b32 v10, %[v_sfl_sld] offset:16640
\n
"
" ds_read_b32 v11, %[v_sfl_sld] offset:16672
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c16],%[c17]] offset:25344
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[2:3], v[130:131], v[64:67]
\n
"
" buffer_load_dwordx4 acc[128:131], %[v_os_b0], s[12:15], 0 offen
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c18],%[c19]] offset:29696
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[4:5], v[132:133], v[64:67]
\n
"
" ds_read_b32 v12, %[v_sfl_sld] offset:16704
\n
"
" ds_read_b32 v13, %[v_sfl_sld] offset:16736
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c20],%[c21]] offset:27520
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[6:7], v[134:135], v[64:67]
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c22],%[c23]] offset:31872
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[8:9], v[136:137], v[64:67]
\n
"
" ds_read_b32 v14, %[v_sfl_sld] offset:20992
\n
"
" ds_read_b32 v15, %[v_sfl_sld] offset:21024
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[10:11], v[138:139], v[64:67]
\n
"
" buffer_load_dwordx4 acc[132:135], %[v_os_b0], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[12:13], v[140:141], v[64:67]
\n
"
" ds_read_b32 v16, %[v_sfl_sld] offset:21056
\n
"
" ds_read_b32 v17, %[v_sfl_sld] offset:21088
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[14:15], v[142:143], v[64:67]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[0:1], v[192:193], 0
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[2:3], v[194:195], v[68:71]
\n
"
" buffer_load_dwordx4 acc[136:139], %[v_os_b0], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[4:5], v[196:197], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[6:7], v[198:199], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[8:9], v[200:201], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[10:11], v[202:203], v[68:71]
\n
"
" buffer_load_dwordx4 acc[140:143], %[v_os_b0], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[12:13], v[204:205], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[14:15], v[206:207], v[68:71]
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_mov_b64 exec, %[s_execflag_0]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o0], v10, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[16:17], v[128:129], 0
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[18:19], v[130:131], v[72:75]
\n
"
" buffer_load_dwordx4 acc[144:147], %[v_os_b1], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[20:21], v[132:133], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[22:23], v[134:135], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[24:25], v[136:137], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[26:27], v[138:139], v[72:75]
\n
"
" buffer_load_dwordx4 acc[148:151], %[v_os_b1], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[28:29], v[140:141], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[30:31], v[142:143], v[72:75]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[16:17], v[192:193], 0
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[18:19], v[194:195], v[76:79]
\n
"
" buffer_load_dwordx4 acc[152:155], %[v_os_b1], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[20:21], v[196:197], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[22:23], v[198:199], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[24:25], v[200:201], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[26:27], v[202:203], v[76:79]
\n
"
" buffer_load_dwordx4 acc[156:159], %[v_os_b1], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[28:29], v[204:205], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[30:31], v[206:207], v[76:79]
\n
"
" s_mov_b64 exec, %[s_execflag_1]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o1], v11, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" s_waitcnt vmcnt(0)
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[32:33], v[144:145], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[34:35], v[146:147], v[64:67]
\n
"
" buffer_load_dwordx4 acc[160:163], %[v_os_b2], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[36:37], v[148:149], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[38:39], v[150:151], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[40:41], v[152:153], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[42:43], v[154:155], v[64:67]
\n
"
" buffer_load_dwordx4 acc[164:167], %[v_os_b2], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[44:45], v[156:157], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[46:47], v[158:159], v[64:67]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[32:33], v[208:209], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[34:35], v[210:211], v[68:71]
\n
"
" buffer_load_dwordx4 acc[168:171], %[v_os_b2], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[36:37], v[212:213], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[38:39], v[214:215], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[40:41], v[216:217], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[42:43], v[218:219], v[68:71]
\n
"
" buffer_load_dwordx4 acc[172:175], %[v_os_b2], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[44:45], v[220:221], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[46:47], v[222:223], v[68:71]
\n
"
" s_mov_b64 exec, %[s_execflag_2]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o2], v12, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[48:49], v[144:145], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[50:51], v[146:147], v[72:75]
\n
"
" buffer_load_dwordx4 acc[176:179], %[v_os_b3], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[52:53], v[148:149], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[54:55], v[150:151], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[56:57], v[152:153], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[58:59], v[154:155], v[72:75]
\n
"
" buffer_load_dwordx4 acc[180:183], %[v_os_b3], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[60:61], v[156:157], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[62:63], v[158:159], v[72:75]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[48:49], v[208:209], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[50:51], v[210:211], v[76:79]
\n
"
" buffer_load_dwordx4 acc[184:187], %[v_os_b3], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[52:53], v[212:213], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[54:55], v[214:215], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[56:57], v[216:217], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[58:59], v[218:219], v[76:79]
\n
"
" buffer_load_dwordx4 acc[188:191], %[v_os_b3], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[60:61], v[220:221], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[62:63], v[222:223], v[76:79]
\n
"
" s_mov_b64 exec, %[s_execflag_3]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o3], v13, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" s_waitcnt vmcnt(0)
\n
"
// _UK_MFMA_
// " [%[c0], %[c1], %[c2], %[c3]], acc[64:65], v[160:161], v[64:67] \n" _UK_MFMA_
// " [%[c0], %[c1], %[c2], %[c3]], acc[66:67], v[162:163], v[64:67] \n"
// " buffer_load_dwordx4 acc[192:195], %[v_os_b4], s[12:15], 0 offen \n" _UK_MFMA_
// " [%[c0], %[c1], %[c2], %[c3]], acc[68:69], v[164:165], v[64:67] \n" _UK_MFMA_
// " [%[c0], %[c1], %[c2], %[c3]], acc[70:71], v[166:167], v[64:67] \n" _UK_MFMA_
// " [%[c0], %[c1], %[c2], %[c3]], acc[72:73], v[168:169], v[64:67] \n" _UK_MFMA_
// " [%[c0], %[c1], %[c2], %[c3]], acc[74:75], v[170:171], v[64:67] \n"
// " buffer_load_dwordx4 acc[196:199], %[v_os_b4], s[12:15], 0 offen offset:1024 \n" _UK_MFMA_
// " [%[c0], %[c1], %[c2], %[c3]], acc[76:77], v[172:173], v[64:67] \n" _UK_MFMA_
// " [%[c0], %[c1], %[c2], %[c3]], acc[78:79], v[174:175], v[64:67] \n" _UK_MFMA_
// " [%[c4], %[c5], %[c6], %[c7]], acc[64:65], v[224:225], v[68:71] \n" _UK_MFMA_
// " [%[c4], %[c5], %[c6], %[c7]], acc[66:67], v[226:227], v[68:71] \n"
// " buffer_load_dwordx4 acc[200:203], %[v_os_b4], s[12:15], 0 offen offset:2048 \n" _UK_MFMA_
// " [%[c4], %[c5], %[c6], %[c7]], acc[68:69], v[228:229], v[68:71] \n" _UK_MFMA_
// " [%[c4], %[c5], %[c6], %[c7]], acc[70:71], v[230:231], v[68:71] \n" _UK_MFMA_
// " [%[c4], %[c5], %[c6], %[c7]], acc[72:73], v[232:233], v[68:71] \n" _UK_MFMA_
// " [%[c4], %[c5], %[c6], %[c7]], acc[74:75], v[234:235], v[68:71] \n"
// " buffer_load_dwordx4 acc[204:207], %[v_os_b4], s[12:15], 0 offen offset:3072 \n" _UK_MFMA_
// " [%[c4], %[c5], %[c6], %[c7]], acc[76:77], v[236:237], v[68:71] \n" _UK_MFMA_
// " [%[c4], %[c5], %[c6], %[c7]], acc[78:79], v[238:239], v[68:71] \n"
" s_mov_b64 exec, %[s_execflag_4]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o4], v14, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
// _UK_MFMA_
// " [%[c8], %[c9], %[c10], %[c11]], acc[80:81], v[160:161], v[72:75] \n" _UK_MFMA_
// " [%[c8], %[c9], %[c10], %[c11]], acc[82:83], v[162:163], v[72:75] \n"
// " buffer_load_dwordx4 acc[208:211], %[v_os_b5], s[12:15], 0 offen \n" _UK_MFMA_
// " [%[c8], %[c9], %[c10], %[c11]], acc[84:85], v[164:165], v[72:75] \n" _UK_MFMA_
// " [%[c8], %[c9], %[c10], %[c11]], acc[86:87], v[166:167], v[72:75] \n" _UK_MFMA_
// " [%[c8], %[c9], %[c10], %[c11]], acc[88:89], v[168:169], v[72:75] \n" _UK_MFMA_
// " [%[c8], %[c9], %[c10], %[c11]], acc[90:91], v[170:171], v[72:75] \n"
// " buffer_load_dwordx4 acc[212:215], %[v_os_b5], s[12:15], 0 offen offset:1024 \n" _UK_MFMA_
// " [%[c8], %[c9], %[c10], %[c11]], acc[92:93], v[172:173], v[72:75] \n" _UK_MFMA_
// " [%[c8], %[c9], %[c10], %[c11]], acc[94:95], v[174:175], v[72:75] \n" _UK_MFMA_
// " [%[c12], %[c13], %[c14], %[c15]], acc[80:81], v[224:225], v[76:79] \n" _UK_MFMA_
// " [%[c12], %[c13], %[c14], %[c15]], acc[82:83], v[226:227], v[76:79] \n"
// " buffer_load_dwordx4 acc[216:219], %[v_os_b5], s[12:15], 0 offen offset:2048 \n" _UK_MFMA_
// " [%[c12], %[c13], %[c14], %[c15]], acc[84:85], v[228:229], v[76:79] \n" _UK_MFMA_
// " [%[c12], %[c13], %[c14], %[c15]], acc[86:87], v[230:231], v[76:79] \n" _UK_MFMA_
// " [%[c12], %[c13], %[c14], %[c15]], acc[88:89], v[232:233], v[76:79] \n" _UK_MFMA_
// " [%[c12], %[c13], %[c14], %[c15]], acc[90:91], v[234:235], v[76:79] \n"
// " buffer_load_dwordx4 acc[220:223], %[v_os_b5], s[12:15], 0 offen offset:3072 \n" _UK_MFMA_
// " [%[c12], %[c13], %[c14], %[c15]], acc[92:93], v[236:237], v[76:79] \n" _UK_MFMA_
// " [%[c12], %[c13], %[c14], %[c15]], acc[94:95], v[238:239], v[76:79] \n"
" s_mov_b64 exec, %[s_execflag_5]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o5], v15, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" s_waitcnt vmcnt(0)
\n
"
// _UK_MFMA_
// " [%[c0], %[c1], %[c2], %[c3]], acc[96:97], v[176:177], v[64:67] \n" _UK_MFMA_
// " [%[c0], %[c1], %[c2], %[c3]], acc[98:99], v[178:179], v[64:67] \n"
// " buffer_load_dwordx4 acc[224:227], %[v_os_b6], s[12:15], 0 offen \n" _UK_MFMA_
// " [%[c0], %[c1], %[c2], %[c3]], acc[100:101], v[180:181], v[64:67] \n" _UK_MFMA_
// " [%[c0], %[c1], %[c2], %[c3]], acc[102:103], v[182:183], v[64:67] \n" _UK_MFMA_
// " [%[c0], %[c1], %[c2], %[c3]], acc[104:105], v[184:185], v[64:67] \n" _UK_MFMA_
// " [%[c0], %[c1], %[c2], %[c3]], acc[106:107], v[186:187], v[64:67] \n"
// " buffer_load_dwordx4 acc[228:231], %[v_os_b6], s[12:15], 0 offen offset:1024 \n" _UK_MFMA_
// " [%[c0], %[c1], %[c2], %[c3]], acc[108:109], v[188:189], v[64:67] \n" _UK_MFMA_
// " [%[c0], %[c1], %[c2], %[c3]], acc[110:111], v[190:191], v[64:67] \n" _UK_MFMA_
// " [%[c4], %[c5], %[c6], %[c7]], acc[96:97], v[240:241], v[68:71] \n" _UK_MFMA_
// " [%[c4], %[c5], %[c6], %[c7]], acc[98:99], v[242:243], v[68:71] \n"
// " buffer_load_dwordx4 acc[232:235], %[v_os_b6], s[12:15], 0 offen offset:2048 \n" _UK_MFMA_
// " [%[c4], %[c5], %[c6], %[c7]], acc[100:101], v[244:245], v[68:71] \n" _UK_MFMA_
// " [%[c4], %[c5], %[c6], %[c7]], acc[102:103], v[246:247], v[68:71] \n" _UK_MFMA_
// " [%[c4], %[c5], %[c6], %[c7]], acc[104:105], v[248:249], v[68:71] \n" _UK_MFMA_
// " [%[c4], %[c5], %[c6], %[c7]], acc[106:107], v[250:251], v[68:71] \n"
// " buffer_load_dwordx4 acc[236:239], %[v_os_b6], s[12:15], 0 offen offset:3072 \n" _UK_MFMA_
// " [%[c4], %[c5], %[c6], %[c7]], acc[108:109], v[252:253], v[68:71] \n" _UK_MFMA_
// " [%[c4], %[c5], %[c6], %[c7]], acc[110:111], v[254:255], v[68:71] \n"
" s_mov_b64 exec, %[s_execflag_6]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o6], v16, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
// _UK_MFMA_
// " [%[c8], %[c9], %[c10], %[c11]], acc[112:113], v[176:177], v[72:75] \n" _UK_MFMA_
// " [%[c8], %[c9], %[c10], %[c11]], acc[114:115], v[178:179], v[72:75] \n"
// " buffer_load_dwordx4 acc[240:243], %[v_os_b7], s[12:15], 0 offen \n" _UK_MFMA_
// " [%[c8], %[c9], %[c10], %[c11]], acc[116:117], v[180:181], v[72:75] \n" _UK_MFMA_
// " [%[c8], %[c9], %[c10], %[c11]], acc[118:119], v[182:183], v[72:75] \n" _UK_MFMA_
// " [%[c8], %[c9], %[c10], %[c11]], acc[120:121], v[184:185], v[72:75] \n" _UK_MFMA_
// " [%[c8], %[c9], %[c10], %[c11]], acc[122:123], v[186:187], v[72:75] \n"
// " buffer_load_dwordx4 acc[244:247], %[v_os_b7], s[12:15], 0 offen offset:1024 \n" _UK_MFMA_
// " [%[c8], %[c9], %[c10], %[c11]], acc[124:125], v[188:189], v[72:75] \n" _UK_MFMA_
// " [%[c8], %[c9], %[c10], %[c11]], acc[126:127], v[190:191], v[72:75] \n" _UK_MFMA_
// " [%[c12], %[c13], %[c14], %[c15]], acc[112:113], v[240:241], v[76:79] \n" _UK_MFMA_
// " [%[c12], %[c13], %[c14], %[c15]], acc[114:115], v[242:243], v[76:79] \n"
// " buffer_load_dwordx4 acc[248:251], %[v_os_b7], s[12:15], 0 offen offset:2048 \n" _UK_MFMA_
// " [%[c12], %[c13], %[c14], %[c15]], acc[116:117], v[244:245], v[76:79] \n" _UK_MFMA_
// " [%[c12], %[c13], %[c14], %[c15]], acc[118:119], v[246:247], v[76:79] \n" _UK_MFMA_
// " [%[c12], %[c13], %[c14], %[c15]], acc[120:121], v[248:249], v[76:79] \n" _UK_MFMA_
// " [%[c12], %[c13], %[c14], %[c15]], acc[122:123], v[250:251], v[76:79] \n"
// " buffer_load_dwordx4 acc[252:255], %[v_os_b7], s[12:15], 0 offen offset:3072 \n" _UK_MFMA_
// " [%[c12], %[c13], %[c14], %[c15]], acc[124:125], v[252:253], v[76:79] \n" _UK_MFMA_
// " [%[c12], %[c13], %[c14], %[c15]], acc[126:127], v[254:255], v[76:79] \n"
" s_mov_b64 exec, %[s_execflag_7]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o7], v17, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" s_add_u32 s60, 0x00000100, s80
\n
"
" s_cmp_lt_u32 s60, %[s_loop_cnt]
\n
"
" s_cselect_b32 s56, %[s_tile_os_b], 0
\n
"
" s_add_u32 s12, s56, s12
\n
"
" s_addc_u32 s13, 0, s13
\n
"
" s_cmp_ge_u32 s80, 0x00000100
\n
"
" s_cselect_b32 s59, %[s_tile_os_o], s59
\n
"
" s_add_u32 s8, s59, s8
\n
"
" s_addc_u32 s9, 0, s9
\n
"
" v_mul_f32 %[c0], %[scale_0], %[c0]
\n
"
" v_mul_f32 %[c1], %[scale_0], %[c1]
\n
"
" v_mul_f32 %[c2], %[scale_0], %[c2]
\n
"
" v_mul_f32 %[c3], %[scale_0], %[c3]
\n
"
" v_mul_f32 %[c4], %[scale_1], %[c4]
\n
"
" v_mul_f32 %[c5], %[scale_1], %[c5]
\n
"
" v_mul_f32 %[c6], %[scale_1], %[c6]
\n
"
" v_mul_f32 %[c7], %[scale_1], %[c7]
\n
"
" v_mul_f32 %[c8], %[scale_0], %[c8]
\n
"
" v_mul_f32 %[c9], %[scale_0], %[c9]
\n
"
" v_mul_f32 %[c10], %[scale_0], %[c10]
\n
"
" v_mul_f32 %[c11], %[scale_0], %[c11]
\n
"
" v_mul_f32 %[c12], %[scale_1], %[c12]
\n
"
" v_mul_f32 %[c13], %[scale_1], %[c13]
\n
"
" v_mul_f32 %[c14], %[scale_1], %[c14]
\n
"
" v_mul_f32 %[c15], %[scale_1], %[c15]
\n
"
_UK_PK_CVT_
(
"%[c0]"
,
"%[c1]"
,
"%[c0]"
)
_UK_PK_CVT_
(
"%[c2]"
,
"%[c3]"
,
"%[c1]"
)
_UK_PK_CVT_
(
"%[c4]"
,
"%[c5]"
,
"%[c2]"
)
_UK_PK_CVT_
(
"%[c6]"
,
"%[c7]"
,
"%[c3]"
)
_UK_PK_CVT_
(
"%[c8]"
,
"%[c9]"
,
"%[c4]"
)
_UK_PK_CVT_
(
"%["
"c10]"
,
"%["
"c11]"
,
"%[c5]"
)
_UK_PK_CVT_
(
"%[c12]"
,
"%[c13]"
,
"%[c6]"
)
_UK_PK_CVT_
(
"%[c14]"
,
"%[c15]"
,
"%[c7]"
)
" s_addk_i32 s80, 0x0080
\n
"
" s_cmp_lt_i32 s80, %[s_loop_cnt]
\n
"
" s_cbranch_scc0 loop_atomic_%=
\n
"
" s_waitcnt vmcnt(0) & lgkmcnt(0)
\n
"
" s_barrier
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[128:129], v[128:129], 0
\n
"
" ds_read_b32 v10, %[v_sfl_sld] offset:25344
\n
"
" ds_read_b32 v11, %[v_sfl_sld] offset:25376
\n
"
" ds_write_b64 v3, v[64:65] offset:16640
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[130:131], v[130:131], v[80:83]
\n
"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen
\n
"
" ds_write_b64 v3, v[66:67] offset:20992
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[132:133], v[132:133], v[80:83]
\n
"
" ds_read_b32 v12, %[v_sfl_sld] offset:25408
\n
"
" ds_read_b32 v13, %[v_sfl_sld] offset:25440
\n
"
" ds_write_b64 v3, v[68:69] offset:18816
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[134:135], v[134:135], v[80:83]
\n
"
" ds_write_b64 v3, v[70:71] offset:23168
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[136:137], v[136:137], v[80:83]
\n
"
" ds_read_b32 v14, %[v_sfl_sld] offset:29696
\n
"
" ds_read_b32 v15, %[v_sfl_sld] offset:29728
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[138:139], v[138:139], v[80:83]
\n
"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024 "
"
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[140:141], v[140:141], v[80:83]
\n
"
" ds_read_b32 v16, %[v_sfl_sld] offset:29760
\n
"
" ds_read_b32 v17, %[v_sfl_sld] offset:29792
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[142:143], v[142:143], v[80:83] "
"
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[128:129], v[192:193], 0
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[130:131], v[194:195], v[84:87]
\n
"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048 "
"
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[132:133], v[196:197], "
"v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[134:135], v[198:199], v[84:87] "
"
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[136:137], v[200:201], "
"v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[138:139], v[202:203], v[84:87]
\n
"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072 "
"
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[140:141], v[204:205], "
"v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[142:143], v[206:207], v[84:87]
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_mov_b64 exec, %[s_execflag_0]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o0], v10, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[144:145], v[128:129], 0
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[146:147], v[130:131], v[88:91]
\n
"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[148:149], v[132:133], v[88:91] "
"
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[150:151], v[134:135], "
"v[88:91]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[152:153], v[136:137], v[88:91] "
"
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[154:155], v[138:139], v[88:91]
\n
"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024 "
"
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[156:157], v[140:141], "
"v[88:91]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[158:159], v[142:143], v[88:91] "
"
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[144:145], v[192:193], 0
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[146:147], v[194:195], v[92:95]
\n
"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048 "
"
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[148:149], v[196:197], "
"v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[150:151], v[198:199], v[92:95] "
"
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[152:153], v[200:201], "
"v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[154:155], v[202:203], v[92:95]
\n
"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072 "
"
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[156:157], v[204:205], "
"v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[158:159], v[206:207], v[92:95]
\n
"
" s_mov_b64 exec, %[s_execflag_1]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o1], v11, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" s_waitcnt vmcnt(0)
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[160:161], v[144:145], v[80:83] "
"
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[162:163], v[146:147], v[80:83]
\n
"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[164:165], v[148:149], v[80:83] "
"
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[166:167], v[150:151], "
"v[80:83]
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[168:169], v[152:153], v[80:83] "
"
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[170:171], v[154:155], v[80:83]
\n
"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024 "
"
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[172:173], v[156:157], "
"v[80:83]
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[174:175], v[158:159], v[80:83] "
"
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[160:161], v[208:209], "
"v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[162:163], v[210:211], v[84:87]
\n
"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048 "
"
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[164:165], v[212:213], "
"v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[166:167], v[214:215], v[84:87] "
"
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[168:169], v[216:217], "
"v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[170:171], v[218:219], v[84:87]
\n
"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072 "
"
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[172:173], v[220:221], "
"v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[174:175], v[222:223], v[84:87]
\n
"
" s_mov_b64 exec, %[s_execflag_2]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o2], v12, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[176:177], v[144:145], v[88:91] "
"
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[178:179], v[146:147], v[88:91]
\n
"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[180:181], v[148:149], v[88:91] "
"
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[182:183], v[150:151], "
"v[88:91]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[184:185], v[152:153], v[88:91] "
"
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[186:187], v[154:155], v[88:91]
\n
"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024 "
"
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[188:189], v[156:157], "
"v[88:91]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[190:191], v[158:159], v[88:91] "
"
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[176:177], v[208:209], "
"v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[178:179], v[210:211], v[92:95]
\n
"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048 "
"
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[180:181], v[212:213], "
"v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[182:183], v[214:215], v[92:95] "
"
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[184:185], v[216:217], "
"v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[186:187], v[218:219], v[92:95]
\n
"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072 "
"
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[188:189], v[220:221], "
"v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[190:191], v[222:223], v[92:95]
\n
"
" s_mov_b64 exec, %[s_execflag_3]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o3], v13, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" s_waitcnt vmcnt(0)
\n
"
// _UK_MFMA_
// " [%[c16], %[c17], %[c18], %[c19]], acc[192:193], v[160:161], v[80:83] "
// "\n" _UK_MFMA_
// " [%[c16], %[c17], %[c18], %[c19]], acc[194:195], v[162:163], v[80:83] \n"
// " buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[12:15], 0 offen \n" _UK_MFMA_
// " [%[c16], %[c17], %[c18], %[c19]], acc[196:197], v[164:165], v[80:83] "
// "\n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[198:199], v[166:167], "
// "v[80:83] \n" _UK_MFMA_
// " [%[c16], %[c17], %[c18], %[c19]], acc[200:201], v[168:169], v[80:83] "
// "\n" _UK_MFMA_
// " [%[c16], %[c17], %[c18], %[c19]], acc[202:203], v[170:171], v[80:83] \n"
// " buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[12:15], 0 offen offset:1024 "
// "\n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[204:205], v[172:173], "
// "v[80:83] \n" _UK_MFMA_
// " [%[c16], %[c17], %[c18], %[c19]], acc[206:207], v[174:175], v[80:83] "
// "\n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[192:193], v[224:225], "
// "v[84:87] \n" _UK_MFMA_
// " [%[c20], %[c21], %[c22], %[c23]], acc[194:195], v[226:227], v[84:87] \n"
// " buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[12:15], 0 offen offset:2048 "
// "\n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[196:197], v[228:229], "
// "v[84:87] \n" _UK_MFMA_
// " [%[c20], %[c21], %[c22], %[c23]], acc[198:199], v[230:231], v[84:87] "
// "\n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[200:201], v[232:233], "
// "v[84:87] \n" _UK_MFMA_
// " [%[c20], %[c21], %[c22], %[c23]], acc[202:203], v[234:235], v[84:87] \n"
// " buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[12:15], 0 offen offset:3072 "
// "\n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[204:205], v[236:237], "
// "v[84:87] \n" _UK_MFMA_
// " [%[c20], %[c21], %[c22], %[c23]], acc[206:207], v[238:239], v[84:87] \n"
" s_mov_b64 exec, %[s_execflag_4]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o4], v14, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
// _UK_MFMA_
// " [%[c24], %[c25], %[c26], %[c27]], acc[208:209], v[160:161], v[88:91] "
// "\n" _UK_MFMA_
// " [%[c24], %[c25], %[c26], %[c27]], acc[210:211], v[162:163], v[88:91] \n"
// " buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[12:15], 0 offen \n" _UK_MFMA_
// " [%[c24], %[c25], %[c26], %[c27]], acc[212:213], v[164:165], v[88:91] "
// "\n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[214:215], v[166:167], "
// "v[88:91] \n" _UK_MFMA_
// " [%[c24], %[c25], %[c26], %[c27]], acc[216:217], v[168:169], v[88:91] "
// "\n" _UK_MFMA_
// " [%[c24], %[c25], %[c26], %[c27]], acc[218:219], v[170:171], v[88:91] \n"
// " buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[12:15], 0 offen offset:1024 "
// "\n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[220:221], v[172:173], "
// "v[88:91] \n" _UK_MFMA_
// " [%[c24], %[c25], %[c26], %[c27]], acc[222:223], v[174:175], v[88:91] "
// "\n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[208:209], v[224:225], "
// "v[92:95] \n" _UK_MFMA_
// " [%[c28], %[c29], %[c30], %[c31]], acc[210:211], v[226:227], v[92:95] \n"
// " buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[12:15], 0 offen offset:2048 "
// "\n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[212:213], v[228:229], "
// "v[92:95] \n" _UK_MFMA_
// " [%[c28], %[c29], %[c30], %[c31]], acc[214:215], v[230:231], v[92:95] "
// "\n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[216:217], v[232:233], "
// "v[92:95] \n" _UK_MFMA_
// " [%[c28], %[c29], %[c30], %[c31]], acc[218:219], v[234:235], v[92:95] \n"
// " buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[12:15], 0 offen offset:3072 "
// "\n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[220:221], v[236:237], "
// "v[92:95] \n" _UK_MFMA_
// " [%[c28], %[c29], %[c30], %[c31]], acc[222:223], v[238:239], v[92:95] \n"
" s_mov_b64 exec, %[s_execflag_5]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o5], v15, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" s_waitcnt vmcnt(0)
\n
"
// _UK_MFMA_
// " [%[c16], %[c17], %[c18], %[c19]], acc[224:225], v[176:177], v[80:83] "
// "\n" _UK_MFMA_
// " [%[c16], %[c17], %[c18], %[c19]], acc[226:227], v[178:179], v[80:83] \n"
// " buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[12:15], 0 offen \n" _UK_MFMA_
// " [%[c16], %[c17], %[c18], %[c19]], acc[228:229], v[180:181], v[80:83] "
// "\n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[230:231], v[182:183], "
// "v[80:83] \n" _UK_MFMA_
// " [%[c16], %[c17], %[c18], %[c19]], acc[232:233], v[184:185], v[80:83] "
// "\n" _UK_MFMA_
// " [%[c16], %[c17], %[c18], %[c19]], acc[234:235], v[186:187], v[80:83] \n"
// " buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[12:15], 0 offen "
// "offset:1024 \n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[236:237], "
// "v[188:189], v[80:83] \n" _UK_MFMA_
// " [%[c16], %[c17], %[c18], %[c19]], acc[238:239], v[190:191], v[80:83] "
// "\n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[224:225], v[240:241], "
// "v[84:87] \n" _UK_MFMA_
// " [%[c20], %[c21], %[c22], %[c23]], acc[226:227], v[242:243], v[84:87] \n"
// " buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[12:15], 0 offen "
// "offset:2048 \n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[228:229], "
// "v[244:245], v[84:87] \n" _UK_MFMA_
// " [%[c20], %[c21], %[c22], %[c23]], acc[230:231], v[246:247], v[84:87] "
// "\n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[232:233], v[248:249], "
// "v[84:87] \n" _UK_MFMA_
// " [%[c20], %[c21], %[c22], %[c23]], acc[234:235], v[250:251], v[84:87] \n"
// " buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[12:15], 0 offen "
// "offset:3072 \n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[236:237], "
// "v[252:253], v[84:87] \n" _UK_MFMA_
// " [%[c20], %[c21], %[c22], %[c23]], acc[238:239], v[254:255], v[84:87] \n"
" s_mov_b64 exec, %[s_execflag_6]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o6], v16, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
// _UK_MFMA_
// " [%[c24], %[c25], %[c26], %[c27]], acc[240:241], v[176:177], v[88:91] "
// "\n" _UK_MFMA_
// " [%[c24], %[c25], %[c26], %[c27]], acc[242:243], v[178:179], v[88:91] \n"
// " buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[12:15], 0 offen "
// "\n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[244:245], v[180:181], "
// "v[88:91] \n" _UK_MFMA_
// " [%[c24], %[c25], %[c26], %[c27]], acc[246:247], v[182:183], v[88:91] "
// "\n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[248:249], v[184:185], "
// "v[88:91] \n" _UK_MFMA_
// " [%[c24], %[c25], %[c26], %[c27]], acc[250:251], v[186:187], v[88:91] \n"
// " buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[12:15], 0 offen "
// "offset:1024 \n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[252:253], "
// "v[188:189], v[88:91] \n" _UK_MFMA_
// " [%[c24], %[c25], %[c26], %[c27]], acc[254:255], v[190:191], v[88:91] "
// "\n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[240:241], v[240:241], "
// "v[92:95] \n" _UK_MFMA_
// " [%[c28], %[c29], %[c30], %[c31]], acc[242:243], v[242:243], v[92:95] \n"
// " buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[12:15], 0 offen "
// "offset:2048 \n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[244:245], "
// "v[244:245], v[92:95] \n" _UK_MFMA_
// " [%[c28], %[c29], %[c30], %[c31]], acc[246:247], v[246:247], v[92:95] "
// "\n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[248:249], v[248:249], "
// "v[92:95] \n" _UK_MFMA_
// " [%[c28], %[c29], %[c30], %[c31]], acc[250:251], v[250:251], v[92:95] \n"
// " buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[12:15], 0 offen "
// "offset:3072 \n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[252:253], "
// "v[252:253], v[92:95] \n" _UK_MFMA_
// " [%[c28], %[c29], %[c30], %[c31]], acc[254:255], v[254:255], v[92:95] \n"
" s_mov_b64 exec, %[s_execflag_7]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o7], v17, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" s_add_u32 s60, 0x00000100, s80
\n
"
" s_cmp_lt_u32 s60, %[s_loop_cnt]
\n
"
" s_cselect_b32 s56, s56, 0
\n
"
" s_add_u32 s12, s56, s12
\n
"
" s_addc_u32 s13, 0, s13
\n
"
" s_cmp_ge_u32 s80, 0x00000100
\n
"
" s_cselect_b32 s59, 0x00000100, s59
\n
"
" s_add_u32 s8, s59, s8
\n
"
" s_addc_u32 s9, 0, s9
\n
"
" v_mul_f32 %[c16], %[scale_0], %[c16]
\n
"
" v_mul_f32 %[c17], %[scale_0], %[c17]
\n
"
" v_mul_f32 %[c18], %[scale_0], %[c18]
\n
"
" v_mul_f32 %[c19], %[scale_0], %[c19]
\n
"
" v_mul_f32 %[c20], %[scale_1], %[c20]
\n
"
" v_mul_f32 %[c21], %[scale_1], %[c21]
\n
"
" v_mul_f32 %[c22], %[scale_1], %[c22]
\n
"
" v_mul_f32 %[c23], %[scale_1], %[c23]
\n
"
" v_mul_f32 %[c24], %[scale_0], %[c24]
\n
"
" v_mul_f32 %[c25], %[scale_0], %[c25]
\n
"
" v_mul_f32 %[c26], %[scale_0], %[c26]
\n
"
" v_mul_f32 %[c27], %[scale_0], %[c27]
\n
"
" v_mul_f32 %[c28], %[scale_1], %[c28]
\n
"
" v_mul_f32 %[c29], %[scale_1], %[c29]
\n
"
" v_mul_f32 %[c30], %[scale_1], %[c30]
\n
"
" v_mul_f32 %[c31], %[scale_1], %[c31]
\n
"
_UK_PK_CVT_
(
"%[c16]"
,
"%[c17]"
,
"%[c16]"
)
_UK_PK_CVT_
(
"%[c18]"
,
"%[c19]"
,
"%[c17]"
)
_UK_PK_CVT_
(
"%[c20]"
,
"%[c21]"
,
"%[c18]"
)
_UK_PK_CVT_
(
"%[c22]"
,
"%[c23]"
,
"%[c19]"
)
_UK_PK_CVT_
(
"%[c24]"
,
"%[c25]"
,
"%[c20]"
)
_UK_PK_CVT_
(
"%[c26]"
,
"%[c27]"
,
"%[c21]"
)
_UK_PK_CVT_
(
"%[c28]"
,
"%[c29]"
,
"%[c22]"
)
_UK_PK_CVT_
(
"%[c30]"
,
"%[c31]"
,
"%[c23]"
)
" s_addk_i32 s80, 0x0080
\n
"
" s_cmp_lt_i32 s80, %[s_loop_cnt]
\n
"
" s_cbranch_scc0 loop_atomic_%=
\n
"
" s_branch coreloop_top_%=
\n
"
" loop_atomic_%=:
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_barrier
\n
"
" ds_read_b32 v10, %[v_sfl_sld] offset:16640
\n
"
" ds_read_b32 v11, %[v_sfl_sld] offset:16672
\n
"
" ds_read_b32 v12, %[v_sfl_sld] offset:16704
\n
"
" ds_read_b32 v13, %[v_sfl_sld] offset:16736
\n
"
" ds_read_b32 v14, %[v_sfl_sld] offset:20992
\n
"
" ds_read_b32 v15, %[v_sfl_sld] offset:21024
\n
"
" ds_read_b32 v16, %[v_sfl_sld] offset:21056
\n
"
" ds_read_b32 v17, %[v_sfl_sld] offset:21088
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_mov_b64 exec, %[s_execflag_0]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o0], v10, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_1]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o1], v11, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_2]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o2], v12, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_3]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o3], v13, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_4]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o4], v14, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_5]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o5], v15, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_6]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o6], v16, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_7]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o7], v17, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39] "
"
\n
"
" s_add_u32 s8, s59, s8
\n
"
" s_addc_u32 s9, 0, s9
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c16],%[c17]] "
"offset:25344
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c18],%[c19]] "
"offset:29696
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c20],%[c21]] "
"offset:27520
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c22],%[c23]] "
"offset:31872
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_barrier
\n
"
" ds_read_b32 v10, %[v_sfl_sld] offset:25344
\n
"
" ds_read_b32 v11, %[v_sfl_sld] offset:25376
\n
"
" ds_read_b32 v12, %[v_sfl_sld] offset:25408
\n
"
" ds_read_b32 v13, %[v_sfl_sld] offset:25440
\n
"
" ds_read_b32 v14, %[v_sfl_sld] offset:29696
\n
"
" ds_read_b32 v15, %[v_sfl_sld] offset:29728
\n
"
" ds_read_b32 v16, %[v_sfl_sld] offset:29760
\n
"
" ds_read_b32 v17, %[v_sfl_sld] offset:29792
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_mov_b64 exec, %[s_execflag_0]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o0], v10, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_1]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o1], v11, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_2]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o2], v12, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_3]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o3], v13, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_4]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o4], v14, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_5]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o5], v15, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_6]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o6], v16, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_7]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o7], v17, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
#undef _UK_MFMA_
#undef _UK_PK_CVT_
#undef _UK_ATOMIC_ADD_
include/ck_tile/ops/fused_moe/kernel/fused_moegemm_shape.hpp
View file @
a75f162b
...
...
@@ -61,15 +61,16 @@ struct FusedMoeGemmShape
// TODO: we don't support half warps aound to 1 warp here
static_assert
(
NumWarps
==
reduce_on_sequence
(
WarpPerBlock_1
{},
multiplies
{},
number
<
1
>
{}));
static
constexpr
index_t
Block_M0
=
BlockTile_0
::
at
(
number
<
0
>
{});
static
constexpr
index_t
Block_N0
=
BlockTile_0
::
at
(
number
<
1
>
{});
static
constexpr
index_t
Block_K0
=
BlockTile_0
::
at
(
number
<
2
>
{});
static
constexpr
index_t
WarpPerBlock_M0
=
WarpPerBlock_0
::
at
(
number
<
0
>
{});
static
constexpr
index_t
WarpPerBlock_N0
=
WarpPerBlock_0
::
at
(
number
<
1
>
{});
static
constexpr
index_t
WarpPerBlock_K0
=
WarpPerBlock_0
::
at
(
number
<
2
>
{});
static
constexpr
index_t
Warp_M0
=
WarpTile_0
::
at
(
number
<
0
>
{});
static
constexpr
index_t
Warp_N0
=
WarpTile_0
::
at
(
number
<
1
>
{});
static
constexpr
index_t
Warp_K0
=
WarpTile_0
::
at
(
number
<
2
>
{});
// S<32, 512, 128>, S<1, 4, 1>, S<16, 16, 32>
static
constexpr
index_t
Block_M0
=
BlockTile_0
::
at
(
number
<
0
>
{});
//32
static
constexpr
index_t
Block_N0
=
BlockTile_0
::
at
(
number
<
1
>
{});
//512
static
constexpr
index_t
Block_K0
=
BlockTile_0
::
at
(
number
<
2
>
{});
// 128
static
constexpr
index_t
WarpPerBlock_M0
=
WarpPerBlock_0
::
at
(
number
<
0
>
{});
// 1
static
constexpr
index_t
WarpPerBlock_N0
=
WarpPerBlock_0
::
at
(
number
<
1
>
{});
// 4
static
constexpr
index_t
WarpPerBlock_K0
=
WarpPerBlock_0
::
at
(
number
<
2
>
{});
// 1
static
constexpr
index_t
Warp_M0
=
WarpTile_0
::
at
(
number
<
0
>
{});
// 16
static
constexpr
index_t
Warp_N0
=
WarpTile_0
::
at
(
number
<
1
>
{});
// 16
static
constexpr
index_t
Warp_K0
=
WarpTile_0
::
at
(
number
<
2
>
{});
// 32
static
constexpr
index_t
ThreadPerBlock_M0
=
Warp_M0
*
WarpPerBlock_M0
;
static
constexpr
index_t
ThreadPerBlock_N0
=
Warp_N0
*
WarpPerBlock_N0
;
...
...
@@ -77,19 +78,19 @@ struct FusedMoeGemmShape
static_assert
(
Block_M0
%
ThreadPerBlock_M0
==
0
);
static_assert
(
Block_N0
%
ThreadPerBlock_N0
==
0
);
static_assert
(
Block_K0
%
ThreadPerBlock_K0
==
0
);
static
constexpr
index_t
Repeat_M0
=
Block_M0
/
ThreadPerBlock_M0
;
static
constexpr
index_t
Repeat_N0
=
Block_N0
/
ThreadPerBlock_N0
;
static
constexpr
index_t
Repeat_K0
=
Block_K0
/
ThreadPerBlock_K0
;
static
constexpr
index_t
Repeat_M0
=
Block_M0
/
ThreadPerBlock_M0
;
// 2
static
constexpr
index_t
Repeat_N0
=
Block_N0
/
ThreadPerBlock_N0
;
// 8
static
constexpr
index_t
Repeat_K0
=
Block_K0
/
ThreadPerBlock_K0
;
// 4
static
constexpr
index_t
Block_M1
=
BlockTile_1
::
at
(
number
<
0
>
{});
static
constexpr
index_t
Block_N1
=
BlockTile_1
::
at
(
number
<
1
>
{});
static
constexpr
index_t
Block_K1
=
BlockTile_1
::
at
(
number
<
2
>
{});
static
constexpr
index_t
WarpPerBlock_M1
=
WarpPerBlock_1
::
at
(
number
<
0
>
{});
static
constexpr
index_t
WarpPerBlock_N1
=
WarpPerBlock_1
::
at
(
number
<
1
>
{});
static
constexpr
index_t
WarpPerBlock_K1
=
WarpPerBlock_1
::
at
(
number
<
2
>
{});
static
constexpr
index_t
Warp_M1
=
WarpTile_1
::
at
(
number
<
0
>
{});
static
constexpr
index_t
Warp_N1
=
WarpTile_1
::
at
(
number
<
1
>
{});
static
constexpr
index_t
Warp_K1
=
WarpTile_1
::
at
(
number
<
2
>
{});
static
constexpr
index_t
Block_M1
=
BlockTile_1
::
at
(
number
<
0
>
{});
//32
static
constexpr
index_t
Block_N1
=
BlockTile_1
::
at
(
number
<
1
>
{});
//128
static
constexpr
index_t
Block_K1
=
BlockTile_1
::
at
(
number
<
2
>
{});
//512
static
constexpr
index_t
WarpPerBlock_M1
=
WarpPerBlock_1
::
at
(
number
<
0
>
{});
// 1
static
constexpr
index_t
WarpPerBlock_N1
=
WarpPerBlock_1
::
at
(
number
<
1
>
{});
// 4
static
constexpr
index_t
WarpPerBlock_K1
=
WarpPerBlock_1
::
at
(
number
<
2
>
{});
// 1
static
constexpr
index_t
Warp_M1
=
WarpTile_1
::
at
(
number
<
0
>
{});
// 16
static
constexpr
index_t
Warp_N1
=
WarpTile_1
::
at
(
number
<
1
>
{});
// 16
static
constexpr
index_t
Warp_K1
=
WarpTile_1
::
at
(
number
<
2
>
{});
// 32
static
constexpr
index_t
ThreadPerBlock_M1
=
Warp_M1
*
WarpPerBlock_M1
;
static
constexpr
index_t
ThreadPerBlock_N1
=
Warp_N1
*
WarpPerBlock_N1
;
...
...
@@ -97,9 +98,9 @@ struct FusedMoeGemmShape
static_assert
(
Block_M1
%
ThreadPerBlock_M1
==
0
);
static_assert
(
Block_N1
%
ThreadPerBlock_N1
==
0
);
static_assert
(
Block_K1
%
ThreadPerBlock_K1
==
0
);
static
constexpr
index_t
Repeat_M1
=
Block_M1
/
ThreadPerBlock_M1
;
static
constexpr
index_t
Repeat_N1
=
Block_N1
/
ThreadPerBlock_N1
;
static
constexpr
index_t
Repeat_K1
=
Block_K1
/
ThreadPerBlock_K1
;
static
constexpr
index_t
Repeat_M1
=
Block_M1
/
ThreadPerBlock_M1
;
// 2
static
constexpr
index_t
Repeat_N1
=
Block_N1
/
ThreadPerBlock_N1
;
// 2
static
constexpr
index_t
Repeat_K1
=
Block_K1
/
ThreadPerBlock_K1
;
// 16
static
constexpr
index_t
BlockSize
=
warpSize
*
NumWarps
;
...
...
@@ -115,9 +116,9 @@ struct FusedMoeGemmShape
static
constexpr
index_t
Block_W0
=
Warp_N0
*
Warp_K0
;
static
constexpr
index_t
Block_Nr0
=
Block_N0
/
Warp_N0
;
static
constexpr
index_t
Block_Kr0
=
Block_K0
/
Warp_K0
;
static
constexpr
index_t
Block_W1
=
Warp_N1
*
Warp_K1
;
static
constexpr
index_t
Block_Nr1
=
Block_N1
/
Warp_N1
;
static
constexpr
index_t
Block_Kr1
=
Block_K1
/
Warp_K1
;
static
constexpr
index_t
Block_W1
=
Warp_N1
*
Warp_K1
;
// 512
static
constexpr
index_t
Block_Nr1
=
Block_N1
/
Warp_N1
;
// 8
static
constexpr
index_t
Block_Kr1
=
Block_K1
/
Warp_K1
;
// 16
static_assert
(
Block_W0
==
Block_W1
);
// static_assert(Block_Nr0 == Block_Kr1);
...
...
include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp
View file @
a75f162b
...
...
@@ -199,6 +199,8 @@ struct FusedMoeGemmPipeline_FlatmmUk
threadIdx
.
x
%
(
BlockShape
::
Block_K0
/
kAlignmentA
)
*
kAlignmentA
;
},
number
<
row_ids_a
.
size
()
>
{});
if
(
row_ids_a
[
0
]
>=
kargs
.
num_tokens
)
return
;
auto
a_res
=
make_wave_buffer_resource
(
reinterpret_cast
<
const
ADataType
*>
(
kargs
.
a_ptr
),
kargs
.
num_tokens
*
kargs
.
stride_token
*
sizeof
(
ADataType
));
...
...
@@ -238,7 +240,7 @@ struct FusedMoeGemmPipeline_FlatmmUk
const
auto
d_view_
=
make_naive_tensor_view
<
address_space_enum
::
global
>
(
d_ptr
,
make_tuple
(
nr_1
,
kr_1
,
BlockShape
::
Block_W1
),
make_tuple
(
nr_1
,
kr_1
,
BlockShape
::
Block_W1
),
// n/16, k/32, 512
make_tuple
(
kr_1
*
BlockShape
::
Block_W1
,
BlockShape
::
Block_W1
,
1
),
number
<
kAlignmentD
>
{},
number
<
1
>
{});
...
...
@@ -264,13 +266,13 @@ struct FusedMoeGemmPipeline_FlatmmUk
auto
d_coords
=
[
&
]()
{
constexpr
index_t
Nr_
=
2
;
constexpr
index_t
Nw_
=
4
;
constexpr
index_t
Kr0_
=
4
;
constexpr
index_t
Kr0_
=
BlockShape
::
Block_Kr1
/
Kr1_
;
//
4
constexpr
index_t
Kr1_
=
4
;
constexpr
index_t
Kl_
=
4
;
constexpr
index_t
Nl_
=
16
;
constexpr
index_t
Kv_
=
8
;
constexpr
index_t
W_
=
Kl_
*
Nl_
*
Kv_
;
constexpr
index_t
num_offsets_
=
Nr_
*
Kr0_
;
constexpr
index_t
W_
=
Kl_
*
Nl_
*
Kv_
;
// 512
constexpr
index_t
num_offsets_
=
Nr_
*
Kr0_
;
// 8
index_t
base_os_
=
(
threadIdx
.
x
%
64
)
*
Kv_
+
(
threadIdx
.
x
/
64
)
*
shared_intermediate_size_1
*
Nl_
;
// Kr0_ * Kr1_ * W_;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment