Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
fa335f31
Commit
fa335f31
authored
Jan 07, 2025
by
feifei14119
Browse files
remove debug 9.8 tflops
parent
888317e6
Changes
12
Show whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
3746 additions
and
0 deletions
+3746
-0
example/ck_tile/18_flatmm_uk/CMakeLists.txt
example/ck_tile/18_flatmm_uk/CMakeLists.txt
+19
-0
example/ck_tile/18_flatmm_uk/flatmm_uk.hpp
example/ck_tile/18_flatmm_uk/flatmm_uk.hpp
+100
-0
example/ck_tile/18_flatmm_uk/instances/flatmm_uk_api.cpp
example/ck_tile/18_flatmm_uk/instances/flatmm_uk_api.cpp
+192
-0
example/ck_tile/18_flatmm_uk/instances/flatmm_uk_api.hpp
example/ck_tile/18_flatmm_uk/instances/flatmm_uk_api.hpp
+76
-0
example/ck_tile/18_flatmm_uk/main.cpp
example/ck_tile/18_flatmm_uk/main.cpp
+692
-0
example/ck_tile/CMakeLists.txt
example/ck_tile/CMakeLists.txt
+1
-0
include/ck_tile/ops/flatmm/block/flatmm_ff_32x512x128_1x4x1_16x16x32.hpp
.../ops/flatmm/block/flatmm_ff_32x512x128_1x4x1_16x16x32.hpp
+665
-0
include/ck_tile/ops/flatmm/block/uk/flatmm_ff_uk_gfx9_32x512x128_1x1x1_16x16x16.inc
.../block/uk/flatmm_ff_uk_gfx9_32x512x128_1x1x1_16x16x16.inc
+574
-0
include/ck_tile/ops/flatmm_uk.hpp
include/ck_tile/ops/flatmm_uk.hpp
+17
-0
include/ck_tile/ops/fused_moe/kernel/flatmm_uk_kernel.hpp
include/ck_tile/ops/fused_moe/kernel/flatmm_uk_kernel.hpp
+264
-0
include/ck_tile/ops/fused_moe/pipeline/flatmm_uk_pipeline.hpp
...ude/ck_tile/ops/fused_moe/pipeline/flatmm_uk_pipeline.hpp
+337
-0
include/ck_tile/ops/fused_moe/pipeline/flatmm_uk_pipeline_policy.hpp
...tile/ops/fused_moe/pipeline/flatmm_uk_pipeline_policy.hpp
+809
-0
No files found.
example/ck_tile/18_flatmm_uk/CMakeLists.txt
0 → 100644
View file @
fa335f31
set
(
TILE_EXAPMLE_FLATMM_UK
"tile_example_flatmm_uk"
)
# not using add_example_executable() to add this target, since we don't want this to have
# to be included in "make all/install/check"
message
(
"adding
${
TILE_EXAPMLE_FLATMM_UK
}
"
)
file
(
GLOB INSTANCE_SRCS instances/*.cpp
)
add_executable
(
${
TILE_EXAPMLE_FLATMM_UK
}
EXCLUDE_FROM_ALL main.cpp
)
target_include_directories
(
${
TILE_EXAPMLE_FLATMM_UK
}
PRIVATE
${
CMAKE_CURRENT_LIST_DIR
}
)
target_sources
(
${
TILE_EXAPMLE_FLATMM_UK
}
PRIVATE
${
INSTANCE_SRCS
}
)
set
(
TILE_EXAPMLE_FLATMM_UK_COMPILE_OPTIONS
)
# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
list
(
APPEND TILE_EXAPMLE_FLATMM_UK_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal
)
list
(
APPEND TILE_EXAPMLE_FLATMM_UK_COMPILE_OPTIONS -DCK_TILE_BUFFER_LOAD_AGPR=1
)
# TODO: enable load to a
list
(
APPEND TILE_EXAPMLE_FLATMM_UK_COMPILE_OPTIONS -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=4
)
# rta
# list(APPEND TILE_EXAPMLE_FLATMM_UK_COMPILE_OPTIONS -mllvm -greedy-reverse-local-assignment=1)
# list(APPEND TILE_EXAPMLE_FLATMM_UK_COMPILE_OPTIONS -v --save-temps -Wno-gnu-line-marker)
target_compile_options
(
${
TILE_EXAPMLE_FLATMM_UK
}
PRIVATE
${
TILE_EXAPMLE_FLATMM_UK_COMPILE_OPTIONS
}
)
example/ck_tile/18_flatmm_uk/flatmm_uk.hpp
0 → 100644
View file @
fa335f31
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/host/kernel_launch.hpp"
#include "ck_tile/ops/flatmm_uk.hpp"
#include <string>
// this is only a convenient structure for creating an example
// this is not part of the host API
template
<
typename
I
,
typename
W
,
typename
O
,
typename
ST
,
typename
SW
,
typename
SQ
,
typename
KW
>
struct
FlatmmUkTypeConfig
;
template
<
typename
ST
,
typename
SW
,
typename
SQ
,
typename
KW
>
struct
FlatmmUkTypeConfig
<
ck_tile
::
bf16_t
,
ck_tile
::
bf16_t
,
ck_tile
::
bf16_t
,
ST
,
SW
,
SQ
,
KW
>
{
using
ADataType
=
ck_tile
::
bf16_t
;
using
GDataType
=
ck_tile
::
bf16_t
;
using
DDataType
=
ck_tile
::
bf16_t
;
using
AccDataType
=
float
;
using
ODataType
=
ck_tile
::
bf16_t
;
using
AScaleDataType
=
ck_tile
::
remove_cvref_t
<
ST
>
;
using
GScaleDataType
=
ck_tile
::
remove_cvref_t
<
SW
>
;
using
DScaleDataType
=
ck_tile
::
remove_cvref_t
<
SW
>
;
using
YSmoothScaleDataType
=
ck_tile
::
remove_cvref_t
<
SQ
>
;
using
TopkWeightDataType
=
ck_tile
::
remove_cvref_t
<
KW
>
;
using
IndexDataType
=
ck_tile
::
index_t
;
};
template
<
typename
ST
,
typename
SW
,
typename
SQ
,
typename
KW
>
struct
FlatmmUkTypeConfig
<
ck_tile
::
fp16_t
,
ck_tile
::
fp16_t
,
ck_tile
::
fp16_t
,
ST
,
SW
,
SQ
,
KW
>
{
using
ADataType
=
ck_tile
::
fp16_t
;
using
GDataType
=
ck_tile
::
fp16_t
;
using
DDataType
=
ck_tile
::
fp16_t
;
using
AccDataType
=
float
;
using
ODataType
=
ck_tile
::
fp16_t
;
using
AScaleDataType
=
ck_tile
::
remove_cvref_t
<
ST
>
;
using
GScaleDataType
=
ck_tile
::
remove_cvref_t
<
SW
>
;
using
DScaleDataType
=
ck_tile
::
remove_cvref_t
<
SW
>
;
using
YSmoothScaleDataType
=
ck_tile
::
remove_cvref_t
<
SQ
>
;
using
TopkWeightDataType
=
ck_tile
::
remove_cvref_t
<
KW
>
;
using
IndexDataType
=
ck_tile
::
index_t
;
};
template
<
typename
ST
,
typename
SW
,
typename
SQ
,
typename
KW
>
struct
FlatmmUkTypeConfig
<
ck_tile
::
int8_t
,
ck_tile
::
int8_t
,
ck_tile
::
bf16_t
,
ST
,
SW
,
SQ
,
KW
>
{
using
ADataType
=
ck_tile
::
int8_t
;
using
GDataType
=
ck_tile
::
int8_t
;
using
DDataType
=
ck_tile
::
int8_t
;
using
AccDataType
=
int32_t
;
using
ODataType
=
ck_tile
::
bf16_t
;
using
AScaleDataType
=
ck_tile
::
remove_cvref_t
<
ST
>
;
using
GScaleDataType
=
ck_tile
::
remove_cvref_t
<
SW
>
;
using
DScaleDataType
=
ck_tile
::
remove_cvref_t
<
SW
>
;
using
YSmoothScaleDataType
=
ck_tile
::
remove_cvref_t
<
SQ
>
;
using
TopkWeightDataType
=
ck_tile
::
remove_cvref_t
<
KW
>
;
using
IndexDataType
=
ck_tile
::
index_t
;
};
struct
flatmm_uk_args
{
const
void
*
a_ptr
;
// [m, k], input token
const
void
*
b_ptr
;
// [m, k], input token
const
void
*
c_ptr
;
// [m, k], output token (no need to do zeroing)
void
*
d_ptr
;
// [m, k], output token (no need to do zeroing)
void
*
dbg_int_ptr
;
// [m, k], output token (no need to do zeroing)
void
*
dbg_bf16_ptr
;
// [m, k], output token (no need to do zeroing)
void
*
dbg_fp32_ptr
;
// [m, k], output token (no need to do zeroing)
ck_tile
::
index_t
block_m
;
// block_m, used to devide the input
ck_tile
::
index_t
hidden_size
;
// k
ck_tile
::
index_t
intermediate_size
;
// n / TP, for Gate. if Gate+Up, Down need divide by 2
ck_tile
::
index_t
num_tokens
;
// input number of tokens for current iteration
ck_tile
::
index_t
num_experts
;
// number of groups
ck_tile
::
index_t
topk
;
// need this?
ck_tile
::
index_t
stride_token
;
// for input/output, stride for each row, should >= hidden_size
};
// This is the public API, will be generated by script
struct
flatmm_uk_traits
{
std
::
string
prec_i
;
// input precision
std
::
string
prec_w
;
// weight precision
std
::
string
prec_o
;
// output precision
std
::
string
prec_st
;
// token scale data type
std
::
string
prec_sw
;
// weight scale data type
std
::
string
prec_sq
;
// smooth quant scale
std
::
string
prec_kw
;
// topk-weight data type
int
block_m
;
int
gate_only
;
int
fused_quant
;
// 0:no-sweep, 1:smooth-dynamic-quant, 2:dynamic-quant
};
float
flatmm_uk
(
flatmm_uk_traits
,
flatmm_uk_args
,
const
ck_tile
::
stream_config
&
);
example/ck_tile/18_flatmm_uk/instances/flatmm_uk_api.cpp
0 → 100644
View file @
fa335f31
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "flatmm_uk.hpp"
#include "flatmm_uk_api.hpp"
#include "ck_tile/ops/flatmm_uk.hpp"
#include <iostream>
template
<
ck_tile
::
index_t
...
Is
>
using
S
=
ck_tile
::
sequence
<
Is
...
>
;
// do not the define of this tepmlate function inside the _api.cpp, otherwise will block make -j
template
<
typename
Ts_
>
float
flatmm_uk_
(
const
ck_tile
::
stream_config
&
s_
,
flatmm_uk_args_
a_
)
{
printf
(
"[FF] ======= fused_moegemm_() =======
\n
\t
get moe arg in a_ <flatmm_uk_args>, get "
"config in Ts_
\n
"
);
using
f_traits
=
ck_tile
::
FusedMoeGemmTraits
<
Ts_
::
GateOnly
,
Ts_
::
FusedQuant
==
1
,
1
/*atomic*/
>
;
using
f_shape
=
ck_tile
::
FusedMoeGemmShape
<
typename
Ts_
::
BlockTile_0
,
typename
Ts_
::
WarpPerBlock_0
,
typename
Ts_
::
WarpTile_0
,
typename
Ts_
::
BlockTile_1
,
typename
Ts_
::
WarpPerBlock_0
,
typename
Ts_
::
WarpTile_0
>
;
printf
(
"[FF] --- fused_moegemm_(): <FusedMoeGemmShape> ---
\n
"
);
printf
(
"[FF] f_shape::BlockSize = %d
\n
"
,
static_cast
<
uint32_t
>
(
f_shape
::
BlockSize
));
printf
(
"[FF] f_shape::NumWarps = %d
\n
"
,
static_cast
<
uint32_t
>
(
f_shape
::
NumWarps
));
printf
(
"[FF] ---------
\n
"
);
printf
(
"[FF] f_shape::Block_M0 = %d
\n
"
,
static_cast
<
uint32_t
>
(
f_shape
::
Block_M0
));
printf
(
"[FF] f_shape::Block_N0 = %d
\n
"
,
static_cast
<
uint32_t
>
(
f_shape
::
Block_N0
));
printf
(
"[FF] f_shape::Block_K0 = %d
\n
"
,
static_cast
<
uint32_t
>
(
f_shape
::
Block_K0
));
printf
(
"[FF] f_shape::WarpPerBlock_M0 = %d
\n
"
,
static_cast
<
uint32_t
>
(
f_shape
::
WarpPerBlock_M0
));
printf
(
"[FF] f_shape::WarpPerBlock_N0 = %d
\n
"
,
static_cast
<
uint32_t
>
(
f_shape
::
WarpPerBlock_N0
));
printf
(
"[FF] f_shape::WarpPerBlock_K0 = %d
\n
"
,
static_cast
<
uint32_t
>
(
f_shape
::
WarpPerBlock_K0
));
printf
(
"[FF] f_shape::Warp_M0 = %d
\n
"
,
static_cast
<
uint32_t
>
(
f_shape
::
Warp_M0
));
printf
(
"[FF] f_shape::Warp_N0 = %d
\n
"
,
static_cast
<
uint32_t
>
(
f_shape
::
Warp_N0
));
printf
(
"[FF] f_shape::Warp_K0 = %d
\n
"
,
static_cast
<
uint32_t
>
(
f_shape
::
Warp_K0
));
printf
(
"[FF] f_shape::ThreadPerBlock_M0 = %d
\n
"
,
static_cast
<
uint32_t
>
(
f_shape
::
ThreadPerBlock_M0
));
printf
(
"[FF] f_shape::ThreadPerBlock_N0 = %d
\n
"
,
static_cast
<
uint32_t
>
(
f_shape
::
ThreadPerBlock_N0
));
printf
(
"[FF] f_shape::ThreadPerBlock_K0 = %d
\n
"
,
static_cast
<
uint32_t
>
(
f_shape
::
ThreadPerBlock_K0
));
printf
(
"[FF] f_shape::Repeat_M0 = %d
\n
"
,
static_cast
<
uint32_t
>
(
f_shape
::
Repeat_M0
));
printf
(
"[FF] f_shape::Repeat_N0 = %d
\n
"
,
static_cast
<
uint32_t
>
(
f_shape
::
Repeat_N0
));
printf
(
"[FF] f_shape::Repeat_K0 = %d
\n
"
,
static_cast
<
uint32_t
>
(
f_shape
::
Repeat_K0
));
printf
(
"[FF] f_shape::Block_W0 = %d
\n
"
,
static_cast
<
uint32_t
>
(
f_shape
::
Block_W0
));
printf
(
"[FF] f_shape::Block_Nr0 = %d
\n
"
,
static_cast
<
uint32_t
>
(
f_shape
::
Block_Nr0
));
printf
(
"[FF] f_shape::Block_Kr0 = %d
\n
"
,
static_cast
<
uint32_t
>
(
f_shape
::
Block_Kr0
));
printf
(
"[FF] ---------
\n
"
);
printf
(
"[FF] f_shape::Block_M1 = %d
\n
"
,
static_cast
<
uint32_t
>
(
f_shape
::
Block_M1
));
printf
(
"[FF] f_shape::Block_N1 = %d
\n
"
,
static_cast
<
uint32_t
>
(
f_shape
::
Block_N1
));
printf
(
"[FF] f_shape::Block_K1 = %d
\n
"
,
static_cast
<
uint32_t
>
(
f_shape
::
Block_K1
));
printf
(
"[FF] f_shape::WarpPerBlock_M1 = %d
\n
"
,
static_cast
<
uint32_t
>
(
f_shape
::
WarpPerBlock_M1
));
printf
(
"[FF] f_shape::WarpPerBlock_N1 = %d
\n
"
,
static_cast
<
uint32_t
>
(
f_shape
::
WarpPerBlock_N1
));
printf
(
"[FF] f_shape::WarpPerBlock_K1 = %d
\n
"
,
static_cast
<
uint32_t
>
(
f_shape
::
WarpPerBlock_K1
));
printf
(
"[FF] f_shape::Warp_M1 = %d
\n
"
,
static_cast
<
uint32_t
>
(
f_shape
::
Warp_M1
));
printf
(
"[FF] f_shape::Warp_N1 = %d
\n
"
,
static_cast
<
uint32_t
>
(
f_shape
::
Warp_N1
));
printf
(
"[FF] f_shape::Warp_K1 = %d
\n
"
,
static_cast
<
uint32_t
>
(
f_shape
::
Warp_K1
));
printf
(
"[FF] f_shape::ThreadPerBlock_M1 = %d
\n
"
,
static_cast
<
uint32_t
>
(
f_shape
::
ThreadPerBlock_M1
));
printf
(
"[FF] f_shape::ThreadPerBlock_N1 = %d
\n
"
,
static_cast
<
uint32_t
>
(
f_shape
::
ThreadPerBlock_N1
));
printf
(
"[FF] f_shape::ThreadPerBlock_K1 = %d
\n
"
,
static_cast
<
uint32_t
>
(
f_shape
::
ThreadPerBlock_K1
));
printf
(
"[FF] f_shape::Repeat_M1 = %d
\n
"
,
static_cast
<
uint32_t
>
(
f_shape
::
Repeat_M1
));
printf
(
"[FF] f_shape::Repeat_N1 = %d
\n
"
,
static_cast
<
uint32_t
>
(
f_shape
::
Repeat_N1
));
printf
(
"[FF] f_shape::Repeat_K1 = %d
\n
"
,
static_cast
<
uint32_t
>
(
f_shape
::
Repeat_K1
));
printf
(
"[FF] f_shape::Block_W1 = %d
\n
"
,
static_cast
<
uint32_t
>
(
f_shape
::
Block_W1
));
printf
(
"[FF] f_shape::Block_Nr1 = %d
\n
"
,
static_cast
<
uint32_t
>
(
f_shape
::
Block_Nr1
));
printf
(
"[FF] f_shape::Block_Kr1 = %d
\n
"
,
static_cast
<
uint32_t
>
(
f_shape
::
Block_Kr1
));
using
f_problem
=
ck_tile
::
FusedMoeGemmPipelineProblem
<
typename
Ts_
::
ADataType
,
typename
Ts_
::
GDataType
,
typename
Ts_
::
DDataType
,
typename
Ts_
::
AccDataType
,
typename
Ts_
::
ODataType
,
typename
Ts_
::
AScaleDataType
,
typename
Ts_
::
GScaleDataType
,
typename
Ts_
::
DScaleDataType
,
typename
Ts_
::
YSmoothScaleDataType
,
typename
Ts_
::
TopkWeightDataType
,
typename
Ts_
::
IndexDataType
,
ck_tile
::
element_wise
::
FastGeluAsm
,
// TODO: hardcoded
f_shape
,
f_traits
>
;
// using f_pipeline = ck_tile::FusedMoeGemmPipeline_FlatmmEx<f_problem>;
using
f_pipeline
=
ck_tile
::
GemmPipeline_FlatmmUk
<
f_problem
>
;
using
f_kernel
=
ck_tile
::
FlatmmUkKernel
<
f_pipeline
,
void
>
;
const
dim3
grids
=
f_kernel
::
GridSize
(
a_
);
constexpr
dim3
blocks
=
f_kernel
::
BlockSize
();
constexpr
ck_tile
::
index_t
kBlockPerCu
=
1
;
printf
(
"[FF] grids = [%d, %d, %d]
\n
"
,
grids
.
x
,
grids
.
y
,
grids
.
z
);
printf
(
"[FF] blocks = [%d, %d, %d]
\n
"
,
blocks
.
x
,
blocks
.
y
,
blocks
.
z
);
static
int
printed
=
0
;
auto
kargs
=
f_kernel
::
MakeKargs
(
a_
);
f_kernel
kernel
{};
auto
lambda_kenrel
=
ck_tile
::
make_kernel
<
blocks
.
x
,
kBlockPerCu
>
(
kernel
,
grids
,
blocks
,
0
,
kargs
);
if
(
s_
.
log_level_
>
0
&&
printed
==
10
)
{
// std::cout << ", " << f_kernel::GetName() << std::flush;
printed
=
1
;
}
return
ck_tile
::
launch_kernel
(
s_
,
lambda_kenrel
// ck_tile::make_kernel<blocks.x, kBlockPerCu>(f_kernel{}, grids, blocks, 0, kargs)
);
}
float
flatmm_uk
(
flatmm_uk_traits
t
,
flatmm_uk_args
a
,
const
ck_tile
::
stream_config
&
s
)
{
// auto s_ = ck_tile::stream_config{s.stream_id_, false, s.log_level_, 0, 1};
auto
s_
=
s
;
auto
t_
=
flatmm_uk_traits_
{
t
.
prec_i
,
t
.
prec_w
,
t
.
prec_o
,
t
.
prec_st
,
t
.
prec_sw
,
t
.
prec_sq
,
t
.
prec_kw
,
t
.
block_m
,
t
.
gate_only
,
t
.
fused_quant
};
auto
a_
=
flatmm_uk_args_
{
a
.
a_ptr
,
// const void* a_ptr;
a
.
b_ptr
,
// const void* a_ptr;
a
.
c_ptr
,
// void* o_ptr;
a
.
d_ptr
,
// void* o_ptr;
a
.
dbg_int_ptr
,
a
.
dbg_bf16_ptr
,
a
.
dbg_fp32_ptr
,
a
.
hidden_size
,
// index_t hidden_size;
a
.
intermediate_size
,
// index_t intermediate_size;
a
.
num_tokens
,
// index_t num_tokens;
a
.
num_experts
,
// index_t num_experts;
a
.
topk
,
// index_t topk;
a
.
stride_token
// index_t stride_token;
};
float
r
=
-
1
;
if
(
t_
.
prec_i
==
"bf16"
&&
t_
.
prec_w
==
"bf16"
&&
t_
.
prec_o
==
"bf16"
&&
t_
.
prec_st
==
"fp32"
&&
t_
.
prec_sw
==
"fp32"
&&
t_
.
prec_sq
==
"fp32"
&&
t_
.
prec_kw
==
"fp32"
&&
t_
.
block_m
==
32
&&
t_
.
gate_only
==
1
)
{
using
t_
=
fmoe_
<
ck_tile
::
bf16_t
,
ck_tile
::
bf16_t
,
ck_tile
::
bf16_t
,
float
,
float
,
float
,
float
,
S
<
32
,
512
,
128
,
128
>
,
S
<
1
,
4
,
1
>
,
S
<
16
,
16
,
32
>
,
1
,
0
>
;
r
=
flatmm_uk_
<
t_
>
(
s_
,
a_
);
}
else
if
(
t_
.
prec_i
==
"fp16"
&&
t_
.
prec_w
==
"fp16"
&&
t_
.
prec_o
==
"fp16"
&&
t_
.
prec_st
==
"fp32"
&&
t_
.
prec_sw
==
"fp32"
&&
t_
.
prec_sq
==
"fp32"
&&
t_
.
prec_kw
==
"fp32"
&&
t_
.
block_m
==
32
&&
t_
.
gate_only
==
1
)
{
using
t_
=
fmoe_
<
ck_tile
::
fp16_t
,
ck_tile
::
fp16_t
,
ck_tile
::
fp16_t
,
float
,
float
,
float
,
float
,
S
<
32
,
512
,
128
,
128
>
,
S
<
1
,
4
,
1
>
,
S
<
16
,
16
,
32
>
,
1
,
0
>
;
r
=
flatmm_uk_
<
t_
>
(
s_
,
a_
);
}
// keep unsupported case return negative
if
(
r
<
0
)
return
-
1
;
return
r
;
}
example/ck_tile/18_flatmm_uk/instances/flatmm_uk_api.hpp
0 → 100644
View file @
fa335f31
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/host/kernel_launch.hpp"
#include "ck_tile/ops/flatmm_uk.hpp"
#include <string>
// runtime args
struct
flatmm_uk_args_
:
public
ck_tile
::
FlatmmUkHostArgs
{
};
// This is the public API, will be generated by script
struct
flatmm_uk_traits_
{
std
::
string
prec_i
;
// input precision
std
::
string
prec_w
;
// weight precision
std
::
string
prec_o
;
// output precision
std
::
string
prec_st
;
// token scale data type
std
::
string
prec_sw
;
// weight scale data type
std
::
string
prec_sq
;
// smooth quant scale
std
::
string
prec_kw
;
// topk-weight data type
int
block_m
;
int
gate_only
;
int
fused_quant
;
// 0:no-sweep, 1:smooth-dynamic-quant, 2:dynamic-quant
};
// this is used to pattern-match internl kernel implementation, not to instantiate kernel
template
<
typename
I
,
typename
W
,
typename
O
,
typename
ST
,
typename
SW
,
typename
SQ
,
typename
KW
,
typename
BlockTIle_
,
// seq<b_token, b_interm, b_hidden, b_down>
typename
WarpPerBlock_
,
typename
WarpTile_
,
// seq<*,*,*>, used to select mfma
ck_tile
::
index_t
GateOnly_
=
0
,
ck_tile
::
index_t
FusedQuant_
=
0
>
struct
fmoe_
// traits, ugly name, only used for internal
{
using
TypeConfig
=
FlatmmUkTypeConfig
<
I
,
W
,
O
,
ST
,
SW
,
SQ
,
KW
>
;
using
ADataType
=
ck_tile
::
remove_cvref_t
<
typename
TypeConfig
::
ADataType
>
;
using
GDataType
=
ck_tile
::
remove_cvref_t
<
typename
TypeConfig
::
GDataType
>
;
using
DDataType
=
ck_tile
::
remove_cvref_t
<
typename
TypeConfig
::
DDataType
>
;
using
AccDataType
=
ck_tile
::
remove_cvref_t
<
typename
TypeConfig
::
AccDataType
>
;
using
ODataType
=
ck_tile
::
remove_cvref_t
<
typename
TypeConfig
::
ODataType
>
;
using
AScaleDataType
=
ck_tile
::
remove_cvref_t
<
typename
TypeConfig
::
AScaleDataType
>
;
using
GScaleDataType
=
ck_tile
::
remove_cvref_t
<
typename
TypeConfig
::
GScaleDataType
>
;
using
DScaleDataType
=
ck_tile
::
remove_cvref_t
<
typename
TypeConfig
::
DScaleDataType
>
;
using
YSmoothScaleDataType
=
ck_tile
::
remove_cvref_t
<
typename
TypeConfig
::
YSmoothScaleDataType
>
;
using
TopkWeightDataType
=
ck_tile
::
remove_cvref_t
<
typename
TypeConfig
::
TopkWeightDataType
>
;
using
IndexDataType
=
ck_tile
::
remove_cvref_t
<
typename
TypeConfig
::
IndexDataType
>
;
static
constexpr
ck_tile
::
index_t
BT_
=
BlockTIle_
::
at
(
ck_tile
::
number
<
0
>
{});
// block token
static
constexpr
ck_tile
::
index_t
BI_
=
BlockTIle_
::
at
(
ck_tile
::
number
<
1
>
{});
// block intermediate
static
constexpr
ck_tile
::
index_t
BH_
=
BlockTIle_
::
at
(
ck_tile
::
number
<
2
>
{});
// block hidden
static
constexpr
ck_tile
::
index_t
BD_
=
BlockTIle_
::
at
(
ck_tile
::
number
<
3
>
{});
// block down
using
BlockTile_0
=
ck_tile
::
sequence
<
BT_
,
BI_
,
BH_
>
;
using
WarpPerBlock_0
=
ck_tile
::
remove_cvref_t
<
WarpPerBlock_
>
;
using
WarpTile_0
=
ck_tile
::
remove_cvref_t
<
WarpTile_
>
;
using
BlockTile_1
=
ck_tile
::
sequence
<
BT_
,
BD_
,
BI_
/
(
GateOnly_
?
1
:
2
)
>
;
using
WarpPerBlock_1
=
ck_tile
::
remove_cvref_t
<
WarpPerBlock_
>
;
using
WarpTile_1
=
ck_tile
::
remove_cvref_t
<
WarpTile_
>
;
static
constexpr
ck_tile
::
index_t
GateOnly
=
GateOnly_
;
static
constexpr
ck_tile
::
index_t
FusedQuant
=
FusedQuant_
;
};
example/ck_tile/18_flatmm_uk/main.cpp
0 → 100644
View file @
fa335f31
#include <algorithm>
#include <cstring>
#include <unordered_set>
#include <vector>
#include <set>
#include "ck_tile/host.hpp"
#include "flatmm_uk.hpp"
// different threshold for different dtype
template
<
typename
DataType
>
auto
get_elimit
()
{
double
rtol
=
1e-2
;
double
atol
=
1e-2
;
return
ck_tile
::
make_tuple
(
rtol
,
atol
);
}
template
<
>
auto
get_elimit
<
ck_tile
::
bf16_t
>
()
{
double
rtol
=
1e-2
;
double
atol
=
1e-2
;
return
ck_tile
::
make_tuple
(
rtol
,
atol
);
}
template
<
typename
ADataType
,
typename
BDataType
,
typename
AccDataType
,
typename
CDataType
,
typename
AElementOp
=
ck_tile
::
identity
,
typename
BElementOp
=
ck_tile
::
identity
,
typename
ACCElementOp
=
ck_tile
::
identity
>
CK_TILE_HOST
void
my_reference_gemm
(
const
ck_tile
::
HostTensor
<
ADataType
>&
a_m_k
,
const
ck_tile
::
HostTensor
<
BDataType
>&
b_k_n
,
ck_tile
::
HostTensor
<
CDataType
>&
c_m_n
,
float
t
,
const
AElementOp
&
a_element_op
=
{},
const
BElementOp
&
b_element_op
=
{},
const
ACCElementOp
&
acc_element_op
=
{})
{
const
std
::
size_t
M
=
a_m_k
.
get_length
(
0
);
const
std
::
size_t
N
=
b_k_n
.
get_length
(
0
);
const
std
::
size_t
K
=
a_m_k
.
get_length
(
1
);
printf
(
"[REF] M = %zu, N = %zu, K = %zu
\n
"
,
M
,
N
,
K
);
auto
cal_tflops
=
[
&
](
auto
ms
)
{
double
flop_gemm
=
2.0
*
M
*
N
*
K
;
return
(
flop_gemm
)
/
(
static_cast
<
double
>
(
ms
)
*
1e-3
)
/
1e12
;
};
auto
cal_tbps
=
[
&
](
auto
ms
)
{
double
a_bytes
=
static_cast
<
double
>
(
M
)
*
K
*
sizeof
(
ADataType
);
double
b_bytes
=
static_cast
<
double
>
(
N
)
*
K
*
sizeof
(
BDataType
);
double
o_bytes
=
static_cast
<
double
>
(
M
)
*
N
*
sizeof
(
CDataType
);
return
(
a_bytes
+
b_bytes
+
o_bytes
)
/
(
static_cast
<
double
>
(
ms
)
*
1e-3
)
/
1e12
;
};
std
::
cout
<<
", "
<<
t
*
1.E3
<<
" us, "
<<
cal_tflops
(
t
)
<<
" tflops, "
<<
cal_tbps
(
t
)
<<
" TB/s"
<<
std
::
endl
<<
std
::
flush
;
auto
f_mn
=
[
&
](
auto
m
,
auto
n
)
{
AccDataType
v_acc
=
0
;
for
(
std
::
size_t
k
=
0
;
k
<
K
;
++
k
)
{
ADataType
v_a
=
a_element_op
(
a_m_k
(
m
,
k
));
BDataType
v_b
=
b_element_op
(
b_k_n
(
n
,
k
));
v_acc
+=
ck_tile
::
type_convert
<
AccDataType
>
(
v_a
)
*
ck_tile
::
type_convert
<
AccDataType
>
(
v_b
);
}
c_m_n
(
m
,
n
)
=
ck_tile
::
type_convert
<
CDataType
>
(
acc_element_op
(
v_acc
));
};
ck_tile
::
make_ParallelTensorFunctor
(
f_mn
,
M
,
N
)(
std
::
thread
::
hardware_concurrency
());
}
// mfma_type, 0:32x32, 1:16x16
// TODO: padding?
template
<
typename
T
>
auto
shuffle_moe_weight
(
const
ck_tile
::
HostTensor
<
T
>&
t
,
std
::
string
mfma_dtype
,
int
mfma_type
=
0
)
{
assert
(
t
.
get_lengths
().
size
()
==
3
);
int
b_
=
t
.
get_lengths
()[
0
];
int
n_
=
t
.
get_lengths
()[
1
];
int
k_
=
t
.
get_lengths
()[
2
];
if
((
mfma_dtype
==
"bf16"
||
mfma_dtype
==
"fp16"
)
&&
mfma_type
==
0
)
{
ck_tile
::
HostTensor
<
T
>
t_view
({
b_
,
n_
/
32
,
32
,
k_
/
16
,
2
,
8
});
std
::
copy
(
t
.
begin
(),
t
.
end
(),
t_view
.
begin
());
return
ck_tile
::
reference_permute
(
t_view
,
{
0
,
1
,
3
,
4
,
2
,
5
});
}
else
if
((
mfma_dtype
==
"bf16"
||
mfma_dtype
==
"fp16"
)
&&
mfma_type
==
1
)
{
ck_tile
::
HostTensor
<
T
>
t_view
({
b_
,
n_
/
16
,
16
,
k_
/
32
,
4
,
8
});
std
::
copy
(
t
.
begin
(),
t
.
end
(),
t_view
.
begin
());
return
ck_tile
::
reference_permute
(
t_view
,
{
0
,
1
,
3
,
4
,
2
,
5
});
}
else
if
((
mfma_dtype
==
"int8"
||
mfma_dtype
==
"fp8"
)
&&
mfma_type
==
0
)
{
ck_tile
::
HostTensor
<
T
>
t_view
({
b_
,
n_
/
32
,
32
,
k_
/
32
,
2
,
16
});
std
::
copy
(
t
.
begin
(),
t
.
end
(),
t_view
.
begin
());
return
ck_tile
::
reference_permute
(
t_view
,
{
0
,
1
,
3
,
4
,
2
,
5
});
}
else
if
((
mfma_dtype
==
"int8"
||
mfma_dtype
==
"fp8"
)
&&
mfma_type
==
1
)
{
ck_tile
::
HostTensor
<
T
>
t_view
({
b_
,
n_
/
16
,
16
,
k_
/
64
,
4
,
16
});
std
::
copy
(
t
.
begin
(),
t
.
end
(),
t_view
.
begin
());
return
ck_tile
::
reference_permute
(
t_view
,
{
0
,
1
,
3
,
4
,
2
,
5
});
}
return
t
;
}
template
<
typename
T
>
auto
shuffle_weight
(
const
ck_tile
::
HostTensor
<
T
>&
t
,
std
::
string
mfma_dtype
,
int
mfma_type
=
0
)
{
assert
(
t
.
get_lengths
().
size
()
==
2
);
int
n_
=
t
.
get_lengths
()[
0
];
int
k_
=
t
.
get_lengths
()[
1
];
if
((
mfma_dtype
==
"bf16"
||
mfma_dtype
==
"fp16"
)
&&
mfma_type
==
0
)
{
ck_tile
::
HostTensor
<
T
>
t_view
({
n_
/
32
,
32
,
k_
/
16
,
2
,
8
});
std
::
copy
(
t
.
begin
(),
t
.
end
(),
t_view
.
begin
());
return
ck_tile
::
reference_permute
(
t_view
,
{
0
,
2
,
3
,
1
,
4
});
}
else
if
((
mfma_dtype
==
"bf16"
||
mfma_dtype
==
"fp16"
)
&&
mfma_type
==
1
)
{
ck_tile
::
HostTensor
<
T
>
t_view
({
n_
/
16
,
16
,
k_
/
32
,
4
,
8
});
printf
(
"[FF] permute: n_ = %d, k_ = %d, n_/16 = %d, k_/32 = %d
\n
"
,
n_
,
k_
,
n_
/
16
,
k_
/
32
);
std
::
copy
(
t
.
begin
(),
t
.
end
(),
t_view
.
begin
());
return
ck_tile
::
reference_permute
(
t_view
,
{
0
,
2
,
3
,
1
,
4
});
}
else
if
((
mfma_dtype
==
"int8"
||
mfma_dtype
==
"fp8"
)
&&
mfma_type
==
0
)
{
ck_tile
::
HostTensor
<
T
>
t_view
({
n_
/
32
,
32
,
k_
/
32
,
2
,
16
});
std
::
copy
(
t
.
begin
(),
t
.
end
(),
t_view
.
begin
());
return
ck_tile
::
reference_permute
(
t_view
,
{
0
,
2
,
3
,
1
,
4
});
}
else
if
((
mfma_dtype
==
"int8"
||
mfma_dtype
==
"fp8"
)
&&
mfma_type
==
1
)
{
ck_tile
::
HostTensor
<
T
>
t_view
({
n_
/
16
,
16
,
k_
/
64
,
4
,
16
});
std
::
copy
(
t
.
begin
(),
t
.
end
(),
t_view
.
begin
());
return
ck_tile
::
reference_permute
(
t_view
,
{
0
,
2
,
3
,
1
,
4
});
}
return
t
;
}
auto
create_args
(
int
argc
,
char
*
argv
[])
{
ck_tile
::
ArgParser
arg_parser
;
arg_parser
.
insert
(
"m"
,
"64"
,
"num of m"
)
.
insert
(
"n"
,
"1024"
,
"num of n"
)
.
insert
(
"k"
,
"8192"
,
"num of k"
)
.
insert
(
"t"
,
"64"
,
"num input tokens"
)
.
insert
(
"e"
,
"8"
,
"num of experts"
)
.
insert
(
"tk"
,
"1"
,
"topk"
)
.
insert
(
"h"
,
"4096"
,
"hidden_size of this model"
)
.
insert
(
"i"
,
"4096"
,
"intermediate_size between 2 gemms of FFN"
)
.
insert
(
"stride"
,
"-1"
,
"stride per row, if -1 then equal to hidden_size"
)
.
insert
(
"bm"
,
"32"
,
"blocking factor for sorted tokens"
)
.
insert
(
"tp"
,
"8"
,
"tensor parallel size"
)
.
insert
(
"v"
,
"1"
,
"cpu validation or not"
)
.
insert
(
"kname"
,
"1"
,
"print kernel name or not"
)
.
insert
(
"prec_i"
,
"bf16"
,
"input precision"
)
.
insert
(
"prec_w"
,
"bf16"
,
"weight precision"
)
.
insert
(
"prec_o"
,
"bf16"
,
"output precision"
)
.
insert
(
"prec_st"
,
"auto"
,
"token scale data type. auto will set to fp32"
)
.
insert
(
"prec_sw"
,
"auto"
,
"weight scale data type. auto will set to fp32"
)
.
insert
(
"prec_sq"
,
"auto"
,
"(dynamic) smooth quant data type. auto will set to fp32"
)
.
insert
(
"prec_kw"
,
"auto"
,
"topk-weight data type. auto will set to fp32"
)
.
insert
(
"fquant"
,
"0"
,
"fused-quant, 0:no, 1:smooth-dynamic-quant, 2:dynamic-quant"
)
.
insert
(
"gate_only"
,
"1"
,
"w0(gate/up) style, 0:gate+up will double interm size, 1:only gate"
)
.
insert
(
"api"
,
"0"
,
"benchmark api set: 0:fused-moe(moe-gemm+moe-sorting), 1:moe-gemm"
)
.
insert
(
"balance"
,
"0"
,
"if set to 1, will try balance the expert in topk-ids(convenient for testing)"
)
.
insert
(
"init"
,
"2"
,
"init method. 0:random stepped float(fast). 1: random uniform, 2:rand normalized"
"normalized(slow)"
)
.
insert
(
"seed"
,
"11939"
,
"seed used to do random"
)
.
insert
(
"warmup"
,
"1"
,
"cold iter"
)
.
insert
(
"repeat"
,
"4"
,
"hot iter"
);
bool
result
=
arg_parser
.
parse
(
argc
,
argv
);
return
std
::
make_tuple
(
result
,
arg_parser
);
}
// I:input-type, W:weight-type, O:output-type, ST:toke-scale-tpye, SW:weight-scale-type,
// SQ:smooth-quant-type, KW:topk-weight-type
template
<
typename
I
,
typename
W
,
typename
O
,
typename
ST
,
typename
SW
,
typename
SQ
,
typename
KW
>
bool
run
(
const
ck_tile
::
ArgParser
&
arg_parser
)
{
ck_tile
::
index_t
M
=
arg_parser
.
get_int
(
"m"
);
ck_tile
::
index_t
N
=
arg_parser
.
get_int
(
"n"
);
ck_tile
::
index_t
K
=
arg_parser
.
get_int
(
"k"
);
printf
(
"[FF] M = %d, N = %d, K = %d
\n
"
,
M
,
N
,
K
);
ck_tile
::
index_t
experts
=
arg_parser
.
get_int
(
"e"
);
ck_tile
::
index_t
topk
=
arg_parser
.
get_int
(
"tk"
);
ck_tile
::
index_t
stride
=
arg_parser
.
get_int
(
"stride"
);
ck_tile
::
index_t
block_m
=
arg_parser
.
get_int
(
"bm"
);
std
::
string
prec_i
=
arg_parser
.
get_str
(
"prec_i"
);
std
::
string
prec_w
=
arg_parser
.
get_str
(
"prec_w"
);
std
::
string
prec_o
=
arg_parser
.
get_str
(
"prec_o"
);
std
::
string
prec_st
=
arg_parser
.
get_str
(
"prec_st"
);
std
::
string
prec_sw
=
arg_parser
.
get_str
(
"prec_sw"
);
std
::
string
prec_sq
=
arg_parser
.
get_str
(
"prec_sq"
);
std
::
string
prec_kw
=
arg_parser
.
get_str
(
"prec_kw"
);
prec_st
=
(
prec_st
==
"auto"
)
?
"fp32"
:
prec_st
;
prec_sw
=
(
prec_sw
==
"auto"
)
?
"fp32"
:
prec_sw
;
prec_sq
=
(
prec_sq
==
"auto"
)
?
"fp32"
:
prec_sq
;
prec_kw
=
(
prec_kw
==
"auto"
)
?
"fp32"
:
prec_kw
;
int
kname
=
arg_parser
.
get_int
(
"kname"
);
int
do_validation
=
arg_parser
.
get_int
(
"v"
);
int
warmup
=
arg_parser
.
get_int
(
"warmup"
);
int
repeat
=
arg_parser
.
get_int
(
"repeat"
);
int
fused_quant
=
arg_parser
.
get_int
(
"fquant"
);
int
gate_only
=
arg_parser
.
get_int
(
"gate_only"
);
int
init
=
arg_parser
.
get_int
(
"init"
);
uint32_t
seed
=
arg_parser
.
get_uint32
(
"seed"
);
using
TypeConfig
=
FlatmmUkTypeConfig
<
I
,
W
,
O
,
ST
,
SW
,
SQ
,
KW
>
;
using
ADataType
=
typename
TypeConfig
::
ADataType
;
using
BDataType
=
ADataType
;
using
AccDataType
=
typename
TypeConfig
::
AccDataType
;
using
CDataType
=
AccDataType
;
using
DDataType
=
AccDataType
;
// host verify
ck_tile
::
HostTensor
<
ADataType
>
a_host
({
M
,
K
});
ck_tile
::
HostTensor
<
BDataType
>
b_host
({
N
,
K
});
ck_tile
::
HostTensor
<
CDataType
>
c_host
({
M
,
N
});
ck_tile
::
HostTensor
<
DDataType
>
d_host
({
M
,
N
});
ck_tile
::
HostTensor
<
int
>
dbg_int
({
M
*
N
,
K
});
ck_tile
::
HostTensor
<
float
>
dbg_fp32
({
M
*
N
,
K
});
ck_tile
::
HostTensor
<
ck_tile
::
bf16_t
>
dbg_bf16
({
M
*
N
,
K
});
if
(
init
==
0
)
{
ck_tile
::
FillStepRange
<
ADataType
>
{
-
.5
f
,
.5
f
,
0.01
f
}(
a_host
);
ck_tile
::
FillStepRange
<
BDataType
>
{
-
.5
f
,
.5
f
,
0.01
f
}(
b_host
);
}
else
if
(
init
==
1
)
{
ck_tile
::
FillUniformDistribution
<
ADataType
>
{
-
.5
f
,
.5
f
,
seed
,
true
}(
a_host
);
ck_tile
::
FillUniformDistribution
<
BDataType
>
{
-
.5
f
,
.5
f
,
seed
,
true
}(
b_host
);
}
else
if
(
init
==
2
)
{
ck_tile
::
FillNormalDistribution
<
ADataType
>
{
0.
f
,
1.
f
,
seed
,
true
}(
a_host
);
ck_tile
::
FillNormalDistribution
<
BDataType
>
{
0.
f
,
1.
f
,
seed
,
true
}(
b_host
);
}
/*
// a_host
{
int X = static_cast<int>(K);
int Y = static_cast<int>(M);
for(int y = 0; y < Y; y++)
{
for(int x = 0; x < X; x++)
{
int idx = X * y + x;
a_host.mData[idx] = ck_tile::type_convert<ADataType>(x * 1.0f);
//b_host.mData[idx] = ck_tile::type_convert<GDataType>(y * 1.0f);
//b_host.mData[idx] = ck_tile::type_convert<GDataType>(y*1.f + x * 0.0001f);
}
}
}
// b_host
{
int X = static_cast<int>(K);
int Y = static_cast<int>(N);
for(int y = 0; y < Y; y++)
{
for(int x = 0; x < X; x++)
{
int idx = X * y + x;
b_host.mData[idx] = ck_tile::type_convert<GDataType>(idx * 1.0f);
//b_host.mData[idx] = ck_tile::type_convert<GDataType>(y * 1.0f);
//b_host.mData[idx] = ck_tile::type_convert<GDataType>(y*1.f + x * 0.0001f);
}
}
}*/
// permute weight
ck_tile
::
HostTensor
<
BDataType
>
b_perm_host
=
shuffle_weight
(
b_host
,
prec_w
,
1
);
ck_tile
::
DeviceMem
a_buf
(
a_host
);
ck_tile
::
DeviceMem
b_buf
(
b_perm_host
);
// b_host -> b_perm_host
ck_tile
::
DeviceMem
c_buf
(
c_host
);
ck_tile
::
DeviceMem
d_buf
(
d_host
);
ck_tile
::
DeviceMem
dbg_int_buf
(
dbg_int
);
ck_tile
::
DeviceMem
dbg_bf16_buf
(
dbg_bf16
);
ck_tile
::
DeviceMem
dbg_fp32_buf
(
dbg_fp32
);
flatmm_uk_traits
traits
{
prec_i
,
prec_w
,
prec_o
,
prec_st
,
prec_sw
,
prec_sq
,
prec_kw
,
block_m
,
gate_only
,
fused_quant
};
printf
(
"[FF] --- run(): <flatmm_uk_traits> ---
\n
"
);
printf
(
"[FF] traits.prec_i = %s
\n
"
,
traits
.
prec_i
.
c_str
());
printf
(
"[FF] traits.prec_w = %s
\n
"
,
traits
.
prec_w
.
c_str
());
printf
(
"[FF] traits.prec_o = %s
\n
"
,
traits
.
prec_o
.
c_str
());
printf
(
"[FF] traits.prec_st = %s
\n
"
,
traits
.
prec_st
.
c_str
());
printf
(
"[FF] traits.prec_sw = %s
\n
"
,
traits
.
prec_sw
.
c_str
());
printf
(
"[FF] traits.prec_sq = %s
\n
"
,
traits
.
prec_sq
.
c_str
());
printf
(
"[FF] traits.prec_kw = %s
\n
"
,
traits
.
prec_kw
.
c_str
());
printf
(
"[FF] traits.block_m = %d
\n
"
,
traits
.
block_m
);
printf
(
"[FF] traits.gate_only = %d
\n
"
,
traits
.
gate_only
);
printf
(
"[FF] traits.fused_quant = %d
\n
"
,
traits
.
fused_quant
);
flatmm_uk_args
args
{
a_buf
.
GetDeviceBuffer
(),
b_buf
.
GetDeviceBuffer
(),
c_buf
.
GetDeviceBuffer
(),
d_buf
.
GetDeviceBuffer
(),
dbg_int_buf
.
GetDeviceBuffer
(),
dbg_bf16_buf
.
GetDeviceBuffer
(),
dbg_fp32_buf
.
GetDeviceBuffer
(),
block_m
,
K
,
N
,
M
,
experts
,
topk
,
stride
};
printf
(
"[FF] --- run(): <flatmm_uk_args> ---
\n
"
);
printf
(
"[FF] args.block_m = %d
\n
"
,
args
.
block_m
);
printf
(
"[FF] args.hidden_size = %d
\n
"
,
args
.
hidden_size
);
printf
(
"[FF] args.intermediate_size = %d
\n
"
,
args
.
intermediate_size
);
printf
(
"[FF] args.num_tokens = %d
\n
"
,
args
.
num_tokens
);
// 1
printf
(
"[FF] args.topk = %d
\n
"
,
args
.
topk
);
// 0
printf
(
"[FF] args.num_experts = %d
\n
"
,
args
.
num_experts
);
// 0
printf
(
"[FF] args.stride_token = %d
\n
"
,
args
.
stride_token
);
float
ave_time
=
flatmm_uk
(
traits
,
args
,
ck_tile
::
stream_config
{
nullptr
,
true
,
kname
?
1
:
0
,
warmup
,
repeat
});
if
(
ave_time
<
0
)
{
std
::
cout
<<
" not supported!"
<<
std
::
endl
<<
std
::
flush
;
return
false
;
}
bool
pass
=
true
;
if
(
do_validation
)
{
auto
d_dev
=
d_buf
.
ToHost
<
float
>
();
std
::
cout
<<
std
::
endl
<<
" =================== "
<<
std
::
endl
;
d_host
.
SetZero
();
my_reference_gemm
<
ADataType
,
BDataType
,
CDataType
,
DDataType
>
(
a_host
,
b_host
,
d_host
,
ave_time
);
pass
=
ck_tile
::
check_err
(
d_dev
,
d_host
);
std
::
cout
<<
"The CPU veification result is:"
<<
(
pass
?
"correct"
:
"fail"
)
<<
std
::
endl
;
}
#if 0
int GridDimX = 2;
int GridDimY = 1;
int BlockDimX = 64;
int BlockDimY = 4;
int BlockSize = BlockDimX * BlockDimY;
// dbg_int
{
auto dbg_int_dev = dbg_int_buf.ToHost<int>();
std::ofstream file("ff_dbg_int.txt");
file << " [dbg_int]: Grid = [" << GridDimX << ", " << GridDimY << "], Block = " << BlockSize
<< std::endl;
for(int bidy = 0; bidy < GridDimY; bidy++)
{
for(int bidx = 0; bidx < GridDimX; bidx++)
{
file << "\n ========== block : [" << bidx << ", " << bidy << "] ==========";
for(int tid = 0; tid < BlockSize; tid++)
{
int gid = (BlockSize * GridDimX) * bidy + BlockSize * bidx + tid;
if(tid % 64 == 0)
{
file << "\n [" << tid << " : " << tid + 63 << "]: ";
}
file << ck_tile::type_convert<int>(dbg_int_dev.mData[gid]) << ", ";
}
}
}
file.close();
}
// dbg_bf16 ---> kernel
{
auto dbg_bf16_dev = dbg_bf16_buf.ToHost<BDataType>();
std::ofstream file("ff_dbg_bf16_kernel.txt");
file << " [dbg_bf16]: Grid = [" << GridDimX << ", " << GridDimY
<< "], Block = " << BlockSize << std::endl;
for(int bidy = 0; bidy < GridDimY; bidy++)
{
for(int bidx = 0; bidx < GridDimX; bidx++)
{
file << "\n ========== block : [" << bidx << ", " << bidy << "] ==========";
for(int tid = 0; tid < BlockSize; tid++)
{
int gid = (BlockSize * bidx) * bidy + BlockSize * bidx + tid;
file << "\n [" << tid << "]: ";
for(int i = 0; i < 64; i++) // multi output per thread
file << ck_tile::type_convert<float>(dbg_bf16_dev.mData[gid * 64 + i])
<< ", ";
}
}
}
file.close();
}
// dbg_bf16
{
auto dbg_bf16_dev = dbg_bf16_buf.ToHost<BDataType>();
std::ofstream file("ff_dbg_bf16.txt");
int X = static_cast<int>(N);
int Y = static_cast<int>(M);
file << " [dbg_bf16]: Row = " << Y << ", Col = " << X << std::endl;
for(int m = 0; m < Y; m++)
{
file << "\n ========== row : [" << m << " / " << Y << "] ==========";
for(int n = 0; n < X; n++)
{
if(n % 64 == 0)
{
file << "\n [" << n << " : " << n + 63 << "]: ";
}
int idx = X * m + n;
file << ck_tile::type_convert<float>(dbg_bf16_dev.mData[idx]) << ", ";
}
}
file.close();
}
// dbg_fp32 ---> kernel
{
auto dbg_fp32_dev = dbg_fp32_buf.ToHost<float>();
std::ofstream file("ff_dbg_fp32_kernel.txt");
file << " [dbg_fp32]: Grid = [" << GridDimX << ", " << GridDimY
<< "], Block = " << BlockSize << std::endl;
for(int bidy = 0; bidy < GridDimY; bidy++)
{
for(int bidx = 0; bidx < GridDimX; bidx++)
{
file << "\n ========== block : [" << bidx << ", " << bidy << "] ==========";
for(int tid = 0; tid < BlockSize; tid++)
{
int gid = (BlockSize * bidx) * bidy + BlockSize * bidx + tid;
file << "\n [" << tid << "]: ";
for(int i = 0; i < 64; i++) // multi output per thread
file << ck_tile::type_convert<float>(dbg_fp32_dev.mData[gid * 64 + i])
<< ", ";
// if(tid % 64 == 0) // one output per thread
// file << "\n [" << tid << " : " << tid + 63 << "]: ";
// file << ck_tile::type_convert<float>(dbg_bf16.mData[gid]) << ", ";
}
}
}
file.close();
}
// dbg_fp32
{
auto dbg_fp32_dev = dbg_fp32_buf.ToHost<float>();
std::ofstream file("ff_dbg_fp32.txt");
int X = static_cast<int>(N);
int Y = static_cast<int>(M);
file << " [dbg_fp32]: Row = " << Y << ", Col = " << X << std::endl;
for(int m = 0; m < Y; m++)
{
file << "\n ========== row : [" << m << " / " << Y << "] ==========";
for(int n = 0; n < X; n++)
{
if(n % 64 == 0)
{
file << "\n [" << n << " : " << n + 63 << "]: ";
}
int idx = X * m + n;
file << ck_tile::type_convert<float>(dbg_fp32_dev.mData[idx]) << ", ";
}
}
file.close();
}
// a_host
{
std::ofstream file("ff_a_host.txt");
int X = static_cast<int>(K);
int Y = static_cast<int>(M);
file << " [a_host]: Row = " << Y << ", Col = " << X << std::endl;
for(int y = 0; y < Y; y++)
{
file << "\n ========== row : [" << y << " / " << Y << "] ==========";
for(int x = 0; x < X; x++)
{
int idx = X * y + x;
if(idx % 16 == 0)
{
file << "\n [" << x << " : " << x + 15 << " ]: ";
}
file << ck_tile::type_convert<float>(a_host.mData[idx]) << ", ";
}
}
file.close();
}
// b_host
{
std::ofstream file("ff_b_host.txt");
int X = static_cast<int>(K);
int Y = static_cast<int>(N);
file << " [b_host]: Row = " << Y << ", Col = " << X << std::endl;
for(int y = 0; y < Y; y++)
{
file << "\n ========== row : [" << y << " / " << Y << "] ==========";
for(int x = 0; x < X; x++)
{
int idx = X * y + x;
if(idx % 16 == 0)
{
file << "\n [" << x << " : " << x + 15 << " ]: ";
}
file << ck_tile::type_convert<float>(b_host.mData[idx]) << ", ";
}
}
file.close();
}
// permute_b
{
std::ofstream file("ff_b_perm_host.txt");
int X = static_cast<int>(K);
int Y = static_cast<int>(N);
file << " [b_perm_host]: Row = " << Y << ", Col = " << X << std::endl;
for(int y = 0; y < Y; y++)
{
file << "\n ========== row : [" << y << " / " << Y << "] ==========";
for(int x = 0; x < X; x++)
{
int idx = X * y + x;
if(idx % 16 == 0)
{
file << "\n [" << x << " : " << x + 15 << " ]: ";
}
file << ck_tile::type_convert<float>(b_perm_host.mData[idx]) << ", ";
}
}
file.close();
}
// d_dev ---> kernel
{
auto d_dev = d_buf.ToHost<float>();
std::ofstream file("ff_d_dev_kernel.txt");
file << " [d_dev]: Grid = [" << GridDimX << ", " << GridDimY << "], Block = " << BlockSize
<< std::endl;
for(int bidy = 0; bidy < GridDimY; bidy++)
{
for(int bidx = 0; bidx < GridDimX; bidx++)
{
file << "\n ========== block : [" << bidx << ", " << bidy << "] ==========";
for(int tid = 0; tid < BlockSize; tid++)
{
int gid = (BlockSize * bidx) * bidy + BlockSize * bidx + tid;
file << "\n [" << tid << "]: ";
for(int i = 0; i < 64; i++) // multi output per thread
file << ck_tile::type_convert<float>(d_dev.mData[gid * 64 + i]) << ", ";
}
}
}
file.close();
}
// d_dev
{
auto d_dev = d_buf.ToHost<float>();
std::ofstream file("ff_d_dev.txt");
int X = static_cast<int>(N);
int Y = static_cast<int>(M);
file << " [d_dev]: Row = " << Y << ", Col = " << X << std::endl;
for(int y = 0; y < Y; y++)
{
file << "\n ========== row : [" << y << " / " << Y << "] ==========";
for(int x = 0; x < X; x++)
{
if(x % 64 == 0)
{
file << "\n [" << x << " : " << x + 63 << "]: ";
}
int idx = X * y + x;
file << ck_tile::type_convert<float>(d_dev.mData[idx]) << ", ";
}
}
file.close();
}
// d_host
{
std::ofstream file("ff_d_host.txt");
int X = static_cast<int>(N);
int Y = static_cast<int>(M);
file << " [d_host]: Row = " << Y << ", Col = " << X << std::endl;
for(int y = 0; y < Y; y++)
{
file << "\n ========== row : [" << y << " / " << Y << "] ==========";
for(int x = 0; x < X; x++)
{
if(x % 64 == 0)
{
file << "\n [" << x << " : " << x + 63 << "]: ";
}
int idx = X * y + x;
file << ck_tile::type_convert<float>(d_host.mData[idx]) << ", ";
}
}
file.close();
}
#endif
std
::
cout
<<
std
::
flush
<<
std
::
endl
;
return
pass
;
}
int
main
(
int
argc
,
char
*
argv
[])
{
auto
[
result
,
arg_parser
]
=
create_args
(
argc
,
argv
);
if
(
!
result
)
return
-
1
;
std
::
string
prec_i
=
arg_parser
.
get_str
(
"prec_i"
);
std
::
string
prec_w
=
arg_parser
.
get_str
(
"prec_w"
);
std
::
string
prec_o
=
arg_parser
.
get_str
(
"prec_o"
);
std
::
string
prec_st
=
arg_parser
.
get_str
(
"prec_st"
);
std
::
string
prec_sw
=
arg_parser
.
get_str
(
"prec_sw"
);
std
::
string
prec_sq
=
arg_parser
.
get_str
(
"prec_sq"
);
std
::
string
prec_kw
=
arg_parser
.
get_str
(
"prec_kw"
);
prec_st
=
(
prec_st
==
"auto"
)
?
"fp32"
:
prec_st
;
prec_sw
=
(
prec_sw
==
"auto"
)
?
"fp32"
:
prec_sw
;
prec_sq
=
(
prec_sq
==
"auto"
)
?
"fp32"
:
prec_sq
;
prec_kw
=
(
prec_kw
==
"auto"
)
?
"fp32"
:
prec_kw
;
// no dynamic quant case
if
(
prec_i
==
"bf16"
&&
prec_w
==
"bf16"
&&
prec_o
==
"bf16"
&&
prec_kw
==
"fp32"
)
{
return
run
<
ck_tile
::
bf16_t
,
ck_tile
::
bf16_t
,
ck_tile
::
bf16_t
,
float
,
float
,
float
,
float
>
(
arg_parser
)
?
0
:
-
2
;
}
else
if
(
prec_i
==
"fp16"
&&
prec_w
==
"fp16"
&&
prec_o
==
"fp16"
&&
prec_kw
==
"fp32"
)
{
return
run
<
ck_tile
::
fp16_t
,
ck_tile
::
fp16_t
,
ck_tile
::
fp16_t
,
float
,
float
,
float
,
float
>
(
arg_parser
)
?
0
:
-
2
;
}
return
-
3
;
}
example/ck_tile/CMakeLists.txt
View file @
fa335f31
...
...
@@ -17,3 +17,4 @@ add_subdirectory(14_moe_smoothquant)
add_subdirectory
(
15_fused_moe
)
add_subdirectory
(
16_batched_gemm
)
add_subdirectory
(
17_grouped_gemm
)
add_subdirectory
(
18_flatmm_uk
)
include/ck_tile/ops/flatmm/block/flatmm_ff_32x512x128_1x4x1_16x16x32.hpp
0 → 100644
View file @
fa335f31
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/ops/gemm/warp/warp_gemm.hpp"
#include "ck_tile/ops/flatmm/block/flatmm_uk_config.hpp"
namespace
ck_tile
{
// A async load to LDS, B direct to AGPR
// B matrix preshuffled in br*kr*w
// require 4 wave, occupancy=1c
// agpr useage:256
// vgpr usage:64(A local) + 64(acc) + 8(os_a) + 8(os_b) = 144 (rem:112)
//
// for this gemm, 4 16x16x16 transposed layout
// input A vpgpr layout
// v0-v15: [ 0:15](gemm_m)x128(gemm_k)
// v16-v31: [16:31](gemm_m)x128(gemm_k)
// input B vpgpr layout
// v0-v15: [ 0: 15](gemm_n)x128(gemm_k)
// v16-v31: [ 64: 79](gemm_n)x128(gemm_k)
// ......................
// v111-v127: [448:463](gemm_n)x128(gemm_k)
// output C vpgpr layout
// v0-v3 : [ 0:15](gemm_m)x[ 0: 15](gemm_n)
// v4-v7 : [16:31](gemm_m)x[ 0: 15](gemm_n)
// v8-v11: [ 0:15](gemm_m)x[64: 79](gemm_n)
// v12-v15: [16:31](gemm_m)x[64: 79](gemm_n)
// ......................
// v56-v59: [ 0:15](gemm_m)x[448:463](gemm_n)
// v60-v63: [16:31](gemm_m)x[448:463](gemm_n)
struct
Flatmm_ff_32x512x128_1x4x1_16x16x32_Base
// for f16/bf16
{
static
constexpr
index_t
Block_M
=
32
;
static
constexpr
index_t
Block_N
=
512
;
static
constexpr
index_t
Block_K
=
128
;
static
constexpr
index_t
WarpPerBlock_M
=
1
;
static
constexpr
index_t
WarpPerBlock_N
=
4
;
static
constexpr
index_t
WarpPerBlock_K
=
1
;
static
constexpr
index_t
NumWarps
=
4
;
static
constexpr
index_t
Warp_M
=
16
;
static
constexpr
index_t
Warp_N
=
16
;
static
constexpr
index_t
Warp_K
=
32
;
// 16 * SubKPacks
static
constexpr
index_t
BlockSize
=
256
;
static
constexpr
index_t
SubKPacks
=
2
;
// this is used to gurantee every threads can do dwordx4
// TODO: note Nr/Kr/W need consider SubKPacks
static
constexpr
index_t
Block_W
=
Warp_N
*
Warp_K
;
// 512 element
static
constexpr
index_t
Block_Nr
=
Block_N
/
Warp_N
;
// 32 element, 4 per wave
static
constexpr
index_t
Block_Kr
=
Block_K
/
Warp_K
;
// 4
static
constexpr
index_t
Repeat_M
=
Block_M
/
(
Warp_M
*
WarpPerBlock_M
);
// 2
static
constexpr
index_t
Repeat_N
=
Block_N
/
(
Warp_N
*
WarpPerBlock_N
);
// 8
static
constexpr
index_t
Repeat_K
=
Block_K
/
(
Warp_K
*
WarpPerBlock_K
);
// 8/2=4
static
CK_TILE_DEVICE
constexpr
auto
MakeCBlockDist
()
{
constexpr
auto
c_block_outer_dstr_encoding
=
tile_distribution_encoding
<
sequence
<>
,
tuple
<
sequence
<
Repeat_M
,
WarpPerBlock_M
>
,
sequence
<
Repeat_N
,
WarpPerBlock_N
>>
,
tuple
<
sequence
<
1
,
2
>>
,
tuple
<
sequence
<
1
,
1
>>
,
sequence
<
2
,
1
>
,
// !! note here is different
sequence
<
0
,
0
>>
{};
using
WG
=
WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution
;
constexpr
auto
c_block_dstr_encode
=
detail
::
make_embed_tile_distribution_encoding
(
c_block_outer_dstr_encoding
,
typename
WG
::
CWarpDstrEncoding
{});
constexpr
auto
c_block_dstr
=
make_static_tile_distribution
(
c_block_dstr_encode
);
return
c_block_dstr
;
}
static
CK_TILE_DEVICE
constexpr
auto
MakeCBlockTile
()
{
using
CDataType
=
float
;
constexpr
auto
c_block_dstr
=
MakeCBlockDist
();
auto
c_block_tensor
=
make_static_distributed_tensor
<
CDataType
>
(
c_block_dstr
);
return
c_block_tensor
;
}
CK_TILE_HOST_DEVICE
static
constexpr
auto
MakeLdsStoreDesc_A
()
{
// A async->LDS
// constexpr index_t Block_M = Problem::BlockShape::Block_M0;
// constexpr index_t Block_K = Problem::BlockShape::Block_K0;
// constexpr index_t BlockSize = Problem::BlockShape::BlockSize;
constexpr
index_t
warpSize
=
ck_tile
::
get_warp_size
();
// constexpr index_t NumWarps = Problem::BlockShape::NumWarps;
constexpr
index_t
KPack_
=
8
;
// GetSmemKPack_A<Problem>(); // LDS
constexpr
index_t
KVector
=
2
;
// GetAlignment_A<Problem>(); // async copy 1 dword
constexpr
index_t
KPad
=
KPack_
;
// pad between warps
static_assert
(
Block_K
%
KVector
==
0
);
constexpr
index_t
LanesPerK
=
Block_K
/
KVector
;
// how many thread loading K
if
constexpr
(
LanesPerK
>=
warpSize
)
{
// need multiple waves to load K
static_assert
(
LanesPerK
%
warpSize
==
0
);
constexpr
index_t
wavesPerK
=
LanesPerK
/
warpSize
;
if
constexpr
(
wavesPerK
>
NumWarps
)
{
// TODO: need multiple issues along K to load all data
}
else
{
constexpr
index_t
wavesPerM
=
NumWarps
/
wavesPerK
;
constexpr
index_t
NumIssues
=
Block_M
/
wavesPerM
;
constexpr
auto
lds_block_desc_0
=
make_naive_tensor_descriptor
(
make_tuple
(
number
<
NumIssues
>
{},
// m0
number
<
wavesPerM
>
{},
// m1
number
<
wavesPerK
>
{},
// k0
number
<
warpSize
>
{},
// k1
number
<
KVector
>
{}),
// k2
make_tuple
(
number
<
NumWarps
*
(
warpSize
*
KVector
+
KPad
)
>
{},
// m0
number
<
wavesPerK
*
(
warpSize
*
KVector
+
KPad
)
>
{},
// m1
number
<
warpSize
*
KVector
+
KPad
>
{},
// k0
number
<
KVector
>
{},
// k1
number
<
1
>
{}),
// k2
number
<
KVector
>
{},
// lds store vector(actually no explicit store)
number
<
1
>
{});
constexpr
auto
lds_block_desc_issues_warps_lanes
=
transform_tensor_descriptor
(
lds_block_desc_0
,
make_tuple
(
make_pass_through_transform
(
number
<
NumIssues
>
{}),
make_merge_transform
(
make_tuple
(
number
<
wavesPerM
>
{},
number
<
wavesPerK
>
{})),
make_merge_transform
(
make_tuple
(
number
<
warpSize
>
{},
number
<
KVector
>
{}))),
make_tuple
(
sequence
<
0
>
{},
sequence
<
1
,
2
>
{},
sequence
<
3
,
4
>
{}),
make_tuple
(
sequence
<
0
>
{},
sequence
<
1
>
{},
sequence
<
2
>
{}));
return
lds_block_desc_issues_warps_lanes
;
}
}
else
{
// lanes within a wave load different M but same K
static_assert
(
warpSize
%
LanesPerK
==
0
);
constexpr
index_t
LaneGroups
=
warpSize
/
LanesPerK
;
// along m
constexpr
index_t
NumIssues
=
Block_M
/
(
LaneGroups
*
NumWarps
);
constexpr
auto
lds_block_desc_0
=
make_naive_tensor_descriptor
(
make_tuple
(
number
<
NumIssues
>
{},
// m0
number
<
LaneGroups
>
{},
// m1
number
<
NumWarps
>
{},
// m2
number
<
LanesPerK
>
{},
// k0
number
<
KVector
>
{}),
// k1
make_tuple
(
number
<
NumWarps
*
(
warpSize
*
KVector
+
KPad
)
>
{},
// m0
number
<
Block_K
>
{},
// m1
number
<
warpSize
*
KVector
+
KPad
>
{},
// m2
number
<
KVector
>
{},
// k0
number
<
1
>
{}),
// k1
number
<
KVector
>
{},
// lds store vector(actually no explicit store)
number
<
1
>
{});
constexpr
auto
lds_block_desc_issues_warps_lanes
=
transform_tensor_descriptor
(
lds_block_desc_0
,
make_tuple
(
make_pass_through_transform
(
number
<
NumIssues
>
{}),
make_pass_through_transform
(
number
<
NumWarps
>
{}),
make_merge_transform
(
make_tuple
(
number
<
LaneGroups
>
{},
number
<
LanesPerK
>
{},
number
<
KVector
>
{}))),
make_tuple
(
sequence
<
0
>
{},
sequence
<
2
>
{},
sequence
<
1
,
3
,
4
>
{}),
make_tuple
(
sequence
<
0
>
{},
sequence
<
1
>
{},
sequence
<
2
>
{}));
return
lds_block_desc_issues_warps_lanes
;
}
}
// template <typename Problem>
CK_TILE_HOST_DEVICE
static
constexpr
auto
MakeLdsLoadDesc_A
()
{
// load from LDS to register, every wave has same layout
constexpr
index_t
KPack_
=
8
;
// GetSmemKPack_A<Problem>(); // LDS
constexpr
index_t
KPad
=
KPack_
;
// pad between warps
constexpr
index_t
kAMLane
=
16
;
constexpr
index_t
kABKLane
=
4
;
constexpr
index_t
kABKPerLane
=
4
;
constexpr
index_t
kKIter
=
2
;
static_assert
(
KPack_
==
(
kABKPerLane
*
kKIter
));
constexpr
auto
lds_block_desc_0
=
make_naive_tensor_descriptor
(
make_tuple
(
number
<
Repeat_M
>
{},
// m0 y
number
<
kAMLane
>
{},
// m1 p
number
<
Repeat_K
>
{},
// k0 y
number
<
kABKLane
>
{},
// k1 p
number
<
KPack_
>
{}),
// k2 y-vector
make_tuple
(
number
<
kAMLane
*
(
Block_K
+
KPad
)
>
{},
// m0
number
<
Block_K
+
KPad
>
{},
// m1
number
<
kABKLane
*
KPack_
>
{},
// k0
number
<
KPack_
>
{},
// k1
number
<
1
>
{}),
// k2
number
<
KPack_
>
{},
// lds load vector
number
<
1
>
{});
constexpr
auto
lds_desc_m_k
=
transform_tensor_descriptor
(
lds_block_desc_0
,
make_tuple
(
make_merge_transform
(
make_tuple
(
number
<
Repeat_M
>
{},
number
<
kAMLane
>
{})),
make_merge_transform
(
make_tuple
(
number
<
Repeat_K
>
{},
number
<
kABKLane
>
{},
number
<
KPack_
>
{}))),
make_tuple
(
sequence
<
0
,
1
>
{},
sequence
<
2
,
3
,
4
>
{}),
make_tuple
(
sequence
<
0
>
{},
sequence
<
1
>
{}));
return
lds_desc_m_k
;
}
static
constexpr
auto
GetGemm_AWarpEnc
()
{
constexpr
index_t
kAMLane
=
16
;
constexpr
index_t
kABKLane
=
4
;
constexpr
index_t
kABKPerLane
=
4
;
constexpr
index_t
kKIter
=
2
;
using
enc_
=
tile_distribution_encoding
<
sequence
<>
,
tuple
<
sequence
<
kAMLane
>
,
sequence
<
kABKLane
,
kABKPerLane
*
kKIter
>>
,
tuple
<
sequence
<
2
,
1
>>
,
tuple
<
sequence
<
0
,
0
>>
,
sequence
<
2
>
,
sequence
<
1
>>
;
return
enc_
{};
}
CK_TILE_HOST_DEVICE
static
constexpr
ck_tile
::
index_t
GetSmemSize
()
{
return
32
*
(
128
+
8
)
*
sizeof
(
bf16_t
);
}
};
struct
Flatmm_ff_32x512x128_1x4x1_16x16x32_BF16
:
public
Flatmm_ff_32x512x128_1x4x1_16x16x32_Base
{
using
ADataType
=
bf16_t
;
using
BDataType
=
bf16_t
;
// TODO: need paired with tile_window_linear!
// TODO: need call init_raw() before call this function!
template
<
typename
ARes
,
typename
ACoords
,
typename
BRes
,
typename
BCoords
>
CK_TILE_DEVICE
auto
operator
()(
const
ARes
&
res_a
,
const
ACoords
&
cached_coords_a
,
const
BRes
&
res_b
,
const
BCoords
&
cached_coords_b
,
CK_TILE_LDS_ADDR
void
*
smem
,
index_t
k
,
index_t
tile_offset_a
,
// for each tile, the offset to move for each unroll
index_t
tile_offset_b
,
// for each tile, the offset to move for each unroll
int
*
dbg_int
,
short
*
dbg_bf16
,
float
*
dbg_fp32
)
{
#if 0
if(threadIdx.x == 0 && blockIdx.x == 0 && threadIdx.y == 0 && blockIdx.y == 0)
{
printf("[UK] Flatmm_ff_32x512x128_1x4x1_16x16x32_BF16 =====\n");
}
[[maybe_unused]] uint32_t tidx = threadIdx.x; // 0~255
[[maybe_unused]] uint32_t tidy = threadIdx.y; // 0~0
[[maybe_unused]] uint32_t bidx = blockIdx.x; // 0~1
[[maybe_unused]] uint32_t bidy = blockIdx.y; // 0~51
[[maybe_unused]] uint32_t bdmx = blockDim.x; // 256
[[maybe_unused]] uint32_t bdmy = blockDim.y; // 1
[[maybe_unused]] uint32_t gdmx = gridDim.x; // 2
[[maybe_unused]] uint32_t gdmy = gridDim.y; // 52
[[maybe_unused]] uint32_t gid = ((bdmx * bdmy) * gdmx) * bidy
+ (bdmx * bdmy) * bidx
+ bdmx * tidy
+ tidx;
#endif
(
void
)
dbg_int
;
(
void
)
dbg_bf16
;
(
void
)
dbg_fp32
;
static_assert
(
ACoords
::
size
()
==
Block_M
*
Block_K
/
BlockSize
/
2
/*2x per dword*/
);
// 8
static_assert
(
BCoords
::
size
()
==
Repeat_N
);
auto
a_sst
=
make_tile_window
(
make_tensor_view
<
address_space_enum
::
lds
>
(
reinterpret_cast
<
CK_TILE_LDS_ADDR
ADataType
*>
(
smem
),
MakeLdsStoreDesc_A
()),
MakeLdsStoreDesc_A
().
get_lengths
(),
{
0
,
0
,
0
});
auto
a_sld
=
[
&
]()
{
constexpr
auto
a_warp_enc_
=
GetGemm_AWarpEnc
();
constexpr
auto
a_outer_dstr_enc
=
tile_distribution_encoding
<
sequence
<
WarpPerBlock_N
>
,
tuple
<
sequence
<
Repeat_M
,
WarpPerBlock_M
>
,
sequence
<
Repeat_K
>>
,
tuple
<
sequence
<
1
,
0
>>
,
tuple
<
sequence
<
1
,
0
>>
,
sequence
<
1
,
2
>
,
sequence
<
0
,
0
>>
{};
constexpr
auto
a_block_dstr_encode
=
detail
::
make_embed_tile_distribution_encoding
(
a_outer_dstr_enc
,
a_warp_enc_
);
return
make_tile_window_linear
(
make_tensor_view
<
address_space_enum
::
lds
>
(
reinterpret_cast
<
CK_TILE_LDS_ADDR
ADataType
*>
(
smem
),
MakeLdsLoadDesc_A
()),
MakeLdsLoadDesc_A
().
get_lengths
(),
{
0
,
0
},
make_static_tile_distribution
(
a_block_dstr_encode
));
}();
const
index_t
tile_offset_a_bytes
=
tile_offset_a
*
sizeof
(
ADataType
);
const
index_t
tile_offset_b_bytes
=
tile_offset_b
*
sizeof
(
BDataType
);
const
auto
[
m0_init_value
,
size_per_issue
]
=
get_async_store_smem_info
(
a_sst
);
constexpr
auto
smem_buf_size
=
MakeLdsLoadDesc_A
().
get_element_space_size
()
*
sizeof
(
ADataType
);
static_assert
(
a_sld
.
get_num_of_access
()
==
8
);
constexpr
auto
sld_os
=
generate_tuple
(
[
&
](
auto
i_access
)
{
return
number
<
a_sld
.
get_bottom_linear_offset
(
i_access
)
*
sizeof
(
ADataType
)
>
{};
},
number
<
a_sld
.
get_num_of_access
()
>
{});
index_t
loop_cnt
=
k
/
Block_K
;
// this is the acc thread buffer
fp32x4_t
v_acc
[
16
]{
.0
f
};
// float dbg_dword = 0.0f;
#pragma region ASM
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Winline-asm"
// clang-format off
asm
volatile
(
#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_BF16
#include "uk/flatmm_ff_uk_gfx9_32x512x128_1x1x1_16x16x16.inc"
#undef CK_TILE_FLATMM_UK_MFMA
:
[
s_loop_cnt
]
"+s"
(
loop_cnt
),
[
v_acc_0
]
"+v"
(
v_acc
[
0
]),
[
v_acc_1
]
"+v"
(
v_acc
[
1
]),
[
v_acc_2
]
"+v"
(
v_acc
[
2
]),
[
v_acc_3
]
"+v"
(
v_acc
[
3
]),
[
v_acc_4
]
"+v"
(
v_acc
[
4
]),
[
v_acc_5
]
"+v"
(
v_acc
[
5
]),
[
v_acc_6
]
"+v"
(
v_acc
[
6
]),
[
v_acc_7
]
"+v"
(
v_acc
[
7
]),
[
v_acc_8
]
"+v"
(
v_acc
[
8
]),
[
v_acc_9
]
"+v"
(
v_acc
[
9
]),
[
v_acc_10
]
"+v"
(
v_acc
[
10
]),
[
v_acc_11
]
"+v"
(
v_acc
[
11
]),
[
v_acc_12
]
"+v"
(
v_acc
[
12
]),
[
v_acc_13
]
"+v"
(
v_acc
[
13
]),
[
v_acc_14
]
"+v"
(
v_acc
[
14
]),
[
v_acc_15
]
"+v"
(
v_acc
[
15
]),
//[v_dbg]"+v"(dbg_dword),
[
s_mem_
]
"+r"
(
smem
)
:
[
s_res_a0
]
"s"
(
res_a
[
0
]),
[
s_res_a1
]
"s"
(
res_a
[
1
]),
[
s_res_a2
]
"s"
(
res_a
[
2
]),
[
s_res_a3
]
"s"
(
res_a
[
3
]),
[
v_os_a0
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_a
[
number
<
0
>
{}]
*
sizeof
(
ADataType
))),
[
v_os_a1
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_a
[
number
<
1
>
{}]
*
sizeof
(
ADataType
))),
[
v_os_a2
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_a
[
number
<
2
>
{}]
*
sizeof
(
ADataType
))),
[
v_os_a3
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_a
[
number
<
3
>
{}]
*
sizeof
(
ADataType
))),
[
v_os_a4
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_a
[
number
<
4
>
{}]
*
sizeof
(
ADataType
))),
[
v_os_a5
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_a
[
number
<
5
>
{}]
*
sizeof
(
ADataType
))),
[
v_os_a6
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_a
[
number
<
6
>
{}]
*
sizeof
(
ADataType
))),
[
v_os_a7
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_a
[
number
<
7
>
{}]
*
sizeof
(
ADataType
))),
[
s_res_b0
]
"s"
(
res_b
[
0
]),
[
s_res_b1
]
"s"
(
res_b
[
1
]),
[
s_res_b2
]
"s"
(
res_b
[
2
]),
[
s_res_b3
]
"s"
(
res_b
[
3
]),
[
v_os_b0
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
0
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b1
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
1
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b2
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
2
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b3
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
3
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b4
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
4
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b5
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
5
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b6
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
6
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b7
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
7
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_slda
]
"v"
(
static_cast
<
index_t
>
(
a_sld
.
cached_coords_
[
number
<
0
>
{}].
get_offset
()
*
sizeof
(
ADataType
))),
[
s_m0_init
]
"s"
(
m0_init_value
),
[
s_size_per_issue
]
"s"
(
size_per_issue
),
[
smem_sz
]
"n"
(
smem_buf_size
),
//(smem_buf_size),
[
sld_os_0
]
"n"
(
sld_os
[
number
<
0
>
{}].
value
),
[
sld_os_1
]
"n"
(
sld_os
[
number
<
1
>
{}].
value
),
[
sld_os_2
]
"n"
(
sld_os
[
number
<
2
>
{}].
value
),
[
sld_os_3
]
"n"
(
sld_os
[
number
<
3
>
{}].
value
),
[
sld_os_4
]
"n"
(
sld_os
[
number
<
4
>
{}].
value
),
[
sld_os_5
]
"n"
(
sld_os
[
number
<
5
>
{}].
value
),
[
sld_os_6
]
"n"
(
sld_os
[
number
<
6
>
{}].
value
),
[
sld_os_7
]
"n"
(
sld_os
[
number
<
7
>
{}].
value
),
[
s_tile_os_a
]
"s"
(
tile_offset_a_bytes
),
[
s_tile_os_b
]
"s"
(
tile_offset_b_bytes
)
:
"memory"
,
"a0"
,
"a1"
,
"a2"
,
"a3"
,
"a4"
,
"a5"
,
"a6"
,
"a7"
,
"a8"
,
"a9"
,
"a10"
,
"a11"
,
"a12"
,
"a13"
,
"a14"
,
"a15"
,
"a16"
,
"a17"
,
"a18"
,
"a19"
,
"a20"
,
"a21"
,
"a22"
,
"a23"
,
"a24"
,
"a25"
,
"a26"
,
"a27"
,
"a28"
,
"a29"
,
"a30"
,
"a31"
,
"a32"
,
"a33"
,
"a34"
,
"a35"
,
"a36"
,
"a37"
,
"a38"
,
"a39"
,
"a40"
,
"a41"
,
"a42"
,
"a43"
,
"a44"
,
"a45"
,
"a46"
,
"a47"
,
"a48"
,
"a49"
,
"a50"
,
"a51"
,
"a52"
,
"a53"
,
"a54"
,
"a55"
,
"a56"
,
"a57"
,
"a58"
,
"a59"
,
"a60"
,
"a61"
,
"a62"
,
"a63"
,
"a64"
,
"a65"
,
"a66"
,
"a67"
,
"a68"
,
"a69"
,
"a70"
,
"a71"
,
"a72"
,
"a73"
,
"a74"
,
"a75"
,
"a76"
,
"a77"
,
"a78"
,
"a79"
,
"a80"
,
"a81"
,
"a82"
,
"a83"
,
"a84"
,
"a85"
,
"a86"
,
"a87"
,
"a88"
,
"a89"
,
"a90"
,
"a91"
,
"a92"
,
"a93"
,
"a94"
,
"a95"
,
"a96"
,
"a97"
,
"a98"
,
"a99"
,
"a100"
,
"a101"
,
"a102"
,
"a103"
,
"a104"
,
"a105"
,
"a106"
,
"a107"
,
"a108"
,
"a109"
,
"a110"
,
"a111"
,
"a112"
,
"a113"
,
"a114"
,
"a115"
,
"a116"
,
"a117"
,
"a118"
,
"a119"
,
"a120"
,
"a121"
,
"a122"
,
"a123"
,
"a124"
,
"a125"
,
"a126"
,
"a127"
,
"a128"
,
"a129"
,
"a130"
,
"a131"
,
"a132"
,
"a133"
,
"a134"
,
"a135"
,
"a136"
,
"a137"
,
"a138"
,
"a139"
,
"a140"
,
"a141"
,
"a142"
,
"a143"
,
"a144"
,
"a145"
,
"a146"
,
"a147"
,
"a148"
,
"a149"
,
"a150"
,
"a151"
,
"a152"
,
"a153"
,
"a154"
,
"a155"
,
"a156"
,
"a157"
,
"a158"
,
"a159"
,
"a160"
,
"a161"
,
"a162"
,
"a163"
,
"a164"
,
"a165"
,
"a166"
,
"a167"
,
"a168"
,
"a169"
,
"a170"
,
"a171"
,
"a172"
,
"a173"
,
"a174"
,
"a175"
,
"a176"
,
"a177"
,
"a178"
,
"a179"
,
"a180"
,
"a181"
,
"a182"
,
"a183"
,
"a184"
,
"a185"
,
"a186"
,
"a187"
,
"a188"
,
"a189"
,
"a190"
,
"a191"
,
"a192"
,
"a193"
,
"a194"
,
"a195"
,
"a196"
,
"a197"
,
"a198"
,
"a199"
,
"a200"
,
"a201"
,
"a202"
,
"a203"
,
"a204"
,
"a205"
,
"a206"
,
"a207"
,
"a208"
,
"a209"
,
"a210"
,
"a211"
,
"a212"
,
"a213"
,
"a214"
,
"a215"
,
"a216"
,
"a217"
,
"a218"
,
"a219"
,
"a220"
,
"a221"
,
"a222"
,
"a223"
,
"a224"
,
"a225"
,
"a226"
,
"a227"
,
"a228"
,
"a229"
,
"a230"
,
"a231"
,
"a232"
,
"a233"
,
"a234"
,
"a235"
,
"a236"
,
"a237"
,
"a238"
,
"a239"
,
"a240"
,
"a241"
,
"a242"
,
"a243"
,
"a244"
,
"a245"
,
"a246"
,
"a247"
,
"a248"
,
"a249"
,
"a250"
,
"a251"
,
"a252"
,
"a253"
,
"a254"
,
"a255"
,
"s16"
,
"s17"
,
"s18"
,
"s19"
,
"s20"
,
"s21"
,
"s22"
,
"s23"
,
"s86"
,
// s86 as tmp
"v64"
,
"v65"
,
"v66"
,
"v67"
,
"v68"
,
"v69"
,
"v70"
,
"v71"
,
"v72"
,
"v73"
,
"v74"
,
"v75"
,
"v76"
,
"v77"
,
"v78"
,
"v79"
,
"v80"
,
"v81"
,
"v82"
,
"v83"
,
"v84"
,
"v85"
,
"v86"
,
"v87"
,
"v88"
,
"v89"
,
"v90"
,
"v91"
,
"v92"
,
"v93"
,
"v94"
,
"v95"
,
"v96"
,
"v97"
,
"v98"
,
"v99"
,
"v100"
,
"v101"
,
"v102"
,
"v103"
,
"v104"
,
"v105"
,
"v106"
,
"v107"
,
"v108"
,
"v109"
,
"v110"
,
"v111"
,
"v112"
,
"v113"
,
"v114"
,
"v115"
,
"v116"
,
"v117"
,
"v118"
,
"v119"
,
"v120"
,
"v121"
,
"v122"
,
"v123"
,
"v124"
,
"v125"
,
"v126"
,
"v127"
);
// clang-format on
#pragma clang diagnostic pop
#pragma endregion
// return local scratch
auto
c
=
MakeCBlockTile
();
for
(
auto
i
=
0
;
i
<
16
;
i
++
)
{
c
.
get_thread_buffer
()[
4
*
i
+
0
]
=
v_acc
[
i
].
x
;
c
.
get_thread_buffer
()[
4
*
i
+
1
]
=
v_acc
[
i
].
y
;
c
.
get_thread_buffer
()[
4
*
i
+
2
]
=
v_acc
[
i
].
z
;
c
.
get_thread_buffer
()[
4
*
i
+
3
]
=
v_acc
[
i
].
w
;
}
/*float * vacc0x = reinterpret_cast<float*>(v_acc);
short * pdbgf16 = reinterpret_cast<short*>(vacc0x);
//short * pdbg_u8 = reinterpret_cast<short*>(&dbg_dword);
int dbgCntPerThd = 8;
for(int i = 0; i < dbgCntPerThd; i++)
{
dbg_bf16[gid * dbgCntPerThd + i] = *(pdbgf16 + i);
//dbg_bf16[gid * dbgCntPerThd + i] = *(pdbg_u8 + i);
}
int * pdbg_i32 = reinterpret_cast<int*>(&dbg_dword);
dbg_int[gid] = *(pdbg_i32);*/
return
c
;
}
};
struct
Flatmm_ff_32x512x128_1x4x1_16x16x32_FP16
:
public
Flatmm_ff_32x512x128_1x4x1_16x16x32_Base
{
using
ADataType
=
fp16_t
;
using
BDataType
=
fp16_t
;
// TODO: need paired with tile_window_linear!
// TODO: need call init_raw() before call this function!
template
<
typename
ARes
,
typename
ACoords
,
typename
BRes
,
typename
BCoords
>
CK_TILE_DEVICE
auto
operator
()(
const
ARes
&
res_a
,
const
ACoords
&
cached_coords_a
,
const
BRes
&
res_b
,
const
BCoords
&
cached_coords_b
,
CK_TILE_LDS_ADDR
void
*
smem
,
index_t
k
,
index_t
tile_offset_a
,
// for each tile, the offset to move for each unroll
index_t
tile_offset_b
,
int
*
dbg_int
,
short
*
dbg_bf16
,
float
*
dbg_fp32
)
{
(
void
)
dbg_int
;
(
void
)
dbg_fp32
;
(
void
)
dbg_bf16
;
static_assert
(
ACoords
::
size
()
==
Block_M
*
Block_K
/
BlockSize
/
2
/*2x per dword*/
);
// 8
static_assert
(
BCoords
::
size
()
==
Repeat_N
);
auto
a_sst
=
make_tile_window
(
make_tensor_view
<
address_space_enum
::
lds
>
(
reinterpret_cast
<
CK_TILE_LDS_ADDR
ADataType
*>
(
smem
),
MakeLdsStoreDesc_A
()),
MakeLdsStoreDesc_A
().
get_lengths
(),
{
0
,
0
,
0
});
auto
a_sld
=
[
&
]()
{
constexpr
auto
a_warp_enc_
=
GetGemm_AWarpEnc
();
constexpr
auto
a_outer_dstr_enc
=
tile_distribution_encoding
<
sequence
<
WarpPerBlock_N
>
,
tuple
<
sequence
<
Repeat_M
,
WarpPerBlock_M
>
,
sequence
<
Repeat_K
>>
,
tuple
<
sequence
<
1
,
0
>>
,
tuple
<
sequence
<
1
,
0
>>
,
sequence
<
1
,
2
>
,
sequence
<
0
,
0
>>
{};
constexpr
auto
a_block_dstr_encode
=
detail
::
make_embed_tile_distribution_encoding
(
a_outer_dstr_enc
,
a_warp_enc_
);
return
make_tile_window_linear
(
make_tensor_view
<
address_space_enum
::
lds
>
(
reinterpret_cast
<
CK_TILE_LDS_ADDR
ADataType
*>
(
smem
),
MakeLdsLoadDesc_A
()),
MakeLdsLoadDesc_A
().
get_lengths
(),
{
0
,
0
},
make_static_tile_distribution
(
a_block_dstr_encode
));
}();
const
index_t
tile_offset_a_bytes
=
tile_offset_a
*
sizeof
(
ADataType
);
const
index_t
tile_offset_b_bytes
=
tile_offset_b
*
sizeof
(
BDataType
);
const
auto
[
m0_init_value
,
size_per_issue
]
=
get_async_store_smem_info
(
a_sst
);
constexpr
auto
smem_buf_size
=
MakeLdsLoadDesc_A
().
get_element_space_size
()
*
sizeof
(
ADataType
);
static_assert
(
a_sld
.
get_num_of_access
()
==
8
);
constexpr
auto
sld_os
=
generate_tuple
(
[
&
](
auto
i_access
)
{
return
number
<
a_sld
.
get_bottom_linear_offset
(
i_access
)
*
sizeof
(
ADataType
)
>
{};
},
number
<
a_sld
.
get_num_of_access
()
>
{});
index_t
loop_cnt
=
k
/
Block_K
;
// this is the acc thread buffer
fp32x4_t
v_acc
[
16
]{
.0
f
};
float
dbg_dword
=
0.0
f
;
// B nr->kr
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Winline-asm"
// clang-format off
asm
volatile
(
#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_FP16
#include "uk/flatmm_ff_uk_gfx9_32x512x128_1x1x1_16x16x16.inc"
#undef CK_TILE_FLATMM_UK_MFMA
:
[
s_loop_cnt
]
"+s"
(
loop_cnt
),
[
v_acc_0
]
"+v"
(
v_acc
[
0
]),
[
v_acc_1
]
"+v"
(
v_acc
[
1
]),
[
v_acc_2
]
"+v"
(
v_acc
[
2
]),
[
v_acc_3
]
"+v"
(
v_acc
[
3
]),
[
v_acc_4
]
"+v"
(
v_acc
[
4
]),
[
v_acc_5
]
"+v"
(
v_acc
[
5
]),
[
v_acc_6
]
"+v"
(
v_acc
[
6
]),
[
v_acc_7
]
"+v"
(
v_acc
[
7
]),
[
v_acc_8
]
"+v"
(
v_acc
[
8
]),
[
v_acc_9
]
"+v"
(
v_acc
[
9
]),
[
v_acc_10
]
"+v"
(
v_acc
[
10
]),
[
v_acc_11
]
"+v"
(
v_acc
[
11
]),
[
v_acc_12
]
"+v"
(
v_acc
[
12
]),
[
v_acc_13
]
"+v"
(
v_acc
[
13
]),
[
v_acc_14
]
"+v"
(
v_acc
[
14
]),
[
v_acc_15
]
"+v"
(
v_acc
[
15
]),
[
s_mem_
]
"+r"
(
smem
)
:
[
s_res_a0
]
"s"
(
res_a
[
0
]),
[
s_res_a1
]
"s"
(
res_a
[
1
]),
[
s_res_a2
]
"s"
(
res_a
[
2
]),
[
s_res_a3
]
"s"
(
res_a
[
3
]),
[
s_res_b0
]
"s"
(
res_b
[
0
]),
[
s_res_b1
]
"s"
(
res_b
[
1
]),
[
s_res_b2
]
"s"
(
res_b
[
2
]),
[
s_res_b3
]
"s"
(
res_b
[
3
]),
[
v_os_a0
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_a
[
number
<
0
>
{}]
*
sizeof
(
ADataType
))),
[
v_os_a1
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_a
[
number
<
1
>
{}]
*
sizeof
(
ADataType
))),
[
v_os_a2
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_a
[
number
<
2
>
{}]
*
sizeof
(
ADataType
))),
[
v_os_a3
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_a
[
number
<
3
>
{}]
*
sizeof
(
ADataType
))),
[
v_os_a4
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_a
[
number
<
4
>
{}]
*
sizeof
(
ADataType
))),
[
v_os_a5
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_a
[
number
<
5
>
{}]
*
sizeof
(
ADataType
))),
[
v_os_a6
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_a
[
number
<
6
>
{}]
*
sizeof
(
ADataType
))),
[
v_os_a7
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_a
[
number
<
7
>
{}]
*
sizeof
(
ADataType
))),
[
v_os_b0
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
0
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b1
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
1
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b2
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
2
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b3
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
3
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b4
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
4
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b5
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
5
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b6
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
6
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b7
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
7
>
{}]
*
sizeof
(
BDataType
))),
[
v_dbg
]
"v"
(
dbg_dword
),
[
v_os_slda
]
"v"
(
static_cast
<
index_t
>
(
a_sld
.
cached_coords_
[
number
<
0
>
{}].
get_offset
()
*
sizeof
(
ADataType
))),
[
s_m0_init
]
"s"
(
m0_init_value
),
[
s_size_per_issue
]
"s"
(
size_per_issue
),
[
smem_sz
]
"n"
(
smem_buf_size
),
//(smem_buf_size),
[
sld_os_0
]
"n"
(
sld_os
[
number
<
0
>
{}].
value
),
[
sld_os_1
]
"n"
(
sld_os
[
number
<
1
>
{}].
value
),
[
sld_os_2
]
"n"
(
sld_os
[
number
<
2
>
{}].
value
),
[
sld_os_3
]
"n"
(
sld_os
[
number
<
3
>
{}].
value
),
[
sld_os_4
]
"n"
(
sld_os
[
number
<
4
>
{}].
value
),
[
sld_os_5
]
"n"
(
sld_os
[
number
<
5
>
{}].
value
),
[
sld_os_6
]
"n"
(
sld_os
[
number
<
6
>
{}].
value
),
[
sld_os_7
]
"n"
(
sld_os
[
number
<
7
>
{}].
value
),
[
s_tile_os_a
]
"s"
(
tile_offset_a_bytes
),
[
s_tile_os_b
]
"s"
(
tile_offset_b_bytes
)
:
"memory"
,
"a0"
,
"a1"
,
"a2"
,
"a3"
,
"a4"
,
"a5"
,
"a6"
,
"a7"
,
"a8"
,
"a9"
,
"a10"
,
"a11"
,
"a12"
,
"a13"
,
"a14"
,
"a15"
,
"a16"
,
"a17"
,
"a18"
,
"a19"
,
"a20"
,
"a21"
,
"a22"
,
"a23"
,
"a24"
,
"a25"
,
"a26"
,
"a27"
,
"a28"
,
"a29"
,
"a30"
,
"a31"
,
"a32"
,
"a33"
,
"a34"
,
"a35"
,
"a36"
,
"a37"
,
"a38"
,
"a39"
,
"a40"
,
"a41"
,
"a42"
,
"a43"
,
"a44"
,
"a45"
,
"a46"
,
"a47"
,
"a48"
,
"a49"
,
"a50"
,
"a51"
,
"a52"
,
"a53"
,
"a54"
,
"a55"
,
"a56"
,
"a57"
,
"a58"
,
"a59"
,
"a60"
,
"a61"
,
"a62"
,
"a63"
,
"a64"
,
"a65"
,
"a66"
,
"a67"
,
"a68"
,
"a69"
,
"a70"
,
"a71"
,
"a72"
,
"a73"
,
"a74"
,
"a75"
,
"a76"
,
"a77"
,
"a78"
,
"a79"
,
"a80"
,
"a81"
,
"a82"
,
"a83"
,
"a84"
,
"a85"
,
"a86"
,
"a87"
,
"a88"
,
"a89"
,
"a90"
,
"a91"
,
"a92"
,
"a93"
,
"a94"
,
"a95"
,
"a96"
,
"a97"
,
"a98"
,
"a99"
,
"a100"
,
"a101"
,
"a102"
,
"a103"
,
"a104"
,
"a105"
,
"a106"
,
"a107"
,
"a108"
,
"a109"
,
"a110"
,
"a111"
,
"a112"
,
"a113"
,
"a114"
,
"a115"
,
"a116"
,
"a117"
,
"a118"
,
"a119"
,
"a120"
,
"a121"
,
"a122"
,
"a123"
,
"a124"
,
"a125"
,
"a126"
,
"a127"
,
"a128"
,
"a129"
,
"a130"
,
"a131"
,
"a132"
,
"a133"
,
"a134"
,
"a135"
,
"a136"
,
"a137"
,
"a138"
,
"a139"
,
"a140"
,
"a141"
,
"a142"
,
"a143"
,
"a144"
,
"a145"
,
"a146"
,
"a147"
,
"a148"
,
"a149"
,
"a150"
,
"a151"
,
"a152"
,
"a153"
,
"a154"
,
"a155"
,
"a156"
,
"a157"
,
"a158"
,
"a159"
,
"a160"
,
"a161"
,
"a162"
,
"a163"
,
"a164"
,
"a165"
,
"a166"
,
"a167"
,
"a168"
,
"a169"
,
"a170"
,
"a171"
,
"a172"
,
"a173"
,
"a174"
,
"a175"
,
"a176"
,
"a177"
,
"a178"
,
"a179"
,
"a180"
,
"a181"
,
"a182"
,
"a183"
,
"a184"
,
"a185"
,
"a186"
,
"a187"
,
"a188"
,
"a189"
,
"a190"
,
"a191"
,
"a192"
,
"a193"
,
"a194"
,
"a195"
,
"a196"
,
"a197"
,
"a198"
,
"a199"
,
"a200"
,
"a201"
,
"a202"
,
"a203"
,
"a204"
,
"a205"
,
"a206"
,
"a207"
,
"a208"
,
"a209"
,
"a210"
,
"a211"
,
"a212"
,
"a213"
,
"a214"
,
"a215"
,
"a216"
,
"a217"
,
"a218"
,
"a219"
,
"a220"
,
"a221"
,
"a222"
,
"a223"
,
"a224"
,
"a225"
,
"a226"
,
"a227"
,
"a228"
,
"a229"
,
"a230"
,
"a231"
,
"a232"
,
"a233"
,
"a234"
,
"a235"
,
"a236"
,
"a237"
,
"a238"
,
"a239"
,
"a240"
,
"a241"
,
"a242"
,
"a243"
,
"a244"
,
"a245"
,
"a246"
,
"a247"
,
"a248"
,
"a249"
,
"a250"
,
"a251"
,
"a252"
,
"a253"
,
"a254"
,
"a255"
,
"s16"
,
"s17"
,
"s18"
,
"s19"
,
"s20"
,
"s21"
,
"s22"
,
"s23"
,
"s86"
,
// s86 as tmp
"v64"
,
"v65"
,
"v66"
,
"v67"
,
"v68"
,
"v69"
,
"v70"
,
"v71"
,
"v72"
,
"v73"
,
"v74"
,
"v75"
,
"v76"
,
"v77"
,
"v78"
,
"v79"
,
"v80"
,
"v81"
,
"v82"
,
"v83"
,
"v84"
,
"v85"
,
"v86"
,
"v87"
,
"v88"
,
"v89"
,
"v90"
,
"v91"
,
"v92"
,
"v93"
,
"v94"
,
"v95"
,
"v96"
,
"v97"
,
"v98"
,
"v99"
,
"v100"
,
"v101"
,
"v102"
,
"v103"
,
"v104"
,
"v105"
,
"v106"
,
"v107"
,
"v108"
,
"v109"
,
"v110"
,
"v111"
,
"v112"
,
"v113"
,
"v114"
,
"v115"
,
"v116"
,
"v117"
,
"v118"
,
"v119"
,
"v120"
,
"v121"
,
"v122"
,
"v123"
,
"v124"
,
"v125"
,
"v126"
,
"v127"
);
// clang-format on
#pragma clang diagnostic pop
// return local scratch
auto
c
=
MakeCBlockTile
();
for
(
auto
i
=
0
;
i
<
16
;
i
++
)
{
c
.
get_thread_buffer
()[
4
*
i
+
0
]
=
v_acc
[
i
].
x
;
c
.
get_thread_buffer
()[
4
*
i
+
1
]
=
v_acc
[
i
].
y
;
c
.
get_thread_buffer
()[
4
*
i
+
2
]
=
v_acc
[
i
].
z
;
c
.
get_thread_buffer
()[
4
*
i
+
3
]
=
v_acc
[
i
].
w
;
}
return
c
;
}
};
}
// namespace ck_tile
include/ck_tile/ops/flatmm/block/uk/flatmm_ff_uk_gfx9_32x512x128_1x1x1_16x16x16.inc
0 → 100644
View file @
fa335f31
#ifndef CK_TILE_FLATMM_UK_MFMA
#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_BF16
#endif
#if CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_BF16
#define _UK_MFMA_ "v_mfma_f32_16x16x16_bf16"
#elif CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_FP16
#define _UK_MFMA_ "v_mfma_f32_16x16x16_f16"
#endif
"s_mov_b32 s16, %[s_res_a0]
\n
"
"s_mov_b32 s17, %[s_res_a1]
\n
"
"s_mov_b32 s18, %[s_res_a2]
\n
"
"s_mov_b32 s19, %[s_res_a3]
\n
"
"s_mov_b32 s20, %[s_res_b0]
\n
"
"s_mov_b32 s21, %[s_res_b1]
\n
"
"s_mov_b32 s22, %[s_res_b2]
\n
"
"s_mov_b32 s23, %[s_res_b3]
\n
"
// "s_nop 4\n"
"; -- prefetch A0
\n
"
"s_add_u32 m0, 0, %[s_m0_init]
\n
"
"buffer_load_dword %[v_os_a0], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
// size_per_issue = 1088
"buffer_load_dword %[v_os_a1], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a2], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a3], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a4], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a5], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a6], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a7], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[smem_sz], %[s_m0_init]
\n
"
// smem_sz = 8688
"s_cmp_gt_i32 %[s_loop_cnt] 1 ; move a with cond
\n
"
"s_cselect_b32 s86, %[s_tile_os_a], 0 ; move a with cond
\n
"
// tile_offset_a_bytes = 256 = 128(bf16)K
"s_add_u32 s16, s86, s16 ; move a with cond
\n
"
"s_addc_u32 s17, 0, s17 ; move a with cond
\n
"
"; -- prefetch A1
\n
"
"buffer_load_dword %[v_os_a0], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a1], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a2], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a3], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a4], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a5], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a6], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a7], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, 0, %[s_m0_init]
\n
"
"s_cmp_gt_i32 %[s_loop_cnt] 2 ; move a with cond
\n
"
"s_cselect_b32 s86, %[s_tile_os_a], 0 ; move a with cond
\n
"
"s_add_u32 s16, s86, s16 ; move a with cond
\n
"
"s_addc_u32 s17, 0, s17 ; move a with cond
\n
"
"; -- prefetch B0
\n
"
"buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[20:23], 0 offen
\n
"
// bf16 * 8
"buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[20:23], 0 offen offset:1024
\n
"
"buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[20:23], 0 offen offset:2048
\n
"
"buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[20:23], 0 offen offset:3072
\n
"
"buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[20:23], 0 offen
\n
"
"buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[20:23], 0 offen offset:1024
\n
"
"buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[20:23], 0 offen offset:2048
\n
"
"buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[20:23], 0 offen offset:3072
\n
"
"buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[20:23], 0 offen
\n
"
"buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[20:23], 0 offen offset:1024
\n
"
"buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[20:23], 0 offen offset:2048
\n
"
"buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[20:23], 0 offen offset:3072
\n
"
"buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[20:23], 0 offen
\n
"
"buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[20:23], 0 offen offset:1024
\n
"
"buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[20:23], 0 offen offset:2048
\n
"
"buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[20:23], 0 offen offset:3072
\n
"
"buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[20:23], 0 offen
\n
"
"buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[20:23], 0 offen offset:1024
\n
"
"buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[20:23], 0 offen offset:2048
\n
"
"buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[20:23], 0 offen offset:3072
\n
"
"buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[20:23], 0 offen
\n
"
"buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[20:23], 0 offen offset:1024
\n
"
"buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[20:23], 0 offen offset:2048
\n
"
"buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[20:23], 0 offen offset:3072
\n
"
"buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[20:23], 0 offen
\n
"
"buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[20:23], 0 offen offset:1024
\n
"
"buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[20:23], 0 offen offset:2048
\n
"
"buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[20:23], 0 offen offset:3072
\n
"
"buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[20:23], 0 offen
\n
"
"buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[20:23], 0 offen offset:1024
\n
"
"buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[20:23], 0 offen offset:2048
\n
"
"buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[20:23], 0 offen offset:3072
\n
"
#if 0
// test a
//" s_waitcnt vmcnt(0) & lgkmcnt(0) \n"
//" s_barrier \n"
//"ds_read_b128 v[64:67], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_0]\n" // 1024: N stride, 64 // K stride
//"ds_read_b128 v[68:71], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_1]\n"
//"ds_read_b128 v[72:75], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_2]\n"
//"ds_read_b128 v[76:79], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_3]\n"
//"ds_read_b128 v[80:83], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_4]\n"
//"ds_read_b128 v[84:87], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_5]\n"
//"ds_read_b128 v[88:91], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_6]\n"
//"ds_read_b128 v[92:95], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_7]\n"
//" s_waitcnt vmcnt(0) & lgkmcnt(0) \n"
//" s_barrier \n"
//"v_add_u32 %[v_dbg], %[v_os_slda], %[sld_os_0] \n"
//"v_mov_b32 %[v_dbg], v64 \n"
// test b
" s_waitcnt vmcnt(0) & lgkmcnt(0)
\n
"
" s_barrier
\n
"
//"s_add_u32 s20, s86, s20 ; move b with cond \n"
//"s_addc_u32 s21, 0, s21 ; move b with cond \n"
"buffer_load_dwordx4 %[v_acc_0], %[v_os_b0], s[20:23], 0 offen offset:1024
\n
"
" s_waitcnt vmcnt(0) & lgkmcnt(0)
\n
"
" s_barrier
\n
"
"v_mov_b32 %[v_dbg], %[v_os_b2]
\n
"
#else
"s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond
\n
"
"s_cselect_b32 s86, %[s_tile_os_b], 0 ; move b with cond
\n
"
// s_tile_os_b = 4096B = 2048(bf16)K
"s_add_u32 s20, s86, s20 ; move b with cond
\n
"
"s_addc_u32 s21, 0, s21 ; move b with cond
\n
"
"s_waitcnt vmcnt(40)
\n
"
"s_barrier
\n
"
"ds_read_b128 v[64:67], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_0]
\n
"
// 1024: N stride, 64 // K stride
"ds_read_b128 v[68:71], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_1]
\n
"
"ds_read_b128 v[72:75], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_2]
\n
"
"ds_read_b128 v[76:79], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_3]
\n
"
"ds_read_b128 v[80:83], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_4]
\n
"
"ds_read_b128 v[84:87], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_5]
\n
"
"ds_read_b128 v[88:91], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_6]
\n
"
"ds_read_b128 v[92:95], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_7]
\n
"
//////////////////////////////////////////////////////////////////////
"L_start%=:
\n
"
" s_waitcnt vmcnt(24) & lgkmcnt(0)
\n
"
" s_barrier
\n
"
_UK_MFMA_
" %[v_acc_0], acc[0:1], v[64:65], %[v_acc_0]
\n
"
_UK_MFMA_
" %[v_acc_0], acc[2:3], v[66:67], %[v_acc_0]
\n
"
" buffer_load_dwordx4 acc[128:131], %[v_os_b0], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_0], acc[4:5], v[68:69], %[v_acc_0]
\n
"
_UK_MFMA_
" %[v_acc_0], acc[6:7], v[70:71], %[v_acc_0]
\n
"
" buffer_load_dword %[v_os_a0], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
" %[v_acc_0], acc[8:9], v[72:73], %[v_acc_0]
\n
"
_UK_MFMA_
" %[v_acc_0], acc[10:11], v[74:75], %[v_acc_0]
\n
"
" buffer_load_dwordx4 acc[132:135], %[v_os_b0], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_0], acc[12:13], v[76:77], %[v_acc_0]
\n
"
_UK_MFMA_
" %[v_acc_0], acc[14:15], v[78:79], %[v_acc_0]
\n
"
" buffer_load_dword %[v_os_a1], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
" %[v_acc_1], acc[0:1], v[80:81], %[v_acc_1]
\n
"
_UK_MFMA_
" %[v_acc_1], acc[2:3], v[82:83], %[v_acc_1]
\n
"
" buffer_load_dwordx4 acc[136:139], %[v_os_b0], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_1], acc[4:5], v[84:85], %[v_acc_1]
\n
"
_UK_MFMA_
" %[v_acc_1], acc[6:7], v[86:87], %[v_acc_1]
\n
"
" buffer_load_dword %[v_os_a2], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
" %[v_acc_1], acc[8:9], v[88:89], %[v_acc_1]
\n
"
_UK_MFMA_
" %[v_acc_1], acc[10:11], v[90:91], %[v_acc_1]
\n
"
" buffer_load_dwordx4 acc[140:143], %[v_os_b0], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_1], acc[12:13], v[92:93], %[v_acc_1]
\n
"
_UK_MFMA_
" %[v_acc_1], acc[14:15], v[94:95], %[v_acc_1]
\n
"
" buffer_load_dword %[v_os_a3], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
" %[v_acc_2], acc[16:17], v[64:65], %[v_acc_2]
\n
"
_UK_MFMA_
" %[v_acc_2], acc[18:19], v[66:67], %[v_acc_2]
\n
"
" buffer_load_dwordx4 acc[144:147], %[v_os_b1], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_2], acc[20:21], v[68:69], %[v_acc_2]
\n
"
_UK_MFMA_
" %[v_acc_2], acc[22:23], v[70:71], %[v_acc_2]
\n
"
" buffer_load_dword %[v_os_a4], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
" %[v_acc_2], acc[24:25], v[72:73], %[v_acc_2]
\n
"
_UK_MFMA_
" %[v_acc_2], acc[26:27], v[74:75], %[v_acc_2]
\n
"
" buffer_load_dwordx4 acc[148:151], %[v_os_b1], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_2], acc[28:29], v[76:77], %[v_acc_2]
\n
"
_UK_MFMA_
" %[v_acc_2], acc[30:31], v[78:79], %[v_acc_2]
\n
"
" buffer_load_dword %[v_os_a5], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
" %[v_acc_3], acc[16:17], v[80:81], %[v_acc_3]
\n
"
_UK_MFMA_
" %[v_acc_3], acc[18:19], v[82:83], %[v_acc_3]
\n
"
" buffer_load_dwordx4 acc[152:155], %[v_os_b1], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_3], acc[20:21], v[84:85], %[v_acc_3]
\n
"
_UK_MFMA_
" %[v_acc_3], acc[22:23], v[86:87], %[v_acc_3]
\n
"
" buffer_load_dword %[v_os_a6], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
" %[v_acc_3], acc[24:25], v[88:89], %[v_acc_3]
\n
"
_UK_MFMA_
" %[v_acc_3], acc[26:27], v[90:91], %[v_acc_3]
\n
"
" buffer_load_dwordx4 acc[156:159], %[v_os_b1], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_3], acc[28:29], v[92:93], %[v_acc_3]
\n
"
_UK_MFMA_
" %[v_acc_3], acc[30:31], v[94:95], %[v_acc_3]
\n
"
" buffer_load_dword %[v_os_a7], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[smem_sz], %[s_m0_init]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
" %[v_acc_4], acc[32:33], v[64:65], %[v_acc_4]
\n
"
_UK_MFMA_
" %[v_acc_4], acc[34:35], v[66:67], %[v_acc_4]
\n
"
" buffer_load_dwordx4 acc[160:163], %[v_os_b2], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_4], acc[36:37], v[68:69], %[v_acc_4]
\n
"
_UK_MFMA_
" %[v_acc_4], acc[38:39], v[70:71], %[v_acc_4]
\n
"
" ds_read_b128 v[96:99], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_0]
\n
"
_UK_MFMA_
" %[v_acc_4], acc[40:41], v[72:73], %[v_acc_4]
\n
"
_UK_MFMA_
" %[v_acc_4], acc[42:43], v[74:75], %[v_acc_4]
\n
"
" buffer_load_dwordx4 acc[164:167], %[v_os_b2], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_4], acc[44:45], v[76:77], %[v_acc_4]
\n
"
_UK_MFMA_
" %[v_acc_4], acc[46:47], v[78:79], %[v_acc_4]
\n
"
" ds_read_b128 v[100:103], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_1]
\n
"
_UK_MFMA_
" %[v_acc_5], acc[32:33], v[80:81], %[v_acc_5]
\n
"
_UK_MFMA_
" %[v_acc_5], acc[34:35], v[82:83], %[v_acc_5]
\n
"
" buffer_load_dwordx4 acc[168:171], %[v_os_b2], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_5], acc[36:37], v[84:85], %[v_acc_5]
\n
"
_UK_MFMA_
" %[v_acc_5], acc[38:39], v[86:87], %[v_acc_5]
\n
"
" ds_read_b128 v[104:107], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_2]
\n
"
_UK_MFMA_
" %[v_acc_5], acc[40:41], v[88:89], %[v_acc_5]
\n
"
_UK_MFMA_
" %[v_acc_5], acc[42:43], v[90:91], %[v_acc_5]
\n
"
" buffer_load_dwordx4 acc[172:175], %[v_os_b2], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_5], acc[44:45], v[92:93], %[v_acc_5]
\n
"
_UK_MFMA_
" %[v_acc_5], acc[46:47], v[94:95], %[v_acc_5]
\n
"
" ds_read_b128 v[108:111], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_3]
\n
"
_UK_MFMA_
" %[v_acc_6], acc[48:49], v[64:65], %[v_acc_6]
\n
"
_UK_MFMA_
" %[v_acc_6], acc[50:51], v[66:67], %[v_acc_6]
\n
"
" buffer_load_dwordx4 acc[176:179], %[v_os_b3], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_6], acc[52:53], v[68:69], %[v_acc_6]
\n
"
_UK_MFMA_
" %[v_acc_6], acc[54:55], v[70:71], %[v_acc_6]
\n
"
" ds_read_b128 v[112:115], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_4]
\n
"
_UK_MFMA_
" %[v_acc_6], acc[56:57], v[72:73], %[v_acc_6]
\n
"
_UK_MFMA_
" %[v_acc_6], acc[58:59], v[74:75], %[v_acc_6]
\n
"
" buffer_load_dwordx4 acc[180:183], %[v_os_b3], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_6], acc[60:61], v[76:77], %[v_acc_6]
\n
"
_UK_MFMA_
" %[v_acc_6], acc[62:63], v[78:79], %[v_acc_6]
\n
"
" ds_read_b128 v[116:119], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_5]
\n
"
_UK_MFMA_
" %[v_acc_7], acc[48:49], v[80:81], %[v_acc_7]
\n
"
_UK_MFMA_
" %[v_acc_7], acc[50:51], v[82:83], %[v_acc_7]
\n
"
" buffer_load_dwordx4 acc[184:187], %[v_os_b3], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_7], acc[52:53], v[84:85], %[v_acc_7]
\n
"
_UK_MFMA_
" %[v_acc_7], acc[54:55], v[86:87], %[v_acc_7]
\n
"
" ds_read_b128 v[120:123], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_6]
\n
"
_UK_MFMA_
" %[v_acc_7], acc[56:57], v[88:89], %[v_acc_7]
\n
"
_UK_MFMA_
" %[v_acc_7], acc[58:59], v[90:91], %[v_acc_7]
\n
"
" buffer_load_dwordx4 acc[188:191], %[v_os_b3], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_7], acc[60:61], v[92:93], %[v_acc_7]
\n
"
_UK_MFMA_
" %[v_acc_7], acc[62:63], v[94:95], %[v_acc_7]
\n
"
" ds_read_b128 v[124:127], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_7]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
" %[v_acc_8], acc[64:65], v[64:65], %[v_acc_8]
\n
"
_UK_MFMA_
" %[v_acc_8], acc[66:67], v[66:67], %[v_acc_8]
\n
"
" buffer_load_dwordx4 acc[192:195], %[v_os_b4], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_8], acc[68:69], v[68:69], %[v_acc_8]
\n
"
_UK_MFMA_
" %[v_acc_8], acc[70:71], v[70:71], %[v_acc_8]
\n
"
_UK_MFMA_
" %[v_acc_8], acc[72:73], v[72:73], %[v_acc_8]
\n
"
_UK_MFMA_
" %[v_acc_8], acc[74:75], v[74:75], %[v_acc_8]
\n
"
" buffer_load_dwordx4 acc[196:199], %[v_os_b4], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_8], acc[76:77], v[76:77], %[v_acc_8]
\n
"
_UK_MFMA_
" %[v_acc_8], acc[78:79], v[78:79], %[v_acc_8]
\n
"
_UK_MFMA_
" %[v_acc_9], acc[64:65], v[80:81], %[v_acc_9]
\n
"
_UK_MFMA_
" %[v_acc_9], acc[66:67], v[82:83], %[v_acc_9]
\n
"
" buffer_load_dwordx4 acc[200:203], %[v_os_b4], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_9], acc[68:69], v[84:85], %[v_acc_9]
\n
"
_UK_MFMA_
" %[v_acc_9], acc[70:71], v[86:87], %[v_acc_9]
\n
"
_UK_MFMA_
" %[v_acc_9], acc[72:73], v[88:89], %[v_acc_9]
\n
"
_UK_MFMA_
" %[v_acc_9], acc[74:75], v[90:91], %[v_acc_9]
\n
"
" buffer_load_dwordx4 acc[204:207], %[v_os_b4], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_9], acc[76:77], v[92:93], %[v_acc_9]
\n
"
_UK_MFMA_
" %[v_acc_9], acc[78:79], v[94:95], %[v_acc_9]
\n
"
_UK_MFMA_
" %[v_acc_10], acc[80:81], v[64:65], %[v_acc_10]
\n
"
_UK_MFMA_
" %[v_acc_10], acc[82:83], v[66:67], %[v_acc_10]
\n
"
" buffer_load_dwordx4 acc[208:211], %[v_os_b5], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_10], acc[84:85], v[68:69], %[v_acc_10]
\n
"
_UK_MFMA_
" %[v_acc_10], acc[86:87], v[70:71], %[v_acc_10]
\n
"
_UK_MFMA_
" %[v_acc_10], acc[88:89], v[72:73], %[v_acc_10]
\n
"
_UK_MFMA_
" %[v_acc_10], acc[90:91], v[74:75], %[v_acc_10]
\n
"
" buffer_load_dwordx4 acc[212:215], %[v_os_b5], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_10], acc[92:93], v[76:77], %[v_acc_10]
\n
"
_UK_MFMA_
" %[v_acc_10], acc[94:95], v[78:79], %[v_acc_10]
\n
"
_UK_MFMA_
" %[v_acc_11], acc[80:81], v[80:81], %[v_acc_11]
\n
"
_UK_MFMA_
" %[v_acc_11], acc[82:83], v[82:83], %[v_acc_11]
\n
"
" buffer_load_dwordx4 acc[216:219], %[v_os_b5], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_11], acc[84:85], v[84:85], %[v_acc_11]
\n
"
_UK_MFMA_
" %[v_acc_11], acc[86:87], v[86:87], %[v_acc_11]
\n
"
_UK_MFMA_
" %[v_acc_11], acc[88:89], v[88:89], %[v_acc_11]
\n
"
_UK_MFMA_
" %[v_acc_11], acc[90:91], v[90:91], %[v_acc_11]
\n
"
" buffer_load_dwordx4 acc[220:223], %[v_os_b5], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_11], acc[92:93], v[92:93], %[v_acc_11]
\n
"
_UK_MFMA_
" %[v_acc_11], acc[94:95], v[94:95], %[v_acc_11]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
" %[v_acc_12], acc[96:97], v[64:65], %[v_acc_12]
\n
"
_UK_MFMA_
" %[v_acc_12], acc[98:99], v[66:67], %[v_acc_12]
\n
"
" buffer_load_dwordx4 acc[224:227], %[v_os_b6], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_12], acc[100:101], v[68:69], %[v_acc_12]
\n
"
_UK_MFMA_
" %[v_acc_12], acc[102:103], v[70:71], %[v_acc_12]
\n
"
_UK_MFMA_
" %[v_acc_12], acc[104:105], v[72:73], %[v_acc_12]
\n
"
_UK_MFMA_
" %[v_acc_12], acc[106:107], v[74:75], %[v_acc_12]
\n
"
" buffer_load_dwordx4 acc[228:231], %[v_os_b6], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_12], acc[108:109], v[76:77], %[v_acc_12]
\n
"
_UK_MFMA_
" %[v_acc_12], acc[110:111], v[78:79], %[v_acc_12]
\n
"
_UK_MFMA_
" %[v_acc_13], acc[96:97], v[80:81], %[v_acc_13]
\n
"
_UK_MFMA_
" %[v_acc_13], acc[98:99], v[82:83], %[v_acc_13]
\n
"
" buffer_load_dwordx4 acc[232:235], %[v_os_b6], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_13], acc[100:101], v[84:85], %[v_acc_13]
\n
"
_UK_MFMA_
" %[v_acc_13], acc[102:103], v[86:87], %[v_acc_13]
\n
"
_UK_MFMA_
" %[v_acc_13], acc[104:105], v[88:89], %[v_acc_13]
\n
"
_UK_MFMA_
" %[v_acc_13], acc[106:107], v[90:91], %[v_acc_13]
\n
"
" buffer_load_dwordx4 acc[236:239], %[v_os_b6], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_13], acc[108:109], v[92:93], %[v_acc_13]
\n
"
_UK_MFMA_
" %[v_acc_13], acc[110:111], v[94:95], %[v_acc_13]
\n
"
_UK_MFMA_
" %[v_acc_14], acc[112:113], v[64:65], %[v_acc_14]
\n
"
_UK_MFMA_
" %[v_acc_14], acc[114:115], v[66:67], %[v_acc_14]
\n
"
" buffer_load_dwordx4 acc[240:243], %[v_os_b7], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_14], acc[116:117], v[68:69], %[v_acc_14]
\n
"
_UK_MFMA_
" %[v_acc_14], acc[118:119], v[70:71], %[v_acc_14]
\n
"
_UK_MFMA_
" %[v_acc_14], acc[120:121], v[72:73], %[v_acc_14]
\n
"
_UK_MFMA_
" %[v_acc_14], acc[122:123], v[74:75], %[v_acc_14]
\n
"
" buffer_load_dwordx4 acc[244:247], %[v_os_b7], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_14], acc[124:125], v[76:77], %[v_acc_14]
\n
"
_UK_MFMA_
" %[v_acc_14], acc[126:127], v[78:79], %[v_acc_14]
\n
"
_UK_MFMA_
" %[v_acc_15], acc[112:113], v[80:81], %[v_acc_15]
\n
"
_UK_MFMA_
" %[v_acc_15], acc[114:115], v[82:83], %[v_acc_15]
\n
"
" buffer_load_dwordx4 acc[248:251], %[v_os_b7], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_15], acc[116:117], v[84:85], %[v_acc_15]
\n
"
_UK_MFMA_
" %[v_acc_15], acc[118:119], v[86:87], %[v_acc_15]
\n
"
_UK_MFMA_
" %[v_acc_15], acc[120:121], v[88:89], %[v_acc_15]
\n
"
_UK_MFMA_
" %[v_acc_15], acc[122:123], v[90:91], %[v_acc_15]
\n
"
" buffer_load_dwordx4 acc[252:255], %[v_os_b7], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_15], acc[124:125], v[92:93], %[v_acc_15]
\n
"
_UK_MFMA_
" %[v_acc_15], acc[126:127], v[94:95], %[v_acc_15]
\n
"
////////////////////////////////////////////////////
" s_sub_i32 %[s_loop_cnt], %[s_loop_cnt], 1
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 0
\n
"
" s_cbranch_scc0 L_end%=
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 2 ; move a with cond
\n
"
" s_cselect_b32 s86, %[s_tile_os_a], 0
\n
"
" s_add_u32 s16, s86, s16
\n
"
" s_addc_u32 s17, 0, s17
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond
\n
"
" s_cselect_b32 s86, %[s_tile_os_b], 0
\n
"
" s_add_u32 s20, s86, s20
\n
"
" s_addc_u32 s21, 0, s21
\n
"
" ;------------------------------------------
\n
"
" s_waitcnt vmcnt(24) & lgkmcnt(0)
\n
"
" s_barrier
\n
"
///////////////////////////////////////////////////////
_UK_MFMA_
" %[v_acc_0], acc[128:129], v[96:97], %[v_acc_0]
\n
"
_UK_MFMA_
" %[v_acc_0], acc[130:131], v[98:99], %[v_acc_0]
\n
"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_0], acc[132:133], v[100:101], %[v_acc_0]
\n
"
_UK_MFMA_
" %[v_acc_0], acc[134:135], v[102:103], %[v_acc_0]
\n
"
" buffer_load_dword %[v_os_a0], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
" %[v_acc_0], acc[136:137], v[104:105], %[v_acc_0]
\n
"
_UK_MFMA_
" %[v_acc_0], acc[138:139], v[106:107], %[v_acc_0]
\n
"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_0], acc[140:141], v[108:109], %[v_acc_0]
\n
"
_UK_MFMA_
" %[v_acc_0], acc[142:143], v[110:111], %[v_acc_0]
\n
"
" buffer_load_dword %[v_os_a1], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
" %[v_acc_1], acc[128:129], v[112:113], %[v_acc_1]
\n
"
_UK_MFMA_
" %[v_acc_1], acc[130:131], v[114:115], %[v_acc_1]
\n
"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_1], acc[132:133], v[116:117], %[v_acc_1]
\n
"
_UK_MFMA_
" %[v_acc_1], acc[134:135], v[118:119], %[v_acc_1]
\n
"
" buffer_load_dword %[v_os_a2], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
" %[v_acc_1], acc[136:137], v[120:121], %[v_acc_1]
\n
"
_UK_MFMA_
" %[v_acc_1], acc[138:139], v[122:123], %[v_acc_1]
\n
"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_1], acc[140:141], v[124:125], %[v_acc_1]
\n
"
_UK_MFMA_
" %[v_acc_1], acc[142:143], v[126:127], %[v_acc_1]
\n
"
" buffer_load_dword %[v_os_a3], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
" %[v_acc_2], acc[144:145], v[96:97], %[v_acc_2]
\n
"
_UK_MFMA_
" %[v_acc_2], acc[146:147], v[98:99], %[v_acc_2]
\n
"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_2], acc[148:149], v[100:101], %[v_acc_2]
\n
"
_UK_MFMA_
" %[v_acc_2], acc[150:151], v[102:103], %[v_acc_2]
\n
"
" buffer_load_dword %[v_os_a4], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
" %[v_acc_2], acc[152:153], v[104:105], %[v_acc_2]
\n
"
_UK_MFMA_
" %[v_acc_2], acc[154:155], v[106:107], %[v_acc_2]
\n
"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_2], acc[156:157], v[108:109], %[v_acc_2]
\n
"
_UK_MFMA_
" %[v_acc_2], acc[158:159], v[110:111], %[v_acc_2]
\n
"
" buffer_load_dword %[v_os_a5], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
" %[v_acc_3], acc[144:145], v[112:113], %[v_acc_3]
\n
"
_UK_MFMA_
" %[v_acc_3], acc[146:147], v[114:115], %[v_acc_3]
\n
"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_3], acc[148:149], v[116:117], %[v_acc_3]
\n
"
_UK_MFMA_
" %[v_acc_3], acc[150:151], v[118:119], %[v_acc_3]
\n
"
" buffer_load_dword %[v_os_a6], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
" %[v_acc_3], acc[152:153], v[120:121], %[v_acc_3]
\n
"
_UK_MFMA_
" %[v_acc_3], acc[154:155], v[122:123], %[v_acc_3]
\n
"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_3], acc[156:157], v[124:125], %[v_acc_3]
\n
"
_UK_MFMA_
" %[v_acc_3], acc[158:159], v[126:127], %[v_acc_3]
\n
"
" buffer_load_dword %[v_os_a7], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, 0, %[s_m0_init]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
" %[v_acc_4], acc[160:161], v[96:97], %[v_acc_4]
\n
"
_UK_MFMA_
" %[v_acc_4], acc[162:163], v[98:99], %[v_acc_4]
\n
"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_4], acc[164:165], v[100:101], %[v_acc_4]
\n
"
_UK_MFMA_
" %[v_acc_4], acc[166:167], v[102:103], %[v_acc_4]
\n
"
" ds_read_b128 v[64:67], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_0]
\n
"
_UK_MFMA_
" %[v_acc_4], acc[168:169], v[104:105], %[v_acc_4]
\n
"
_UK_MFMA_
" %[v_acc_4], acc[170:171], v[106:107], %[v_acc_4]
\n
"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_4], acc[172:173], v[108:109], %[v_acc_4]
\n
"
_UK_MFMA_
" %[v_acc_4], acc[174:175], v[110:111], %[v_acc_4]
\n
"
" ds_read_b128 v[68:71], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_1]
\n
"
_UK_MFMA_
" %[v_acc_5], acc[160:161], v[112:113], %[v_acc_5]
\n
"
_UK_MFMA_
" %[v_acc_5], acc[162:163], v[114:115], %[v_acc_5]
\n
"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_5], acc[164:165], v[116:117], %[v_acc_5]
\n
"
_UK_MFMA_
" %[v_acc_5], acc[166:167], v[118:119], %[v_acc_5]
\n
"
" ds_read_b128 v[72:75], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_2] "
"
\n
"
_UK_MFMA_
" %[v_acc_5], acc[168:169], v[120:121], %[v_acc_5]
\n
"
_UK_MFMA_
" %[v_acc_5], acc[170:171], v[122:123], %[v_acc_5]
\n
"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_5], acc[172:173], v[124:125], %[v_acc_5]
\n
"
_UK_MFMA_
" %[v_acc_5], acc[174:175], v[126:127], %[v_acc_5]
\n
"
" ds_read_b128 v[76:79], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_3] "
"
\n
"
_UK_MFMA_
" %[v_acc_6], acc[176:177], v[96:97], %[v_acc_6]
\n
"
_UK_MFMA_
" %[v_acc_6], acc[178:179], v[98:99], %[v_acc_6]
\n
"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_6], acc[180:181], v[100:101], %[v_acc_6]
\n
"
_UK_MFMA_
" %[v_acc_6], acc[182:183], v[102:103], %[v_acc_6]
\n
"
" ds_read_b128 v[80:83], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_4] "
"
\n
"
_UK_MFMA_
" %[v_acc_6], acc[184:185], v[104:105], %[v_acc_6]
\n
"
_UK_MFMA_
" %[v_acc_6], acc[186:187], v[106:107], %[v_acc_6]
\n
"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_6], acc[188:189], v[108:109], %[v_acc_6]
\n
"
_UK_MFMA_
" %[v_acc_6], acc[190:191], v[110:111], %[v_acc_6]
\n
"
" ds_read_b128 v[84:87], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_5] "
"
\n
"
_UK_MFMA_
" %[v_acc_7], acc[176:177], v[112:113], %[v_acc_7]
\n
"
_UK_MFMA_
" %[v_acc_7], acc[178:179], v[114:115], %[v_acc_7]
\n
"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_7], acc[180:181], v[116:117], %[v_acc_7]
\n
"
_UK_MFMA_
" %[v_acc_7], acc[182:183], v[118:119], %[v_acc_7]
\n
"
" ds_read_b128 v[88:91], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_6] "
"
\n
"
_UK_MFMA_
" %[v_acc_7], acc[184:185], v[120:121], %[v_acc_7]
\n
"
_UK_MFMA_
" %[v_acc_7], acc[186:187], v[122:123], %[v_acc_7]
\n
"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_7], acc[188:189], v[124:125], %[v_acc_7]
\n
"
_UK_MFMA_
" %[v_acc_7], acc[190:191], v[126:127], %[v_acc_7]
\n
"
" ds_read_b128 v[92:95], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_7]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
" %[v_acc_8], acc[192:193], v[96:97], %[v_acc_8]
\n
"
_UK_MFMA_
" %[v_acc_8], acc[194:195], v[98:99], %[v_acc_8]
\n
"
" buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_8], acc[196:197], v[100:101], %[v_acc_8]
\n
"
_UK_MFMA_
" %[v_acc_8], acc[198:199], v[102:103], %[v_acc_8]
\n
"
_UK_MFMA_
" %[v_acc_8], acc[200:201], v[104:105], %[v_acc_8]
\n
"
_UK_MFMA_
" %[v_acc_8], acc[202:203], v[106:107], %[v_acc_8]
\n
"
" buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_8], acc[204:205], v[108:109], %[v_acc_8]
\n
"
_UK_MFMA_
" %[v_acc_8], acc[206:207], v[110:111], %[v_acc_8]
\n
"
_UK_MFMA_
" %[v_acc_9], acc[192:193], v[112:113], %[v_acc_9]
\n
"
_UK_MFMA_
" %[v_acc_9], acc[194:195], v[114:115], %[v_acc_9]
\n
"
" buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_9], acc[196:197], v[116:117], %[v_acc_9]
\n
"
_UK_MFMA_
" %[v_acc_9], acc[198:199], v[118:119], %[v_acc_9]
\n
"
_UK_MFMA_
" %[v_acc_9], acc[200:201], v[120:121], %[v_acc_9]
\n
"
_UK_MFMA_
" %[v_acc_9], acc[202:203], v[122:123], %[v_acc_9]
\n
"
" buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_9], acc[204:205], v[124:125], %[v_acc_9]
\n
"
_UK_MFMA_
" %[v_acc_9], acc[206:207], v[126:127], %[v_acc_9]
\n
"
_UK_MFMA_
" %[v_acc_10], acc[208:209], v[96:97], %[v_acc_10]
\n
"
_UK_MFMA_
" %[v_acc_10], acc[210:211], v[98:99], %[v_acc_10]
\n
"
" buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_10], acc[212:213], v[100:101], %[v_acc_10]
\n
"
_UK_MFMA_
" %[v_acc_10], acc[214:215], v[102:103], %[v_acc_10]
\n
"
_UK_MFMA_
" %[v_acc_10], acc[216:217], v[104:105], %[v_acc_10]
\n
"
_UK_MFMA_
" %[v_acc_10], acc[218:219], v[106:107], %[v_acc_10]
\n
"
" buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_10], acc[220:221], v[108:109], %[v_acc_10]
\n
"
_UK_MFMA_
" %[v_acc_10], acc[222:223], v[110:111], %[v_acc_10]
\n
"
_UK_MFMA_
" %[v_acc_11], acc[208:209], v[112:113], %[v_acc_11]
\n
"
_UK_MFMA_
" %[v_acc_11], acc[210:211], v[114:115], %[v_acc_11]
\n
"
" buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_11], acc[212:213], v[116:117], %[v_acc_11]
\n
"
_UK_MFMA_
" %[v_acc_11], acc[214:215], v[118:119], %[v_acc_11]
\n
"
_UK_MFMA_
" %[v_acc_11], acc[216:217], v[120:121], %[v_acc_11]
\n
"
_UK_MFMA_
" %[v_acc_11], acc[218:219], v[122:123], %[v_acc_11]
\n
"
" buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_11], acc[220:221], v[124:125], %[v_acc_11]
\n
"
_UK_MFMA_
" %[v_acc_11], acc[222:223], v[126:127], %[v_acc_11]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
" %[v_acc_12], acc[224:225], v[96:97], %[v_acc_12]
\n
"
_UK_MFMA_
" %[v_acc_12], acc[226:227], v[98:99], %[v_acc_12]
\n
"
" buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_12], acc[228:229], v[100:101], %[v_acc_12]
\n
"
_UK_MFMA_
" %[v_acc_12], acc[230:231], v[102:103], %[v_acc_12]
\n
"
_UK_MFMA_
" %[v_acc_12], acc[232:233], v[104:105], %[v_acc_12]
\n
"
_UK_MFMA_
" %[v_acc_12], acc[234:235], v[106:107], %[v_acc_12]
\n
"
" buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_12], acc[236:237], v[108:109], %[v_acc_12]
\n
"
_UK_MFMA_
" %[v_acc_12], acc[238:239], v[110:111], %[v_acc_12]
\n
"
_UK_MFMA_
" %[v_acc_13], acc[224:225], v[112:113], %[v_acc_13]
\n
"
_UK_MFMA_
" %[v_acc_13], acc[226:227], v[114:115], %[v_acc_13]
\n
"
" buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_13], acc[228:229], v[116:117], %[v_acc_13]
\n
"
_UK_MFMA_
" %[v_acc_13], acc[230:231], v[118:119], %[v_acc_13]
\n
"
_UK_MFMA_
" %[v_acc_13], acc[232:233], v[120:121], %[v_acc_13]
\n
"
_UK_MFMA_
" %[v_acc_13], acc[234:235], v[122:123], %[v_acc_13]
\n
"
" buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_13], acc[236:237], v[124:125], %[v_acc_13]
\n
"
_UK_MFMA_
" %[v_acc_13], acc[238:239], v[126:127], %[v_acc_13]
\n
"
_UK_MFMA_
" %[v_acc_14], acc[240:241], v[96:97], %[v_acc_14]
\n
"
_UK_MFMA_
" %[v_acc_14], acc[242:243], v[98:99], %[v_acc_14]
\n
"
" buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_14], acc[244:245], v[100:101], %[v_acc_14]
\n
"
_UK_MFMA_
" %[v_acc_14], acc[246:247], v[102:103], %[v_acc_14]
\n
"
_UK_MFMA_
" %[v_acc_14], acc[248:249], v[104:105], %[v_acc_14]
\n
"
_UK_MFMA_
" %[v_acc_14], acc[250:251], v[106:107], %[v_acc_14]
\n
"
" buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_14], acc[252:253], v[108:109], %[v_acc_14]
\n
"
_UK_MFMA_
" %[v_acc_14], acc[254:255], v[110:111], %[v_acc_14]
\n
"
_UK_MFMA_
" %[v_acc_15], acc[240:241], v[112:113], %[v_acc_15]
\n
"
_UK_MFMA_
" %[v_acc_15], acc[242:243], v[114:115], %[v_acc_15]
\n
"
" buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_15], acc[244:245], v[116:117], %[v_acc_15]
\n
"
_UK_MFMA_
" %[v_acc_15], acc[246:247], v[118:119], %[v_acc_15]
\n
"
_UK_MFMA_
" %[v_acc_15], acc[248:249], v[120:121], %[v_acc_15]
\n
"
_UK_MFMA_
" %[v_acc_15], acc[250:251], v[122:123], %[v_acc_15]
\n
"
" buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_15], acc[252:253], v[124:125], %[v_acc_15]
\n
"
_UK_MFMA_
" %[v_acc_15], acc[254:255], v[126:127], %[v_acc_15]
\n
"
" s_sub_i32 %[s_loop_cnt], %[s_loop_cnt], 1
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 0
\n
"
" s_cbranch_scc0 L_end%=
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 2 ; move a with cond
\n
"
" s_cselect_b32 s86, %[s_tile_os_a], 0
\n
"
" s_add_u32 s16, s86, s16
\n
"
" s_addc_u32 s17, 0, s17
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond
\n
"
" s_cselect_b32 s86, %[s_tile_os_b], 0
\n
"
" s_add_u32 s20, s86, s20
\n
"
" s_addc_u32 s21, 0, s21
\n
"
" s_branch L_start%=
\n
"
"L_end%=:
\n
"
" s_nop 2
\n
"
#endif
#undef _UK_MFMA_
include/ck_tile/ops/flatmm_uk.hpp
0 → 100644
View file @
fa335f31
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/ops/flatmm/block/flatmm_uk_config.hpp"
#include "ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp"
#include "ck_tile/ops/flatmm/block/flatmm_ff_32x512x128_1x4x1_16x16x32.hpp"
#include "ck_tile/ops/fused_moe/kernel/flatmm_uk_kernel.hpp"
#include "ck_tile/ops/fused_moe/kernel/fused_moegemm_shape.hpp"
#include "ck_tile/ops/fused_moe/kernel/fused_moegemm_tile_partitioner.hpp"
#include "ck_tile/ops/fused_moe/pipeline/flatmm_uk_pipeline_policy.hpp"
#include "ck_tile/ops/fused_moe/pipeline/flatmm_uk_pipeline.hpp"
#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_problem.hpp"
#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_traits.hpp"
#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
include/ck_tile/ops/fused_moe/kernel/flatmm_uk_kernel.hpp
0 → 100644
View file @
fa335f31
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/ops/common.hpp"
#include "ck_tile/ops/elementwise.hpp"
#include <string>
#include <type_traits>
// clang-format off
// [indexing implementation-1]
// using M_a as constexpr block_size to partition all tokens into different slices
// each slice map to one expert, and one expert can have multiple slices
// e.g. num_experts = 6, topk=3, M_a = 4, input_tokens = 5
// before sort, topk_ids is : [[0, 3, 5], [2, 3, 5], [1, 3, 5], [1, 2, 3], [1, 3, 5]]
// tok-0 tok-1 tok-2 tok-3 tok-4
// topk_weight is : [[a, b, c], [d, e, f], [g, h, i], [j, k, l], [m, n, o]] (some float number)
//
// token_id_per_expert is : [[0], [2, 3, 4], [1, 3], [0, 1, 2, 3, 4], [], [0, 1, 2, 5]]
// (only for reference) exp-0 exp-1 exp-2 exp-3 exp-4 exp-5
// weight_id_per_expert is: [[a], [g, j, m], [d, k], [b, e, h, l, n], [], [c, f, i, o]]
//
// max_num_tokens_padded : topk * input_tokens + num_experts * (M_a - 1)
// * this could be larger than actual, since actual tokens are on GPU
//
// sorted_token_ids_ptr : [0, 6, 6, 6, 2, 3, 4, 6, 1, 3, 6, 6, 0, 1, 2, 3, 4, 6, 6, 6, 6, 6, 6, 6, 0, 1, 2, 5]
// |- exp-0 -|- exp-1 -|- exp-2 -|- exp-3 -|- exp-4 -|- exp-5 -|
// sorted_weight_ptr : [a, *, *, *, g, j, m, *, d, k, *, *, b, e, h, l, n, *, *, *, *, *, *, *, c, f, i, o]
//
// * length is max_num_tokens_padded, actual size is num_tokens_post_padded_ptr
//
// * Note on token_id_per_expert/sorted_token_ids_ptr data:
// currently we do not have topk information from the data of token_id_per_expert/sorted_token_ids_ptr.
// In some cases(like smooth-quant), we need topk information to indexing into tokens quant from
// different expert smooth quant. So we modify the number stored inside token_id_per_expert/sorted_token_ids_ptr
//
// 32bit 0........23 24.....31 bit
// (data) -> (token_id | topk_id)
// low 24 bit is for token id, top 8 bit is for topk id
//
// the input after smooth-quant is [token, topk, hidden_dim], originally it is [token, hidden_dim]
// the input scale for token is [topk, token, 1], the smooth-quant scale for first gemm is [expert, interm_dim]
//
// sorted_expert_ids_ptr : [0, 1, 2, 3, 3, 4, 5]
// * length is (max_num_tokens_padded + block_size - 1) / block_size
//
// num_tokens_post_padded_ptr : [28]
// num_sorted_tiles_ptr : [7]
//
// * different from vLLM
// 1) token_id stored in sorted_token_ids_ptr is actual token_id, not token_id*top_K expanded id
// 2)need sorted_weight_ptr
// 3) use num_sorted_tiles_ptr, already divided by M_a
//
// * below used for indexing
// 1) sorted_token_ids_ptr [max_num_tokens_padded]
// 2) sorted_weight_ptr
// 3) sorted_expert_ids_ptr
// 4)num_tokens_post_padded_ptr/num_sorted_tiles_ptr (select one)
//
// max_num_tokens_padded: opk_ids.numel() + num_experts * (block_size - 1)
//
// [indexing implementation-2]
// before sort, topk_ids is : [[0, 3, 5], [2, 3, 5], [1, 3, 5], [1, 2, 3], [1, 3, 5]]
// tok-0 tok-1 tok-2 tok-3 tok-4
// topk_weight is : [[a, b, c], [d, e, f], [g, h, i], [j, k, l], [m, n, o]] (some float number)
//
// we generate original rol/col id as
// topk_rc_ids : [[0, 5, A], [1, 6, B], [2, 7, C], [3, 8, D], [4, 9, E]]
// let x be one element of above, we can get:
// tpok_row_id(token_id) = x % num_tokens(5)
// tpok_col_id(expert_Id) = x / num_tokens
// topk_row_id/col_id can be used to access original topk_ids/topk_weight
//
// token_id_per_expert is : [[0], [2, 3, 4], [1, 3], [0, 1, 2, 3, 4], [], [0, 1, 5, 5]]
// (only for reference) exp-0 exp-1 exp-2 exp-3 exp-4 exp-5
// weight_id_per_expert is: [[a], [g, j, m], [d, k], [b, e, h, l, n], [], [c, f, i, o]]
//
// we can get permuted_rc_ids:
// [[0], [2, 3, 4], [1, 8], [5, 6, 7, D, 9], [], [A, B, C, E]]
//
//
// clang-format on
//
namespace
ck_tile
{
// m: num_tokens (or token*input-batch)
// k: intermediate_size
// n: intermediate_size used between 2 FC (TP slice this)
// e: num expert
// if doing pre-shuffle
// nr : n / Block_Nr
// kr : k / Block_Kr
// w : fattened 1d wave buffer
struct
FlatmmUkHostArgs
{
const
void
*
a_ptr
;
// [m, k], input token
const
void
*
b_ptr
;
// [m, k], input token
const
void
*
c_ptr
;
// [m, k], output token
void
*
d_ptr
;
// [m, k], output token
void
*
dbg_int_ptr
;
// [m, k], output token
void
*
dbg_bf16_ptr
;
// [m, k], output token
void
*
dbg_fp32_ptr
;
// [m, k], output token
index_t
hidden_size
;
// K
index_t
intermediate_size
;
// N
index_t
num_tokens
;
// M
index_t
num_experts
;
// number of groups
index_t
topk
;
// need this?
index_t
stride_token
;
// for input/output, stride for each row, should >= hidden_size
};
// This is scatter/gather b2b group-gemm
template
<
typename
Pipeline_
,
typename
Epilogue_
>
struct
FlatmmUkKernel
{
using
Pipeline
=
remove_cvref_t
<
Pipeline_
>
;
using
Epilogue
=
remove_cvref_t
<
Epilogue_
>
;
// TODO: not used
// static constexpr index_t kBlockPerCu = Pipeline::kBlockPerCu;
// static_assert(kBlockPerCu > 0);
using
BlockShape
=
typename
Pipeline
::
BlockShape
;
// this is FusedMoeGemmShape
static
constexpr
index_t
BlockSize_
=
BlockShape
::
BlockSize
;
using
ADataType
=
typename
Pipeline
::
Problem
::
ADataType
;
using
GDataType
=
typename
Pipeline
::
Problem
::
GDataType
;
using
DDataType
=
typename
Pipeline
::
Problem
::
AccDataType
;
using
AccDataType
=
typename
Pipeline
::
Problem
::
AccDataType
;
using
ODataType
=
typename
Pipeline
::
Problem
::
ODataType
;
using
AScaleDataType
=
typename
Pipeline
::
Problem
::
AScaleDataType
;
using
GScaleDataType
=
typename
Pipeline
::
Problem
::
GScaleDataType
;
using
DScaleDataType
=
typename
Pipeline
::
Problem
::
DScaleDataType
;
using
YSmoothScaleDataType
=
typename
Pipeline
::
Problem
::
YSmoothScaleDataType
;
using
TopkWeightDataType
=
typename
Pipeline
::
Problem
::
TopkWeightDataType
;
using
IndexDataType
=
typename
Pipeline
::
Problem
::
IndexDataType
;
using
YDataType
=
typename
Pipeline
::
Problem
::
YDataType
;
using
Traits
=
typename
Pipeline
::
Problem
::
Traits
;
static
constexpr
bool
UseUK
=
true
;
static
constexpr
bool
IsGateOnly
=
Traits
::
IsGateOnly
;
static
constexpr
bool
UseSmoothQuant
=
Traits
::
UseSmoothQuant
;
static
constexpr
bool
PadHiddenSize
=
Traits
::
PadHiddenSize
;
static
constexpr
bool
PadIntermediateSize
=
Traits
::
PadIntermediateSize
;
// clang-format off
template
<
typename
T
>
struct
t2s
;
template
<
>
struct
t2s
<
float
>
{
static
constexpr
const
char
*
name
=
"fp32"
;
};
template
<
>
struct
t2s
<
fp16_t
>
{
static
constexpr
const
char
*
name
=
"fp16"
;
};
template
<
>
struct
t2s
<
bf16_t
>
{
static
constexpr
const
char
*
name
=
"bf16"
;
};
template
<
>
struct
t2s
<
fp8_t
>
{
static
constexpr
const
char
*
name
=
"fp8"
;
};
template
<
>
struct
t2s
<
bf8_t
>
{
static
constexpr
const
char
*
name
=
"bf8"
;
};
template
<
>
struct
t2s
<
int8_t
>
{
static
constexpr
const
char
*
name
=
"int8"
;
};
// clang-format on
CK_TILE_HOST
static
std
::
string
GetName
()
{
#define _SS_ std::string
#define _TS_ std::to_string
// clang-format off
using
S_
=
BlockShape
;
auto
prec_str
=
[
&
]
()
{
std
::
string
base_str
=
_SS_
(
t2s
<
ADataType
>::
name
);
if
(
!
std
::
is_same_v
<
ADataType
,
GDataType
>
)
{
base_str
+=
_SS_
(
"_"
)
+
_SS_
(
t2s
<
GDataType
>::
name
);
}
return
base_str
;
}();
return
_SS_
(
"fused_moe_"
)
+
_SS_
(
prec_str
)
+
"_"
+
_TS_
(
S_
::
Block_M0
)
+
"x"
+
_TS_
(
S_
::
Block_N0
)
+
"x"
+
_TS_
(
S_
::
Block_K0
)
+
"x"
+
_TS_
(
S_
::
Block_N1
)
+
"_"
+
_TS_
(
S_
::
WarpPerBlock_M0
)
+
"x"
+
_TS_
(
S_
::
WarpPerBlock_N0
)
+
"x"
+
_TS_
(
S_
::
WarpPerBlock_K0
)
+
"_"
+
_TS_
(
S_
::
Warp_M0
)
+
"x"
+
_TS_
(
S_
::
Warp_N0
)
+
"x"
+
_TS_
(
S_
::
Warp_K0
)
+
"_"
+
_SS_
(
Pipeline
::
name
);
#undef _SS_
#undef _TS_
// clang-format on
}
struct
FusedMoeGemmKargs
{
const
void
*
a_ptr
;
// [m, k], input token
const
void
*
b_ptr
;
// [m, k], input token
const
void
*
c_ptr
;
// [m, k], output token
void
*
d_ptr
;
// [m, k], output token
void
*
dbg_int_ptr
;
// [m, k], output token
void
*
dbg_bf16_ptr
;
// [m, k], output token
void
*
dbg_fp32_ptr
;
// [m, k], output token
index_t
hidden_size
;
// K
index_t
intermediate_size
;
// N
index_t
num_tokens
;
// M
index_t
num_experts
;
// number of groups
index_t
topk
;
// need this?
index_t
stride_token
;
// for input/output, stride for each row, should >= hidden_size
};
// TODO: switch karg based on
using
Kargs
=
FusedMoeGemmKargs
;
using
Hargs
=
FlatmmUkHostArgs
;
CK_TILE_HOST
static
constexpr
Kargs
MakeKargs
(
const
Hargs
&
hargs
)
{
// TODO: hargs/kargs not guranteed to be the same
return
bit_cast
<
Kargs
>
(
hargs
);
}
CK_TILE_HOST
static
constexpr
auto
GridSize
(
const
Hargs
&
hargs
)
{
index_t
ms
=
ck_tile
::
integer_divide_ceil
(
hargs
.
num_tokens
,
BlockShape
::
Block_M0
);
index_t
ns
=
ck_tile
::
integer_divide_ceil
(
hargs
.
intermediate_size
,
BlockShape
::
Block_N0
);
return
dim3
(
ns
,
ms
,
1
);
}
CK_TILE_HOST
static
constexpr
auto
BlockSize
()
{
return
dim3
(
BlockSize_
);
}
CK_TILE_HOST_DEVICE
static
constexpr
index_t
GetSmemSize
()
{
return
Pipeline
::
GetSmemSize
();
}
CK_TILE_DEVICE
void
operator
()(
Kargs
kargs
)
const
{
#if 0
if(threadIdx.x == 0 && blockIdx.x == 0 && threadIdx.y == 0 && blockIdx.y == 0)
{
printf("[KERNEL] FlatmmUkKernel =====\n");
printf("[KERNEL] blockDim: [%d, %d], gridDim: [%d, %d]\n",
static_cast<int>(blockDim.x),
static_cast<int>(blockDim.y),
static_cast<int>(gridDim.x),
static_cast<int>(gridDim.y));
printf("[KERNEL] lds = %.3f (KB)\n", GetSmemSize() / 1024.0f);
}
[[maybe_unused]] uint32_t tidx = threadIdx.x; // 0~255
[[maybe_unused]] uint32_t tidy = threadIdx.y; // 0~0
[[maybe_unused]] uint32_t bidx = blockIdx.x; // 0~1
[[maybe_unused]] uint32_t bidy = blockIdx.y; // 0~51
[[maybe_unused]] uint32_t bdmx = blockDim.x; // 256
[[maybe_unused]] uint32_t bdmy = blockDim.y; // 1
[[maybe_unused]] uint32_t gdmx = gridDim.x; // 2
[[maybe_unused]] uint32_t gdmy = gridDim.y; // 52
[[maybe_unused]] uint32_t gid = ((bdmx * bdmy) * gdmx) * bidy
+ (bdmx * bdmy) * bidx
+ bdmx * tidy
+ tidx;
[[maybe_unused]]int * dbg_int = static_cast<int*>(kargs.dbg_int_ptr);
[[maybe_unused]]short * dbg_bf16 = static_cast<short*>(kargs.dbg_bf16_ptr);
[[maybe_unused]]float * dbg_fp32 = static_cast<float*>(kargs.dbg_fp32_ptr);
dbg_int[gid] = -1;
dbg_fp32[gid] = -1.0f;
#endif
__shared__
CK_TILE_LDS_ADDR
ADataType
smem
[
GetSmemSize
()];
Pipeline
{}(
kargs
,
smem
);
}
};
}
// namespace ck_tile
include/ck_tile/ops/fused_moe/pipeline/flatmm_uk_pipeline.hpp
0 → 100644
View file @
fa335f31
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
#include "ck_tile/ops/fused_moe/pipeline/flatmm_uk_pipeline_policy.hpp"
namespace
ck_tile
{
/*
This pipeline deal with a gemm(actually 2 gemm) with one very small(token), one very big(weight)
we need to design the pipeline such that all waves along gemm-N dim (gemm-m only 1 wave)
<----- gemm-N ------>
+----+----+----+----+
| w0 | w1 | w2 | w3 | gemm-m
+----+----+----+----+
*/
template
<
typename
Problem_
,
typename
Policy_
=
GemmPipelineFlatmmPolicy
>
struct
GemmPipeline_FlatmmUk
{
using
Problem
=
remove_cvref_t
<
Problem_
>
;
using
Policy
=
remove_cvref_t
<
Policy_
>
;
using
BlockShape
=
typename
Problem
::
BlockShape
;
// this is FusedMoeGemmShape
using
ADataType
=
typename
Problem
::
ADataType
;
using
GDataType
=
typename
Problem
::
GDataType
;
using
DDataType
=
typename
Problem
::
AccDataType
;
using
AccDataType
=
typename
Problem
::
AccDataType
;
using
ODataType
=
typename
Problem
::
ODataType
;
using
AScaleDataType
=
typename
Problem
::
AScaleDataType
;
using
GScaleDataType
=
typename
Problem
::
GScaleDataType
;
using
DScaleDataType
=
typename
Problem
::
DScaleDataType
;
using
YSmoothScaleDataType
=
typename
Problem
::
YSmoothScaleDataType
;
using
TopkWeightDataType
=
typename
Problem
::
TopkWeightDataType
;
using
IndexDataType
=
typename
Problem
::
IndexDataType
;
using
YDataType
=
typename
Problem
::
YDataType
;
using
Traits
=
typename
Problem
::
Traits
;
static
constexpr
bool
IsGateOnly
=
Traits
::
IsGateOnly
;
static
constexpr
bool
UseSmoothQuant
=
Traits
::
UseSmoothQuant
;
static
constexpr
bool
PadHiddenSize
=
Traits
::
PadHiddenSize
;
static
constexpr
bool
PadIntermediateSize
=
Traits
::
PadIntermediateSize
;
static
constexpr
index_t
kAlignmentA
=
Policy
::
template
GetAlignment_A
<
Problem
>();
static
constexpr
index_t
kAlignmentG
=
Policy
::
template
GetAlignment_G
<
Problem
>();
static
constexpr
index_t
kAlignmentD
=
Policy
::
template
GetAlignment_D
<
Problem
>();
static
constexpr
index_t
kAlignmentO
=
Policy
::
template
GetAlignment_O
<
Problem
>();
static
constexpr
index_t
SLD_A
=
static_cast
<
index_t
>
(
FusedMoeGemmPipelineSequencerEnum
::
SLD_A
);
static
constexpr
index_t
GLD_A
=
static_cast
<
index_t
>
(
FusedMoeGemmPipelineSequencerEnum
::
GLD_A
);
static
constexpr
index_t
GLD_B
=
static_cast
<
index_t
>
(
FusedMoeGemmPipelineSequencerEnum
::
GLD_B
);
static
constexpr
index_t
GST_O
=
static_cast
<
index_t
>
(
FusedMoeGemmPipelineSequencerEnum
::
GST_O
);
static
constexpr
index_t
kBlockPerCu
=
[]()
{
if
constexpr
(
Problem
::
kBlockPerCu
!=
-
1
)
return
Problem
::
kBlockPerCu
;
else
{
// minimize occupancy
return
2
;
}
}();
static
constexpr
const
char
*
name
=
"flatmm_uk"
;
CK_TILE_HOST_DEVICE
static
constexpr
ck_tile
::
index_t
GetSmemSize
()
{
constexpr
index_t
smem_0
=
Policy
::
template
GetUK_0
<
Problem
>().
GetSmemSize
();
constexpr
index_t
smem_bridge
=
BlockShape
::
Block_M0
*
BlockShape
::
Block_N0
*
sizeof
(
YDataType
);
return
max
(
smem_0
,
smem_bridge
);
}
// this is the thread-offset along row/col
CK_TILE_HOST_DEVICE
static
auto
GetACoord
()
{
constexpr
auto
a_dist
=
Policy
::
template
MakeGlobalTileDistribution_A
<
Problem
>();
const
auto
a_coord
=
a_dist
.
calculate_index
();
return
a_coord
;
}
// this is the thread-offset along row/col
CK_TILE_HOST_DEVICE
static
auto
GetOCoord
()
{
constexpr
auto
o_dist
=
Policy
::
template
MakeOGlobalTileDistribution
<
Problem
>();
const
auto
o_coord
=
o_dist
.
calculate_index
();
return
o_coord
;
}
CK_TILE_DEVICE
constexpr
auto
GetNumRowCoords_A
()
{
constexpr
index_t
KLans
=
BlockShape
::
Block_K0
/
kAlignmentA
;
constexpr
index_t
MLans
=
BlockShape
::
BlockSize
/
KLans
;
constexpr
index_t
MRepeat
=
BlockShape
::
Block_M0
/
MLans
;
return
MRepeat
;
}
// TODO: properlly support scatter/gather
CK_TILE_DEVICE
auto
GetRowCoords_A
(
index_t
base_offset
)
{
constexpr
index_t
KLans
=
BlockShape
::
Block_K0
/
kAlignmentA
;
constexpr
index_t
MLans
=
BlockShape
::
BlockSize
/
KLans
;
constexpr
index_t
MRepeat
=
BlockShape
::
Block_M0
/
MLans
;
auto
base_coord
=
threadIdx
.
x
/
KLans
+
base_offset
;
array
<
index_t
,
MRepeat
>
coords
;
static_for
<
0
,
MRepeat
,
1
>
{}([
&
](
auto
i
)
{
coords
.
at
(
i
)
=
base_coord
+
i
*
MLans
;
});
return
coords
;
}
CK_TILE_DEVICE
auto
GetRowCoords_O2
(
index_t
base_offset
)
{
constexpr
index_t
NLans
=
BlockShape
::
Block_N0
/
kAlignmentO
;
constexpr
index_t
MLans
=
BlockShape
::
BlockSize
/
NLans
;
constexpr
index_t
MRepeat
=
BlockShape
::
Block_M0
/
MLans
;
auto
base_coord
=
threadIdx
.
x
/
NLans
+
base_offset
;
array
<
index_t
,
MRepeat
>
coords
;
static_for
<
0
,
MRepeat
,
1
>
{}([
&
](
auto
i
)
{
coords
.
at
(
i
)
=
base_coord
+
i
*
MLans
;
});
return
coords
;
}
template
<
typename
ROW_COORDS
>
CK_TILE_DEVICE
auto
GetRowID
(
const
ROW_COORDS
coords
,
const
IndexDataType
*
sorted_token_ids_ptr
)
{
constexpr
index_t
n_size
=
coords
.
size
();
array
<
index_t
,
n_size
>
row_ids
;
static_for
<
0
,
n_size
,
1
>
{}([
&
](
auto
i
)
{
row_ids
.
at
(
i
)
=
sorted_token_ids_ptr
[
coords
[
i
]];
// base_coord + i * MLans;
});
return
row_ids
;
}
template
<
typename
ROW_COORDS
>
CK_TILE_DEVICE
auto
GetWeightScale
(
const
ROW_COORDS
coords
,
const
TopkWeightDataType
*
sorted_weight_ptr
)
{
constexpr
index_t
n_size
=
coords
.
size
();
array
<
TopkWeightDataType
,
n_size
>
w
;
static_for
<
0
,
n_size
,
1
>
{}([
&
](
auto
i
)
{
w
.
at
(
i
)
=
sorted_weight_ptr
[
coords
[
i
]];
// base_coord + i * MLans;
});
return
w
;
}
// TODO: this row id is before shuffle atomic, need use acc distribution
CK_TILE_DEVICE
auto
GetRowCoords_O
(
index_t
base_offset
)
{
constexpr
index_t
MLanes
=
BlockShape
::
Warp_M1
;
constexpr
index_t
Repeat_M
=
BlockShape
::
Repeat_M1
;
auto
base_coord
=
threadIdx
.
x
%
MLanes
+
base_offset
;
array
<
index_t
,
Repeat_M
>
coords
;
static_for
<
0
,
Repeat_M
,
1
>
{}([
&
](
auto
i
)
{
coords
.
at
(
i
)
=
base_coord
+
i
*
MLanes
;
});
return
coords
;
}
template
<
typename
Karg
>
CK_TILE_DEVICE
auto
operator
()(
const
Karg
&
kargs
,
CK_TILE_LDS_ADDR
void
*
smem
)
{
#if 0
if(threadIdx.x == 0 && blockIdx.x == 0 && threadIdx.y == 0 && blockIdx.y == 0)
{
printf("[PIPE] GemmPipeline_FlatmmUk =====\n");
}
[[maybe_unused]] uint32_t tidx = threadIdx.x; // 0~255
[[maybe_unused]] uint32_t tidy = threadIdx.y; // 0~0
[[maybe_unused]] uint32_t bidx = blockIdx.x; // 0~1
[[maybe_unused]] uint32_t bidy = blockIdx.y; // 0~51
[[maybe_unused]] uint32_t bdmx = blockDim.x; // 256
[[maybe_unused]] uint32_t bdmy = blockDim.y; // 1
[[maybe_unused]] uint32_t gdmx = gridDim.x; // 2
[[maybe_unused]] uint32_t gdmy = gridDim.y; // 52
[[maybe_unused]] uint32_t gid = ((bdmx * bdmy) * gdmx) * bidy
+ (bdmx * bdmy) * bidx
+ bdmx * tidy
+ tidx;
#endif
[[
maybe_unused
]]
int
*
dbg_int
=
static_cast
<
int
*>
(
kargs
.
dbg_int_ptr
);
[[
maybe_unused
]]
short
*
dbg_bf16
=
static_cast
<
short
*>
(
kargs
.
dbg_bf16_ptr
);
[[
maybe_unused
]]
float
*
dbg_fp32
=
static_cast
<
float
*>
(
kargs
.
dbg_fp32_ptr
);
ck_tile
::
index_t
shared_intermediate_size_0
=
kargs
.
intermediate_size
;
// N
index_t
nr_0
=
shared_intermediate_size_0
/
BlockShape
::
Warp_N0
;
// divide N in W
index_t
kr_0
=
kargs
.
hidden_size
/
BlockShape
::
Warp_K0
;
// divide K in W
index_t
interm_idx_nr0
=
__builtin_amdgcn_readfirstlane
(
blockIdx
.
x
*
BlockShape
::
Block_Nr0
);
// intermediate_tile_id * Block_N / (N in W)
// ----------------------------------------------------------------------------
// a
auto
a_res
=
make_wave_buffer_resource
(
reinterpret_cast
<
const
ADataType
*>
(
kargs
.
a_ptr
),
kargs
.
num_tokens
*
kargs
.
hidden_size
*
sizeof
(
ADataType
));
auto
row_ids_a
=
GetRowCoords_A
(
blockIdx
.
y
*
BlockShape
::
Block_M0
);
auto
a_coords
=
generate_tuple
(
[
&
](
auto
i
)
{
return
row_ids_a
[
i
]
*
kargs
.
hidden_size
+
threadIdx
.
x
%
(
BlockShape
::
Block_K0
/
kAlignmentA
)
*
kAlignmentA
;
},
number
<
row_ids_a
.
size
()
>
{});
// ----------------------------------------------------------------------------
// b
auto
b_win
=
[
&
]()
{
const
GDataType
*
b_ptr
=
reinterpret_cast
<
const
GDataType
*>
(
kargs
.
b_ptr
)
+
interm_idx_nr0
*
kr_0
*
BlockShape
::
Block_W0
;
auto
b_view_
=
make_naive_tensor_view
<
address_space_enum
::
global
>
(
b_ptr
,
make_tuple
(
nr_0
,
kr_0
,
number
<
BlockShape
::
Block_W0
>
{}),
make_tuple
(
kr_0
*
BlockShape
::
Block_W0
,
number
<
BlockShape
::
Block_W0
>
{},
1
),
number
<
kAlignmentG
>
{},
number
<
1
>
{});
auto
b_window_
=
make_tile_window_linear_raw
(
b_view_
,
make_tuple
(
number
<
BlockShape
::
Block_Nr0
>
{},
number
<
BlockShape
::
Block_Kr0
>
{},
number
<
BlockShape
::
Block_W0
>
{}),
{
0
,
0
,
0
},
Policy
::
template
MakeGlobalTileDistribution_G
<
Problem
>(),
sequence
<
0
,
1
,
1
>
{});
return
b_window_
;
}();
auto
b_res
=
b_win
.
get_bottom_tensor_view
().
get_buffer_view
().
cached_buf_res_
;
auto
b_coords
=
generate_tuple
([
&
](
auto
i
)
{
return
b_win
.
cached_coords_
[
i
].
get_offset
();
},
number
<
decltype
(
b_win
)
::
NumAccess_NonLinear
>
{});
// ----------------------------------------------------------------------------
// core
auto
uk_0
=
Policy
::
template
GetUK_0
<
Problem
>();
auto
acc_0
=
uk_0
(
a_res
,
a_coords
,
b_res
,
b_coords
,
smem
,
kargs
.
hidden_size
,
BlockShape
::
Block_K0
,
// tile offset for B matrix each unroll
BlockShape
::
Block_Kr0
*
BlockShape
::
Block_W0
,
// tile offset for B matrix each unroll
dbg_int
,
dbg_bf16
,
dbg_fp32
);
// ----------------------------------------------------------------------------
{
int
tid
=
threadIdx
.
x
;
float
srdfp32
=
0.
f
;
float
*
smemfp32
=
static_cast
<
float
*>
(
smem
);
// ----------------------------------------------------------------------------
// store to lds
for
(
uint32_t
accIdx
=
0
;
accIdx
<
16
;
accIdx
++
)
{
float
*
accSmem
=
smemfp32
+
4
*
blockDim
.
x
*
accIdx
;
for
(
int
xyzw
=
0
;
xyzw
<
4
;
xyzw
++
)
{
accSmem
[
tid
*
4
+
xyzw
]
=
acc_0
.
get_thread_buffer
()[
accIdx
*
4
+
xyzw
];
}
}
block_sync_lds
();
// ----------------------------------------------------------------------------
// read from lds
int
sldIdx
=
0
;
// int MLn = 15;
// int Nln = tid / MLn;
int
tidInWave
=
tid
%
64
;
int
waveId
=
tid
/
64
;
// sldIdx = (tid64 % 16 * 16 + tid64 / 16) % 64
// + tid / 64;
sldIdx
=
(
tidInWave
%
16
*
16
+
tidInWave
/
16
)
+
waveId
*
4
;
const
int
accNLane
=
16
;
const
int
NLaneCnt
=
BlockShape
::
Block_N0
/
4
;
// xyzw 512 / 4 = 128
const
int
accBlkSize
=
blockDim
.
x
;
int
accInnerId
=
tid
%
accNLane
;
// 0~15
int
accNIdx
=
tid
/
NLaneCnt
;
// 0~127 = 0; 128~255 = 1
int
acc01BlkIdx
=
tid
%
NLaneCnt
/
16
;
// 0 ~ 7
int
accBlkIdx
=
acc01BlkIdx
*
2
;
// 0, 2, 4, ..., 14
int
acc4Id
=
accBlkIdx
*
accBlkSize
//
+
accNIdx
*
accBlkSize
+
accInnerId
*
16
;
sldIdx
=
acc4Id
;
float
*
d_buf
=
static_cast
<
float
*>
(
kargs
.
d_ptr
);
int
c_blk_offset
=
blockIdx
.
y
*
BlockShape
::
Block_M0
*
kargs
.
intermediate_size
/
4
+
blockIdx
.
x
*
BlockShape
::
Block_N0
/
4
;
for
(
uint32_t
accIdx
=
0
;
accIdx
<
16
;
accIdx
++
)
{
for
(
int
xyzw
=
0
;
xyzw
<
4
;
xyzw
++
)
{
srdfp32
=
smemfp32
[
accIdx
*
(
1
*
4
)
+
sldIdx
*
4
+
xyzw
];
acc_0
.
get_thread_buffer
()[
accIdx
*
4
+
xyzw
]
=
srdfp32
;
}
// ----------------------------------------------------------------------------
// store to vmem
int
c_m_idx_offset
=
(
accIdx
+
accNIdx
*
16
)
*
kargs
.
intermediate_size
/
4
;
int
c_idx_offset
=
c_blk_offset
+
c_m_idx_offset
+
(
tid
%
NLaneCnt
);
for
(
int
xyzw
=
0
;
xyzw
<
4
;
xyzw
++
)
{
srdfp32
=
acc_0
.
get_thread_buffer
()[
accIdx
*
4
+
xyzw
];
d_buf
[
c_idx_offset
*
4
+
xyzw
]
=
srdfp32
;
}
}
}
#if 0
// ----------------------------------------------------------------------------
// debug
for(uint32_t dbgi = 0; dbgi < 64; dbgi++)
{
dbg_fp32[gid * 64 + dbgi] = acc_0.get_thread_buffer()[dbgi];
}
#endif
}
};
}
// namespace ck_tile
include/ck_tile/ops/fused_moe/pipeline/flatmm_uk_pipeline_policy.hpp
0 → 100644
View file @
fa335f31
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_traits.hpp"
#include "ck_tile/ops/flatmm_uk.hpp"
#include "ck_tile/ops/gemm/warp/warp_gemm.hpp"
#include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp"
namespace
ck_tile
{
struct
GemmPipelineFlatmmPolicy
{
CK_TILE_HOST_DEVICE
static
constexpr
index_t
GetAsyncCopyDwords
()
{
// TODO: always 1 dword
return
1
;
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
GetAlignment_A
()
{
// using async
constexpr
index_t
copy_bytes
=
4
*
GetAsyncCopyDwords
();
constexpr
index_t
data_bytes
=
sizeof
(
typename
Problem
::
ADataType
);
static_assert
(
copy_bytes
%
data_bytes
==
0
);
return
copy_bytes
/
data_bytes
;
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
GetAlignment_G
()
{
constexpr
index_t
copy_bytes
=
[
&
]()
{
return
16
;
}();
constexpr
index_t
data_bytes
=
sizeof
(
typename
Problem
::
GDataType
);
static_assert
(
copy_bytes
%
data_bytes
==
0
);
return
copy_bytes
/
data_bytes
;
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
GetAlignment_D
()
{
constexpr
index_t
copy_bytes
=
[
&
]()
{
return
16
;
}();
constexpr
index_t
data_bytes
=
sizeof
(
typename
Problem
::
DDataType
);
static_assert
(
copy_bytes
%
data_bytes
==
0
);
return
copy_bytes
/
data_bytes
;
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
GetAlignment_O
()
{
if
constexpr
(
Problem
::
Traits
::
OAtomic
==
1
)
{
// pack fp16/bf16 atomic
static_assert
(
sizeof
(
typename
Problem
::
ODataType
)
==
2
);
return
2
;
}
else
if
constexpr
(
Problem
::
Traits
::
OAtomic
==
2
)
{
// fp32 atomic
return
1
;
}
else
{
return
16
/
sizeof
(
typename
Problem
::
ODataType
);
}
}
template
<
typename
DataType_
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
GetSmemKPack
()
{
// TODO: this is for 3d layout
return
16
/
sizeof
(
remove_cvref_t
<
DataType_
>
);
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
GetSmemKPack_A
()
{
return
GetSmemKPack
<
typename
Problem
::
ADataType
>
();
}
// used for bridge LDS shuffle
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
GetSmemKPack_Y
()
{
// TODO: this should match mfma layout
return
16
/
sizeof
(
typename
Problem
::
YDataType
);
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
ck_tile
::
index_t
GetSmemSize_A
()
{
constexpr
auto
a_sld_desc
=
MakeLdsLoadDesc_A
<
Problem
>
();
constexpr
auto
a_sst_desc
=
MakeLdsStoreDesc_A
<
Problem
>
();
static_assert
(
a_sld_desc
.
get_element_space_size
()
==
a_sst_desc
.
get_element_space_size
());
return
a_sld_desc
.
get_element_space_size
();
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
ck_tile
::
index_t
GetSmemSize_Bridge
()
{
constexpr
auto
bridge_sld_desc
=
MakeBridgeLdsLoadDesc
<
Problem
>
();
constexpr
auto
bridge_sst_desc
=
MakeBridgeLdsStoreDesc
<
Problem
>
();
static_assert
(
bridge_sld_desc
.
get_element_space_size
()
==
bridge_sst_desc
.
get_element_space_size
());
return
bridge_sld_desc
.
get_element_space_size
();
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
ck_tile
::
index_t
GetSmemSize
()
{
constexpr
index_t
a_lds
=
GetSmemSize_A
<
Problem
>
();
constexpr
index_t
bridge_lds
=
GetSmemSize_Bridge
<
Problem
>
();
return
max
(
a_lds
,
bridge_lds
);
}
template
<
index_t
MPerBlock
,
index_t
KPerBlock
,
index_t
NumWarps
,
index_t
Alignment
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
MakeGlobalTileDistribution_SimpleMxK
()
{
constexpr
index_t
K_vec
=
Alignment
;
constexpr
index_t
K_rem
=
KPerBlock
/
K_vec
;
if
constexpr
(
get_warp_size
()
<
K_rem
)
{
static_assert
(
K_rem
%
get_warp_size
()
==
0
);
constexpr
index_t
K_lan
=
get_warp_size
();
// lane within same wave is along gemm-k
constexpr
index_t
K_wav
=
K_rem
/
get_warp_size
();
static_assert
(
K_wav
<=
NumWarps
,
"not not support thread has repeat along K yet"
);
constexpr
index_t
M_wav
=
NumWarps
/
K_wav
;
static_assert
(
MPerBlock
%
M_wav
==
0
,
"this tile size is too small please check"
);
constexpr
index_t
M_rep
=
MPerBlock
/
M_wav
;
return
make_static_tile_distribution
(
tile_distribution_encoding
<
sequence
<
1
>
,
tuple
<
sequence
<
M_rep
,
M_wav
>
,
sequence
<
K_wav
,
K_lan
,
K_vec
>>
,
tuple
<
sequence
<
1
,
2
>
,
sequence
<
2
>>
,
tuple
<
sequence
<
1
,
0
>
,
sequence
<
1
>>
,
sequence
<
1
,
2
>
,
sequence
<
0
,
2
>>
{});
}
else
{
constexpr
index_t
K_lan
=
K_rem
;
constexpr
index_t
M_lan
=
get_warp_size
()
/
K_lan
;
constexpr
index_t
M_wav
=
NumWarps
;
static_assert
(
MPerBlock
%
(
M_lan
*
M_wav
)
==
0
,
"this tile size is too small please check"
);
constexpr
index_t
M_rep
=
MPerBlock
/
(
M_lan
*
M_wav
);
return
make_static_tile_distribution
(
tile_distribution_encoding
<
sequence
<
1
>
,
tuple
<
sequence
<
M_rep
,
M_wav
,
M_lan
>
,
sequence
<
K_lan
,
K_vec
>>
,
tuple
<
sequence
<
1
>
,
sequence
<
1
,
2
>>
,
tuple
<
sequence
<
1
>
,
sequence
<
2
,
0
>>
,
sequence
<
1
,
2
>
,
sequence
<
0
,
1
>>
{});
}
}
// optimized version for async, not same as simple MXK dist(pay attention!!)
template
<
index_t
MPerBlock
,
index_t
KPerBlock
,
index_t
NumWarps
,
index_t
Alignment
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
MakeGlobalTileDistribution_SimpleMxK_Async
()
{
constexpr
index_t
K_vec
=
Alignment
;
constexpr
index_t
K_rem
=
KPerBlock
/
K_vec
;
if
constexpr
(
get_warp_size
()
<=
K_rem
)
{
static_assert
(
K_rem
%
get_warp_size
()
==
0
);
constexpr
index_t
K_lan
=
get_warp_size
();
// lane within same wave is along gemm-k
constexpr
index_t
K_wav
=
K_rem
/
get_warp_size
();
static_assert
(
K_wav
<=
NumWarps
,
"do not support thread has repeat along K yet"
);
constexpr
index_t
M_wav
=
NumWarps
/
K_wav
;
static_assert
(
MPerBlock
%
M_wav
==
0
,
"this tile size is too small please check"
);
constexpr
index_t
M_rep
=
MPerBlock
/
M_wav
;
// NOTE: no swap, but hard to avoid LDS bank conflict
return
make_static_tile_distribution
(
tile_distribution_encoding
<
sequence
<
1
>
,
tuple
<
sequence
<
M_rep
,
M_wav
>
,
sequence
<
K_wav
,
K_lan
,
K_vec
>>
,
tuple
<
sequence
<
1
,
2
>
,
sequence
<
2
>>
,
tuple
<
sequence
<
1
,
0
>
,
sequence
<
1
>>
,
sequence
<
1
,
2
>
,
sequence
<
0
,
2
>>
{});
}
else
{
constexpr
index_t
K_lan
=
K_rem
;
constexpr
index_t
M_lan
=
get_warp_size
()
/
K_lan
;
constexpr
index_t
M_wav
=
NumWarps
;
static_assert
(
MPerBlock
%
(
M_lan
*
M_wav
)
==
0
,
"this tile size is too small please check"
);
constexpr
index_t
M_rep
=
MPerBlock
/
(
M_lan
*
M_wav
);
// NOTE: swapped for LDS load bank conflict free
return
make_static_tile_distribution
(
tile_distribution_encoding
<
sequence
<
1
>
,
// Note M_wave(num waves) is the fastest dim, different from sipmle 2d
// distribution
tuple
<
sequence
<
M_rep
,
M_lan
,
M_wav
>
,
sequence
<
K_lan
,
K_vec
>>
,
tuple
<
sequence
<
1
>
,
sequence
<
1
,
2
>>
,
tuple
<
sequence
<
2
>
,
sequence
<
1
,
0
>>
,
sequence
<
1
,
2
>
,
sequence
<
0
,
1
>>
{});
}
}
template
<
index_t
WarpPerBlock_N_
,
index_t
WarpPerBlock_K_
,
index_t
Repeat_N_
,
index_t
Repeat_K_
,
index_t
WarpSize_
,
index_t
Alignment_
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
MakeGlobalTileDistribution_Nr_Kr_W
()
{
return
make_static_tile_distribution
(
tile_distribution_encoding
<
sequence
<
1
>
,
tuple
<
sequence
<
Repeat_N_
,
WarpPerBlock_N_
>
,
sequence
<
Repeat_K_
,
WarpPerBlock_K_
>
,
sequence
<
WarpSize_
,
Alignment_
>>
,
tuple
<
sequence
<
1
,
2
>
,
sequence
<
3
>>
,
tuple
<
sequence
<
1
,
1
>
,
sequence
<
0
>>
,
sequence
<
1
,
2
,
3
>
,
sequence
<
0
,
0
,
1
>>
{});
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
MakeGlobalTileDistribution_A
()
{
constexpr
index_t
Block_M_
=
Problem
::
BlockShape
::
Block_M0
;
constexpr
index_t
Block_K_
=
Problem
::
BlockShape
::
Block_K0
;
constexpr
index_t
NumWarps_
=
Problem
::
BlockShape
::
NumWarps
;
constexpr
index_t
Alignment_
=
GetAlignment_A
<
Problem
>
();
return
MakeGlobalTileDistribution_SimpleMxK_Async
<
Block_M_
,
Block_K_
,
NumWarps_
,
Alignment_
>
();
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
MakeGlobalTileDistribution_G
()
{
constexpr
auto
PermuteEnum
=
Problem
::
Traits
::
PermuteEnum
;
// constexpr index_t hidden_radio_0 = Problem::Traits::IsGateOnly ? 1 : 2;
using
S_
=
typename
Problem
::
BlockShape
;
if
constexpr
(
PermuteEnum
==
FusedMoeGemmWeightPermuteEnum
::
b_nr_kr_waveflatten
)
{
// number<S_::WarpPerBlock_N0>{}.rrr();
// number<S_::Repeat_N0>{}.eee();
return
MakeGlobalTileDistribution_Nr_Kr_W
<
S_
::
WarpPerBlock_N0
,
S_
::
WarpPerBlock_K0
,
S_
::
Repeat_N0
,
/// hidden_radio_0,
S_
::
Repeat_K0
,
get_warp_size
(),
GetAlignment_G
<
Problem
>
()
>
();
}
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
MakeGlobalTileDistribution_D
()
{
constexpr
auto
PermuteEnum
=
Problem
::
Traits
::
PermuteEnum
;
using
S_
=
typename
Problem
::
BlockShape
;
if
constexpr
(
PermuteEnum
==
FusedMoeGemmWeightPermuteEnum
::
b_nr_kr_waveflatten
)
{
return
MakeGlobalTileDistribution_Nr_Kr_W
<
S_
::
WarpPerBlock_N1
,
S_
::
WarpPerBlock_K1
,
S_
::
Repeat_N1
,
S_
::
Repeat_K1
,
get_warp_size
(),
GetAlignment_D
<
Problem
>
()
>
();
}
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
MakeGlobalTileDistribution_O
()
{
using
S_
=
remove_cvref_t
<
typename
Problem
::
BlockShape
>
;
using
WarpGemm
=
remove_cvref_t
<
decltype
(
GetWarpGemm1
<
Problem
>
())
>
;
// using CDataType = typename WarpGemm::CDataType;
constexpr
auto
c_block_outer_dstr_encoding
=
tile_distribution_encoding
<
sequence
<>
,
tuple
<
sequence
<
S_
::
Repeat_M1
,
S_
::
WarpPerBlock_M1
>
,
sequence
<
S_
::
Repeat_N1
,
S_
::
WarpPerBlock_N1
>>
,
tuple
<
sequence
<
1
,
2
>>
,
tuple
<
sequence
<
1
,
1
>>
,
sequence
<
1
,
2
>
,
sequence
<
0
,
0
>>
{};
constexpr
auto
c_block_dstr_encode
=
detail
::
make_embed_tile_distribution_encoding
(
c_block_outer_dstr_encoding
,
typename
WarpGemm
::
CWarpDstrEncoding
{});
constexpr
auto
c_block_dstr
=
make_static_tile_distribution
(
c_block_dstr_encode
);
return
c_block_dstr
;
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
MakeLdsStoreDesc_A
()
{
// A async->LDS
constexpr
index_t
Block_M
=
Problem
::
BlockShape
::
Block_M0
;
constexpr
index_t
Block_K
=
Problem
::
BlockShape
::
Block_K0
;
// constexpr index_t BlockSize = Problem::BlockShape::BlockSize;
constexpr
index_t
warpSize
=
ck_tile
::
get_warp_size
();
constexpr
index_t
NumWarps
=
Problem
::
BlockShape
::
NumWarps
;
constexpr
index_t
KPack
=
GetSmemKPack_A
<
Problem
>
();
// LDS
constexpr
index_t
KVector
=
GetAlignment_A
<
Problem
>
();
// async copy 1 dword
constexpr
index_t
KPad
=
KPack
;
// pad between warps
static_assert
(
Block_K
%
KVector
==
0
);
constexpr
index_t
LanesPerK
=
Block_K
/
KVector
;
// how many thread loading K
if
constexpr
(
LanesPerK
>=
warpSize
)
{
// need multiple waves to load K
static_assert
(
LanesPerK
%
warpSize
==
0
);
constexpr
index_t
wavesPerK
=
LanesPerK
/
warpSize
;
if
constexpr
(
wavesPerK
>
NumWarps
)
{
// TODO: need multiple issues along K to load all data
}
else
{
constexpr
index_t
wavesPerM
=
NumWarps
/
wavesPerK
;
constexpr
index_t
NumIssues
=
Block_M
/
wavesPerM
;
constexpr
auto
lds_block_desc_0
=
make_naive_tensor_descriptor
(
make_tuple
(
number
<
NumIssues
>
{},
// m0
number
<
wavesPerM
>
{},
// m1
number
<
wavesPerK
>
{},
// k0
number
<
warpSize
>
{},
// k1
number
<
KVector
>
{}),
// k2
make_tuple
(
number
<
NumWarps
*
(
warpSize
*
KVector
+
KPad
)
>
{},
// m0
number
<
wavesPerK
*
(
warpSize
*
KVector
+
KPad
)
>
{},
// m1
number
<
warpSize
*
KVector
+
KPad
>
{},
// k0
number
<
KVector
>
{},
// k1
number
<
1
>
{}),
// k2
number
<
KVector
>
{},
// lds store vector(actually no explicit store)
number
<
1
>
{});
constexpr
auto
lds_block_desc_issues_warps_lanes
=
transform_tensor_descriptor
(
lds_block_desc_0
,
make_tuple
(
make_pass_through_transform
(
number
<
NumIssues
>
{}),
make_merge_transform
(
make_tuple
(
number
<
wavesPerM
>
{},
number
<
wavesPerK
>
{})),
make_merge_transform
(
make_tuple
(
number
<
warpSize
>
{},
number
<
KVector
>
{}))),
make_tuple
(
sequence
<
0
>
{},
sequence
<
1
,
2
>
{},
sequence
<
3
,
4
>
{}),
make_tuple
(
sequence
<
0
>
{},
sequence
<
1
>
{},
sequence
<
2
>
{}));
return
lds_block_desc_issues_warps_lanes
;
}
}
else
{
// lanes within a wave load different M but same K
static_assert
(
warpSize
%
LanesPerK
==
0
);
constexpr
index_t
LaneGroups
=
warpSize
/
LanesPerK
;
// along m
constexpr
index_t
NumIssues
=
Block_M
/
(
LaneGroups
*
NumWarps
);
constexpr
auto
lds_block_desc_0
=
make_naive_tensor_descriptor
(
make_tuple
(
number
<
NumIssues
>
{},
// m0
number
<
LaneGroups
>
{},
// m1
number
<
NumWarps
>
{},
// m2
number
<
LanesPerK
>
{},
// k0
number
<
KVector
>
{}),
// k1
make_tuple
(
number
<
NumWarps
*
(
warpSize
*
KVector
+
KPad
)
>
{},
// m0
number
<
Block_K
>
{},
// m1
number
<
warpSize
*
KVector
+
KPad
>
{},
// m2
number
<
KVector
>
{},
// k0
number
<
1
>
{}),
// k1
number
<
KVector
>
{},
// lds store vector(actually no explicit store)
number
<
1
>
{});
constexpr
auto
lds_block_desc_issues_warps_lanes
=
transform_tensor_descriptor
(
lds_block_desc_0
,
make_tuple
(
make_pass_through_transform
(
number
<
NumIssues
>
{}),
make_pass_through_transform
(
number
<
NumWarps
>
{}),
make_merge_transform
(
make_tuple
(
number
<
LaneGroups
>
{},
number
<
LanesPerK
>
{},
number
<
KVector
>
{}))),
make_tuple
(
sequence
<
0
>
{},
sequence
<
2
>
{},
sequence
<
1
,
3
,
4
>
{}),
make_tuple
(
sequence
<
0
>
{},
sequence
<
1
>
{},
sequence
<
2
>
{}));
return
lds_block_desc_issues_warps_lanes
;
}
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
MakeLdsLoadDesc_A
()
{
// A async->LDS
// Note that, this descriptor is only to construct the layout inside LDS
// in real Gemm pipeline, ds_read may not follow this pattern
// (may follow that in tile_distribution)
// below code is almost the same as SmemStore dist, with difference:
// 1). modify the GuaranteedLastDimensionVectorLength of naive tensor desc
// 2). return discriptor is in NxK 2d layout
constexpr
index_t
Block_M
=
Problem
::
BlockShape
::
Block_M0
;
constexpr
index_t
Block_K
=
Problem
::
BlockShape
::
Block_K0
;
// constexpr index_t BlockSize = Problem::BlockShape::BlockSize;
constexpr
index_t
warpSize
=
ck_tile
::
get_warp_size
();
constexpr
index_t
NumWarps
=
Problem
::
BlockShape
::
NumWarps
;
constexpr
index_t
KPack
=
GetSmemKPack_A
<
Problem
>
();
// LDS
constexpr
index_t
KVector
=
GetAlignment_A
<
Problem
>
();
// async copy 1 dword
constexpr
index_t
KPad
=
KPack
;
// pad between warps
static_assert
(
Block_K
%
KVector
==
0
);
constexpr
index_t
LanesPerK
=
Block_K
/
KVector
;
// how many thread loading K
if
constexpr
(
LanesPerK
>=
warpSize
)
{
// need multiple waves to load K
static_assert
(
LanesPerK
%
warpSize
==
0
);
constexpr
index_t
wavesPerK
=
LanesPerK
/
warpSize
;
if
constexpr
(
wavesPerK
>=
NumWarps
)
{
// TODO: need multiple issues along K to load all data
}
else
{
constexpr
index_t
wavesPerM
=
NumWarps
/
wavesPerK
;
constexpr
index_t
NumIssues
=
Block_M
/
wavesPerM
;
constexpr
auto
lds_block_desc_0
=
make_naive_tensor_descriptor
(
make_tuple
(
number
<
NumIssues
>
{},
// m0
number
<
wavesPerM
>
{},
// m1
number
<
wavesPerK
>
{},
// k0
number
<
warpSize
>
{},
// k1
number
<
KVector
>
{}),
// k2
make_tuple
(
number
<
NumWarps
*
(
warpSize
*
KVector
+
KPad
)
>
{},
// m0
number
<
wavesPerK
*
(
warpSize
*
KVector
+
KPad
)
>
{},
// m1
number
<
warpSize
*
KVector
+
KPad
>
{},
// k0
number
<
KVector
>
{},
// k1
number
<
1
>
{}),
// k2
number
<
KPack
>
{},
// lds load vector
number
<
1
>
{});
constexpr
auto
lds_desc_m_k
=
transform_tensor_descriptor
(
lds_block_desc_0
,
make_tuple
(
make_merge_transform
(
make_tuple
(
number
<
NumIssues
>
{},
number
<
wavesPerM
>
{})),
make_merge_transform
(
make_tuple
(
number
<
wavesPerK
>
{},
number
<
warpSize
>
{},
number
<
KVector
>
{}))),
make_tuple
(
sequence
<
0
,
1
>
{},
sequence
<
2
,
3
,
4
>
{}),
make_tuple
(
sequence
<
0
>
{},
sequence
<
1
>
{}));
return
lds_desc_m_k
;
}
}
else
{
// lanes within a wave load different M but same K
static_assert
(
warpSize
%
LanesPerK
==
0
);
constexpr
index_t
LaneGroups
=
warpSize
/
LanesPerK
;
// along m
constexpr
index_t
NumIssues
=
Block_M
/
(
LaneGroups
*
NumWarps
);
constexpr
auto
lds_block_desc_0
=
make_naive_tensor_descriptor
(
make_tuple
(
number
<
NumIssues
>
{},
// m0
number
<
LaneGroups
>
{},
// m1
number
<
NumWarps
>
{},
// m2
number
<
LanesPerK
>
{},
// k0
number
<
KVector
>
{}),
// k1
make_tuple
(
number
<
NumWarps
*
(
warpSize
*
KVector
+
KPad
)
>
{},
// m0
number
<
Block_K
>
{},
// m1
number
<
warpSize
*
KVector
+
KPad
>
{},
// m2
number
<
KVector
>
{},
// k0
number
<
1
>
{}),
// k1
number
<
KPack
>
{},
// lds load vector
number
<
1
>
{});
constexpr
auto
lds_desc_m_k
=
transform_tensor_descriptor
(
lds_block_desc_0
,
make_tuple
(
make_merge_transform
(
make_tuple
(
number
<
NumIssues
>
{},
number
<
LaneGroups
>
{},
number
<
NumWarps
>
{})),
make_merge_transform
(
make_tuple
(
number
<
LanesPerK
>
{},
number
<
KVector
>
{}))),
make_tuple
(
sequence
<
0
,
1
,
2
>
{},
sequence
<
3
,
4
>
{}),
make_tuple
(
sequence
<
0
>
{},
sequence
<
1
>
{}));
return
lds_desc_m_k
;
}
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
MakeBridgeLdsLoadDesc
()
{
constexpr
index_t
Block_M
=
Problem
::
BlockShape
::
Block_M0
;
constexpr
index_t
Block_N
=
Problem
::
BlockShape
::
Block_N0
;
constexpr
index_t
KVector
=
GetSmemKPack_Y
<
Problem
>
();
// async copy 1 dword
constexpr
index_t
KPad
=
0
;
// pad between warps
constexpr
auto
desc
=
make_naive_tensor_descriptor
(
make_tuple
(
number
<
Block_M
>
{},
number
<
Block_N
>
{}),
make_tuple
(
number
<
Block_N
+
KPad
>
{},
number
<
1
>
{}),
number
<
KVector
>
{},
number
<
1
>
{});
return
desc
;
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
MakeBridgeLdsStoreDesc
()
{
constexpr
index_t
Block_M
=
Problem
::
BlockShape
::
Block_M0
;
constexpr
index_t
Block_N
=
Problem
::
BlockShape
::
Block_N0
;
constexpr
index_t
KVector
=
GetSmemKPack_Y
<
Problem
>
();
// async copy 1 dword
constexpr
index_t
KPad
=
0
;
// KVector; // pad between warps
constexpr
auto
desc
=
make_naive_tensor_descriptor
(
make_tuple
(
number
<
Block_M
>
{},
number
<
Block_N
>
{}),
make_tuple
(
number
<
Block_N
+
KPad
>
{},
number
<
1
>
{}),
number
<
KVector
>
{},
number
<
1
>
{});
return
desc
;
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
MakeBridgeLdsStoreForUKDesc
()
{
constexpr
index_t
WarpPerBlock_N
=
Problem
::
BlockShape
::
WarpPerBlock_N0
;
constexpr
index_t
Repeat_N
=
Problem
::
BlockShape
::
Repeat_N0
;
constexpr
index_t
Repeat_M
=
Problem
::
BlockShape
::
Repeat_M0
;
constexpr
index_t
kAMLane
=
16
;
constexpr
index_t
kABKLane
=
4
;
constexpr
index_t
kABKPerLane
=
4
;
constexpr
index_t
KPack
=
kABKPerLane
;
constexpr
auto
lds_block_desc_0
=
make_naive_tensor_descriptor
(
make_tuple
(
number
<
Repeat_M
>
{},
// m
number
<
Repeat_N
>
{},
// n
number
<
WarpPerBlock_N
>
{},
// n
number
<
kABKLane
>
{},
// n
number
<
kAMLane
>
{},
// m
number
<
KPack
>
{}),
// n
make_tuple
(
number
<
Repeat_N
*
WarpPerBlock_N
*
kABKLane
*
kAMLane
*
KPack
>
{},
// m
number
<
WarpPerBlock_N
*
kABKLane
*
kAMLane
*
KPack
>
{},
// n
number
<
kABKLane
*
kAMLane
*
KPack
>
{},
// n
number
<
kAMLane
*
KPack
>
{},
// n
number
<
KPack
>
{},
// m
number
<
1
>
{}),
// n
number
<
KPack
>
{},
// lds store vector(actually no explicit store)
number
<
1
>
{});
constexpr
auto
desc
=
transform_tensor_descriptor
(
lds_block_desc_0
,
make_tuple
(
make_merge_transform
(
make_tuple
(
number
<
Repeat_M
>
{},
number
<
kAMLane
>
{})),
make_merge_transform
(
make_tuple
(
number
<
Repeat_N
>
{},
number
<
WarpPerBlock_N
>
{},
number
<
kABKLane
>
{},
number
<
KPack
>
{}))),
make_tuple
(
sequence
<
0
,
4
>
{},
sequence
<
1
,
2
,
3
,
5
>
{}),
make_tuple
(
sequence
<
0
>
{},
sequence
<
1
>
{}));
return
desc
;
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
GetWarpGemm0
()
{
using
S_
=
typename
Problem
::
BlockShape
;
// A is vgpr, B is agpr. But since we transposed, so also need swap this
// TODO: this is ugly
constexpr
auto
wg_ctrl
=
WGAttrCtlEnum
::
Raw_avv
;
// TODO: ugly
if
constexpr
(
std
::
is_same_v
<
typename
Problem
::
ADataType
,
ck_tile
::
bf16_t
>
&&
std
::
is_same_v
<
typename
Problem
::
GDataType
,
ck_tile
::
bf16_t
>
&&
S_
::
Warp_M0
==
32
&&
S_
::
Warp_N0
==
32
&&
S_
::
Warp_K0
==
16
)
{
return
WarpGemmImpl
<
WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB
<
WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8
<
wg_ctrl
>
,
2
>>
{};
}
else
if
constexpr
(
std
::
is_same_v
<
typename
Problem
::
ADataType
,
ck_tile
::
int8_t
>
&&
std
::
is_same_v
<
typename
Problem
::
GDataType
,
ck_tile
::
int8_t
>
&&
S_
::
Warp_M0
==
32
&&
S_
::
Warp_N0
==
32
&&
S_
::
Warp_K0
==
32
)
{
return
WarpGemmImpl
<
WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB
<
WarpGemmAttributeMfmaImpl_i32_32x32x16_i8
<
wg_ctrl
>
,
2
>>
{};
}
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
GetSequencer_0
()
{
// this function return seq<...> used to identify gld/sld/valu... inside mfma sequence
// the purpose is to hide thoes instructions under mfma
// every value inside seq<...> is a mask, indicating a specific operation
using
S_
=
typename
Problem
::
BlockShape
;
constexpr
index_t
SLD_A
=
static_cast
<
index_t
>
(
FusedMoeGemmPipelineSequencerEnum
::
SLD_A
);
constexpr
index_t
GLD_A
=
static_cast
<
index_t
>
(
FusedMoeGemmPipelineSequencerEnum
::
GLD_A
);
constexpr
index_t
GLD_B
=
static_cast
<
index_t
>
(
FusedMoeGemmPipelineSequencerEnum
::
GLD_B
);
if
constexpr
(
std
::
is_same_v
<
typename
Problem
::
YDataType
,
ck_tile
::
bf16_t
>
&&
std
::
is_same_v
<
typename
Problem
::
DDataType
,
ck_tile
::
bf16_t
>
&&
S_
::
Warp_M0
==
32
&&
S_
::
Warp_N0
==
32
&&
S_
::
Warp_K0
==
16
&&
S_
::
Block_M0
==
32
&&
S_
::
Block_N0
==
512
&&
S_
::
Block_K0
==
128
&&
S_
::
Block_N1
==
128
)
{
// Total 64 instructions, 32 buffer-load-dwordx4 gld_b, 8x buffer-load-dwordx1-async
// gld_a 8x ds_read_b128 sld_a total 64 slot :)
// clang-format off
constexpr
auto
seq_all
=
// 0 1 2 3 4 5 6 7
sequence
<
GLD_B
,
GLD_A
,
GLD_B
,
GLD_A
,
GLD_B
,
GLD_A
,
GLD_B
,
GLD_A
,
// 0
GLD_B
,
GLD_A
,
GLD_B
,
GLD_A
,
GLD_B
,
GLD_A
,
GLD_B
,
GLD_A
,
// 1
GLD_B
,
SLD_A
,
GLD_B
,
SLD_A
,
GLD_B
,
SLD_A
,
GLD_B
,
SLD_A
,
// 2
GLD_B
,
SLD_A
,
GLD_B
,
SLD_A
,
GLD_B
,
SLD_A
,
GLD_B
,
SLD_A
,
// 3
GLD_B
,
0
,
GLD_B
,
0
,
GLD_B
,
0
,
GLD_B
,
0
,
// 4
GLD_B
,
0
,
GLD_B
,
0
,
GLD_B
,
0
,
GLD_B
,
0
,
// 5
GLD_B
,
0
,
GLD_B
,
0
,
GLD_B
,
0
,
GLD_B
,
0
,
// 6
GLD_B
,
0
,
GLD_B
,
0
,
GLD_B
,
0
,
GLD_B
,
0
>
{};
// 7
return
seq_all
;
// clang-format on
}
else
if
constexpr
(
std
::
is_same_v
<
typename
Problem
::
YDataType
,
ck_tile
::
bf16_t
>
&&
std
::
is_same_v
<
typename
Problem
::
DDataType
,
ck_tile
::
bf16_t
>
&&
S_
::
Warp_M0
==
32
&&
S_
::
Warp_N0
==
32
&&
S_
::
Warp_K0
==
16
&&
S_
::
Block_M0
==
32
&&
S_
::
Block_N0
==
256
&&
S_
::
Block_K0
==
128
&&
S_
::
Block_N1
==
128
)
{
// Total 32 instructions, 16 buffer-load-dwordx4 gld_b, 8x buffer-load-dwordx1-async
// gld_a 8x ds_read_b128 sld_a total 64 slot :)
// clang-format off
constexpr
auto
seq_all
=
// 0 1 2 3 4 5 6 7
sequence
<
GLD_B
,
GLD_A
,
GLD_B
,
GLD_A
,
GLD_B
,
GLD_A
,
GLD_B
,
GLD_A
,
// 0
GLD_B
,
GLD_A
,
GLD_B
,
GLD_A
,
GLD_B
,
GLD_A
,
GLD_B
,
GLD_A
,
// 1
GLD_B
,
SLD_A
,
GLD_B
,
SLD_A
,
GLD_B
,
SLD_A
,
GLD_B
,
SLD_A
,
// 2
GLD_B
,
SLD_A
,
GLD_B
,
SLD_A
,
GLD_B
,
SLD_A
,
GLD_B
,
SLD_A
>
{};
// 3
return
seq_all
;
// clang-format on
}
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
GetSequencer_1
()
{
// this function return seq<...> used to identify gld/sld/valu... inside mfma sequence
// the purpose is to hide thoes instructions under mfma
// every value inside seq<...> is a mask, indicating a specific operation
using
S_
=
typename
Problem
::
BlockShape
;
constexpr
index_t
GLD_B
=
static_cast
<
index_t
>
(
FusedMoeGemmPipelineSequencerEnum
::
GLD_B
);
constexpr
index_t
GST_O
=
static_cast
<
index_t
>
(
FusedMoeGemmPipelineSequencerEnum
::
GST_O
);
if
constexpr
(
std
::
is_same_v
<
typename
Problem
::
YDataType
,
ck_tile
::
bf16_t
>
&&
std
::
is_same_v
<
typename
Problem
::
DDataType
,
ck_tile
::
bf16_t
>
&&
S_
::
Warp_M1
==
32
&&
S_
::
Warp_N1
==
32
&&
S_
::
Warp_K1
==
16
&&
S_
::
Block_M0
==
32
&&
S_
::
Block_N0
==
512
&&
S_
::
Block_K0
==
128
&&
S_
::
Block_N1
==
128
)
{
// Total 64 instructions, 32 buffer-load-dwordx4 gld_b, 8x buffer-load-dwordx1-async
// gld_a 8x ds_read_b128 sld_a total 64 slot :)
// clang-format off
constexpr
auto
seq_all
=
// 0 1 2 3 4 5 6 7
sequence
<
GLD_B
,
GST_O
,
GLD_B
,
GST_O
,
GLD_B
,
GST_O
,
GLD_B
,
GST_O
,
// 0
GLD_B
,
GST_O
,
GLD_B
,
GST_O
,
GLD_B
,
GST_O
,
GLD_B
,
GST_O
,
// 1
GLD_B
,
0
,
GLD_B
,
0
,
GLD_B
,
0
,
GLD_B
,
0
,
// 2
GLD_B
,
0
,
GLD_B
,
0
,
GLD_B
,
0
,
GLD_B
,
0
,
// 3
GLD_B
,
0
,
GLD_B
,
0
,
GLD_B
,
0
,
GLD_B
,
0
,
// 4
GLD_B
,
0
,
GLD_B
,
0
,
GLD_B
,
0
,
GLD_B
,
0
,
// 5
GLD_B
,
0
,
GLD_B
,
0
,
GLD_B
,
0
,
GLD_B
,
0
,
// 6
GLD_B
,
0
,
GLD_B
,
0
,
GLD_B
,
0
,
GLD_B
,
0
>
{};
// 7
return
seq_all
;
// clang-format on
}
else
if
constexpr
(
std
::
is_same_v
<
typename
Problem
::
YDataType
,
ck_tile
::
bf16_t
>
&&
std
::
is_same_v
<
typename
Problem
::
DDataType
,
ck_tile
::
bf16_t
>
&&
S_
::
Warp_M1
==
32
&&
S_
::
Warp_N1
==
32
&&
S_
::
Warp_K1
==
16
&&
S_
::
Block_M0
==
32
&&
S_
::
Block_N0
==
256
&&
S_
::
Block_K0
==
128
&&
S_
::
Block_N1
==
128
)
{
// Total 64 instructions, 32 buffer-load-dwordx4 gld_b, 8x buffer-load-dwordx1-async
// gld_a 8x ds_read_b128 sld_a total 64 slot :)
// clang-format off
constexpr
auto
seq_all
=
// 0 1 2 3 4 5 6 7
sequence
<
GLD_B
,
GST_O
,
GLD_B
,
GST_O
,
GLD_B
,
GST_O
,
GLD_B
,
GST_O
,
// 0
GLD_B
,
GST_O
,
GLD_B
,
GST_O
,
GLD_B
,
GST_O
,
GLD_B
,
GST_O
,
// 1
GLD_B
,
0
,
GLD_B
,
0
,
GLD_B
,
0
,
GLD_B
,
0
,
// 2
GLD_B
,
0
,
GLD_B
,
0
,
GLD_B
,
0
,
GLD_B
,
0
>
{};
// 3
return
seq_all
;
// clang-format on
}
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
GetWarpGemm1
()
{
using
S_
=
typename
Problem
::
BlockShape
;
constexpr
auto
wg_ctrl
=
WGAttrCtlEnum
::
Raw_avv
;
// TODO: ugly
if
constexpr
(
std
::
is_same_v
<
typename
Problem
::
YDataType
,
ck_tile
::
bf16_t
>
&&
std
::
is_same_v
<
typename
Problem
::
DDataType
,
ck_tile
::
bf16_t
>
&&
S_
::
Warp_M0
==
32
&&
S_
::
Warp_N0
==
32
&&
S_
::
Warp_K0
==
16
)
{
return
WarpGemmImpl
<
WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB
<
WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8
<
wg_ctrl
>
,
2
>>
{};
}
else
if
constexpr
(
std
::
is_same_v
<
typename
Problem
::
YDataType
,
ck_tile
::
int8_t
>
&&
std
::
is_same_v
<
typename
Problem
::
DDataType
,
ck_tile
::
int8_t
>
&&
S_
::
Warp_M0
==
32
&&
S_
::
Warp_N0
==
32
&&
S_
::
Warp_K0
==
32
)
{
return
WarpGemmImpl
<
WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB
<
WarpGemmAttributeMfmaImpl_i32_32x32x16_i8
<
wg_ctrl
>
,
2
>>
{};
}
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
MakeCBlockTile_Gemm0
()
{
using
S_
=
remove_cvref_t
<
typename
Problem
::
BlockShape
>
;
using
WarpGemm
=
remove_cvref_t
<
decltype
(
GetWarpGemm0
<
Problem
>
())
>
;
using
CDataType
=
typename
WarpGemm
::
CDataType
;
constexpr
auto
c_block_outer_dstr_encoding
=
tile_distribution_encoding
<
sequence
<>
,
tuple
<
sequence
<
S_
::
Repeat_M0
,
S_
::
WarpPerBlock_M0
>
,
sequence
<
S_
::
Repeat_N0
,
S_
::
WarpPerBlock_N0
>>
,
tuple
<
sequence
<
1
,
2
>>
,
tuple
<
sequence
<
1
,
1
>>
,
sequence
<
1
,
2
>
,
sequence
<
0
,
0
>>
{};
constexpr
auto
c_block_dstr_encode
=
detail
::
make_embed_tile_distribution_encoding
(
c_block_outer_dstr_encoding
,
typename
WarpGemm
::
CWarpDstrEncoding
{});
constexpr
auto
c_block_dstr
=
make_static_tile_distribution
(
c_block_dstr_encode
);
auto
c_block_tensor
=
make_static_distributed_tensor
<
CDataType
>
(
c_block_dstr
);
return
c_block_tensor
;
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
MakeCBlockTile_Gemm1
()
{
using
S_
=
remove_cvref_t
<
typename
Problem
::
BlockShape
>
;
using
WarpGemm
=
remove_cvref_t
<
decltype
(
GetWarpGemm1
<
Problem
>
())
>
;
using
CDataType
=
typename
WarpGemm
::
CDataType
;
constexpr
auto
c_block_outer_dstr_encoding
=
tile_distribution_encoding
<
sequence
<>
,
tuple
<
sequence
<
S_
::
Repeat_M1
,
S_
::
WarpPerBlock_M1
>
,
sequence
<
S_
::
Repeat_N1
,
S_
::
WarpPerBlock_N1
>>
,
tuple
<
sequence
<
1
,
2
>>
,
tuple
<
sequence
<
1
,
1
>>
,
sequence
<
1
,
2
>
,
sequence
<
0
,
0
>>
{};
constexpr
auto
c_block_dstr_encode
=
detail
::
make_embed_tile_distribution_encoding
(
c_block_outer_dstr_encoding
,
typename
WarpGemm
::
CWarpDstrEncoding
{});
constexpr
auto
c_block_dstr
=
make_static_tile_distribution
(
c_block_dstr_encode
);
auto
c_block_tensor
=
make_static_distributed_tensor
<
CDataType
>
(
c_block_dstr
);
return
c_block_tensor
;
}
// this is used as A matrix for 2nd gemm
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
MakeYTileDistribution
()
{
using
S_
=
remove_cvref_t
<
typename
Problem
::
BlockShape
>
;
using
WarpGemm
=
remove_cvref_t
<
decltype
(
GetWarpGemm1
<
Problem
>
())
>
;
// TODO: all waves a along different N, but same M
constexpr
auto
y_outer_dstr_enc
=
tile_distribution_encoding
<
sequence
<
S_
::
WarpPerBlock_M1
>
,
tuple
<
sequence
<
S_
::
Repeat_M1
>
,
sequence
<
S_
::
Repeat_K1
>>
,
tuple
<
sequence
<
0
>>
,
tuple
<
sequence
<
0
>>
,
sequence
<
1
,
2
>
,
sequence
<
0
,
0
>>
{};
constexpr
auto
y_block_dstr_encode
=
detail
::
make_embed_tile_distribution_encoding
(
y_outer_dstr_enc
,
typename
WarpGemm
::
AWarpDstrEncoding
{});
constexpr
auto
y_block_dstr
=
make_static_tile_distribution
(
y_block_dstr_encode
);
return
y_block_dstr
;
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
MakeYBlockTile
()
{
constexpr
auto
y_block_dstr
=
MakeYTileDistribution
<
Problem
>
();
auto
y_block_tensor
=
make_static_distributed_tensor
<
typename
Problem
::
YDataType
>
(
y_block_dstr
);
return
y_block_tensor
;
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
GetUK_0
()
{
using
S_
=
typename
Problem
::
BlockShape
;
if
constexpr
(
std
::
is_same_v
<
typename
Problem
::
ADataType
,
ck_tile
::
bf16_t
>
&&
std
::
is_same_v
<
typename
Problem
::
GDataType
,
ck_tile
::
bf16_t
>
&&
S_
::
Block_M0
==
32
&&
S_
::
Block_N0
==
512
&&
S_
::
Block_K0
==
128
&&
S_
::
Warp_M0
==
16
&&
S_
::
Warp_N0
==
16
&&
S_
::
Warp_K0
==
32
)
{
return
Flatmm_ff_32x512x128_1x4x1_16x16x32_BF16
{};
}
else
if
constexpr
(
std
::
is_same_v
<
typename
Problem
::
ADataType
,
ck_tile
::
fp16_t
>
&&
std
::
is_same_v
<
typename
Problem
::
GDataType
,
ck_tile
::
fp16_t
>
&&
S_
::
Block_M0
==
32
&&
S_
::
Block_N0
==
512
&&
S_
::
Block_K0
==
128
&&
S_
::
Warp_M0
==
16
&&
S_
::
Warp_N0
==
16
&&
S_
::
Warp_K0
==
32
)
{
return
Flatmm_ff_32x512x128_1x4x1_16x16x32_FP16
{};
}
}
};
}
// namespace ck_tile
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment