Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
6a07464b
Commit
6a07464b
authored
Nov 28, 2024
by
coderfeli
Browse files
change ways but still could not use immediate data as ds_read
parent
405c05c0
Changes
6
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
142 additions
and
253 deletions
+142
-253
cmake/EnableCompilerWarnings.cmake
cmake/EnableCompilerWarnings.cmake
+1
-0
example/ck_tile/03_gemm/gemm_basic.hpp
example/ck_tile/03_gemm/gemm_basic.hpp
+2
-1
example/ck_tile/03_gemm/run_gemm_example.inc
example/ck_tile/03_gemm/run_gemm_example.inc
+11
-8
include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops.hpp
...or_operation/gpu/block/blockwise_gemm_pipeline_xdlops.hpp
+12
-12
include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v2.hpp
...e/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v2.hpp
+5
-3
include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
...e/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
+111
-229
No files found.
cmake/EnableCompilerWarnings.cmake
View file @
6a07464b
...
@@ -66,6 +66,7 @@ else()
...
@@ -66,6 +66,7 @@ else()
-Wunreachable-code
-Wunreachable-code
-Wunused
-Wunused
-Wno-reserved-identifier
-Wno-reserved-identifier
-v --save-temps -Wno-gnu-line-marker
# -Werror
# -Werror
-Wno-option-ignored
-Wno-option-ignored
-Wsign-compare
-Wsign-compare
...
...
example/ck_tile/03_gemm/gemm_basic.hpp
View file @
6a07464b
...
@@ -82,7 +82,8 @@ auto create_args(int argc, char* argv[])
...
@@ -82,7 +82,8 @@ auto create_args(int argc, char* argv[])
.
insert
(
"prec"
,
"fp16"
,
"data type. fp16/bf16/fp8/bf8"
)
.
insert
(
"prec"
,
"fp16"
,
"data type. fp16/bf16/fp8/bf8"
)
.
insert
(
"warmup"
,
"50"
,
"number of iterations before benchmark the kernel"
)
.
insert
(
"warmup"
,
"50"
,
"number of iterations before benchmark the kernel"
)
.
insert
(
"repeat"
,
"100"
,
"number of iterations to benchmark the kernel"
)
.
insert
(
"repeat"
,
"100"
,
"number of iterations to benchmark the kernel"
)
.
insert
(
"timer"
,
"gpu"
,
"gpu:gpu timer, cpu:cpu timer"
);
.
insert
(
"timer"
,
"gpu"
,
"gpu:gpu timer, cpu:cpu timer"
)
.
insert
(
"init"
,
"0"
,
"0:random, 1:linear, 2:constant(1)"
);
bool
result
=
arg_parser
.
parse
(
argc
,
argv
);
bool
result
=
arg_parser
.
parse
(
argc
,
argv
);
return
std
::
make_tuple
(
result
,
arg_parser
);
return
std
::
make_tuple
(
result
,
arg_parser
);
...
...
example/ck_tile/03_gemm/run_gemm_example.inc
View file @
6a07464b
...
@@ -69,6 +69,7 @@ int run_gemm_example_with_layouts(int argc,
...
@@ -69,6 +69,7 @@ int run_gemm_example_with_layouts(int argc,
ck_tile::index_t batch_size = arg_parser.get_int("
b
");
ck_tile::index_t batch_size = arg_parser.get_int("
b
");
int n_warmup = arg_parser.get_int("
warmup
");
int n_warmup = arg_parser.get_int("
warmup
");
int n_repeat = arg_parser.get_int("
repeat
");
int n_repeat = arg_parser.get_int("
repeat
");
ck_tile::index_t init_method = arg_parser.get_int("
init
");
using namespace ck_tile::literals;
using namespace ck_tile::literals;
...
@@ -114,14 +115,16 @@ int run_gemm_example_with_layouts(int argc,
...
@@ -114,14 +115,16 @@ int run_gemm_example_with_layouts(int argc,
f_host_tensor_descriptor(M, N, stride_C, CLayout{}));
f_host_tensor_descriptor(M, N, stride_C, CLayout{}));
// TODO: add different init types
// TODO: add different init types
if (init_method == 0) {
ck_tile::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
ck_tile::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
ck_tile::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n);
ck_tile::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n);
// ck_tile::FillMonotonicSeq<ADataType>{}(a_m_k);
} else if (init_method == 1) {
// ck_tile::FillMonotonicSeq<BDataType>{}(b_k_n);
ck_tile::FillMonotonicSeq<ADataType>{}(a_m_k);
// ck_tile::FillConstant<ADataType>
{
1.f
}
(a_m_k);
ck_tile::FillMonotonicSeq<BDataType>{}(b_k_n);
// ck_tile::FillConstant<BDataType>
{
1.f
}
(b_k_n);
} else {
ck_tile::FillConstant<ADataType>
{
1.f
}
(a_m_k);
ck_tile::FillConstant<BDataType>
{
1.f
}
(b_k_n);
}
ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
...
...
include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops.hpp
View file @
6a07464b
...
@@ -374,29 +374,29 @@ struct BlockwiseGemmXdlops_pipeline_v4
...
@@ -374,29 +374,29 @@ struct BlockwiseGemmXdlops_pipeline_v4
{
{
// schedule
// schedule
constexpr
auto
num_ds_read_inst
=
constexpr
auto
num_ds_read_inst
=
HotLoopInstList
::
A_LDS_Read_Inst_Num
+
HotLoopInstList
::
B_LDS_Read_Inst_Num
;
HotLoopInstList
::
A_LDS_Read_Inst_Num
+
HotLoopInstList
::
B_LDS_Read_Inst_Num
;
//16
constexpr
auto
num_ds_write_inst
=
constexpr
auto
num_ds_write_inst
=
HotLoopInstList
::
A_LDS_Write_Inst_Num
+
HotLoopInstList
::
B_LDS_Write_Inst_Num
;
HotLoopInstList
::
A_LDS_Write_Inst_Num
+
HotLoopInstList
::
B_LDS_Write_Inst_Num
;
//8
;
;
constexpr
auto
num_buffer_load_inst
=
constexpr
auto
num_buffer_load_inst
=
HotLoopInstList
::
A_Buffer_Load_Inst_Num
+
HotLoopInstList
::
B_Buffer_Load_Inst_Num
;
HotLoopInstList
::
A_Buffer_Load_Inst_Num
+
HotLoopInstList
::
B_Buffer_Load_Inst_Num
;
//8
;
;
constexpr
auto
num_mfma_inst
=
HotLoopInstList
::
C_MFMA_Inst_Num
;
constexpr
auto
num_mfma_inst
=
HotLoopInstList
::
C_MFMA_Inst_Num
;
//64
constexpr
auto
num_issue
=
num_buffer_load_inst
;
constexpr
auto
num_issue
=
num_buffer_load_inst
;
// 8
static_for
<
0
,
num_issue
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
num_issue
,
1
>
{}([
&
](
auto
i
)
{
ignore
=
i
;
ignore
=
i
;
__builtin_amdgcn_sched_group_barrier
(
0x008
,
1
,
0
);
// MFMA
__builtin_amdgcn_sched_group_barrier
(
0x008
,
1
,
0
);
// MFMA
: 1
__builtin_amdgcn_sched_group_barrier
(
__builtin_amdgcn_sched_group_barrier
(
0x100
,
num_ds_read_inst
/
num_buffer_load_inst
,
0
);
// DS read
0x100
,
num_ds_read_inst
/
num_buffer_load_inst
,
0
);
// DS read
: 2
__builtin_amdgcn_sched_group_barrier
(
0x008
,
1
,
0
);
// MFMA
__builtin_amdgcn_sched_group_barrier
(
0x008
,
1
,
0
);
// MFMA
: 1
__builtin_amdgcn_sched_group_barrier
(
__builtin_amdgcn_sched_group_barrier
(
0x200
,
num_ds_write_inst
/
num_buffer_load_inst
,
0
);
// DS write
0x200
,
num_ds_write_inst
/
num_buffer_load_inst
,
0
);
// DS write
: 1
__builtin_amdgcn_sched_group_barrier
(
0x008
,
1
,
0
);
// MFMA
__builtin_amdgcn_sched_group_barrier
(
0x008
,
1
,
0
);
// MFMA
: 1
__builtin_amdgcn_sched_group_barrier
(
0x020
,
1
,
0
);
// VMEM read
__builtin_amdgcn_sched_group_barrier
(
0x020
,
1
,
0
);
// VMEM read
:1
__builtin_amdgcn_sched_group_barrier
(
__builtin_amdgcn_sched_group_barrier
(
0x008
,
num_mfma_inst
/
num_buffer_load_inst
-
3
,
0
);
// MFMA
0x008
,
num_mfma_inst
/
num_buffer_load_inst
-
3
,
0
);
// MFMA
: 5
});
});
}
}
...
...
include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v2.hpp
View file @
6a07464b
...
@@ -184,7 +184,6 @@ struct BlockGemmARegBRegCRegV2
...
@@ -184,7 +184,6 @@ struct BlockGemmARegBRegCRegV2
a_block_outer_dstr_encoding
,
typename
WG
::
AWarpDstrEncoding
{});
a_block_outer_dstr_encoding
,
typename
WG
::
AWarpDstrEncoding
{});
constexpr
auto
a_block_dstr
=
make_static_tile_distribution
(
a_block_dstr_encode
);
constexpr
auto
a_block_dstr
=
make_static_tile_distribution
(
a_block_dstr_encode
);
return
a_block_dstr
;
return
a_block_dstr
;
// return make_static_distributed_tensor<ADataType>(a_block_dstr);
}
}
CK_TILE_DEVICE
static
constexpr
auto
MakeBBlockDistribution
()
CK_TILE_DEVICE
static
constexpr
auto
MakeBBlockDistribution
()
...
@@ -208,10 +207,13 @@ struct BlockGemmARegBRegCRegV2
...
@@ -208,10 +207,13 @@ struct BlockGemmARegBRegCRegV2
template
<
typename
BlockWindow
,
typename
BlockTensor
>
template
<
typename
BlockWindow
,
typename
BlockTensor
>
CK_TILE_DEVICE
static
auto
PrefetchLds
(
const
BlockWindow
&
block_window
,
BlockTensor
&
block_tensor
)
CK_TILE_DEVICE
static
auto
PrefetchLds
(
const
BlockWindow
&
block_window
,
BlockTensor
&
block_tensor
)
{
{
auto
tileDist
=
BlockTensor
::
get_tile_distribution
();
//.get_static_tile_distribution_encoding()
auto
tileDist
=
BlockTensor
::
get_tile_distribution
();
return
load_tile
(
block_tensor
,
make_tile_window
(
block_window
,
tileDist
));
return
load_tile
(
block_tensor
,
make_tile_window
(
block_window
,
tileDist
));
// load_tile_raw(block_tensor, make_tile_window_linear_raw(block_window, tileDist));
// return;
}
}
// C = A * B
// C = A * B
template
<
typename
ABlockTensor
,
typename
BBlockTensor
>
template
<
typename
ABlockTensor
,
typename
BBlockTensor
>
CK_TILE_DEVICE
auto
operator
()(
const
ABlockTensor
&
a_block_tensor
,
CK_TILE_DEVICE
auto
operator
()(
const
ABlockTensor
&
a_block_tensor
,
...
...
include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
View file @
6a07464b
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment