Commit 6a07464b authored by coderfeli's avatar coderfeli
Browse files

change ways but still could not use immediate data as ds_read

parent 405c05c0
...@@ -66,6 +66,7 @@ else() ...@@ -66,6 +66,7 @@ else()
-Wunreachable-code -Wunreachable-code
-Wunused -Wunused
-Wno-reserved-identifier -Wno-reserved-identifier
-v --save-temps -Wno-gnu-line-marker
# -Werror # -Werror
-Wno-option-ignored -Wno-option-ignored
-Wsign-compare -Wsign-compare
......
...@@ -82,7 +82,8 @@ auto create_args(int argc, char* argv[]) ...@@ -82,7 +82,8 @@ auto create_args(int argc, char* argv[])
.insert("prec", "fp16", "data type. fp16/bf16/fp8/bf8") .insert("prec", "fp16", "data type. fp16/bf16/fp8/bf8")
.insert("warmup", "50", "number of iterations before benchmark the kernel") .insert("warmup", "50", "number of iterations before benchmark the kernel")
.insert("repeat", "100", "number of iterations to benchmark the kernel") .insert("repeat", "100", "number of iterations to benchmark the kernel")
.insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer"); .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
.insert("init", "0", "0:random, 1:linear, 2:constant(1)");
bool result = arg_parser.parse(argc, argv); bool result = arg_parser.parse(argc, argv);
return std::make_tuple(result, arg_parser); return std::make_tuple(result, arg_parser);
......
...@@ -69,6 +69,7 @@ int run_gemm_example_with_layouts(int argc, ...@@ -69,6 +69,7 @@ int run_gemm_example_with_layouts(int argc,
ck_tile::index_t batch_size = arg_parser.get_int("b"); ck_tile::index_t batch_size = arg_parser.get_int("b");
int n_warmup = arg_parser.get_int("warmup"); int n_warmup = arg_parser.get_int("warmup");
int n_repeat = arg_parser.get_int("repeat"); int n_repeat = arg_parser.get_int("repeat");
ck_tile::index_t init_method = arg_parser.get_int("init");
using namespace ck_tile::literals; using namespace ck_tile::literals;
...@@ -114,14 +115,16 @@ int run_gemm_example_with_layouts(int argc, ...@@ -114,14 +115,16 @@ int run_gemm_example_with_layouts(int argc,
f_host_tensor_descriptor(M, N, stride_C, CLayout{})); f_host_tensor_descriptor(M, N, stride_C, CLayout{}));
// TODO: add different init types // TODO: add different init types
if (init_method == 0) {
ck_tile::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k); ck_tile::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
ck_tile::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n); ck_tile::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n);
// ck_tile::FillMonotonicSeq<ADataType>{}(a_m_k); } else if (init_method == 1) {
// ck_tile::FillMonotonicSeq<BDataType>{}(b_k_n); ck_tile::FillMonotonicSeq<ADataType>{}(a_m_k);
// ck_tile::FillConstant<ADataType>{1.f}(a_m_k); ck_tile::FillMonotonicSeq<BDataType>{}(b_k_n);
// ck_tile::FillConstant<BDataType>{1.f}(b_k_n); } else {
ck_tile::FillConstant<ADataType>{1.f}(a_m_k);
ck_tile::FillConstant<BDataType>{1.f}(b_k_n);
}
ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes()); ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes()); ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes()); ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
......
...@@ -374,29 +374,29 @@ struct BlockwiseGemmXdlops_pipeline_v4 ...@@ -374,29 +374,29 @@ struct BlockwiseGemmXdlops_pipeline_v4
{ {
// schedule // schedule
constexpr auto num_ds_read_inst = constexpr auto num_ds_read_inst =
HotLoopInstList::A_LDS_Read_Inst_Num + HotLoopInstList::B_LDS_Read_Inst_Num; HotLoopInstList::A_LDS_Read_Inst_Num + HotLoopInstList::B_LDS_Read_Inst_Num; //16
constexpr auto num_ds_write_inst = constexpr auto num_ds_write_inst =
HotLoopInstList::A_LDS_Write_Inst_Num + HotLoopInstList::B_LDS_Write_Inst_Num; HotLoopInstList::A_LDS_Write_Inst_Num + HotLoopInstList::B_LDS_Write_Inst_Num; //8
; ;
constexpr auto num_buffer_load_inst = constexpr auto num_buffer_load_inst =
HotLoopInstList::A_Buffer_Load_Inst_Num + HotLoopInstList::B_Buffer_Load_Inst_Num; HotLoopInstList::A_Buffer_Load_Inst_Num + HotLoopInstList::B_Buffer_Load_Inst_Num; //8
; ;
constexpr auto num_mfma_inst = HotLoopInstList::C_MFMA_Inst_Num; constexpr auto num_mfma_inst = HotLoopInstList::C_MFMA_Inst_Num; //64
constexpr auto num_issue = num_buffer_load_inst; constexpr auto num_issue = num_buffer_load_inst; // 8
static_for<0, num_issue, 1>{}([&](auto i) { static_for<0, num_issue, 1>{}([&](auto i) {
ignore = i; ignore = i;
__builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA : 1
__builtin_amdgcn_sched_group_barrier( __builtin_amdgcn_sched_group_barrier(
0x100, num_ds_read_inst / num_buffer_load_inst, 0); // DS read 0x100, num_ds_read_inst / num_buffer_load_inst, 0); // DS read : 2
__builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA: 1
__builtin_amdgcn_sched_group_barrier( __builtin_amdgcn_sched_group_barrier(
0x200, num_ds_write_inst / num_buffer_load_inst, 0); // DS write 0x200, num_ds_write_inst / num_buffer_load_inst, 0); // DS write : 1
__builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA : 1
__builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read :1
__builtin_amdgcn_sched_group_barrier( __builtin_amdgcn_sched_group_barrier(
0x008, num_mfma_inst / num_buffer_load_inst - 3, 0); // MFMA 0x008, num_mfma_inst / num_buffer_load_inst - 3, 0); // MFMA : 5
}); });
} }
......
...@@ -184,7 +184,6 @@ struct BlockGemmARegBRegCRegV2 ...@@ -184,7 +184,6 @@ struct BlockGemmARegBRegCRegV2
a_block_outer_dstr_encoding, typename WG::AWarpDstrEncoding{}); a_block_outer_dstr_encoding, typename WG::AWarpDstrEncoding{});
constexpr auto a_block_dstr = make_static_tile_distribution(a_block_dstr_encode); constexpr auto a_block_dstr = make_static_tile_distribution(a_block_dstr_encode);
return a_block_dstr; return a_block_dstr;
// return make_static_distributed_tensor<ADataType>(a_block_dstr);
} }
CK_TILE_DEVICE static constexpr auto MakeBBlockDistribution() CK_TILE_DEVICE static constexpr auto MakeBBlockDistribution()
...@@ -208,10 +207,13 @@ struct BlockGemmARegBRegCRegV2 ...@@ -208,10 +207,13 @@ struct BlockGemmARegBRegCRegV2
template <typename BlockWindow, typename BlockTensor> template <typename BlockWindow, typename BlockTensor>
CK_TILE_DEVICE static auto PrefetchLds(const BlockWindow& block_window, BlockTensor& block_tensor) CK_TILE_DEVICE static auto PrefetchLds(const BlockWindow& block_window, BlockTensor& block_tensor)
{ {
auto tileDist = BlockTensor::get_tile_distribution();//.get_static_tile_distribution_encoding() auto tileDist = BlockTensor::get_tile_distribution();
return load_tile(block_tensor, make_tile_window(block_window, tileDist)); return load_tile(block_tensor, make_tile_window(block_window, tileDist));
// load_tile_raw(block_tensor, make_tile_window_linear_raw(block_window, tileDist));
// return;
} }
// C = A * B // C = A * B
template <typename ABlockTensor, typename BBlockTensor> template <typename ABlockTensor, typename BBlockTensor>
CK_TILE_DEVICE auto operator()(const ABlockTensor& a_block_tensor, CK_TILE_DEVICE auto operator()(const ABlockTensor& a_block_tensor,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment