"vscode:/vscode.git/clone" did not exist on "c54f8bcc25ace7b8d9ee86ddeb72738c87f908bb"
Commit 804e6803 authored by Harisankar Sadasivan's avatar Harisankar Sadasivan
Browse files

files modified for 1s cold and warm runs

parent 87efbb63
...@@ -20,6 +20,33 @@ float launch_and_time_kernel(const StreamConfig& stream_config, ...@@ -20,6 +20,33 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
#if CK_TIME_KERNEL #if CK_TIME_KERNEL
if(stream_config.time_kernel_) if(stream_config.time_kernel_)
{ {
if(ck::get_device_name() == "gfx940" || ck::get_device_name() == "gfx941" || ck::get_device_name() == "gfx942")
{
hipEvent_t start, stop;
hip_check_error(hipEventCreate(&start));
hip_check_error(hipEventCreate(&stop));
hip_check_error(hipDeviceSynchronize());
hip_check_error(hipEventRecord(start, stream_config.stream_id_));
for(int i = 0; i < stream_config.nrepeat_; ++i)
{
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
hip_check_error(hipGetLastError());
}
hip_check_error(hipEventRecord(stop, stream_config.stream_id_));
hip_check_error(hipEventSynchronize(stop));
float total_time = 0;
hip_check_error(hipEventElapsedTime(&total_time, start, stop));
total_time/=10;
stream_config.cold_niters_ = (1000.0 / total_time);//we need longer runtime to ramp up the clk on MI300s
stream_config.nrepeat_ = stream_config.cold_niters_;
}
#if DEBUG_LOG #if DEBUG_LOG
printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n", printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n",
__func__, __func__,
......
...@@ -11,6 +11,6 @@ struct StreamConfig ...@@ -11,6 +11,6 @@ struct StreamConfig
hipStream_t stream_id_ = nullptr; hipStream_t stream_id_ = nullptr;
bool time_kernel_ = false; bool time_kernel_ = false;
int log_level_ = 0; int log_level_ = 0;
int cold_niters_ = 5; mutable int cold_niters_ = 5;
int nrepeat_ = 50; mutable int nrepeat_ = 50;
}; };
...@@ -669,25 +669,14 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2 ...@@ -669,25 +669,14 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
const BElementwiseOperation b_element_op = BElementwiseOperation{}, const BElementwiseOperation b_element_op = BElementwiseOperation{},
const CElementwiseOperation c_element_op = CElementwiseOperation{}) const CElementwiseOperation c_element_op = CElementwiseOperation{})
{ {
for(auto i = 0; i < 1500; i++)
{
const FloatA* p_a_grid = karg.p_a_grid; const FloatA* p_a_grid = karg.p_a_grid;
const FloatB* p_b_grid = karg.p_b_grid; const FloatB* p_b_grid = karg.p_b_grid;
FloatC* p_c_grid = karg.p_c_grid; FloatC* p_c_grid = karg.p_c_grid;
const auto a_b_k0_m_k1_grid_desc = MakeAGridDescriptor_KBatch_K0_M_K1(karg.M, const auto a_b_k0_m_k1_grid_desc = MakeAGridDescriptor_KBatch_K0_M_K1(
karg.MPadded, karg.M, karg.MPadded, karg.K, karg.StrideA, karg.k_batch, karg.K0Padded, karg.KPadded);
karg.K, const auto b_b_k0_n_k1_grid_desc = MakeBGridDescriptor_KBatch_K0_N_K1(
karg.StrideA, karg.K, karg.NPadded, karg.N, karg.StrideB, karg.k_batch, karg.K0Padded, karg.KPadded);
karg.k_batch,
karg.K0Padded,
karg.KPadded);
const auto b_b_k0_n_k1_grid_desc = MakeBGridDescriptor_KBatch_K0_N_K1(karg.K,
karg.NPadded,
karg.N,
karg.StrideB,
karg.k_batch,
karg.K0Padded,
karg.KPadded);
const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N(karg.M, karg.N, karg.StrideC); const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N(karg.M, karg.N, karg.StrideC);
const auto c_grid_desc_mblock_mperblock_nblock_nperblock = const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
...@@ -877,8 +866,8 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2 ...@@ -877,8 +866,8 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
auto c_thread_buf = blockwise_gemm.GetCThreadBuffer(); auto c_thread_buf = blockwise_gemm.GetCThreadBuffer();
// LDS allocation for A and B: be careful of alignment // LDS allocation for A and B: be careful of alignment
constexpr auto a_block_space_size = math::integer_least_multiple( constexpr auto a_block_space_size =
a_k0_m_k1_block_desc.GetElementSpaceSize(), max_lds_align); math::integer_least_multiple(a_k0_m_k1_block_desc.GetElementSpaceSize(), max_lds_align);
auto p_a_block = reinterpret_cast<LDSTypeA*>(p_shared_block); auto p_a_block = reinterpret_cast<LDSTypeA*>(p_shared_block);
auto p_b_block = reinterpret_cast<LDSTypeB*>(p_a_block + a_block_space_size); auto p_b_block = reinterpret_cast<LDSTypeB*>(p_a_block + a_block_space_size);
...@@ -943,23 +932,20 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2 ...@@ -943,23 +932,20 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor( constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor(
c_block_desc_mblock_mperblock_nblock_nperblock, c_block_desc_mblock_mperblock_nblock_nperblock,
make_tuple(make_freeze_transform(I0), // freeze mblock make_tuple(
make_unmerge_transform( make_freeze_transform(I0), // freeze mblock
make_tuple(CShuffleMRepeatPerShuffle, make_unmerge_transform(make_tuple(CShuffleMRepeatPerShuffle,
M1, M1,
M2, M2,
M3, M3,
M4)), // M1 = MWave, M2 * M3 * M4 = MPerXDL M4)), // M1 = MWave, M2 * M3 * M4 = MPerXDL
make_freeze_transform(I0), // freeze nblock make_freeze_transform(I0), // freeze nblock
make_unmerge_transform( make_unmerge_transform(make_tuple(CShuffleNRepeatPerShuffle,
make_tuple(CShuffleNRepeatPerShuffle,
N1, N1,
N2))), // M1 = MWave, M2 * M3 * M4 = MPerXDL N2))), // M1 = MWave, M2 * M3 * M4 = MPerXDL
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<>{}, make_tuple(
Sequence<0, 2, 4, 5, 6>{}, Sequence<>{}, Sequence<0, 2, 4, 5, 6>{}, Sequence<>{}, Sequence<1, 3, 7>{}));
Sequence<>{},
Sequence<1, 3, 7>{}));
// calculate origin of thread output tensor on global memory // calculate origin of thread output tensor on global memory
// blockwise GEMM c matrix starting index // blockwise GEMM c matrix starting index
...@@ -990,8 +976,8 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2 ...@@ -990,8 +976,8 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
make_multi_index(n_thread_data_on_block)); make_multi_index(n_thread_data_on_block));
// VGPR to LDS // VGPR to LDS
auto c_thread_copy_vgpr_to_lds = ThreadwiseTensorSliceTransfer_v1r3< auto c_thread_copy_vgpr_to_lds =
FloatAcc, ThreadwiseTensorSliceTransfer_v1r3<FloatAcc,
FloatC, FloatC,
decltype(c_m0_n0_m1_n1_m2_m3_m4_n2_thread_desc), decltype(c_m0_n0_m1_n1_m2_m3_m4_n2_thread_desc),
decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2), decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
...@@ -1009,7 +995,8 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2 ...@@ -1009,7 +995,8 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
1, 1,
InMemoryDataOperationEnum::Set, InMemoryDataOperationEnum::Set,
1, 1,
true>{c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2, true>{
c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
make_multi_index(0, make_multi_index(0,
0, 0,
m_thread_data_on_block_idx[I1], m_thread_data_on_block_idx[I1],
...@@ -1082,8 +1069,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2 ...@@ -1082,8 +1069,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
block_sync_lds(); block_sync_lds();
// LDS to global // LDS to global
c_block_copy_lds_to_global.Run( c_block_copy_lds_to_global.Run(c_block_desc_mblock_mperblock_nblock_nperblock,
c_block_desc_mblock_mperblock_nblock_nperblock,
c_block_buf, c_block_buf,
c_grid_desc_mblock_mperblock_nblock_nperblock, c_grid_desc_mblock_mperblock_nblock_nperblock,
c_grid_buf); c_grid_buf);
...@@ -1108,13 +1094,11 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2 ...@@ -1108,13 +1094,11 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
if constexpr(mxdlperwave < MRepeat - CShuffleMRepeatPerShuffle) if constexpr(mxdlperwave < MRepeat - CShuffleMRepeatPerShuffle)
{ {
c_block_copy_lds_to_global.MoveDstSliceWindow( c_block_copy_lds_to_global.MoveDstSliceWindow(
c_grid_desc_mblock_mperblock_nblock_nperblock, c_grid_desc_mblock_mperblock_nblock_nperblock, mxdlperwave_forward_step);
mxdlperwave_forward_step);
} }
}); });
} }
} }
}
static std::string GetTypeString() static std::string GetTypeString()
{ {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment