Commit b59d5490 authored by ltqin's avatar ltqin
Browse files

fixed MPerBlock=96

parent 0eed5076
......@@ -165,7 +165,7 @@ struct ThreadwiseTensorSliceTransfer_v1r3
static_for<1, nDim, 1>{}([&](auto i) {
index_t tmp = ordered_access_idx[I0];
static_for<0, i, 1>{}([&](auto j) {
static_for<1, i, 1>{}([&](auto j) {
tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j];
});
......
......@@ -426,7 +426,6 @@ struct DeviceGemmSplitKXdl
float ave_time = 0;
const auto Run = [&](const auto& kernel) {
#if CK_RUN_KERNEL_AND_TIME
ave_time = launch_and_time_kernel(kernel,
nrepeat,
dim3(grid_size),
......@@ -442,8 +441,14 @@ struct DeviceGemmSplitKXdl
arg.b_element_op_,
arg.c_element_op_,
arg.block_2_ctile_map_);
#else
nrepeat++;
if(kbatch > 1)
{
hipGetErrorString(
hipMemset(arg.p_c_grid_,
0,
arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_.GetElementSpaceSize() *
sizeof(CDataType)));
launch_kernel(kernel,
dim3(grid_size),
dim3(BlockSize),
......@@ -458,7 +463,7 @@ struct DeviceGemmSplitKXdl
arg.b_element_op_,
arg.c_element_op_,
arg.block_2_ctile_map_);
#endif
}
};
if(has_main_k0_block_loop)
{
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment