Commit b59d5490 authored by ltqin's avatar ltqin
Browse files

fixed MPerBlock=96

parent 0eed5076
...@@ -165,7 +165,7 @@ struct ThreadwiseTensorSliceTransfer_v1r3 ...@@ -165,7 +165,7 @@ struct ThreadwiseTensorSliceTransfer_v1r3
static_for<1, nDim, 1>{}([&](auto i) { static_for<1, nDim, 1>{}([&](auto i) {
index_t tmp = ordered_access_idx[I0]; index_t tmp = ordered_access_idx[I0];
static_for<0, i, 1>{}([&](auto j) { static_for<1, i, 1>{}([&](auto j) {
tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j]; tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j];
}); });
......
...@@ -426,7 +426,6 @@ struct DeviceGemmSplitKXdl ...@@ -426,7 +426,6 @@ struct DeviceGemmSplitKXdl
float ave_time = 0; float ave_time = 0;
const auto Run = [&](const auto& kernel) { const auto Run = [&](const auto& kernel) {
#if CK_RUN_KERNEL_AND_TIME
ave_time = launch_and_time_kernel(kernel, ave_time = launch_and_time_kernel(kernel,
nrepeat, nrepeat,
dim3(grid_size), dim3(grid_size),
...@@ -442,8 +441,14 @@ struct DeviceGemmSplitKXdl ...@@ -442,8 +441,14 @@ struct DeviceGemmSplitKXdl
arg.b_element_op_, arg.b_element_op_,
arg.c_element_op_, arg.c_element_op_,
arg.block_2_ctile_map_); arg.block_2_ctile_map_);
#else if(kbatch > 1)
nrepeat++; {
hipGetErrorString(
hipMemset(arg.p_c_grid_,
0,
arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_.GetElementSpaceSize() *
sizeof(CDataType)));
launch_kernel(kernel, launch_kernel(kernel,
dim3(grid_size), dim3(grid_size),
dim3(BlockSize), dim3(BlockSize),
...@@ -458,7 +463,7 @@ struct DeviceGemmSplitKXdl ...@@ -458,7 +463,7 @@ struct DeviceGemmSplitKXdl
arg.b_element_op_, arg.b_element_op_,
arg.c_element_op_, arg.c_element_op_,
arg.block_2_ctile_map_); arg.block_2_ctile_map_);
#endif }
}; };
if(has_main_k0_block_loop) if(has_main_k0_block_loop)
{ {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment