Commit 58b6996a authored by ltqin's avatar ltqin
Browse files

some gridwise gemm write to C matrix

parent 971220d8
......@@ -131,6 +131,17 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
return make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I1, I1, M0, M1, M2, N));
}
__host__ __device__ static constexpr auto GetCGM0N0M1N1M2M3M4N2ThreadDescriptor()
{
constexpr auto c_m0_m1_m2_n_tblk_lens = xdlops_gemm.GetCM0M1M2NThreadBlkLengths();
constexpr auto M0 = c_m0_m1_m2_n_tblk_lens[I0];
constexpr auto M1 = c_m0_m1_m2_n_tblk_lens[I1];
constexpr auto M2 = c_m0_m1_m2_n_tblk_lens[I2];
constexpr auto N = c_m0_m1_m2_n_tblk_lens[I3];
return make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}, I1, I1, I1, I1, M0, M1, M2, N));
}
__host__ __device__ static constexpr auto GetCM0N0M1N1M2M3M4N2BlockDescriptor()
{
constexpr auto c_m0_n0_m1_n1_m2_n2_block_desc =
......@@ -144,6 +155,20 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
return xdlops_gemm.MakeCM0N0M1N1M2M3M4N2Descriptor(c_m0_n0_m1_n1_m2_n2_block_desc);
}
__host__ __device__ static constexpr auto GetCGM0N0M1N1M2M3M4N2BlockDescriptor()
{
constexpr auto c_m0_n0_m1_n1_m2_n2_block_desc =
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{},
Number<MRepeat>{},
Number<NRepeat>{},
Number<MWaves>{},
Number<NWaves>{},
Number<MPerXDL>{},
Number<NPerXDL>{}));
return xdlops_gemm.MakeCM0N0M1N1M2M3M4N2Descriptor(c_m0_n0_m1_n1_m2_n2_block_desc);
}
template <typename CMNGridDesc>
__host__ __device__ static constexpr auto
MakeCM0N0M1N1M2M3M4N2GridDescriptor(const CMNGridDesc& c_m_n_grid_desc)
......
......@@ -255,24 +255,24 @@ void device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwgc_gkyxc_nhwgk(
Sequence<0, 0, 0, 0, 0, 0, 0>{})); // 3-: GemmK1
constexpr auto out_gemmg_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks =
make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 0+: M0
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 1+: M0
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 2+: N0
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 3+: M1
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 4+: N1
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 5+: M2
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 6+: M3
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 7+: M4
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}), // 8+: N2
make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 0-: M0
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 1-: M0
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 2-: N0
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 3-: M1
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 4-: N1
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 5-: M2
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 6-: M3
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 7-: M4
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 8-: N2
make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 0+: M0
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 1+: M0
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 2+: N0
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 3+: M1
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 4+: N1
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 5+: M2
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 6+: M3
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 7+: M4
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}), // 8+: N2
make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 0-: M0
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 1-: M0
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 2-: N0
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 3-: M1
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 4-: N1
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 5-: M2
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 6-: M3
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 7-: M4
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 8-: N2
constexpr auto in_gemmg_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks =
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{};
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment