"git@developer.sourcefind.cn:gaoqiong/composable_kernel.git" did not exist on "83d926dcb1393a800f16d91d3609901ee87084de"
Commit 29d881df authored by Anthony Chang's avatar Anthony Chang
Browse files

format

parent e8c7de8d
...@@ -250,9 +250,8 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1 ...@@ -250,9 +250,8 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
#if CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING #if CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING
static constexpr index_t KPerInnerLoop = math::max( static constexpr index_t KPerInnerLoop =
KPerThread / CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING_MAC_CLUSTERS, math::max(KPerThread / CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING_MAC_CLUSTERS, KPack);
KPack);
// 2-wave optimized blockwise gemm // 2-wave optimized blockwise gemm
template <typename ABlockBuffer, typename BBlockBuffer, typename CThreadBuffer> template <typename ABlockBuffer, typename BBlockBuffer, typename CThreadBuffer>
...@@ -319,8 +318,9 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1 ...@@ -319,8 +318,9 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
// moved here B) reduce VMEM FIFO congestion by applying small delays to // moved here B) reduce VMEM FIFO congestion by applying small delays to
// different wavefronts It is performed near the end of MAC cluster to // different wavefronts It is performed near the end of MAC cluster to
// minimize lgkmcnt penalty // minimize lgkmcnt penalty
if constexpr(int(k) == KPerThread - KPerInnerLoop && int(k_) == KPerInnerLoop - KPack && if constexpr(int(k) == KPerThread - KPerInnerLoop &&
int(m0) == MRepeat - 1 && int(n0) == NRepeat - 1) int(k_) == KPerInnerLoop - KPack && int(m0) == MRepeat - 1 &&
int(n0) == NRepeat - 1)
{ {
__builtin_amdgcn_sched_barrier(); __builtin_amdgcn_sched_barrier();
block_sync_lds(); block_sync_lds();
...@@ -350,12 +350,12 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1 ...@@ -350,12 +350,12 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
private: private:
// A[M0, M1, M2, KPerInnerLoop] // A[M0, M1, M2, KPerInnerLoop]
static constexpr auto a_thread_desc_ = static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor_packed(
make_naive_tensor_descriptor_packed(make_tuple(Number<MRepeat>{}, I1, I1, Number<KPerInnerLoop>{})); make_tuple(Number<MRepeat>{}, I1, I1, Number<KPerInnerLoop>{}));
// B[N0, N1, N2, KPerInnerLoop] // B[N0, N1, N2, KPerInnerLoop]
static constexpr auto b_thread_desc_ = static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed(
make_naive_tensor_descriptor_packed(make_tuple(Number<NRepeat>{}, I1, I1, Number<KPerInnerLoop>{})); make_tuple(Number<NRepeat>{}, I1, I1, Number<KPerInnerLoop>{}));
using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB, using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB,
FloatAB, FloatAB,
...@@ -377,7 +377,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1 ...@@ -377,7 +377,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
B_K1, B_K1,
B_K1>; B_K1>;
#else // #if CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING #else // #if CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING
template <typename ABlockBuffer, typename BBlockBuffer, typename CThreadBuffer> template <typename ABlockBuffer, typename BBlockBuffer, typename CThreadBuffer>
__device__ void Run(const ABlockBuffer& a_block_buf, __device__ void Run(const ABlockBuffer& a_block_buf,
...@@ -468,7 +468,6 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1 ...@@ -468,7 +468,6 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed( static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed(
make_tuple(Number<MRepeat>{}, Number<NRepeat>{}, xdlops_gemm.GetRegSizePerXdlops())); make_tuple(Number<MRepeat>{}, Number<NRepeat>{}, xdlops_gemm.GetRegSizePerXdlops()));
AThreadCopy a_thread_copy_{CalculateAThreadOriginDataIndex()}; AThreadCopy a_thread_copy_{CalculateAThreadOriginDataIndex()};
BThreadCopy b_thread_copy_{CalculateBThreadOriginDataIndex()}; BThreadCopy b_thread_copy_{CalculateBThreadOriginDataIndex()};
}; };
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment