shrink blockwise gemm v2 thread buffer size

f5e61549 · Anthony Chang · c62165da · f5e61549
Commit f5e61549 authored Aug 15, 2022 by Anthony Chang
Show whitespace changes
Inline Side-by-side

Showing with 4 additions and 4 deletions

include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp ...e/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp +4 -4

No files found.
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
@@ -920,13 +920,13 @@ struct BlockwiseGemmXdlops_v2
    }

    protected:
-    // A[M0, M1, M2, KPerThread]
+    // A[M0, M1, M2, KPack]
    static constexpr auto a_thread_desc_ =
-        make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I1, Number<KPerThread>{}));
+        make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I1, Number<KPack>{}));

-    // B[N0, N1, N2, KPerThread]
+    // B[N0, N1, N2, KPack]
    static constexpr auto b_thread_desc_ =
-        make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I1, Number<KPerThread>{}));
+        make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I1, Number<KPack>{}));

    // C[M, N, NumRegXdlops]
    static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed(