use fastest config

e6b32ffe · wangshaojie6 · ff6aacaf · e6b32ffe · e6b32ffe
Commit e6b32ffe authored Apr 27, 2022 by wangshaojie6
Showing with 5 additions and 5 deletions

include/ck/config.hpp include/ck/config.hpp +1 -1

include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp ...e/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp +4 -4

No files found.
--- a/include/ck/config.hpp
+++ b/include/ck/config.hpp
@@ -107,7 +107,7 @@
 // experimental feature: use __builtin_memcpy instead of pointer cast to access a vector from
 // pointer of scalar
-#define CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS 0
+#define CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS 1
 // experimental feature: use __builtin_memcpy instead of union to do bit_cast
 #define CK_EXPERIMENTAL_USE_MEMCPY_FOR_BIT_CAST 1

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
@@ -312,8 +312,8 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
    static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed(
        make_tuple(Number<MRepeat>{}, Number<NRepeat>{}, xdlops_gemm.GetRegSizePerXdlops()));
-    static constexpr index_t A_K1_vec = A_K1 / 2;
+    // static constexpr index_t A_K1_vec = A_K1 / 2;
-    static constexpr index_t B_K1_vec = B_K1 / 2;
+    // static constexpr index_t B_K1_vec = B_K1 / 2;
    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB,
                                                         FloatAB,
@@ -322,7 +322,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                                                         Sequence<1, 1, 1, KPerBlock>,
                                                         Sequence<0, 1, 2, 3>,
                                                         3,
-                                                         A_K1_vec,
+                                                         A_K1,
                                                         A_K1>;
    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB,
@@ -332,7 +332,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                                                         Sequence<1, 1, 1, KPerBlock>,
                                                         Sequence<0, 1, 2, 3>,
                                                         3,
-                                                         B_K1_vec,
+                                                         B_K1,
                                                         B_K1>;
    AThreadCopy a_thread_copy_{CalculateAThreadOriginDataIndex()};