Modify the a_thread offset since the A data load is different from B.

f728087c · mtgu0705 · 1fcd3329 · f728087c
Commit f728087c authored Dec 25, 2024 by mtgu0705
Hide whitespace changes
Inline Side-by-side

Showing with 4 additions and 2 deletions

include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp ...u/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp +4 -2

No files found.
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp
@@ -1368,8 +1368,10 @@ struct GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3
            make_tuple(Number<ScaleSliceSizeN>{}, Number<ScaleSliceSizeK>{}));

        constexpr index_t MWaves = MPerBlock / (MXdlPerWave * MPerXdl);
-        auto a_thread_offset =
-            get_thread_local_1d_id() % MPerXdl + (get_thread_local_1d_id() / 64) % MWaves * MPerXdl;
+        // auto a_thread_offset =
+        //     get_thread_local_1d_id() % MPerXdl + (get_thread_local_1d_id() / 64) % MWaves * MPerXdl;
+
+        auto a_thread_offset = get_thread_local_1d_id() % MPerXdl + (get_thread_local_1d_id() / 128) * MPerXdl;
        
        auto a_scale_thread_copy =
            ThreadwiseTensorSliceTransfer_v2<AScaleType,