fixed output

2049ab57 · Jing Zhang · a3edbec9 · 2049ab57
Commit 2049ab57 authored Feb 01, 2021 by Jing Zhang
Hide whitespace changes
Inline Side-by-side

Showing with 1 addition and 34 deletions

composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_fp16_bfp16.hpp ...lude/tensor_operation/gridwise_gemm_xdlops_fp16_bfp16.hpp +1 -34

No files found.
--- a/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_fp16_bfp16.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_fp16_bfp16.hpp
@@ -587,7 +587,6 @@ struct GridwiseBatchGemmXdlops_gkmkpack_gknkpack_gmn_v2
            constexpr index_t BlkSize = blockwise_gemm.GetBlkSize();
            constexpr index_t NumBlks = blockwise_gemm.GetNumBlks();
-#if 1
            // force unrolling the output loop to get ride of scratches
            static_for<0, NumBlks, 1>{}([&](auto blk_id) {
                // calculate origin of thread output tensor on global memory
@@ -618,40 +617,8 @@ struct GridwiseBatchGemmXdlops_gkmkpack_gknkpack_gmn_v2
                                     m_thread_data_on_global % (M2 * M1) / M2,
                                     m_thread_data_on_global % M2,
                                     n_thread_data_on_global))
-                    .Store(c_thread_vec, p_c_global);
+                    .Store(c_thread_vec.GetVector(Number<BlkSize>{})[Number<blk_id>{}], p_c_global);
            });
-#else
-            for(index_t i = 0; i < NumBlks; ++i)
-            {
-                // calculate origin of thread output tensor on global memory
-                //     blockwise GEMM c matrix starting index
-                const auto c_thread_mtx_on_block = blockwise_gemm.GetBeginOfThreadMatrixC(i);
-                const index_t m_thread_data_on_global =
-                    m_block_data_on_global + c_thread_mtx_on_block.row;
-                const index_t n_thread_data_on_global =
-                    n_block_data_on_global + c_thread_mtx_on_block.col;
-                ThreadwiseGenericTensorSliceCopy_v4r2<decltype(c_g_m0_m1_m2_n_thread_desc),
-                                                      decltype(c_g_m0_m1_m2_n_global_desc),
-                                                      CThreadCopySliceLengths,
-                                                      arithmetic_sequence_gen<0, 5, 1>::type,
-                                                      4,
-                                                      1,
-                                                      1,
-                                                      AddressSpace::Vgpr,
-                                                      AddressSpace::Global,
-                                                      CGlobalMemoryOp>(
-                    make_multi_index(0, 0, 0, 0, 0),
-                    make_multi_index(g_block_data_on_global,
-                                     m_thread_data_on_global / (M2 * M1),
-                                     m_thread_data_on_global % (M2 * M1) / M2,
-                                     m_thread_data_on_global % M2,
-                                     n_thread_data_on_global))
-                    .Run(c_thread_vec.n + i * BlkSize, p_c_global);
-            }
-#endif
        }
    }
 };