Merge pull request #851 from ROCmSoftwarePlatform/perf_opt_fpAintB

New implementation fpAintB

Merge pull request #851 from ROCmSoftwarePlatform/perf_opt_fpAintB
New implementation fpAintB
cc0ffeb7 · Haocong WANG · GitHub · 3ba0f0d7 · bf75259f · cc0ffeb7
Unverified Commit cc0ffeb7 authored Aug 16, 2023 by Haocong WANG Committed by GitHub Aug 16, 2023
10 changed files
--- a/example/01_gemm/gemm_wmma_fp16.cpp
+++ b/example/01_gemm/gemm_wmma_fp16.cpp
@@ -27,7 +27,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle
           BLayout,             
           CLayout,             
           ADataType, 
-           BDataType, 
+           BDataType,
           CDataType, 
           AccDataType, 
           CShuffleDataType,  
@@ -35,16 +35,16 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle
           BElementOp,  
           CElementOp,    
           GemmDefault, 
-           2,           // Prefetch stage
+           1,           // Prefetch stage
           128,         // BlockSize
-           128,         // MPerBlock
-           64,          // NPerBlock
+           64,          // MPerBlock
+           128,         // NPerBlock
           64,          // KPerBlock
           8,           // K1
           16,          // MPerWmma
           16,          // NPerWmma
-           4,           // M-Repeat // M-PerWmma / M-Repeat = M-Wave
-           2,           // N-Repeat // N-PerWmma / N-Repeat = N-Wave
+           2,           // M-Repeat // M-PerWmma / M-Repeat = M-Wave
+           4,           // N-Repeat // N-PerWmma / N-Repeat = N-Wave
           S<4, 32, 1>,     
           S<1, 0, 2>,     
           S<1, 0, 2>,              

--- a/example/49_fpAintB_gemm/fp16int8_gemm_wmma.cpp
+++ b/example/49_fpAintB_gemm/fp16int8_gemm_wmma.cpp
@@ -21,7 +21,7 @@ using QuantDataType    = int8_t;
 using BDataType        = uint8_t;
 using ScaleDataType    = ck::half_t;
 using AccDataType      = float;
-using CShuffleDataType = ck::half_t;
+using CShuffleDataType = float;
 using CDataType        = ck::half_t;

 using ALayout = Row;

--- a/include/ck/tensor_operation/gpu/block/blockwise_fpAintB_gemm_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_fpAintB_gemm_wmma.hpp
--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1_dequant.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1_dequant.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/cluster_descriptor.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_dequant.hpp"
+
+namespace ck {
+
+/**
+ * @brief Blockwise data transfer with dequantization
+ *
+ * RunRead  would load low-precision data and scale data.
+ * RunWrite would process dequantization process.
+ * Assume Scale is identical along K-dimension
+ *
+ * This version does following things to avoid scratch memory issue
+ * 1. Use StaticallyIndexedArray instead of C array for thread buffer
+ * 2. ThreadwiseTensorSliceTransfer_v3 does not keep reference to tensor descriptor
+ * 3. ThreadwiseTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
+ *
+ */
+template <typename ThreadGroup,
+          typename SrcElementwiseOperation,
+          typename ScaleElementwiseOperation,
+          typename DstElementwiseOperation,
+          InMemoryDataOperationEnum DstInMemOp,
+          typename BlockSliceLengths,
+          typename BlockScaleSliceLengths,
+          typename ThreadClusterLengths,
+          typename ThreadClusterArrangeOrder,
+          typename SrcData,
+          typename ScaleData,
+          typename DstData,
+          typename SrcDesc,
+          typename ScaleDesc,
+          typename DstDesc,
+          typename SrcDimAccessOrder,
+          typename DstDimAccessOrder,
+          index_t SrcVectorDim,
+          index_t DstVectorDim,
+          index_t SrcScalarPerVector,
+          index_t ScaleScalarPerVector,
+          index_t DstScalarPerVector,
+          index_t SrcScalarStrideInVector,
+          index_t ScaleScalarStrideInVector,
+          index_t DstScalarStrideInVector,
+          bool ThreadTransferSrcResetCoordinateAfterRun,
+          bool ThreadTransferDstResetCoordinateAfterRun,
+          index_t NumThreadScratch = 1>
+struct ThreadGroupTensorSliceTransfer_v4r1_dequant
+{
+    static constexpr index_t nDim = remove_reference_t<SrcDesc>::GetNumOfDimension();
+
+    static constexpr auto thread_slice_lengths = BlockSliceLengths{} / ThreadClusterLengths{};
+    static constexpr auto scale_thread_slice_lengths =
+        BlockScaleSliceLengths{} / ThreadClusterLengths{};
+
+    using Index = MultiIndex<nDim>;
+
+    __device__ constexpr ThreadGroupTensorSliceTransfer_v4r1_dequant(
+        const SrcDesc& src_desc,
+        const Index& src_block_slice_origin,
+        const SrcElementwiseOperation& src_element_op,
+        const ScaleDesc& scale_desc,
+        const Index& scale_block_slice_origin,
+        const ScaleElementwiseOperation& scale_element_op,
+        const DstDesc& dst_desc,
+        const Index& dst_block_slice_origin,
+        const DstElementwiseOperation& dst_element_op)
+        : threadwise_transfer_(src_desc,
+                               make_zero_multi_index<nDim>(),
+                               src_element_op,
+                               scale_desc,
+                               make_zero_multi_index<nDim>(),
+                               scale_element_op,
+                               dst_desc,
+                               make_zero_multi_index<nDim>(),
+                               dst_element_op)
+
+    {
+        static_assert(nDim == remove_cvref_t<SrcDesc>::GetNumOfDimension() &&
+                          nDim == remove_cvref_t<ScaleDesc>::GetNumOfDimension() &&
+                          nDim == remove_cvref_t<DstDesc>::GetNumOfDimension() &&
+                          nDim == ThreadClusterLengths::Size() &&
+                          nDim == ThreadClusterArrangeOrder::Size() &&
+                          nDim == SrcDimAccessOrder::Size() && nDim == DstDimAccessOrder::Size(),
+                      "wrong! nDim not consistent");
+
+        static_assert(
+            is_same<BlockSliceLengths, decltype(thread_slice_lengths * ThreadClusterLengths{})>{} &&
+                is_same<BlockScaleSliceLengths,
+                        decltype(scale_thread_slice_lengths * ThreadClusterLengths{})>{},
+            "wrong! threads should be mapped to cover entire slicing window");
+
+        static_assert(ThreadGroup::GetNumOfThread() >= thread_cluster_desc_.GetElementSize(),
+                      "wrong! ThreadGroup::GetNumOfThread() too small");
+
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            const auto thread_cluster_idx = thread_cluster_desc_.CalculateBottomIndex(
+                make_multi_index(ThreadGroup::GetThreadId()));
+
+            const auto thread_data_idx_begin = thread_cluster_idx * thread_slice_lengths;
+
+            threadwise_transfer_.SetSrcSliceOrigin(src_desc,
+                                                   src_block_slice_origin + thread_data_idx_begin);
+            threadwise_transfer_.SetScaleSliceOrigin(
+                scale_desc, scale_block_slice_origin + thread_data_idx_begin);
+            threadwise_transfer_.SetDstSliceOrigin(dst_desc,
+                                                   dst_block_slice_origin + thread_data_idx_begin);
+        }
+    }
+
+    template <typename SrcBuffer, index_t ThreadScratchId = 0>
+    __device__ void RunRead(const SrcDesc& src_desc,
+                            const SrcBuffer& src_buf,
+                            Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{})
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.RunRead(src_desc, src_buf, thread_scratch_id);
+        }
+    }
+
+    // With the assumption, scale scratch is always one
+    template <typename ScaleBuffer>
+    __device__ void RunScaleRead(const ScaleDesc& scale_desc, const ScaleBuffer& scale_buf)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.RunScaleRead(scale_desc, scale_buf);
+        }
+    }
+
+    template <typename DstBuffer, index_t ThreadScratchId = 0>
+    __device__ void RunWrite(const DstDesc& dst_desc,
+                             DstBuffer& dst_buf,
+                             Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{})
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.RunWrite(dst_desc, dst_buf, thread_scratch_id);
+        }
+    }
+
+    // We don't prefer use this API directly
+    /*
+    template <typename SrcBuffer, typename DstBuffer, index_t ThreadScratchId>
+    __device__ void Run(const SrcDesc& src_desc,
+                        const SrcBuffer& src_buf,
+                        const DstDesc& dst_desc,
+                        DstBuffer& dst_buf,
+                        Number<ThreadScratchId> thread_scratch_id)
+    {
+        RunRead(src_desc, src_buf, thread_scratch_id);
+        RunWrite(dst_desc, dst_buf, thread_scratch_id);
+    }
+    */
+
+    __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc, const Index& step)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveSrcSliceWindow(src_desc, step);
+        }
+    }
+
+    __device__ void MoveDstSliceWindow(const DstDesc& dst_desc, const Index& step)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveDstSliceWindow(dst_desc, step);
+        }
+    }
+
+    // With the assumption, scale buffer don't need move slice window method
+
+    private:
+    static constexpr auto thread_cluster_desc_ =
+        make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
+
+    using ThreadwiseTransfer =
+        ThreadwiseTensorSliceTransfer_v3r1_dequant<decltype(thread_slice_lengths),
+                                                   decltype(scale_thread_slice_lengths),
+                                                   SrcElementwiseOperation,
+                                                   ScaleElementwiseOperation,
+                                                   DstElementwiseOperation,
+                                                   DstInMemOp,
+                                                   SrcData,
+                                                   ScaleData,
+                                                   DstData,
+                                                   SrcDesc,
+                                                   ScaleDesc,
+                                                   DstDesc,
+                                                   SrcDimAccessOrder,
+                                                   DstDimAccessOrder,
+                                                   SrcVectorDim,
+                                                   DstVectorDim,
+                                                   SrcScalarPerVector,
+                                                   ScaleScalarPerVector,
+                                                   DstScalarPerVector,
+                                                   SrcScalarStrideInVector,
+                                                   ScaleScalarStrideInVector,
+                                                   DstScalarStrideInVector,
+                                                   ThreadTransferSrcResetCoordinateAfterRun,
+                                                   ThreadTransferDstResetCoordinateAfterRun,
+                                                   NumThreadScratch>;
+
+    ThreadwiseTransfer threadwise_transfer_;
+};
+
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp
@@ -66,7 +66,7 @@ template <typename ALayout,
          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
          ck::LoopScheduler LoopSched     = make_default_loop_scheduler(),
-          ck::PipelineVersion PipelineVer = ck::PipelineVersion::dequant_v1>
+          ck::PipelineVersion PipelineVer = ck::PipelineVersion::weight_only>
 struct DeviceFpAintBGemm_Wmma_CShuffle : public DeviceGemm_dequantB<ALayout,
                                                                    BLayout,
                                                                    CLayout,
@@ -95,7 +95,7 @@ struct DeviceFpAintBGemm_Wmma_CShuffle : public DeviceGemm_dequantB<ALayout,
    static constexpr auto BEnableLds_auto = MWaves == 1 ? false : true;

    // If true, LDS is used unconditionally
-    // LDS bypass feature not checked.
+    // LDS bypass feature not implemented for dequantization pipeline.
    static constexpr auto AEnableLds_manu = true;
    static constexpr auto BEnableLds_manu = true;

@@ -677,7 +677,7 @@ struct DeviceFpAintBGemm_Wmma_CShuffle : public DeviceGemm_dequantB<ALayout,
        std::map<PipelineVersion, std::string> PipelineVersionToString{
            {PipelineVersion::v1, "v1"},
            {PipelineVersion::v2, "v2"},
-            {PipelineVersion::dequant_v1, "dequant_v1"}};
+            {PipelineVersion::weight_only, "weight_only"}};

        // clang-format off
        str << "DeviceFpAintBGemm_Wmma_CShuffle"

--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -405,10 +405,10 @@ struct FastNumericArrayConverter<uint8_t, ck::half_t, 4>
        half_2[1] = __builtin_amdgcn_perm(fp16_adder, uint8_4, byte_selector_23);

        static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480;
-        asm volatile("v_pk_add_f16 %0, %1, %2 neg_lo:[0,1] neg_hi:[0,1]\n"
+        asm volatile("v_pk_add_f16 %0, %1, %2 neg_lo:[0,1] neg_hi:[0,1]"
                     : "=v"(half_2[0])
                     : "v"(half_2[0]), "s"(I8s_TO_F16s_MAGIC_NUM));
-        asm volatile("v_pk_add_f16 %0, %1, %2 neg_lo:[0,1] neg_hi:[0,1]\n"
+        asm volatile("v_pk_add_f16 %0, %1, %2 neg_lo:[0,1] neg_hi:[0,1]"
                     : "=v"(half_2[1])
                     : "v"(half_2[1]), "s"(I8s_TO_F16s_MAGIC_NUM));


--- a/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp
@@ -12,7 +12,7 @@ enum struct PipelineVersion
 {
    v1,
    v2,
-    dequant_v1,
+    weight_only,
 };

 template <PipelineVersion PipelineVer,
@@ -37,9 +37,9 @@ constexpr auto GridwiseGemmPipeline_Selector()
    {
        return GridwiseGemmPipeline_v2{};
    }
-    else if constexpr(PipelineVer == PipelineVersion::dequant_v1)
+    else if constexpr(PipelineVer == PipelineVersion::weight_only)
    {
-        return GridwiseGemmPipeline_v1_dequant<NumPrefetch, AEnableLds, BEnableLds>{};
+        return GridwiseGemmPipeline_v1_WeightOnly<NumPrefetch, AEnableLds, BEnableLds>{};
    }
    else
    {

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp
@@ -551,10 +551,10 @@ struct GridwiseGemmPipeline_v1<1, false, false>
 };

 template <index_t NumPrefetch, bool AEnableLds, bool BEnableLds>
-struct GridwiseGemmPipeline_v1_dequant;
+struct GridwiseGemmPipeline_v1_WeightOnly;

 template <>
-struct GridwiseGemmPipeline_v1_dequant<1, true, true>
+struct GridwiseGemmPipeline_v1_WeightOnly<1, true, true>
 {
    static constexpr auto I0 = Number<0>{};
    static constexpr auto I1 = Number<1>{};
@@ -580,10 +580,7 @@ struct GridwiseGemmPipeline_v1_dequant<1, true, true>
              typename BBlockBuffer,
              typename BBlockTransferStep,
              typename ScaleGridDesc,
-              typename ScaleBlockDesc,
-              typename ScaleBlockTransfer,
              typename ScaleGridBuffer,
-              typename ScaleBlockBuffer,
              typename BlockwiseGemm,
              typename CThreadBuffer>
    __device__ static void Run(const AGridDesc& a_grid_desc,
@@ -599,18 +596,16 @@ struct GridwiseGemmPipeline_v1_dequant<1, true, true>
                               BBlockBuffer& b_block_buf,
                               const BBlockTransferStep& b_block_copy_step,
                               const ScaleGridDesc& scale_grid_desc,
-                               const ScaleBlockDesc& scale_block_desc,
-                               ScaleBlockTransfer& scale_blockwise_copy,
                               const ScaleGridBuffer& scale_grid_buf,
-                               ScaleBlockBuffer& scale_block_buf,
                               const BlockwiseGemm& blockwise_gemm,
                               CThreadBuffer& c_thread_buf,
                               index_t num_loop)
    {
-        // preload data into LDS
+        // Global Prefetch Stage 1
        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
-        scale_blockwise_copy.RunRead(scale_grid_desc, scale_grid_buf);
+        // Scale read once
+        b_blockwise_copy.RunScaleRead(scale_grid_desc, scale_grid_buf);

        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
@@ -619,8 +614,8 @@ struct GridwiseGemmPipeline_v1_dequant<1, true, true>
        c_thread_buf.Clear();

        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+        // Dequantization fused in blockwise_copy
        b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
-        scale_blockwise_copy.RunWrite(scale_block_desc, scale_block_buf);

        // main body
        if constexpr(HasMainLoop)
@@ -635,7 +630,7 @@ struct GridwiseGemmPipeline_v1_dequant<1, true, true>

                b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);

-                blockwise_gemm.Run(a_block_buf, b_block_buf, scale_block_buf, c_thread_buf);
+                blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);

                block_sync_lds();

@@ -653,118 +648,7 @@ struct GridwiseGemmPipeline_v1_dequant<1, true, true>
        {
            block_sync_lds();

-            blockwise_gemm.Run(a_block_buf, b_block_buf, scale_block_buf, c_thread_buf);
-        }
-    }
-};
-
-template <>
-struct GridwiseGemmPipeline_v1_dequant<1, true, false>
-{
-    static constexpr auto I0 = Number<0>{};
-    static constexpr auto I1 = Number<1>{};
-
-    __host__ __device__ static constexpr bool IsSupported(index_t /* num_loop */) { return true; }
-
-    __host__ __device__ static constexpr bool CalculateHasMainLoop(index_t num_loop)
-    {
-        return num_loop > 1;
-    }
-
-    template <bool HasMainLoop,
-              typename AGridDesc,
-              typename ABlockDesc,
-              typename ABlockTransfer,
-              typename AGridBuffer,
-              typename ABlockBuffer,
-              typename ABlockTransferStep,
-              typename BGridDesc,
-              typename BBlockDesc,
-              typename BBlockTransfer,
-              typename BGridBuffer,
-              typename BBlockBuffer,
-              typename BBlockTransferStep,
-              typename ScaleGridDesc,
-              typename ScaleBlockDesc,
-              typename ScaleBlockTransfer,
-              typename ScaleGridBuffer,
-              typename ScaleBlockBuffer,
-              typename BlockwiseGemm,
-              typename CThreadBuffer>
-    __device__ static void Run(const AGridDesc& a_grid_desc,
-                               const ABlockDesc& a_block_desc,
-                               ABlockTransfer& a_blockwise_copy,
-                               const AGridBuffer& a_grid_buf,
-                               ABlockBuffer& a_block_buf,
-                               const ABlockTransferStep& a_block_copy_step,
-                               const BGridDesc& b_grid_desc,
-                               const BBlockDesc& b_block_desc,
-                               BBlockTransfer& b_blockwise_copy,
-                               const BGridBuffer& b_grid_buf,
-                               BBlockBuffer& b_block_buf,
-                               const BBlockTransferStep& b_block_copy_step,
-                               const ScaleGridDesc& scale_grid_desc,
-                               const ScaleBlockDesc& scale_block_desc,
-                               ScaleBlockTransfer& scale_blockwise_copy,
-                               const ScaleGridBuffer& scale_grid_buf,
-                               ScaleBlockBuffer& scale_block_buf,
-                               const BlockwiseGemm& blockwise_gemm,
-                               CThreadBuffer& c_thread_buf,
-                               index_t num_loop)
-    {
-        constexpr auto b_block_origin_idx = make_tuple(I0, I0, I0, I0, I0, I0, I0);
-        auto b_block_buf_switch           = b_block_buf;
-
-        // preload data into LDS
-        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
-        b_blockwise_copy.Run(
-            b_grid_desc, b_grid_buf, b_block_desc, b_block_origin_idx, b_block_buf);
-        scale_blockwise_copy.Run(
-            scale_grid_desc, scale_grid_buf, scale_block_desc, b_block_origin_idx, scale_block_buf);
-
-        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
-        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
-
-        // Initialize C
-        c_thread_buf.Clear();
-
-        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
-
-        // main body
-        if constexpr(HasMainLoop)
-        {
-            index_t i = 0;
-
-            do
-            {
-                b_blockwise_copy.Run(
-                    b_grid_desc, b_grid_buf, b_block_desc, b_block_origin_idx, b_block_buf_switch);
-
-                block_sync_lds();
-
-                a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
-
-                blockwise_gemm.Run(a_block_buf, b_block_buf, scale_block_buf, c_thread_buf);
-
-                block_sync_lds();
-
-                a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
-                b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
-
-                a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
-
-                b_block_buf = b_block_buf_switch;
-                ++i;
-            } while(i < (num_loop - 1));
-        }
-
-        // tail
-        {
-            block_sync_lds();
-
-            blockwise_gemm.Run(a_block_buf, b_block_buf, scale_block_buf, c_thread_buf);
-
-            block_sync_lds();
+            blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
        }
    }
 };

--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_dequant.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_dequant.hpp