Merge branch 'develop' into lwpck-1815

4c850c90 · Rostyslav Geyyer · GitHub · ce30621d · 1973903f · 4c850c90
Unverified Commit 4c850c90 authored Jun 19, 2024 by Rostyslav Geyyer Committed by GitHub Jun 19, 2024
20 changed files
--- a/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_bias_fastgelu_xdl_bf16_i8.cpp
+++ b/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_bias_fastgelu_xdl_bf16_i8.cpp
@@ -13,7 +13,7 @@
 #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"

-#include "ck/library/tensor_operation_instance/gpu/grouped_gemm_tile_loop_multply.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_gemm_tile_loop_multiply.hpp"

 #include "ck/host_utility/hip_check_error.hpp"


--- a/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_xdl_bf16_i8.cpp
+++ b/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_xdl_bf16_i8.cpp
@@ -13,7 +13,7 @@
 #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"

-#include "ck/library/tensor_operation_instance/gpu/grouped_gemm_tile_loop_multply.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_gemm_tile_loop_multiply.hpp"

 #include "ck/host_utility/hip_check_error.hpp"


--- a/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp
@@ -63,7 +63,7 @@ using DeviceGemmInstance =
 //######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 //######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 //######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,    S<4, 64, 1>,     S<1, 0, 2>,      S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              4>;
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,    S<4, 64, 1>,     S<1, 0, 2>,      S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,       S<4,4,4>>;
 // clang-format on

 struct ProblemSize final

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1.hpp
@@ -144,12 +144,12 @@ struct BlockwiseGemmXdlops_pipeline_v1<BlockGemmPipelineScheduler::Intrawave,
    static constexpr index_t PrefillStages   = 1;
    static constexpr index_t GlobalBufferNum = 1;

-    __host__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    __host__ __device__ static constexpr bool BlockHasHotloop(index_t num_loop)
    {
        return num_loop > PrefetchStages;
    }

-    __host__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    __host__ __device__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
    {
        ignore = num_loop;
        return TailNumber::Full;
@@ -446,12 +446,12 @@ struct BlockwiseGemmXdlops_pipeline_v1<BlockGemmPipelineScheduler::Interwave,
    static constexpr index_t PrefetchStages  = 1;
    static constexpr index_t PrefillStages   = 1;
    static constexpr index_t GlobalBufferNum = 1;
-    __host__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    __host__ __device__ static constexpr bool BlockHasHotloop(index_t num_loop)
    {
        return num_loop > PrefetchStages;
    }

-    __host__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    __host__ __device__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
    {
        ignore = num_loop;
        return TailNumber::Full;

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2.hpp
@@ -153,12 +153,12 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Intrawave,
    static constexpr index_t PrefillStages   = 1;
    static constexpr index_t GlobalBufferNum = PrefetchStages;

-    __host__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    __host__ __device__ static constexpr bool BlockHasHotloop(index_t num_loop)
    {
        return num_loop > PrefetchStages;
    }

-    __host__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    __host__ __device__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
    {
        if(num_loop % PrefetchStages == 1)
        {
@@ -646,12 +646,12 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Interwave,
    static constexpr index_t PrefillStages   = 1;
    static constexpr index_t GlobalBufferNum = PrefetchStages;

-    __host__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    __host__ __device__ static constexpr bool BlockHasHotloop(index_t num_loop)
    {
        return num_loop > PrefetchStages;
    }

-    __host__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    __host__ __device__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
    {
        if(num_loop % PrefetchStages == 1)
        {

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3.hpp
@@ -146,12 +146,12 @@ struct BlockwiseGemmXdlops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
    static constexpr index_t PrefillStages   = 1;
    static constexpr index_t GlobalBufferNum = 1;

-    __host__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    __host__ __device__ static constexpr bool BlockHasHotloop(index_t num_loop)
    {
        return num_loop > PrefetchStages;
    }

-    __host__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    __host__ __device__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
    {
        ignore = num_loop;
        return TailNumber::Full;

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4.hpp
@@ -147,12 +147,12 @@ struct BlockwiseGemmXdlops_pipeline_v4<BlockGemmPipelineScheduler::Intrawave,
    static constexpr index_t GlobalBufferNum = 2;
    static constexpr index_t HotloopUnroll   = 2;

-    __host__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    __host__ __device__ static constexpr bool BlockHasHotloop(index_t num_loop)
    {
        return num_loop > PrefetchStages;
    }

-    __host__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    __host__ __device__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
    {
        if(num_loop % HotloopUnroll == 1)
        {

--- a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -501,29 +501,24 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
            // for sanity check of vector memory access
            for(index_t i = 0; i < NumATensor; ++i)
            {
-                as_mz_consecutive_[i] = a_ms_ks_strides[i][NumDimM - 1] == 1;
-                as_kz_consecutive_[i] = a_ms_ks_strides[i][NumDimM + NumDimK - 1] == 1;
-                as_max_read_elems_[i] =
+                tie(as_continous_dim_[i], as_max_read_elems_[i]) =
                    CalculateMaxRead<NumDimM, NumDimK>(a_ms_ks_lengths[i], a_ms_ks_strides[i]);
            }

            for(index_t i = 0; i < NumBTensor; ++i)
            {
-                bs_nz_consecutive_[i] = b_ns_ks_strides[i][NumDimN - 1] == 1;
-                bs_kz_consecutive_[i] = b_ns_ks_strides[i][NumDimN + NumDimK - 1] == 1;
-                bs_max_read_elems_[i] =
+                tie(bs_continous_dim_[i], bs_max_read_elems_[i]) =
                    CalculateMaxRead<NumDimN, NumDimK>(b_ns_ks_lengths[i], b_ns_ks_strides[i]);
            }

            for(index_t i = 0; i < NumDTensor; ++i)
            {
-                ds_nz_consecutive_[i] = d_ms_ns_strides[i][NumDimM + NumDimN - 1] == 1;
-                ds_max_read_elems_[i] =
+                tie(ds_continous_dim_[i], ds_max_read_elems_[i]) =
                    CalculateMaxRead<NumDimM, NumDimN>(d_ms_ns_lengths[i], d_ms_ns_strides[i]);
            }

-            e_nz_consecutive_  = e_ms_ns_stride[NumDimM + NumDimN - 1] == 1;
-            e_max_write_elems_ = CalculateMaxRead<NumDimM, NumDimN>(e_ms_ns_length, e_ms_ns_stride);
+            tie(e_continous_dim_, e_max_write_elems_) =
+                CalculateMaxRead<NumDimM, NumDimN>(e_ms_ns_length, e_ms_ns_stride);
        }

        // pointers
@@ -553,14 +548,11 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
        BElementwiseOperation b_element_op_;
        CDEElementwiseOperation cde_element_op_;

-        // Describe whether the last part of a given dimension of A/B/D/E is consecutive
-        // in the memory or not.
-        std::array<bool, NumATensor> as_mz_consecutive_;
-        std::array<bool, NumATensor> as_kz_consecutive_;
-        std::array<bool, NumBTensor> bs_nz_consecutive_;
-        std::array<bool, NumBTensor> bs_kz_consecutive_;
-        std::array<bool, NumDTensor> ds_nz_consecutive_;
-        bool e_nz_consecutive_;
+        // Describe whether the last part of a given dimension of A/B/D/E is continues dim.
+        std::array<index_t, NumATensor> as_continous_dim_;
+        std::array<index_t, NumATensor> bs_continous_dim_;
+        std::array<index_t, NumBTensor> ds_continous_dim_;
+        index_t e_continous_dim_;

        std::array<index_t, NumATensor> as_max_read_elems_;
        std::array<index_t, NumBTensor> bs_max_read_elems_;
@@ -659,9 +651,9 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
                const bool valid_a_vector_size =
                    arg.as_max_read_elems_[i] % ABlockTransferSrcScalarPerVector == 0;
                const bool valid_a_access_dim_m =
-                    ABlockTransferSrcVectorDim == 1 && arg.as_mz_consecutive_[i];
+                    ABlockTransferSrcVectorDim == 1 && arg.as_continous_dim_[i] == 0;
                const bool valid_a_access_dim_k =
-                    ABlockTransferSrcVectorDim == 2 && arg.as_kz_consecutive_[i];
+                    ABlockTransferSrcVectorDim == 2 && arg.as_continous_dim_[i] == 1;
                const bool valid_a_access_dim = valid_a_access_dim_m || valid_a_access_dim_k;
                if(!((valid_a_vector_size && valid_a_access_dim) ||
                     ABlockTransferSrcScalarPerVector == 1))
@@ -679,9 +671,9 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
                const bool valid_b_vector_size =
                    arg.bs_max_read_elems_[i] % BBlockTransferSrcScalarPerVector == 0;
                const bool valid_b_access_dim_n =
-                    BBlockTransferSrcVectorDim == 1 && arg.bs_nz_consecutive_[i];
+                    BBlockTransferSrcVectorDim == 1 && arg.bs_continous_dim_[i] == 0;
                const bool valid_b_access_dim_k =
-                    BBlockTransferSrcVectorDim == 2 && arg.bs_kz_consecutive_[i];
+                    BBlockTransferSrcVectorDim == 2 && arg.bs_continous_dim_[i] == 1;
                const bool valid_b_access_dim = valid_b_access_dim_n || valid_b_access_dim_k;
                if(!((valid_b_vector_size && valid_b_access_dim) ||
                     BBlockTransferSrcScalarPerVector == 1))
@@ -699,7 +691,7 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
                const bool valid_d_vector_size =
                    arg.ds_max_read_elems_[i] % CDEBlockTransferScalarPerVector_NPerBlock == 0;
                // Vector read of Ds is always on N dimension.
-                const bool valid_d_access_dim = arg.ds_nz_consecutive_[i];
+                const bool valid_d_access_dim = arg.ds_continous_dim_[i] == 1;
                if(!((valid_d_vector_size && valid_d_access_dim) ||
                     CDEBlockTransferScalarPerVector_NPerBlock == 1))
                {
@@ -714,7 +706,7 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
            const bool valid_e_vector_size =
                arg.e_max_write_elems_ % CDEBlockTransferScalarPerVector_NPerBlock == 0;
            // Vector write of E is always on N dimension.
-            const bool valid_e_access_dim = arg.e_nz_consecutive_;
+            const bool valid_e_access_dim = arg.e_continous_dim_ == 1;
            if(!((valid_e_vector_size && valid_e_access_dim) ||
                 CDEBlockTransferScalarPerVector_NPerBlock == 1))
            {

--- a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
@@ -442,25 +442,19 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
            }

            // for sanity check of vector memory access
-            a_mz_consecutive_ = a_ms_ks_strides[NumDimM - 1] == 1;
-            a_kz_consecutive_ = a_ms_ks_strides[NumDimM + NumDimK - 1] == 1;
-            a_max_read_elems_ =
+            tie(a_continous_dim_, a_max_read_elems_) =
                CalculateMaxRead<NumDimM, NumDimK>(a_ms_ks_lengths, a_ms_ks_strides);

-            b_nz_consecutive_ = b_ns_ks_strides[NumDimN - 1] == 1;
-            b_kz_consecutive_ = b_ns_ks_strides[NumDimN + NumDimK - 1] == 1;
-            b_max_read_elems_ =
+            tie(b_continous_dim_, b_max_read_elems_) =
                CalculateMaxRead<NumDimN, NumDimK>(b_ns_ks_lengths, b_ns_ks_strides);

            for(index_t i = 0; i < NumDTensor; ++i)
            {
-                ds_nz_consecutive_[i] = ds_ms_ns_strides[i][NumDimM + NumDimN - 1] == 1;
-                ds_max_read_elems_[i] =
+                tie(ds_continous_dim_[i], ds_max_read_elems_[i]) =
                    CalculateMaxRead<NumDimM, NumDimN>(ds_ms_ns_lengths[i], ds_ms_ns_strides[i]);
            }

-            e_nz_consecutive_ = e_ms_ns_strides[NumDimM + NumDimN - 1] == 1;
-            e_max_write_elems_ =
+            tie(e_continous_dim_, e_max_write_elems_) =
                CalculateMaxRead<NumDimM, NumDimN>(e_ms_ns_lengths, e_ms_ns_strides);
        }

@@ -501,14 +495,11 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
        BElementwiseOperation b_element_op_;
        CDEElementwiseOperation cde_element_op_;

-        // Describe whether the last part of a given dimension of A/B/D/E is consecutive
-        // in the memory or not.
-        bool a_mz_consecutive_;
-        bool a_kz_consecutive_;
-        bool b_nz_consecutive_;
-        bool b_kz_consecutive_;
-        std::array<bool, NumDTensor> ds_nz_consecutive_;
-        bool e_nz_consecutive_;
+        // Describe whether the last part of a given dimension of A/B/D/E is continues dim.
+        index_t a_continous_dim_;
+        index_t b_continous_dim_;
+        std::array<index_t, NumDTensor> ds_continous_dim_;
+        index_t e_continous_dim_;

        index_t a_max_read_elems_;
        index_t b_max_read_elems_;
@@ -624,8 +615,10 @@ struct DeviceContractionMultipleD_Xdl_CShuffle

        const bool valid_a_vector_size =
            arg.a_max_read_elems_ % ABlockTransferSrcScalarPerVector == 0;
-        const bool valid_a_access_dim_m = ABlockTransferSrcVectorDim == 1 && arg.a_mz_consecutive_;
-        const bool valid_a_access_dim_k = ABlockTransferSrcVectorDim == 2 && arg.a_kz_consecutive_;
+        const bool valid_a_access_dim_m =
+            ABlockTransferSrcVectorDim == 1 && arg.a_continous_dim_ == 0;
+        const bool valid_a_access_dim_k =
+            ABlockTransferSrcVectorDim == 2 && arg.a_continous_dim_ == 1;
        const bool valid_a_access_dim =
            valid_a_access_dim_m || valid_a_access_dim_k || ABlockTransferSrcScalarPerVector == 1;
        if(!(valid_a_vector_size && valid_a_access_dim))
@@ -635,8 +628,10 @@ struct DeviceContractionMultipleD_Xdl_CShuffle

        const bool valid_b_vector_size =
            arg.b_max_read_elems_ % BBlockTransferSrcScalarPerVector == 0;
-        const bool valid_b_access_dim_n = BBlockTransferSrcVectorDim == 1 && arg.b_nz_consecutive_;
-        const bool valid_b_access_dim_k = BBlockTransferSrcVectorDim == 2 && arg.b_kz_consecutive_;
+        const bool valid_b_access_dim_n =
+            BBlockTransferSrcVectorDim == 1 && arg.b_continous_dim_ == 0;
+        const bool valid_b_access_dim_k =
+            BBlockTransferSrcVectorDim == 2 && arg.b_continous_dim_ == 1;
        const bool valid_b_access_dim =
            valid_b_access_dim_n || valid_b_access_dim_k || BBlockTransferSrcScalarPerVector == 1;
        if(!(valid_b_vector_size && valid_b_access_dim))
@@ -650,7 +645,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
                arg.ds_max_read_elems_[i] % CDEBlockTransferScalarPerVector_NPerBlock == 0;
            // Vector read of Ds is always on N dimension.
            const bool valid_d_access_dim =
-                arg.ds_nz_consecutive_[i] || CDEBlockTransferScalarPerVector_NPerBlock == 1;
+                arg.ds_continous_dim_[i] == 1 || CDEBlockTransferScalarPerVector_NPerBlock == 1;
            if(!(valid_d_vector_size && valid_d_access_dim))
            {
                valid_ds_access = false;
@@ -665,7 +660,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
            arg.e_max_write_elems_ % CDEBlockTransferScalarPerVector_NPerBlock == 0;
        // Vector write of E is always on N dimension.
        const bool valid_e_access_dim =
-            arg.e_nz_consecutive_ || CDEBlockTransferScalarPerVector_NPerBlock == 1;
+            arg.e_continous_dim_ == 1 || CDEBlockTransferScalarPerVector_NPerBlock == 1;
        if(!(valid_e_vector_size && valid_e_access_dim))
        {
            return false;

--- a/include/ck/tensor_operation/gpu/device/impl/device_contraction_utils.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_contraction_utils.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -50,25 +50,53 @@ auto CalculateMaxRead(const std::vector<index_t>& lengths, const std::vector<ind
    }

    // Determine the beginning and end idx of the group representing the FCD.
-    index_t begin_idx, end_idx;
-    if(strides[NumDim1 - 1] == 1)
+    index_t begin_idx, end_idx, continous_dim, consecutive_stride = 1;
+    if(strides[NumDim1 - 1] == 1 && strides[NumDim1 + NumDim2 - 1] == 1)
+    {
+        // MZ or KZ are ones
+        bool dims1_are_ones = true;
+        for(index_t dim_idx = 0; dim_idx < NumDim1; dim_idx++)
+        {
+            if(lengths[dim_idx] != 1)
+            {
+                dims1_are_ones = false;
+            }
+        }
+
+        if(dims1_are_ones)
+        {
+            begin_idx     = NumDim1;
+            end_idx       = NumDim1 + NumDim2 - 1;
+            continous_dim = 1;
+        }
+        else
+        {
+            begin_idx     = 0;
+            end_idx       = NumDim1 - 1;
+            continous_dim = 0;
+        }
+    }
+    else if(strides[NumDim1 - 1] == 1)
    {
        begin_idx     = 0;
        end_idx       = NumDim1 - 1;
+        continous_dim = 0;
    }
    else if(strides[NumDim1 + NumDim2 - 1] == 1)
    {
        begin_idx     = NumDim1;
        end_idx       = NumDim1 + NumDim2 - 1;
+        continous_dim = 1;
    }
    else
    {
        // The dimension consecutive in memory is not the last dimension of any group, so only
        // one element can be read/written at once.
-        return 1;
+        consecutive_stride = 1;
+        continous_dim      = 0;
+        return make_tuple(continous_dim, consecutive_stride);
    }

-    index_t consecutive_stride = 1;
    for(index_t dim_idx = end_idx; dim_idx >= begin_idx; --dim_idx)
    {
        if(strides[dim_idx] == consecutive_stride)
@@ -81,7 +109,7 @@ auto CalculateMaxRead(const std::vector<index_t>& lengths, const std::vector<ind
        }
    }
    const index_t max_subsequent_elems = consecutive_stride;
-    return max_subsequent_elems;
+    return make_tuple(continous_dim, max_subsequent_elems);
 }

 } // namespace device

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
@@ -93,9 +93,12 @@ __global__ void
        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);

-    const long_index_t a_batch_offset = compute_ptr_offset_of_batch.GetAPtrOffset(g_idx);
-    const long_index_t b_batch_offset = compute_ptr_offset_of_batch.GetBPtrOffset(g_idx);
-    const long_index_t e_batch_offset = compute_ptr_offset_of_batch.GetEPtrOffset(g_idx);
+    const long_index_t a_batch_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
+    const long_index_t b_batch_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx));
+    const long_index_t e_batch_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx));

    const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);


--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp
@@ -54,9 +54,12 @@ __global__ void
        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);

-    const long_index_t a_batch_offset = compute_ptr_offset_of_batch.GetAPtrOffset(g_idx);
-    const long_index_t b_batch_offset = compute_ptr_offset_of_batch.GetBPtrOffset(g_idx);
-    const long_index_t c_batch_offset = compute_ptr_offset_of_batch.GetCPtrOffset(g_idx);
+    const long_index_t a_batch_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
+    const long_index_t b_batch_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx));
+    const long_index_t c_batch_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetCPtrOffset(g_idx));

    __shared__ FloatAB p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB)];


--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp
@@ -66,9 +66,12 @@ __global__ void
        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);

-    const long_index_t a_batch_offset = compute_ptr_offset_of_batch.GetAPtrOffset(g_idx);
-    const long_index_t b_batch_offset = compute_ptr_offset_of_batch.GetBPtrOffset(g_idx);
-    const long_index_t c_batch_offset = compute_ptr_offset_of_batch.GetCPtrOffset(g_idx);
+    const long_index_t a_batch_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
+    const long_index_t b_batch_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx));
+    const long_index_t c_batch_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetCPtrOffset(g_idx));

    __shared__ FloatA p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatA)];


--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
@@ -59,9 +59,12 @@ __global__ void
    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z * NumBatchToMerge);
    const index_t k_idx = __builtin_amdgcn_readfirstlane(blockIdx.y * num_k_per_block);

-    const long_index_t a_batch_offset = compute_ptr_offset_of_batch.GetAPtrOffset(g_idx);
-    const long_index_t b_batch_offset = compute_ptr_offset_of_batch.GetBPtrOffset(g_idx);
-    const long_index_t e_batch_offset = compute_ptr_offset_of_batch.GetEPtrOffset(g_idx);
+    const long_index_t a_batch_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
+    const long_index_t b_batch_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx));
+    const long_index_t e_batch_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx));

    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];

@@ -113,9 +116,12 @@ __global__ void
    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z * NumBatchToMerge);
    const index_t k_idx = __builtin_amdgcn_readfirstlane(blockIdx.y * num_k_per_block);

-    const long_index_t a_batch_offset = compute_ptr_offset_of_batch.GetAPtrOffset(g_idx);
-    const long_index_t b_batch_offset = compute_ptr_offset_of_batch.GetBPtrOffset(g_idx);
-    const long_index_t e_batch_offset = compute_ptr_offset_of_batch.GetEPtrOffset(g_idx);
+    const long_index_t a_batch_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
+    const long_index_t b_batch_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx));
+    const long_index_t e_batch_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx));

    // Pass two lds pointer is the key to tell compiler that ds_read/write
    // operate on different lds chunk at same time without order dependecy

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
@@ -97,9 +97,12 @@ __global__ void
        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);

-    const long_index_t a_batch_offset = compute_ptr_offset_of_batch.GetAPtrOffset(g_idx);
-    const long_index_t b_batch_offset = compute_ptr_offset_of_batch.GetBPtrOffset(g_idx);
-    const long_index_t c_batch_offset = compute_ptr_offset_of_batch.GetEPtrOffset(g_idx);
+    const long_index_t a_batch_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
+    const long_index_t b_batch_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx));
+    const long_index_t c_batch_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx));

    const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);


--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
@@ -106,10 +106,12 @@ __global__ void
    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.y / num_blocks_per_batch);
    const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.y / num_blocks_per_n);

-    const long_index_t e_batch_offset = compute_ptr_offset_of_groups.GetEPtrOffset(g_idx);
+    const long_index_t e_batch_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetEPtrOffset(g_idx));
    const auto& ds_batch_offset = compute_ptr_offset_of_groups.GetDsPtrOffset(g_idx);

-    const long_index_t e_n_offset = compute_ptr_offset_of_n.GetEPtrOffset(n_idx);
+    const long_index_t e_n_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_n.GetEPtrOffset(n_idx));

    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];

@@ -170,10 +172,13 @@ __global__ void
    }
    else
    {
-        const long_index_t a_batch_offset = compute_ptr_offset_of_groups.GetAPtrOffset(g_idx);
-        const long_index_t b_batch_offset = compute_ptr_offset_of_groups.GetBPtrOffset(g_idx);
+        const long_index_t a_batch_offset =
+            amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetAPtrOffset(g_idx));
+        const long_index_t b_batch_offset =
+            amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetBPtrOffset(g_idx));

-        const long_index_t a_n_offset = compute_ptr_offset_of_n.GetAPtrOffset(n_idx);
+        const long_index_t a_n_offset =
+            amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx));

        GridwiseGemm::template Run<HasMainKBlockLoop>(
            p_as_grid + a_batch_offset + a_n_offset,

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
@@ -85,12 +85,17 @@ __global__ void
    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.y / num_blocks_per_batch);
    const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.y / num_blocks_per_n);

-    const long_index_t a_batch_offset = compute_ptr_offset_of_groups.GetAPtrOffset(g_idx);
-    const long_index_t b_batch_offset = compute_ptr_offset_of_groups.GetBPtrOffset(g_idx);
-    const long_index_t e_batch_offset = compute_ptr_offset_of_groups.GetEPtrOffset(g_idx);
-
-    const long_index_t a_n_offset = compute_ptr_offset_of_n.GetAPtrOffset(n_idx);
-    const long_index_t e_n_offset = compute_ptr_offset_of_n.GetEPtrOffset(n_idx);
+    const long_index_t a_batch_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetAPtrOffset(g_idx));
+    const long_index_t b_batch_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetBPtrOffset(g_idx));
+    const long_index_t e_batch_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetEPtrOffset(g_idx));
+
+    const long_index_t a_n_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx));
+    const long_index_t e_n_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_n.GetEPtrOffset(n_idx));

    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];

@@ -142,12 +147,17 @@ __global__ void
    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.y / num_blocks_per_batch);
    const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.y / num_blocks_per_n);

-    const long_index_t a_batch_offset = compute_ptr_offset_of_groups.GetAPtrOffset(g_idx);
-    const long_index_t b_batch_offset = compute_ptr_offset_of_groups.GetBPtrOffset(g_idx);
-    const long_index_t e_batch_offset = compute_ptr_offset_of_groups.GetEPtrOffset(g_idx);
-
-    const long_index_t a_n_offset = compute_ptr_offset_of_n.GetAPtrOffset(n_idx);
-    const long_index_t e_n_offset = compute_ptr_offset_of_n.GetEPtrOffset(n_idx);
+    const long_index_t a_batch_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetAPtrOffset(g_idx));
+    const long_index_t b_batch_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetBPtrOffset(g_idx));
+    const long_index_t e_batch_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetEPtrOffset(g_idx));
+
+    const long_index_t a_n_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx));
+    const long_index_t e_n_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_n.GetEPtrOffset(n_idx));

    // Pass two lds pointer is the key to tell compiler that ds_read/write
    // operate on different lds chunk at same time without order dependecy

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp
@@ -161,11 +161,11 @@ __global__ void
        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);

-    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+    const long_index_t a_batch_offset = amd_wave_read_first_lane(
        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
-    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+    const long_index_t b_batch_offset = amd_wave_read_first_lane(
        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
-    const long_index_t e_batch_offset = __builtin_amdgcn_readfirstlane(
+    const long_index_t e_batch_offset = amd_wave_read_first_lane(
        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));

    const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp
--- a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
+++ b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
@@ -908,6 +908,51 @@ struct OffsettedBlockToCTileMap
    UnderlyingBlockToCTileMap block_to_ctile_map_;
    index_t block_start_;
 };
+// second version with 2 offsets
+template <typename UnderlyingBlockToCTileMap>
+struct OffsettedBlockToCTileMap2
+{
+    using underlying_type = UnderlyingBlockToCTileMap;
+
+    __host__ __device__ OffsettedBlockToCTileMap2(UnderlyingBlockToCTileMap block_to_ctile_map,
+                                                  index_t group_offset,
+                                                  index_t tile_offset)
+        : block_to_ctile_map_{block_to_ctile_map},
+          group_offset_{group_offset},
+          tile_offset_{tile_offset}
+    {
+    }
+
+    template <typename TopIdx>
+    __host__ __device__ constexpr auto CalculateBottomIndex(const TopIdx& idx_top) const
+    {
+        return block_to_ctile_map_.CalculateBottomIndex(
+            make_multi_index(idx_top[Number<0>{}] + tile_offset_ - group_offset_));
+    }
+
+    template <typename CTileIdx, typename CTileDim>
+    __host__ __device__ bool ValidCTileIndex(const CTileIdx& c_tile_idx,
+                                             const CTileDim& c_tile_dim) const
+    {
+        return block_to_ctile_map_.ValidCTileIndex(c_tile_idx, c_tile_dim);
+    }
+
+    template <typename CGridDesc_M_N>
+    __host__ constexpr bool CheckValidity(const CGridDesc_M_N& c_grid_desc_m_n) const
+    {
+        return block_to_ctile_map_.CheckValidity(c_grid_desc_m_n);
+    }
+
+    __host__ __device__ constexpr index_t CalculateGridSize(index_t M, index_t N) const
+    {
+        return block_to_ctile_map_.CalculateGridSize(M, N);
+    }
+
+    __device__ void UpdateTileOffset(index_t offset) { tile_offset_ = offset; }
+    UnderlyingBlockToCTileMap block_to_ctile_map_;
+    index_t group_offset_;
+    index_t tile_offset_;
+};

 /**
 * @brief      Simple tile mapping which creates 3D grid of block of threads.