Merge branch 'develop' into amd-develop

55a89c74 · Jun Liu · 0dacd895 · dcedf363 · 55a89c74 · 55a89c74
Commit 55a89c74 authored Dec 16, 2023 by Jun Liu
20 changed files
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
-rocm-docs-core>=0.20.0
+rocm-docs-core==0.30.1
 sphinxcontrib-bibtex==2.6.1
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -16,7 +16,7 @@ beautifulsoup4==4.11.2
    # via pydata-sphinx-theme
 breathe==4.34.0
    # via rocm-docs-core
-certifi==2022.12.7
+certifi==2023.7.22
    # via requests
 cffi==1.15.1
    # via
@@ -26,7 +26,7 @@ charset-normalizer==3.1.0
    # via requests
 click==8.1.3
    # via sphinx-external-toc
-cryptography==40.0.2
+cryptography==41.0.6
    # via pyjwt
 deprecated==1.2.13
    # via pygithub
@@ -42,7 +42,7 @@ fastjsonschema==2.18.0
    # via rocm-docs-core
 gitdb==4.0.10
    # via gitpython
-gitpython==3.1.35
+gitpython==3.1.37
    # via rocm-docs-core
 idna==3.4
    # via requests
@@ -88,9 +88,9 @@ pydata-sphinx-theme==0.13.3
    # via
    #   rocm-docs-core
    #   sphinx-book-theme
-pygithub==1.58.2
+pygithub==1.58.1
    # via rocm-docs-core
-pygments==2.14.0
+pygments==2.15.0
    # via
    #   accessible-pygments
    #   pydata-sphinx-theme
@@ -109,11 +109,11 @@ pyyaml==6.0
    #   pybtex
    #   rocm-docs-core
    #   sphinx-external-toc
-requests==2.28.2
+requests==2.31.0
    # via
    #   pygithub
    #   sphinx
-rocm-docs-core==0.27.0
+rocm-docs-core==0.30.1
    # via -r requirements.in
 six==1.16.0
    # via
@@ -141,7 +141,7 @@ sphinx-book-theme==1.0.1
    # via rocm-docs-core
 sphinx-copybutton==0.5.1
    # via rocm-docs-core
-sphinx-design==0.3.0
+sphinx-design==0.4.1
    # via rocm-docs-core
 sphinx-external-toc==0.3.1
    # via rocm-docs-core
@@ -163,7 +163,7 @@ sphinxcontrib-serializinghtml==1.1.5
    # via sphinx
 typing-extensions==4.5.0
    # via pydata-sphinx-theme
-urllib3==1.26.15
+urllib3==1.26.18
    # via requests
 wrapt==1.15.0
    # via deprecated

--- a/docs/wrapper.rst
+++ b/docs/wrapper.rst
+===============
+Wrapper
+===============
+-------------------------------------
+Description
+-------------------------------------
+.. note::
+    The wrapper is under development and its functionality is limited.
+CK provides a lightweight wrapper for more complex operations implemented in 
+the library. It allows indexing of nested layouts using a simple interface 
+(avoiding complex descriptor transformations) and memory access (using Tensor).
+Example:
+.. code-block:: c
+    const auto shape_4x2x4         = ck::make_tuple(4, ck::make_tuple(2, 4));
+    const auto strides_s2x1x8      = ck::make_tuple(2, ck::make_tuple(1, 8));
+    const auto layout = ck::wrapper::make_layout(shape_4x2x4, strides_s2x1x8);
+    std::array<ck::index_t, 32> data;
+    auto tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(&data[0], layout);
+    for(ck::index_t w = 0; w < size(tensor); w++) {
+        tensor(w) = w;
+    }
+    // slice() == slice(0, -1) (whole dimension)
+    auto tensor_slice = tensor(ck::wrapper::slice(1, 3), ck::make_tuple(ck::wrapper::slice(), ck::wrapper::slice()));
+    std::cout << "dims:2,(2,4) strides:2,(1,8)" << std::endl;
+    for(ck::index_t h = 0; h < ck::wrapper::size<0>(tensor_slice); h++)
+    {
+        for(ck::index_t w = 0; w < ck::wrapper::size<1>(tensor_slice); w++)
+        {
+            std::cout << tensor_slice(h, w) << " ";
+        }
+        std::cout << std::endl;
+    }
+Output::
+    dims:2,(2,4) strides:2,(1,8)
+    1 5 9 13 17 21 25 29 
+    2 6 10 14 18 22 26 30 
+-------------------------------------
+Layout
+-------------------------------------
+.. doxygenstruct:: ck::wrapper::Layout
+-------------------------------------
+Layout helpers
+-------------------------------------
+.. doxygenfile:: layout_utils.hpp
+-------------------------------------
+Tensor
+-------------------------------------
+.. doxygenstruct:: ck::wrapper::Tensor
+-------------------------------------
+Tensor helpers
+-------------------------------------
+.. doxygenfile:: tensor_utils.hpp
--- a/example/62_conv_fwd_activ/CMakeLists.txt
+++ b/example/62_conv_fwd_activ/CMakeLists.txt
@@ -42,6 +42,8 @@ foreach(gpu IN LISTS GPU_TARGETS)
      # ScaleAdd ScaleAdd Relu
      add_example_executable(example_convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16 convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16.cpp)
      add_example_dependencies(example_convnd_fwd_activ_xdl example_convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16)
+      add_example_executable(example_convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16 convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16.cpp)
+      add_example_dependencies(example_convnd_fwd_activ_xdl example_convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16)
   set(target 1)
 endif()
 endforeach()
--- a/example/62_conv_fwd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16.cpp
+++ b/example/62_conv_fwd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+#include <algorithm>
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <type_traits>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+constexpr ck::index_t NDimSpatial = 3;
+using InDataType                  = ck::half_t;
+using WeiDataType                 = ck::half_t;
+using AccDataType                 = float;
+using CShuffleDataType            = ck::half_t;
+using OutDataType                 = ck::half_t;
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using InLayout  = ck::tensor_layout::convolution::NDHWGC;
+using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
+using OutLayout = ck::tensor_layout::convolution::NDHWGK;
+using BiasLayout = ck::tensor_layout::convolution::G_K;
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::ScaleAddScaleAddRelu;
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+template <typename OutElementOp>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
+        NDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<OutLayout, BiasLayout>,
+        OutLayout,
+        InDataType,
+        WeiDataType,
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<OutDataType, OutDataType>,
+        OutDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        32,          // KPerBlock
+        8,           // AK1
+        8,           // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        8,           // ABlockTransferSrcScalarPerVector
+        8,           // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        8,           // BBlockTransferSrcScalarPerVector
+        8,           // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 32, 1, 8>,
+        8>;
+using DeviceGroupedConvNDFwdActivInstance = DeviceGroupedConvNDFwdInstance<OutElementOp>;
+namespace {
+// Use custom implementation to pass two more tensors for post op
+template <ck::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementOp,
+          typename WeiElementOp,
+          typename OutElementOp,
+          typename DeviceConvNDFwdInstance>
+bool run_grouped_conv_fwd(bool do_verification,
+                          int init_method,
+                          bool time_kernel,
+                          const ck::utils::conv::ConvParam& conv_param,
+                          const HostTensorDescriptor& in_g_n_c_wis_desc,
+                          const HostTensorDescriptor& wei_g_k_c_xs_desc,
+                          const HostTensorDescriptor& out_g_n_k_wos_desc,
+                          const InElementOp& in_element_op,
+                          const WeiElementOp& wei_element_op,
+                          const OutElementOp& out_element_op)
+{
+    constexpr ck::index_t NumDs = 2;
+    const ck::index_t G         = out_g_n_k_wos_desc.GetLengths()[0];
+    const ck::index_t K         = out_g_n_k_wos_desc.GetLengths()[2];
+    // Logical broadcast bias (we have to pass bias lengths in the same format as output - GNKDHW)
+    std::array<ck::index_t, NDimSpatial + 3> bias_g_k_lengths;
+    std::array<ck::index_t, NDimSpatial + 3> bias_g_k_strides;
+    // Fill other lenghts than G,K with 1 and strides with 0
+    bias_g_k_lengths.fill(1);
+    bias_g_k_strides.fill(0);
+    bias_g_k_lengths[0]              = G;
+    bias_g_k_lengths[2]              = K;
+    bias_g_k_strides[0]              = K; // stride to G
+    bias_g_k_strides[2]              = 1; // stride to K
+    const auto broadcasted_bias_desc = HostTensorDescriptor(bias_g_k_lengths, bias_g_k_strides);
+    //  y = relu ( alpha1 * conv(x) + alpha2 * z + bias )
+    Tensor<InDataType> in(in_g_n_c_wis_desc);
+    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
+    Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
+    Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
+    std::array<Tensor<OutDataType>, NumDs> d_tensors = {Tensor<OutDataType>(out_g_n_k_wos_desc),
+                                                        Tensor<OutDataType>(broadcasted_bias_desc)};
+    std::cout << "in: " << in.mDesc << std::endl;
+    std::cout << "wei: " << wei.mDesc << std::endl;
+    std::cout << "out: " << out_host.mDesc << std::endl;
+    std::cout << "z_tensor: " << d_tensors[0].mDesc << std::endl;
+    std::cout << "bias_tensor: " << d_tensors[1].mDesc << std::endl;
+    // Make sure that we allocated only G * K values for bias
+    assert(static_cast<ck::index_t>(d_tensors[1].mData.size()) == G * K);
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-2, 2});
+        wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-2, 2});
+        d_tensors[0].GenerateTensorValue(GeneratorTensor_2<OutDataType>{-2, 2});
+        d_tensors[1].GenerateTensorValue(GeneratorTensor_2<OutDataType>{-2, 2});
+        break;
+    default:
+        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-1.0, 1.0});
+        wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.05, 0.05});
+        d_tensors[0].GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.05, 0.05});
+        d_tensors[1].GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.05, 0.05});
+    }
+    DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
+    DeviceMem z_buf(sizeof(OutDataType) * d_tensors[0].mDesc.GetElementSpaceSize());
+    DeviceMem bias_buf(sizeof(OutDataType) * d_tensors[1].mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
+    in_device_buf.ToDevice(in.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
+    z_buf.ToDevice(d_tensors[0].mData.data());
+    bias_buf.ToDevice(d_tensors[1].mData.data());
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+    auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+    copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
+    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
+    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
+    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
+    copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
+    copy(conv_param.conv_filter_strides_, conv_filter_strides);
+    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_param.input_left_pads_, input_left_pads);
+    copy(conv_param.input_right_pads_, input_right_pads);
+    const std::array<const void*, NumDs> ds = {z_buf.GetDeviceBuffer(), bias_buf.GetDeviceBuffer()};
+    auto conv     = DeviceConvNDFwdInstance{};
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(in_device_buf.GetDeviceBuffer(),
+                                      wei_device_buf.GetDeviceBuffer(),
+                                      ds,
+                                      out_device_buf.GetDeviceBuffer(),
+                                      a_g_n_c_wis_lengths,
+                                      a_g_n_c_wis_strides,
+                                      b_g_k_c_xs_lengths,
+                                      b_g_k_c_xs_strides,
+                                      std::array<std::array<ck::index_t, NDimSpatial + 3>, NumDs>{
+                                          e_g_n_k_wos_lengths, bias_g_k_lengths},
+                                      std::array<std::array<ck::index_t, NDimSpatial + 3>, NumDs>{
+                                          e_g_n_k_wos_strides, bias_g_k_strides},
+                                      e_g_n_k_wos_lengths,
+                                      e_g_n_k_wos_strides,
+                                      conv_filter_strides,
+                                      conv_filter_dilations,
+                                      input_left_pads,
+                                      input_right_pads,
+                                      in_element_op,
+                                      wei_element_op,
+                                      out_element_op);
+    if(!conv.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error("The device op with the specified compilation parameters does "
+                                 "not support this convolution problem.");
+    }
+    float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+    std::size_t flop = conv_param.GetFlops() + G * K +
+                       conv_param.GetOutputByte<OutDataType>() / sizeof(OutDataType);
+    std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>() +
+                            G * K * sizeof(OutDataType) + conv_param.GetOutputByte<OutDataType>();
+    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+    float gb_per_sec = num_btype / 1.E6 / avg_time;
+    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << conv.GetTypeString() << std::endl;
+    if(do_verification)
+    {
+        auto ref_conv =
+            ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+                                                         InDataType,
+                                                         WeiDataType,
+                                                         OutDataType,
+                                                         InElementOp,
+                                                         WeiElementOp,
+                                                         OutElementOp,
+                                                         0, /*Num A Elementwise Tensors*/
+                                                         0, /*Num B Elementwise Tensors*/
+                                                         NumDs>();
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(in,
+                                                  wei,
+                                                  out_host,
+                                                  conv_param.conv_filter_strides_,
+                                                  conv_param.conv_filter_dilations_,
+                                                  conv_param.input_left_pads_,
+                                                  conv_param.input_right_pads_,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  out_element_op,
+                                                  {},
+                                                  {},
+                                                  d_tensors);
+        ref_invoker.Run(ref_argument);
+        out_device_buf.FromDevice(out_device.mData.data());
+        return ck::utils::check_err(out_device, out_host, "Error: incorrect results!");
+    }
+    return true;
+}
+} // namespace
+#include "run_convnd_fwd_activ_example.inc"
+int main(int argc, char* argv[]) { return !run_convnd_fwd_example(argc, argv); }
--- a/example/62_conv_fwd_activ/run_convnd_fwd_activ_example.inc
+++ b/example/62_conv_fwd_activ/run_convnd_fwd_activ_example.inc
@@ -24,7 +24,7 @@ bool run_convnd_fwd_example(int argc, char* argv[])
    // Following shapes are selected to avoid overflow. Expect inf in case of
    // size increase for some elementwise ops.
    ck::utils::conv::ConvParam conv_param{
-        3, 1, 16, 128, 8, {3, 3, 3}, {17, 17, 17}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}};
+        3, 2, 16, 128, 8, {3, 3, 3}, {17, 17, 17}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}};
    if(argc == 1)
    {

--- a/example/64_tensor_transforms/CMakeLists.txt
+++ b/example/64_tensor_transforms/CMakeLists.txt
-add_example_executable(example_tensor_transform tensor_transform.cpp)
-add_example_executable(example_tensor_transform_using_wrapper tensor_transform_using_wrapper.cpp)
--- a/include/ck/host_utility/device_prop.hpp
+++ b/include/ck/host_utility/device_prop.hpp
@@ -26,7 +26,7 @@ inline std::string get_device_name()
    }
    const std::string raw_name(props.gcnArchName);
-    // https://github.com/ROCmSoftwarePlatform/MIOpen/blob/8498875aef84878e04c1eabefdf6571514891086/src/target_properties.cpp#L40
+    // https://github.com/ROCm/MIOpen/blob/8498875aef84878e04c1eabefdf6571514891086/src/target_properties.cpp#L40
    static std::map<std::string, std::string> device_name_map = {
        {"Ellesmere", "gfx803"},
        {"Baffin", "gfx803"},

--- a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp
@@ -14,6 +14,7 @@
 #include "ck/tensor_operation/gpu/device/device_contraction_multiple_abd.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_contraction_utils.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp"
 #include "ck/host_utility/device_prop.hpp"
 #include "ck/host_utility/kernel_launch.hpp"
@@ -500,22 +501,29 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
            // for sanity check of vector memory access
            for(index_t i = 0; i < NumATensor; ++i)
            {
-                a_mz_stride_[i] = a_ms_ks_strides[i][NumDimM - 1];
+                as_mz_consecutive_[i] = a_ms_ks_strides[i][NumDimM - 1] == 1;
-                a_kz_stride_[i] = a_ms_ks_strides[i][NumDimM + NumDimK - 1];
+                as_kz_consecutive_[i] = a_ms_ks_strides[i][NumDimM + NumDimK - 1] == 1;
+                as_max_read_elems_[i] =
+                    CalculateMaxRead<NumDimM, NumDimK>(a_ms_ks_lengths[i], a_ms_ks_strides[i]);
            }
            for(index_t i = 0; i < NumBTensor; ++i)
            {
-                b_nz_stride_[i] = b_ns_ks_strides[i][NumDimN - 1];
+                bs_nz_consecutive_[i] = b_ns_ks_strides[i][NumDimN - 1] == 1;
-                b_kz_stride_[i] = b_ns_ks_strides[i][NumDimN + NumDimK - 1];
+                bs_kz_consecutive_[i] = b_ns_ks_strides[i][NumDimN + NumDimK - 1] == 1;
+                bs_max_read_elems_[i] =
+                    CalculateMaxRead<NumDimN, NumDimK>(b_ns_ks_lengths[i], b_ns_ks_strides[i]);
            }
            for(index_t i = 0; i < NumDTensor; ++i)
            {
-                ds_nz_stride_[i] = d_ms_ns_strides[i][NumDimM + NumDimN - 1];
+                ds_nz_consecutive_[i] = d_ms_ns_strides[i][NumDimM + NumDimN - 1] == 1;
+                ds_max_read_elems_[i] =
+                    CalculateMaxRead<NumDimM, NumDimN>(d_ms_ns_lengths[i], d_ms_ns_strides[i]);
            }
-            e_nz_stride_ = e_ms_ns_stride[NumDimM + NumDimN - 1];
+            e_nz_consecutive_  = e_ms_ns_stride[NumDimM + NumDimN - 1] == 1;
+            e_max_write_elems_ = CalculateMaxRead<NumDimM, NumDimN>(e_ms_ns_length, e_ms_ns_stride);
        }
        // pointers
@@ -545,16 +553,19 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
        BElementwiseOperation b_element_op_;
        CDEElementwiseOperation cde_element_op_;
-        // Strides for the last M/N/K dimensions of A/B/Ds/E
+        // Describe whether the last part of a given dimension of A/B/D/E is consecutive
-        //   for sanity check of vector load/store
+        // in the memory or not.
-        std::array<index_t, NumATensor> a_mz_stride_;
+        std::array<bool, NumATensor> as_mz_consecutive_;
-        std::array<index_t, NumATensor> a_kz_stride_;
+        std::array<bool, NumATensor> as_kz_consecutive_;
+        std::array<bool, NumBTensor> bs_nz_consecutive_;
-        std::array<index_t, NumBTensor> b_nz_stride_;
+        std::array<bool, NumBTensor> bs_kz_consecutive_;
-        std::array<index_t, NumBTensor> b_kz_stride_;
+        std::array<bool, NumDTensor> ds_nz_consecutive_;
+        bool e_nz_consecutive_;
-        std::array<index_t, NumDTensor> ds_nz_stride_;
-        index_t e_nz_stride_;
+        std::array<index_t, NumATensor> as_max_read_elems_;
+        std::array<index_t, NumBTensor> bs_max_read_elems_;
+        std::array<index_t, NumDTensor> ds_max_read_elems_;
+        index_t e_max_write_elems_;
    };
    // Invoker
@@ -643,73 +654,65 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
        // check vector load/store
        {
-            bool all_valid = true;
+            bool valid_as_access = true;
            static_for<0, NumATensor, 1>{}([&](auto i) {
-                // vector memory access of A: could be on M or AK1 dimension
+                const bool valid_a_vector_size =
-                if constexpr(ABlockTransferSrcVectorDim == 1)
+                    arg.as_max_read_elems_[i] % ABlockTransferSrcScalarPerVector == 0;
-                {
+                const bool valid_a_access_dim_m =
-                    if(!(arg.a_mz_stride_[i] == 1 && arg.as_grid_desc_ak0_m_ak1_[i].GetLength(I1) %
+                    ABlockTransferSrcVectorDim == 1 && arg.as_mz_consecutive_[i];
-                                                             ABlockTransferSrcScalarPerVector ==
+                const bool valid_a_access_dim_k =
-                                                         0))
+                    ABlockTransferSrcVectorDim == 2 && arg.as_kz_consecutive_[i];
-                    {
+                const bool valid_a_access_dim = valid_a_access_dim_m || valid_a_access_dim_k;
-                        all_valid = false;
+                if(!(valid_a_vector_size && valid_a_access_dim))
-                    }
-                }
-                else
                {
-                    if(!(arg.a_kz_stride_[i] == 1 && arg.as_grid_desc_ak0_m_ak1_[i].GetLength(I2) %
+                    valid_as_access = false;
-                                                             ABlockTransferSrcScalarPerVector ==
-                                                         0))
-                    {
-                        all_valid = false;
-                    }
                }
            });
+            if(!valid_as_access)
+            {
+                return false;
+            }
-            // vector memory access of B: could be on N or BK1 dimension
+            bool valid_bs_access = true;
            static_for<0, NumBTensor, 1>{}([&](auto i) {
-                if constexpr(BBlockTransferSrcVectorDim == 1)
+                const bool valid_b_vector_size =
+                    arg.bs_max_read_elems_[i] % BBlockTransferSrcScalarPerVector == 0;
+                const bool valid_b_access_dim_n =
+                    BBlockTransferSrcVectorDim == 1 && arg.bs_nz_consecutive_[i];
+                const bool valid_b_access_dim_k =
+                    BBlockTransferSrcVectorDim == 2 && arg.bs_kz_consecutive_[i];
+                const bool valid_b_access_dim = valid_b_access_dim_n || valid_b_access_dim_k;
+                if(!(valid_b_vector_size && valid_b_access_dim))
                {
-                    if(!(arg.b_nz_stride_[i] == 1 && arg.bs_grid_desc_bk0_n_bk1_[i].GetLength(I1) %
+                    valid_bs_access = false;
-                                                             BBlockTransferSrcScalarPerVector ==
-                                                         0))
-                    {
-                        all_valid = false;
-                    }
-                }
-                else
-                {
-                    if(!(arg.b_kz_stride_[i] == 1 && arg.bs_grid_desc_bk0_n_bk1_[i].GetLength(I2) %
-                                                             BBlockTransferSrcScalarPerVector ==
-                                                         0))
-                    {
-                        all_valid = false;
-                    }
                }
            });
+            if(!valid_bs_access)
+            {
+                return false;
+            }
-            // check vector load of Ds
+            bool valid_ds_access = true;
            static_for<0, NumDTensor, 1>{}([&](auto i) {
-                if(!(arg.ds_nz_stride_[i] == 1 &&
+                const bool valid_d_vector_size =
-                     arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_[i].GetLength(I3) %
+                    arg.ds_max_read_elems_[i] % CDEBlockTransferScalarPerVector_NPerBlock == 0;
-                             CDEBlockTransferScalarPerVector_NPerBlock ==
+                // Vector read of Ds is always on N dimension.
-                         0))
+                const bool valid_d_access_dim = arg.ds_nz_consecutive_[i];
+                if(!(valid_d_vector_size && valid_d_access_dim))
                {
-                    all_valid = false;
+                    valid_ds_access = false;
                }
            });
+            if(!valid_ds_access)
-            // vector memory access of E: always on NPerBlock dimension
-            if(!(arg.e_nz_stride_ == 1 &&
-                 arg.e_grid_desc_mblock_mperblock_nblock_nperblock_.GetLength(I3) %
-                         CDEBlockTransferScalarPerVector_NPerBlock ==
-                     0))
            {
-                all_valid = false;
+                return false;
            }
-            if(!all_valid)
+            const bool valid_e_vector_size =
+                arg.e_max_write_elems_ % CDEBlockTransferScalarPerVector_NPerBlock == 0;
+            // Vector write of E is always on N dimension.
+            const bool valid_e_access_dim = arg.e_nz_consecutive_;
+            if(!(valid_e_vector_size && valid_e_access_dim))
            {
                return false;
            }

--- a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
@@ -13,6 +13,7 @@
 #include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_contraction_utils.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp"
 #include "ck/host_utility/device_prop.hpp"
 #include "ck/host_utility/kernel_launch.hpp"
@@ -183,7 +184,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
            return generate_tuple([&](auto i) { return vec[i]; }, num);
        };
-        const auto a_ms_ns_lengths = to_tuple(a_ms_ks_lengths_vec, Number<NumDimM + NumDimK>{});
+        const auto a_ms_ks_lengths = to_tuple(a_ms_ks_lengths_vec, Number<NumDimM + NumDimK>{});
        const auto a_ms_ks_strides = to_tuple(a_ms_ks_strides_vec, Number<NumDimM + NumDimK>{});
        // dimension Ids for M0, M1, ...
@@ -194,14 +195,14 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
            typename arithmetic_sequence_gen<NumDimM, NumDimM + NumDimK, 1>::type{};
        // lengths for M0, M1, ...
-        const auto mLengths = get_container_subset(a_ms_ns_lengths, mDimIds);
+        const auto mLengths = get_container_subset(a_ms_ks_lengths, mDimIds);
        // lengths for K0, K1, ...
-        const auto kLengths = get_container_subset(a_ms_ns_lengths, kDimIds);
+        const auto kLengths = get_container_subset(a_ms_ks_lengths, kDimIds);
        // naive tensor A[M0, M1, M2, ..., K0, K1, K2...]
        const auto a_grid_desc_ms_ks =
-            make_naive_tensor_descriptor(a_ms_ns_lengths, a_ms_ks_strides);
+            make_naive_tensor_descriptor(a_ms_ks_lengths, a_ms_ks_strides);
        // transformed tensor A[MRaw = M0 * M1 * M2 * ... , KRaw = K0 * K1 * K2 * ...]
        const auto a_grid_desc_mraw_kraw = transform_tensor_descriptor(
@@ -383,7 +384,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
                 const void* p_b_grid,
                 std::array<const void*, NumDTensor> p_ds_grid,
                 void* p_e_grid,
-                 const std::vector<index_t>& a_ms_ns_lengths,
+                 const std::vector<index_t>& a_ms_ks_lengths,
                 const std::vector<index_t>& a_ms_ks_strides,
                 const std::vector<index_t>& b_ns_ks_lengths,
                 const std::vector<index_t>& b_ns_ks_strides,
@@ -398,7 +399,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
              p_b_grid_{static_cast<const BDataType*>(p_b_grid)},
              p_ds_grid_{},
              p_e_grid_{static_cast<EDataType*>(p_e_grid)},
-              a_grid_desc_m_k_{DeviceOp::MakeAGridDescriptor_M_K(a_ms_ns_lengths, a_ms_ks_strides)},
+              a_grid_desc_m_k_{DeviceOp::MakeAGridDescriptor_M_K(a_ms_ks_lengths, a_ms_ks_strides)},
              b_grid_desc_n_k_{DeviceOp::MakeBGridDescriptor_N_K(b_ns_ks_lengths, b_ns_ks_strides)},
              ds_grid_desc_m_n_{},
              e_grid_desc_m_n_{DeviceOp::MakeEGridDescriptor_M_N(e_ms_ns_lengths, e_ms_ns_strides)},
@@ -411,13 +412,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
              block_2_etile_map_{GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
              a_element_op_{a_element_op},
              b_element_op_{b_element_op},
-              cde_element_op_{cde_element_op},
+              cde_element_op_{cde_element_op}
-              a_mz_stride_{},
-              a_kz_stride_{},
-              b_nz_stride_{},
-              b_kz_stride_{},
-              ds_nz_stride_{},
-              e_nz_stride_{}
        {
            // populate pointer, batch stride, desc for Ds
            static_for<0, NumDTensor, 1>{}([&](auto i) {
@@ -448,18 +443,26 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
            }
            // for sanity check of vector memory access
-            a_mz_stride_ = a_ms_ks_strides[NumDimM - 1];
+            a_mz_consecutive_ = a_ms_ks_strides[NumDimM - 1] == 1;
-            a_kz_stride_ = a_ms_ks_strides[NumDimM + NumDimK - 1];
+            a_kz_consecutive_ = a_ms_ks_strides[NumDimM + NumDimK - 1] == 1;
+            a_max_read_elems_ =
+                CalculateMaxRead<NumDimM, NumDimK>(a_ms_ks_lengths, a_ms_ks_strides);
-            b_nz_stride_ = b_ns_ks_strides[NumDimN - 1];
+            b_nz_consecutive_ = b_ns_ks_strides[NumDimN - 1] == 1;
-            b_kz_stride_ = b_ns_ks_strides[NumDimN + NumDimK - 1];
+            b_kz_consecutive_ = b_ns_ks_strides[NumDimN + NumDimK - 1] == 1;
+            b_max_read_elems_ =
+                CalculateMaxRead<NumDimN, NumDimK>(b_ns_ks_lengths, b_ns_ks_strides);
            for(index_t i = 0; i < NumDTensor; ++i)
            {
-                ds_nz_stride_[i] = ds_ms_ns_strides[i][NumDimM + NumDimN - 1];
+                ds_nz_consecutive_[i] = ds_ms_ns_strides[i][NumDimM + NumDimN - 1] == 1;
+                ds_max_read_elems_[i] =
+                    CalculateMaxRead<NumDimM, NumDimN>(ds_ms_ns_lengths[i], ds_ms_ns_strides[i]);
            }
-            e_nz_stride_ = e_ms_ns_strides[NumDimM + NumDimN - 1];
+            e_nz_consecutive_ = e_ms_ns_strides[NumDimM + NumDimN - 1] == 1;
+            e_max_write_elems_ =
+                CalculateMaxRead<NumDimM, NumDimN>(e_ms_ns_lengths, e_ms_ns_strides);
        }
        void Print() const
@@ -499,15 +502,19 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
        BElementwiseOperation b_element_op_;
        CDEElementwiseOperation cde_element_op_;
-        // Strides for the last M/N/K dimensions of A/B/Ds/E
+        // Describe whether the last part of a given dimension of A/B/D/E is consecutive
-        //   for sanity check of vector load/store
+        // in the memory or not.
-        index_t a_mz_stride_;
+        bool a_mz_consecutive_;
-        index_t a_kz_stride_;
+        bool a_kz_consecutive_;
-        index_t b_nz_stride_;
+        bool b_nz_consecutive_;
-        index_t b_kz_stride_;
+        bool b_kz_consecutive_;
-        std::array<index_t, NumDTensor> ds_nz_stride_;
+        std::array<bool, NumDTensor> ds_nz_consecutive_;
-        index_t e_mz_stride_;
+        bool e_nz_consecutive_;
-        index_t e_nz_stride_;
+        index_t a_max_read_elems_;
+        index_t b_max_read_elems_;
+        std::array<index_t, NumDTensor> ds_max_read_elems_;
+        index_t e_max_write_elems_;
    };
    // Invoker
@@ -616,65 +623,47 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
                          (BBlockTransferSrcVectorDim == 1 || BBlockTransferSrcVectorDim == 2),
                      "wrong!");
-        // vector memory access of A: could be on M or AK1 dimension
+        const bool valid_a_vector_size =
-        if constexpr(ABlockTransferSrcVectorDim == 1)
+            arg.a_max_read_elems_ % ABlockTransferSrcScalarPerVector == 0;
+        const bool valid_a_access_dim_m = ABlockTransferSrcVectorDim == 1 && arg.a_mz_consecutive_;
+        const bool valid_a_access_dim_k = ABlockTransferSrcVectorDim == 2 && arg.a_kz_consecutive_;
+        const bool valid_a_access_dim   = valid_a_access_dim_m || valid_a_access_dim_k;
+        if(!(valid_a_vector_size && valid_a_access_dim))
        {
-            if(!(arg.a_mz_stride_ == 1 &&
+            return false;
-                 arg.a_grid_desc_ak0_m_ak1_.GetLength(I1) % ABlockTransferSrcScalarPerVector == 0))
-            {
-                return false;
-            }
-        }
-        else
-        {
-            if(!(arg.a_kz_stride_ == 1 &&
-                 arg.a_grid_desc_ak0_m_ak1_.GetLength(I2) % ABlockTransferSrcScalarPerVector == 0))
-            {
-                return false;
-            }
        }
-        // vector memory access of B: could be on N or BK1 dimension
+        const bool valid_b_vector_size =
-        if constexpr(BBlockTransferSrcVectorDim == 1)
+            arg.b_max_read_elems_ % BBlockTransferSrcScalarPerVector == 0;
-        {
+        const bool valid_b_access_dim_n = BBlockTransferSrcVectorDim == 1 && arg.b_nz_consecutive_;
-            if(!(arg.b_nz_stride_ == 1 &&
+        const bool valid_b_access_dim_k = BBlockTransferSrcVectorDim == 2 && arg.b_kz_consecutive_;
-                 arg.b_grid_desc_bk0_n_bk1_.GetLength(I1) % BBlockTransferSrcScalarPerVector == 0))
+        const bool valid_b_access_dim   = valid_b_access_dim_n || valid_b_access_dim_k;
-            {
+        if(!(valid_b_vector_size && valid_b_access_dim))
-                return false;
-            }
-        }
-        else
        {
-            if(!(arg.b_kz_stride_ == 1 &&
+            return false;
-                 arg.b_grid_desc_bk0_n_bk1_.GetLength(I2) % BBlockTransferSrcScalarPerVector == 0))
-            {
-                return false;
-            }
        }
-        // vector memory access of Ds: always on NPerBlock dimension
+        bool valid_ds_access = true;
-        bool valid_d_access = true;
        static_for<0, NumDTensor, 1>{}([&](auto i) {
-            if(!(arg.ds_nz_stride_[i] == 1 &&
+            const bool valid_d_vector_size =
-                 arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_[i].GetLength(I3) %
+                arg.ds_max_read_elems_[i] % CDEBlockTransferScalarPerVector_NPerBlock == 0;
-                         CDEBlockTransferScalarPerVector_NPerBlock ==
+            // Vector read of Ds is always on N dimension.
-                     0))
+            const bool valid_d_access_dim = arg.ds_nz_consecutive_[i];
+            if(!(valid_d_vector_size && valid_d_access_dim))
            {
-                valid_d_access = false;
+                valid_ds_access = false;
            }
        });
+        if(!valid_ds_access)
-        if(valid_d_access == false)
        {
            return false;
        }
-        // vector memory access of E: always on NPerBlock dimension
+        const bool valid_e_vector_size =
-        if(!(arg.e_nz_stride_ == 1 &&
+            arg.e_max_write_elems_ % CDEBlockTransferScalarPerVector_NPerBlock == 0;
-             arg.e_grid_desc_mblock_mperblock_nblock_nperblock_.GetLength(I3) %
+        // Vector write of E is always on N dimension.
-                     CDEBlockTransferScalarPerVector_NPerBlock ==
+        const bool valid_e_access_dim = arg.e_nz_consecutive_;
-                 0))
+        if(!(valid_e_vector_size && valid_e_access_dim))
        {
            return false;
        }
@@ -692,7 +681,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
                             const void* p_b,
                             std::array<const void*, NumDTensor> p_ds,
                             void* p_e,
-                             const std::vector<index_t>& a_ms_ns_lengths,
+                             const std::vector<index_t>& a_ms_ks_lengths,
                             const std::vector<index_t>& a_ms_ks_strides,
                             const std::vector<index_t>& b_ns_ks_lengths,
                             const std::vector<index_t>& b_ns_ks_strides,
@@ -708,7 +697,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
                        p_b,
                        p_ds,
                        p_e,
-                        a_ms_ns_lengths,
+                        a_ms_ks_lengths,
                        a_ms_ks_strides,
                        b_ns_ks_lengths,
                        b_ns_ks_strides,
@@ -729,7 +718,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
                        const void* p_b,
                        std::array<const void*, NumDTensor> p_ds,
                        void* p_e,
-                        const std::vector<index_t>& a_ms_ns_lengths,
+                        const std::vector<index_t>& a_ms_ks_lengths,
                        const std::vector<index_t>& a_ms_ks_strides,
                        const std::vector<index_t>& b_ns_ks_lengths,
                        const std::vector<index_t>& b_ns_ks_strides,
@@ -745,7 +734,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
                                          p_b,
                                          p_ds,
                                          p_e,
-                                          a_ms_ns_lengths,
+                                          a_ms_ks_lengths,
                                          a_ms_ks_strides,
                                          b_ns_ks_lengths,
                                          b_ns_ks_strides,

--- a/include/ck/tensor_operation/gpu/device/impl/device_contraction_utils.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_contraction_utils.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <cassert>
+#include <sstream>
+#include <vector>
+#include "ck/ck.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+/**
+ * Calculates the maximum number of subsequent elements of the fast changing dimension
+ * that are consecutive in memory.
+ *
+ * Example:
+ *   NumDimM = 2, NumDimK = 3
+ *   A shape =   [  2,   3,  4, 5, 6]
+ *   A strides = [360, 120, 30, 6, 1]
+ *                |  M   |  |   K  |
+ *   It follows from strides that K is FCD and all the subsequent elements of K are consecutive
+ *   in memory.
+ *   But if strides were [360, 120, 6, 24, 1], then only 6 subsequent elements of K would be
+ *   consecutive in memory.
+ *
+ * Assumes that the dimensions are split into two groups of `NumDim1` and `NumDim2` dimensions.
+ */
+template <index_t NumDim1, index_t NumDim2>
+auto CalculateMaxRead(const std::vector<index_t>& lengths, const std::vector<index_t>& strides)
+{
+    if(lengths.size() != NumDim1 + NumDim2)
+    {
+        std::ostringstream err;
+        err << "Incorrect number of lengths in " << __FILE__ << ":" << __LINE__
+            << ", in function: " << __func__;
+        throw std::runtime_error(err.str());
+    }
+    if(strides.size() != NumDim1 + NumDim2)
+    {
+        std::ostringstream err;
+        err << "Incorrect number of strides in " << __FILE__ << ":" << __LINE__
+            << ", in function: " << __func__;
+        throw std::runtime_error(err.str());
+    }
+    // Determine the beginning and end idx of the group representing the FCD.
+    index_t begin_idx, end_idx;
+    if(strides[NumDim1 - 1] == 1)
+    {
+        begin_idx = 0;
+        end_idx   = NumDim1 - 1;
+    }
+    else if(strides[NumDim1 + NumDim2 - 1] == 1)
+    {
+        begin_idx = NumDim1;
+        end_idx   = NumDim1 + NumDim2 - 1;
+    }
+    else
+    {
+        // The dimension consecutive in memory is not the last dimension of any group, so only
+        // one element can be read/written at once.
+        return 1;
+    }
+    index_t consecutive_stride = 1;
+    for(index_t dim_idx = end_idx; dim_idx >= begin_idx; --dim_idx)
+    {
+        if(strides[dim_idx] == consecutive_stride)
+        {
+            consecutive_stride *= lengths[dim_idx];
+        }
+        else
+        {
+            break;
+        }
+    }
+    const index_t max_subsequent_elems = consecutive_stride;
+    return max_subsequent_elems;
+}
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
@@ -357,15 +357,17 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
        return out_gemmm_gemmn_desc;
    }
+    // Shape of Ds and E must be aligned. Strides can be different.
+    // Pass e_g_n_k_wos_lengths for logical broadcast.
    static auto MakeDsGridDescriptor_M_N(
-        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& ds_g_n_k_wos_lengths,
+        const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths,
        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& ds_g_n_k_wos_strides)
    {
        return generate_tuple(
            [&](auto i) {
                using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
-                return DeviceOp::MakeEGridDescriptor_M_N<DLayout>(ds_g_n_k_wos_lengths[i],
+                return DeviceOp::MakeEGridDescriptor_M_N<DLayout>(e_g_n_k_wos_lengths,
                                                                  ds_g_n_k_wos_strides[i]);
            },
            Number<NumDTensor>{});
@@ -569,7 +571,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                // D desc
                ds_grid_desc_m_n_(i) = DeviceOp::MakeEGridDescriptor_M_N<DLayout>(
-                    ds_g_n_k_wos_lengths[i], ds_g_n_k_wos_strides[i]);
+                    e_g_n_k_wos_lengths, ds_g_n_k_wos_strides[i]);
            });
            compute_ptr_offset_of_batch_.BatchStrideE_ = e_g_n_k_wos_strides[0];
@@ -916,8 +918,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                         is_same_v<DLayout, ctc::G_NDHW_K> || is_same_v<DLayout, ctc::GNWK> ||
                         is_same_v<DLayout, ctc::GNHWK> || is_same_v<DLayout, ctc::GNDHWK> ||
                         is_same_v<DLayout, ctc::NWGK> || is_same_v<DLayout, ctc::NHWGK> ||
-                         is_same_v<DLayout, ctc::NDHWGK> || is_same_v<DLayout, ctc::GK> ||
+                         is_same_v<DLayout, ctc::NDHWGK> || is_same_v<DLayout, ctc::G_K>)
-                         is_same_v<DLayout, ctc::G_K>)
            {
                const index_t K = arg.ds_g_n_k_wos_lengths_[i][2];
@@ -925,6 +926,27 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                {
                    valid = false;
                }
+                if constexpr(is_same_v<DLayout, ctc::G_K>)
+                {
+                    // G and K must be the same
+                    if(arg.ds_g_n_k_wos_lengths_[i][0] != arg.e_g_n_k_wos_lengths_[0] ||
+                       arg.ds_g_n_k_wos_lengths_[i][2] != arg.e_g_n_k_wos_lengths_[2])
+                    {
+                        valid = false;
+                    }
+                }
+                else
+                {
+                    // E and D must have the same shape
+                    for(index_t d = 0; d < NDimSpatial + 3; d++)
+                    {
+                        if(arg.ds_g_n_k_wos_lengths_[i][d] != arg.e_g_n_k_wos_lengths_[d])
+                        {
+                            valid = false;
+                        }
+                    }
+                }
            }
            else
            {

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
@@ -631,8 +631,7 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
                         is_same_v<DLayout, ctc::G_NDHW_K> || is_same_v<DLayout, ctc::GNWK> ||
                         is_same_v<DLayout, ctc::GNHWK> || is_same_v<DLayout, ctc::GNDHWK> ||
                         is_same_v<DLayout, ctc::NWGK> || is_same_v<DLayout, ctc::NHWGK> ||
-                         is_same_v<DLayout, ctc::NDHWGK> || is_same_v<DLayout, ctc::GK> ||
+                         is_same_v<DLayout, ctc::NDHWGK> || is_same_v<DLayout, ctc::G_K>)
-                         is_same_v<DLayout, ctc::G_K>)
            {
                const index_t K = arg.ds_g_n_k_wos_lengths_[i][2];

--- a/include/ck/tensor_operation/gpu/device/tensor_layout.hpp
+++ b/include/ck/tensor_operation/gpu/device/tensor_layout.hpp
@@ -308,12 +308,6 @@ struct GNDHWK : public BaseTensorLayout
    static constexpr const char* name = "GNDHWK";
 };
-// for output bias
-struct GK : public BaseTensorLayout
-{
-    static constexpr const char* name = "GK";
-};
 // output tensor
 // packed NWGK/NHWGK/NDHWGK
 struct NWGK : public BaseTensorLayout

--- a/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
@@ -50,7 +50,9 @@ __global__ void
    ignore = p_in_global;
    ignore = out_grid_desc;
    ignore = p_out_global;
+    ignore = batch_count;
    ignore = block_2_tile_map;
+    ignore = compute_ptr_offset_of_batch;
 #endif
 }

--- a/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp
@@ -522,22 +522,21 @@ struct TransformConvFwdToGemm
    // for output bias
    template <typename CLayout,
-              typename std::enable_if<is_same_v<CLayout, tensor_layout::convolution::GK> ||
+              typename std::enable_if<is_same_v<CLayout, tensor_layout::convolution::G_K>,
-                                          is_same_v<CLayout, tensor_layout::convolution::G_K>,
                                      bool>::type = false>
-    static auto
+    static auto MakeCDescriptor_M_N(const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
-    MakeCDescriptor_M_N(const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
+                                    const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_strides)
-                        const std::array<index_t, NDimSpatial + 3>& /* c_g_n_k_wos_strides */)
    {
-        const index_t N = c_g_n_k_wos_lengths[1];
+        const index_t N       = c_g_n_k_wos_lengths[1];
-        const index_t K = c_g_n_k_wos_lengths[2];
+        const index_t K       = c_g_n_k_wos_lengths[2];
+        const index_t KStride = c_g_n_k_wos_strides[2];
        const index_t NHoWo =
            N * ck::accumulate_n<index_t>(
                    c_g_n_k_wos_lengths.begin() + 3, NDimSpatial, 1, std::multiplies<>());
        const auto out_gemmm_gemmn_desc =
-            make_naive_tensor_descriptor(make_tuple(NHoWo, K), make_tuple(I0, I1));
+            make_naive_tensor_descriptor(make_tuple(NHoWo, K), make_tuple(I0, KStride));
        return out_gemmm_gemmn_desc;
    }

--- a/include/ck/utility/tuple_helper.hpp
+++ b/include/ck/utility/tuple_helper.hpp
@@ -166,4 +166,16 @@ __host__ __device__ constexpr auto IsNestedTuple(const Tuple<Ts...>&)
    return (is_detected<is_tuple, Ts>::value || ...);
 }
+template <index_t depth = 0, typename T>
+__host__ __device__ constexpr auto TupleDepth(const T&)
+{
+    return depth;
+}
+template <index_t depth = 0, typename... Ts>
+__host__ __device__ constexpr auto TupleDepth(const Tuple<Ts...>&)
+{
+    return math::max(TupleDepth<depth + 1>(Ts{})...);
+}
 } // namespace ck
--- a/example/64_tensor_transforms/tensor_transform_wrapper.hpp
+++ b/example/64_tensor_transforms/tensor_transform_wrapper.hpp
@@ -3,27 +3,13 @@
 #pragma once
-#include "ck/ck.hpp"
+#include "ck/wrapper/utils/layout_utils.hpp"
-#include "ck/utility/number.hpp"
-#include "ck/utility/tuple.hpp"
-#include "ck/utility/tuple_helper.hpp"
-#include "ck/utility/sequence.hpp"
-#include "ck/utility/sequence_helper.hpp"
-#include "ck/utility/is_detected.hpp"
-#include "ck/tensor_description/tensor_descriptor.hpp"
-#include "ck/tensor_description/tensor_descriptor_helper.hpp"
-#include "ck/tensor_description/multi_index_transform_helper.hpp"
 namespace ck {
-namespace tensor_transform_wrapper {
+namespace wrapper {
 /**
- * \brief Layout wrapper
+ * \brief Layout wrapper that performs the tensor descriptor logic.
- *
- * \details
- * Layout wrapper that performs the tensor descriptor logic.
 *
 * \tparam Shape Tuple of Number<> (for compile-time layout) or index_t
 *         (dynamic layout). It is possible to pass nested shapes
@@ -32,21 +18,39 @@ namespace tensor_transform_wrapper {
 *         (dynamic layout). Stride tuple should be nested if shape tuple is
 *         nested.
 */
-template <typename Shape, typename Strides = Tuple<>>
+template <typename Shape, typename Strides>
 struct Layout
 {
    private:
    static constexpr auto I0 = Number<0>{};
    static constexpr auto I1 = Number<1>{};
-    template <typename T>
+    // Generate default idxs tuple (idx with all merged nested shapes)
-    using is_tuple = decltype(std::declval<T&>().IsTuple());
+    template <typename... Ts>
+    __host__ __device__ constexpr static auto GenerateDefaultIdxsTuple(const Tuple<Ts...>&)
+    {
+        return generate_tuple(
+            [&](auto) {
+                if constexpr(!FlattenDescriptorType::IsKnownAtCompileTime())
+                {
+                    // runtime layout
+                    return index_t(0);
+                }
+                else
+                {
+                    // compiletime layout
+                    return I0;
+                }
+            },
+            Number<Tuple<Ts...>::Size()>{});
+    }
    // Generate packed (column-major) strides if not passed
    template <typename... Ts>
    __host__ __device__ constexpr static auto
-    GenerateColumnMajorPackedStrides(const Tuple<Ts...>& tuple)
+    GenerateColumnMajorPackedStrides(const Tuple<Ts...>& shape)
    {
+        const auto unrolled_shape = UnrollNestedTuple(shape);
        return generate_tuple(
            [&](auto i) {
                if constexpr(i.value == 0)
@@ -56,10 +60,10 @@ struct Layout
                else
                {
                    return TupleReduce<I0.value, i.value>([](auto x, auto y) { return x * y; },
-                                                          tuple);
+                                                          unrolled_shape);
                }
            },
-            Number<Tuple<Ts...>::Size()>{});
+            Number<decltype(unrolled_shape)::Size()>{});
    }
    // Generate LowerDims in Compile-time for MergeTrasform using passed Type
@@ -112,8 +116,8 @@ struct Layout
    // Example shape:   (2, (2, 2)), 2, (2, 2)
    // Unrolled shape:  2,  (2, 2),  2, (2, 2)
    template <typename... ShapeDims, typename... IdxDims>
-    __host__ __device__ constexpr static auto UnrollShapeViaIdx(const Tuple<ShapeDims...>& shape,
+    __host__ __device__ constexpr static auto AlignShapeToIdx(const Tuple<ShapeDims...>& shape,
-                                                                const Tuple<IdxDims...>& idx)
+                                                              const Tuple<IdxDims...>& idx)
    {
        if constexpr(!IsNestedTuple(Tuple<IdxDims...>{}))
        {
@@ -125,7 +129,7 @@ struct Layout
            // Iterate over shape tuple elements:
            // 1. If corresponding idx element is tuple then return (will be unrolled)
            // 2. If no, pack in tuple. It will be restored during unroll.
-            auto unrolled_shape_via_idx = generate_tuple(
+            auto aligned_shape = generate_tuple(
                [&](auto i) {
                    if constexpr(is_detected<is_tuple,
                                             tuple_element_t<i, Tuple<IdxDims...>>>::value)
@@ -140,37 +144,34 @@ struct Layout
                Number<Tuple<IdxDims...>::Size()>{});
            // Unroll and process next step
-            return UnrollShapeViaIdx(UnrollNestedTuple<0, 1>(unrolled_shape_via_idx),
+            return AlignShapeToIdx(UnrollNestedTuple<0, 1>(aligned_shape),
-                                     UnrollNestedTuple<0, 1>(idx));
+                                   UnrollNestedTuple<0, 1>(idx));
        }
    }
    template <typename... ShapeDims, typename DescriptorToMerge>
    __host__ __device__ constexpr static auto MakeMerge1d(const Tuple<ShapeDims...>& shape,
-                                                          DescriptorToMerge& desc)
+                                                          const DescriptorToMerge& desc)
    {
        // Reverse each element in tuple
-        using ReversedUnrolledShape = decltype(TupleReverse(UnrollNestedTuple(shape)));
+        const auto merge_elems = TupleReverse(UnrollNestedTuple(shape));
-        const auto merge_elems      = ReversedUnrolledShape{};
        // Generate reverted indexes (column major traverse)
-        using MergeElemsSequence =
+        using MergeElemsSequence = typename arithmetic_sequence_gen<0, merge_elems.Size(), 1>::type;
-            typename arithmetic_sequence_gen<0, ReversedUnrolledShape::Size(), 1>::type;
+        const auto lower_dims    = make_tuple(MergeElemsSequence::Reverse());
-        const auto lower_dims = make_tuple(MergeElemsSequence::Reverse());
+        const auto upper_dims    = make_tuple(Sequence<0>{});
-        const auto upper_dims = make_tuple(Sequence<0>{});
        // Merge to 1d
        return transform_tensor_descriptor(
            desc, make_tuple(make_merge_transform(merge_elems)), lower_dims, upper_dims);
    }
-    // Merge nested shape dims
+    // Merge nested shape dims when corresponding index is also nested.
    // Input desc shape: 2,  2,  2, 2,  2,  2
    // Example idx:      1,      1, 1,      1
    // Example shape:    2, (2, 2), 2, (2, 2)
    // Merged shape:     2,      4, 2,      4
    template <typename... ShapeDims, typename... IdxDims, typename DescriptorToMerge>
-    __host__ __device__ constexpr static auto
+    __host__ __device__ constexpr static auto CreateMergedDescriptor(
-    MakeMerges(const Tuple<ShapeDims...>& shape, const Tuple<IdxDims...>&, DescriptorToMerge& desc)
+        const Tuple<ShapeDims...>& shape, const Tuple<IdxDims...>&, DescriptorToMerge& desc)
    {
        const auto transforms = generate_tuple(
            [&](auto i) {
@@ -206,14 +207,38 @@ struct Layout
        return transform_tensor_descriptor(desc, transforms, lower_dims, upper_dims);
    }
+    template <typename LayoutShape, typename LayoutStrides>
+    __host__ __device__ static auto MakeFlattenDescriptor(const LayoutShape& shape,
+                                                          const LayoutStrides& strides)
+    {
+        const auto unrolled_shape   = UnrollNestedTuple(shape);
+        const auto unrolled_strides = UnrollNestedTuple(strides);
+        static_assert(unrolled_shape.Size() == unrolled_strides.Size(),
+                      "Size of strides and shape are not consistent.");
+        return make_naive_tensor_descriptor(unrolled_shape, unrolled_strides);
+    }
+    // If the stride is not passed, you can infer it from `GenerateColumnMajorPackedStrides`.
+    using DeducedStrides =
+        std::conditional_t<is_same_v<Strides, Tuple<>>,
+                           remove_cvref_t<decltype(GenerateColumnMajorPackedStrides(Shape{}))>,
+                           Strides>;
+    using FlattenDescriptorType =
+        remove_cvref_t<decltype(MakeFlattenDescriptor(Shape{}, DeducedStrides{}))>;
+    using Descriptor1dType =
+        remove_cvref_t<decltype(MakeMerge1d(Shape{}, FlattenDescriptorType{}))>;
+    using DefaultIdxsTupleType = remove_cvref_t<decltype(GenerateDefaultIdxsTuple(Shape{}))>;
    template <typename... ShapeDims, typename... IdxDims>
-    __host__ __device__ constexpr auto TransformDesc(const Tuple<ShapeDims...>& shape,
+    __host__ __device__ constexpr static auto
-                                                     const Tuple<IdxDims...>& idx) const
+    TransformDesc(const Tuple<ShapeDims...>& shape,
+                  const Tuple<IdxDims...>& idx,
+                  const FlattenDescriptorType& naive_descriptor)
    {
        if constexpr(Tuple<IdxDims...>::Size() == I1)
        {
            // 1d idx path
-            return MakeMerge1d(shape, descriptor_);
+            return MakeMerge1d(shape, naive_descriptor);
        }
        else
        {
@@ -224,62 +249,55 @@ struct Layout
            static_assert(Tuple<ShapeDims...>::Size() == Tuple<IdxDims...>::Size(),
                          "Idx rank and Shape rank must be the same (except 1d).");
            // Unroll while IdxDims is nested
-            const auto unrolled_shape_via_idx = UnrollShapeViaIdx(shape, idx);
+            const auto aligned_shape = AlignShapeToIdx(shape, idx);
            // Transform correct form of shape
-            return MakeMerges(unrolled_shape_via_idx, UnrollNestedTuple(idx), descriptor_);
+            return CreateMergedDescriptor(aligned_shape, UnrollNestedTuple(idx), naive_descriptor);
        }
    }
-    template <typename LayoutShape, typename LayoutStrides>
+    using MergedNestsDescriptorType = remove_cvref_t<decltype(TransformDesc(
-    __host__ __device__ static auto MakeNaiveDescriptor(const LayoutShape& shape,
+        Shape{}, DefaultIdxsTupleType{}, FlattenDescriptorType{}))>;
-                                                        const LayoutStrides& strides)
-    {
-        const auto unrolled_shape = UnrollNestedTuple(shape);
-        if constexpr(ck::is_same_v<LayoutStrides, Tuple<>>)
-        {
-            // If shape is packed
-            const auto column_major_packed_strides =
-                GenerateColumnMajorPackedStrides(unrolled_shape);
-            return make_naive_tensor_descriptor(unrolled_shape, column_major_packed_strides);
-        }
-        else
-        {
-            const auto unrolled_strides = UnrollNestedTuple(strides);
-            static_assert(unrolled_shape.Size() == unrolled_strides.Size(),
-                          "Size of strides and shape are not consistent.");
-            return make_naive_tensor_descriptor(unrolled_shape, unrolled_strides);
-        }
-    }
    public:
-    using NaiveDescriptorType = remove_cvref_t<decltype(MakeNaiveDescriptor(Shape{}, Strides{}))>;
+    __host__ __device__ constexpr auto GetElementSpaceSize() const
+    {
+        return flatten_descriptor_.GetElementSpaceSize();
+    }
+    __host__ __device__ Layout() = delete;
    /**
     * \brief Layout constructor.
     *
     * \param shape Shape for layout.
     * \param strides Strides for layout (optional if tensor is packed).
-     * \return Layout object.
     */
-    __host__ __device__ Layout() = delete;
+    __host__ __device__ constexpr Layout(const Shape& shape, const Strides& strides)
-    __host__ __device__ Layout(const Shape& shape, const Strides& strides) : descriptor_{}
+        : flatten_descriptor_{}, shape_(shape), strides_(strides)
    {
        // Construct if runtime mode
-        if constexpr(!NaiveDescriptorType::IsKnownAtCompileTime())
+        if constexpr(!FlattenDescriptorType::IsKnownAtCompileTime())
        {
-            // Keep only shape, strides are not need for transforms
+            flatten_descriptor_ = MakeFlattenDescriptor(shape_, strides_);
-            shape_      = shape;
+            descriptor_1d_      = MakeMerge1d(shape_, flatten_descriptor_);
-            descriptor_ = MakeNaiveDescriptor(shape, strides);
+            merged_nests_descriptor_ =
+                TransformDesc(shape_, DefaultIdxsTupleType{}, flatten_descriptor_);
        }
    }
-    __host__ __device__ Layout(const Shape& shape) : descriptor_{}
+    /**
+     * \brief Layout constructor (with default packed column-major strides).
+     *
+     * \param shape Shape for layout.
+     */
+    __host__ __device__ constexpr Layout(const Shape& shape)
+        : flatten_descriptor_{}, shape_(shape), strides_(GenerateColumnMajorPackedStrides(shape_))
    {
-        if constexpr(!NaiveDescriptorType::IsKnownAtCompileTime())
+        if constexpr(!FlattenDescriptorType::IsKnownAtCompileTime())
        {
-            shape_      = shape;
+            flatten_descriptor_ = MakeFlattenDescriptor(shape_, strides_);
-            descriptor_ = MakeNaiveDescriptor(shape, Strides{});
+            descriptor_1d_      = MakeMerge1d(shape_, flatten_descriptor_);
+            merged_nests_descriptor_ =
+                TransformDesc(shape_, DefaultIdxsTupleType{}, flatten_descriptor_);
        }
    }
@@ -292,7 +310,9 @@ struct Layout
    template <typename Idxs>
    __host__ __device__ constexpr index_t operator()() const
    {
-        using TransformedDesc = decltype(TransformDesc(Shape{}, Idxs{}));
+        static_assert(FlattenDescriptorType::IsKnownAtCompileTime(),
+                      "Compiletime operator used on runtime layout.");
+        using TransformedDesc = decltype(TransformDesc(Shape{}, Idxs{}, FlattenDescriptorType{}));
        using UnrolledIdx     = decltype(UnrollNestedTuple(Idxs{}));
        return TransformedDesc{}.CalculateOffset(UnrolledIdx{});
    }
@@ -306,9 +326,22 @@ struct Layout
    template <typename... Ts>
    __host__ __device__ index_t operator()(const Tuple<Ts...>& Idx) const
    {
-        // Static to construct transformed_desc only once
+        if constexpr(!IsNestedTuple(Tuple<Ts...>{}) && Tuple<Ts...>::Size() == 1)
-        static const auto transformed_desc = TransformDesc(shape_, Idx);
+        {
-        return transformed_desc.CalculateOffset(UnrollNestedTuple(Idx));
+            // if 1d access
+            return descriptor_1d_.CalculateOffset(Idx);
+        }
+        else if constexpr(!IsNestedTuple(Tuple<Ts...>{}) && Tuple<Ts...>::Size() == Shape::Size())
+        {
+            // if Shape::Size() access (merged nested shapes)
+            return merged_nests_descriptor_.CalculateOffset(UnrollNestedTuple(Idx));
+        }
+        else
+        {
+            // Custom index, need to transform descriptor
+            const auto transformed_desc = TransformDesc(shape_, Idx, flatten_descriptor_);
+            return transformed_desc.CalculateOffset(UnrollNestedTuple(Idx));
+        }
    }
    /**
@@ -338,7 +371,7 @@ struct Layout
     *
     * \return Calculated size.
     */
-    __host__ __device__ constexpr index_t GetLength() const
+    __host__ __device__ constexpr index_t GetLengths() const
    {
        const auto unrolled_shape = UnrollNestedTuple(shape_);
        return TupleReduce<I0.value, unrolled_shape.Size()>([](auto x, auto y) { return x * y; },
@@ -346,80 +379,56 @@ struct Layout
    }
    /**
-     * \brief Dimension getter.
+     * \brief Shape getter.
     *
-     * \tparam IDim Dimension idx.
+     * \return Shape.
-     * \return Calculated size.
     */
-    template <index_t IDim>
+    __host__ __device__ constexpr const Shape& GetShape() const { return shape_; }
-    __host__ __device__ constexpr auto Get() const
-    {
-        const auto elem = shape_.At(Number<IDim>{});
-        return elem;
-    }
-    private:
+    /**
-    NaiveDescriptorType descriptor_;
+     * \brief Strides getter.
-    Shape shape_;
+     *
-};
+     * \return Strides.
+     */
-// Layout helpers
+    __host__ __device__ constexpr const DeducedStrides& GetStrides() const { return strides_; }
-// Length getter (product if tuple)
-template <index_t idx, typename Shape, typename Strides>
-__host__ __device__ constexpr index_t size(const Layout<Shape, Strides>& layout)
-{
-    return layout.template GetLength<idx>();
-}
-// Get shape size (product of dims if tuple)
-template <typename... ShapeDims>
-__host__ __device__ constexpr index_t size(const Tuple<ShapeDims...>& shape)
-{
-    using UnrolledShape = decltype(UnrollNestedTuple(shape));
-    return TupleReduce<0, UnrolledShape::Size()>([](auto x, auto y) { return x * y; },
-                                                 UnrolledShape{});
-}
-// Get dim size (could be returned from get function)
-template <typename T>
-__host__ __device__ T constexpr size(const T& dim)
-{
-    return dim;
-}
-// Get layout size (product of shapes)
-template <typename Shape, typename Strides>
-__host__ __device__ constexpr index_t size(const Layout<Shape, Strides>& layout)
-{
-    return layout.GetLength();
-}
-// Get shape element size
+    /**
-template <index_t idx, typename... ShapeDims>
+     * \brief Get default lengths (tuple filled with Shape length elements).
-__host__ __device__ constexpr index_t size(const Tuple<ShapeDims...>& shape)
+     *
-{
+     * \return Default lengths.
-    return size(shape.At(Number<idx>{}));
+     */
-}
+    __host__ __device__ constexpr auto GetDefaultLengthsTuple() const
+    {
+        return generate_tuple([&](auto i) { return GetLength<i>(); }, Number<Shape::Size()>{});
+    }
-// Dim getter (tuple if tuple)
+    /**
-template <index_t idx, typename Shape, typename Strides>
+     * \brief Get default start idx (tuple filled with 0s of the same size as Shape).
-__host__ __device__ constexpr auto get(const Layout<Shape, Strides>& layout)
+     *
-{
+     * \return Default start idx.
-    return layout.template Get<idx>();
+     */
-}
+    __host__ __device__ constexpr auto GetDefaultStartIdxs() const
+    {
+        return GenerateDefaultIdxsTuple(shape_);
+    }
-template <typename Shape, typename Strides>
+    /**
-__host__ __device__ constexpr Layout<Shape, Strides> make_layout(const Shape& shape,
+     * \brief Get default descriptor (with the same size as Shape)
-                                                                 const Strides& strides)
+     *
-{
+     * \return Default descriptor.
-    return Layout<Shape, Strides>(shape, strides);
+     */
-}
+    __host__ __device__ constexpr MergedNestsDescriptorType GetDefaultDescriptor()
+    {
+        return merged_nests_descriptor_;
+    }
-template <typename Shape>
+    private:
-__host__ __device__ constexpr Layout<Shape> make_layout(const Shape& shape)
+    FlattenDescriptorType flatten_descriptor_;
-{
+    Descriptor1dType descriptor_1d_;
-    return Layout<Shape>(shape);
+    MergedNestsDescriptorType merged_nests_descriptor_;
-}
+    const Shape shape_;
+    const DeducedStrides strides_;
+};
-} // namespace tensor_transform_wrapper
+} // namespace wrapper
 } // namespace ck
--- a/include/ck/wrapper/tensor.hpp
+++ b/include/ck/wrapper/tensor.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "utils/tensor_utils.hpp"
+#include "utils/layout_utils.hpp"
+namespace ck {
+namespace wrapper {
+/**
+ * \brief Tensor wrapper that performs static and dynamic buffer logic.
+ *
+ * \tparam BufferAddressSpace Memory type (Generic, Global, LDS, VGPR, SGPR).
+ * \tparam ElementType Element data type.
+ * \tparam Shape Tensor shape (layout component).
+ * \tparam Strides Tensor strides (layout component).
+ * \tparam NumVectors Number of vectors (only for VGPR, SGPR).
+ * \tparam ScalarPerVector Scalars per vector (only for VGPR, SGPR).
+ */
+template <MemoryTypeEnum BufferAddressSpace,
+          typename ElementType,
+          typename Shape,
+          typename Strides,
+          index_t NumVectors,     // param for Register memory
+          index_t ScalarPerVector // param for Register memory
+          >
+struct Tensor
+{
+    private:
+    // Check if Tuple contains Slice object
+    template <typename T>
+    constexpr static bool IsSlicing(T&&)
+    {
+        return is_detected<is_slice, T>::value;
+    }
+    template <typename... Ts>
+    constexpr static bool IsSlicing(Tuple<Ts...>&&)
+    {
+        return (IsSlicing(Ts{}) || ...);
+    }
+    // Calculate first index of new tensor after slice
+    // It is needed to calculate offset for new tensor
+    template <typename... Ts>
+    constexpr auto GetStartIdxForSlicedTensor(const Tuple<Ts...>& idx) const
+    {
+        const auto start_idx_for_sliced_tensor = generate_tuple(
+            [&](auto i) {
+                constexpr auto num_i = Number<i>{};
+                if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Ts...>>>::value)
+                {
+                    // if tuple then recurrence
+                    return GetStartIdxForSlicedTensor(idx.At(num_i));
+                }
+                else if constexpr(is_detected<is_slice,
+                                              tuple_element_t<i.value, Tuple<Ts...>>>::value)
+                {
+                    // if slice, return the beginning of the interval
+                    return idx.At(num_i).from_;
+                }
+                else
+                {
+                    // if one dim selected
+                    return idx.At(num_i);
+                }
+            },
+            Number<Tuple<Ts...>::Size()>{});
+        return start_idx_for_sliced_tensor;
+    }
+    // Calculate new tensor shape after slice
+    template <typename... Ts, typename ShapeTmpType>
+    constexpr auto GetShapeFromSlicedTensor(const Tuple<Ts...>& idx,
+                                            const ShapeTmpType& shape) const
+    {
+        // Pack each value in tuple to remove empty tuples after generation
+        auto new_shape = generate_tuple(
+            [&](auto i) {
+                constexpr auto num_i = Number<i>{};
+                if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Ts...>>>::value)
+                {
+                    if constexpr(!IsSlicing(tuple_element_t<i.value, Tuple<Ts...>>{}))
+                    {
+                        // if tuple does not have any slice then we can remove dimension
+                        return Tuple<>{};
+                    }
+                    else
+                    {
+                        // if tuple then recurrence
+                        return make_tuple(GetShapeFromSlicedTensor(idx.At(num_i), shape.At(num_i)));
+                    }
+                }
+                else if constexpr(is_detected<is_slice,
+                                              tuple_element_t<i.value, Tuple<Ts...>>>::value)
+                {
+                    // calculate new dimension
+                    const auto& dim = size(shape.At(num_i));
+                    const auto val  = idx.At(num_i).range(dim);
+                    return make_tuple(val);
+                }
+                else
+                {
+                    // remove dimension for just value
+                    return Tuple<>{};
+                }
+            },
+            Number<Tuple<Ts...>::Size()>{});
+        // Remove empty tuples (deleted elements) and return
+        return UnrollNestedTuple<0, 1>(new_shape);
+    }
+    template <typename... Ts, typename StridesTmpType>
+    constexpr auto GetStridesFromSlicedTensor(const Tuple<Ts...>& idx,
+                                              const StridesTmpType& strides) const
+    {
+        // Pack each value in tuple to remove empty tuples after generation
+        auto new_strides = generate_tuple(
+            [&](auto i) {
+                constexpr auto num_i = Number<i>{};
+                if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Ts...>>>::value)
+                {
+                    if constexpr(!IsSlicing(tuple_element_t<i.value, Tuple<Ts...>>{}))
+                    {
+                        // if tuple does not have any slice then we can remove dimension
+                        return Tuple<>{};
+                    }
+                    else
+                    {
+                        // if tuple then recurrence
+                        return make_tuple(
+                            GetStridesFromSlicedTensor(idx.At(num_i), strides.At(num_i)));
+                    }
+                }
+                else if constexpr(is_detected<is_slice,
+                                              tuple_element_t<i.value, Tuple<Ts...>>>::value)
+                {
+                    // Stride will be the same
+                    return make_tuple(strides.At(num_i));
+                }
+                else
+                {
+                    // remove dimension for just value
+                    return Tuple<>{};
+                }
+            },
+            Number<Tuple<Ts...>::Size()>{});
+        // Remove empty tuples (deleted elements) and return
+        return UnrollNestedTuple<0, 1>(new_strides);
+    }
+    public:
+    using ElementSpaceSize  = decltype(Layout<Shape, Strides>{
+        Shape{}, Strides{}}.GetElementSpaceSize()); // SpaceSize type for buffer
+    using TensorElementType = ElementType;           // DataType
+    static constexpr MemoryTypeEnum TensorBufferAddressSpace = BufferAddressSpace;
+    static constexpr bool IsDynamicBuffer = !(BufferAddressSpace == MemoryTypeEnum ::Sgpr ||
+                                              BufferAddressSpace == MemoryTypeEnum ::Vgpr);
+    __host__ __device__ Tensor() = delete;
+    __host__ __device__ Tensor(ElementType* pointer, const Layout<Shape, Strides>& layout)
+        : layout_(layout),
+          buffer_(make_dynamic_buffer<BufferAddressSpace>(pointer, layout.GetElementSpaceSize()))
+    {
+    }
+    __host__ __device__ Tensor(const Layout<Shape, Strides>& layout) : layout_(layout)
+    {
+        static_assert(!IsDynamicBuffer, "Wrong BufferAddressSpace for register.");
+    }
+    __host__ __device__ constexpr const Layout<Shape, Strides>& GetLayout() const
+    {
+        return layout_;
+    }
+    // Getter for new sliced tensor
+    template <typename... Ts, enable_if_t<IsSlicing(Tuple<Ts...>{}), bool> = false>
+    __host__ __device__ auto operator[](const Tuple<Ts...>& idx) const
+    {
+        static_assert(IsDynamicBuffer, "Register slice is not supported");
+        // Calculate offset based on first idx for new tensor
+        const index_t offset = layout_(GetStartIdxForSlicedTensor(idx));
+        auto new_shape = GetShapeFromSlicedTensor(idx, layout_.GetShape());
+        if constexpr(is_same_v<Strides, Tuple<>>)
+        {
+            auto new_layout = make_layout(new_shape);
+            return make_tensor<BufferAddressSpace>(buffer_.p_data_ + offset, new_layout);
+        }
+        else
+        {
+            auto new_strides = GetStridesFromSlicedTensor(idx, layout_.GetStrides());
+            auto new_layout  = make_layout(new_shape, new_strides);
+            return make_tensor<BufferAddressSpace>(buffer_.p_data_ + offset, new_layout);
+        }
+    }
+    template <typename... Ts, enable_if_t<IsSlicing(Tuple<Ts...>{}), bool> = false>
+    __host__ __device__ auto operator()(const Tuple<Ts...>& idx) const
+    {
+        return this->operator[](idx);
+    }
+    template <typename... Idxs, enable_if_t<IsSlicing(Tuple<Idxs...>{}), bool> = false>
+    __host__ __device__ auto operator()(Idxs... idxs) const
+    {
+        return this->operator[](make_tuple(idxs...));
+    }
+    // Getter for the const value
+    template <typename... Ts, enable_if_t<!IsSlicing(Tuple<Ts...>{}), bool> = false>
+    __host__ __device__ const ElementType& operator[](const Tuple<Ts...>& idx) const
+    {
+        if constexpr(IsDynamicBuffer)
+        {
+            const index_t offset = layout_(idx);
+            return buffer_[offset];
+        }
+        else
+        {
+            if constexpr(is_same_v<Strides, Tuple<>>)
+            {
+                constexpr index_t offset =
+                    Layout<Shape, Strides>{Shape{}}.template operator()<Tuple<Ts...>>();
+                return buffer_[Number<offset>{}];
+            }
+            else
+            {
+                constexpr index_t offset =
+                    Layout<Shape, Strides>{Shape{}, Strides{}}.template operator()<Tuple<Ts...>>();
+                return buffer_[Number<offset>{}];
+            }
+        }
+    }
+    template <typename... Ts, enable_if_t<!IsSlicing(Tuple<Ts...>{}), bool> = false>
+    __host__ __device__ const ElementType& operator()(const Tuple<Ts...>& idx) const
+    {
+        return this->operator[](idx);
+    }
+    template <typename... Idxs, enable_if_t<!IsSlicing(Tuple<Idxs...>{}), bool> = false>
+    __host__ __device__ const ElementType& operator()(Idxs... idxs) const
+    {
+        return this->operator[](make_tuple(idxs...));
+    }
+    // Getter for the value reference
+    template <typename... Ts, enable_if_t<!IsSlicing(Tuple<Ts...>{}), bool> = false>
+    __host__ __device__ ElementType& operator[](const Tuple<Ts...>& idx)
+    {
+        if constexpr(IsDynamicBuffer)
+        {
+            const index_t offset = layout_(idx);
+            return buffer_(offset);
+        }
+        else
+        {
+            if constexpr(is_same_v<Strides, Tuple<>>)
+            {
+                constexpr index_t offset =
+                    Layout<Shape, Strides>{Shape{}}.template operator()<Tuple<Ts...>>();
+                return buffer_(Number<offset>{});
+            }
+            else
+            {
+                constexpr index_t offset =
+                    Layout<Shape, Strides>{Shape{}, Strides{}}.template operator()<Tuple<Ts...>>();
+                return buffer_(Number<offset>{});
+            }
+        }
+    }
+    template <typename... Ts, enable_if_t<!IsSlicing(Tuple<Ts...>{}), bool> = false>
+    __host__ __device__ ElementType& operator()(const Tuple<Ts...>& idx)
+    {
+        return this->operator[](idx);
+    }
+    template <typename... Idxs, enable_if_t<!IsSlicing(Tuple<Idxs...>{}), bool> = false>
+    __host__ __device__ ElementType& operator()(Idxs... idxs)
+    {
+        return this->operator[](make_tuple(idxs...));
+    }
+    __host__ __device__ constexpr auto GetDefaultDescriptor()
+    {
+        return layout_.GetDefaultDescriptor();
+    }
+    private:
+    using DynamicBufferType = DynamicBuffer<BufferAddressSpace,
+                                            ElementType,
+                                            ElementSpaceSize,
+                                            true /*InvalidElementUseNumericalZeroValue*/>;
+    using StaticBufferType =
+        StaticBufferTupleOfVector<BufferAddressSpace,
+                                  ElementType,
+                                  NumVectors,
+                                  ScalarPerVector,
+                                  true /*InvalidElementUseNumericalZeroValue*/>;
+    // If register use static buffer, else use dynamic buffer
+    using Buffer = std::conditional_t<IsDynamicBuffer, DynamicBufferType, StaticBufferType>;
+    const Layout<Shape, Strides> layout_;
+    Buffer buffer_;
+};
+} // namespace wrapper
+} // namespace ck
--- a/include/ck/wrapper/utils/layout_utils.hpp
+++ b/include/ck/wrapper/utils/layout_utils.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck/ck.hpp"
+#include "ck/utility/number.hpp"
+#include "ck/utility/tuple.hpp"
+#include "ck/utility/tuple_helper.hpp"
+#include "ck/utility/sequence.hpp"
+#include "ck/utility/sequence_helper.hpp"
+#include "ck/utility/is_detected.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
+namespace ck {
+namespace wrapper {
+// Disable from doxygen docs generation
+/// @cond
+// forward declaration
+template <typename Shape, typename Strides>
+struct Layout;
+template <typename T>
+using is_tuple = decltype(std::declval<T&>().IsTuple());
+/// @endcond
+// make_*
+/**
+ * \brief Make layout function.
+ *
+ * \tparam Shape Shape for layout.
+ * \tparam Strides Strides for layout.
+ * \return Constructed layout.
+ */
+template <typename Shape, typename Strides>
+__host__ __device__ constexpr Layout<Shape, Strides> make_layout(const Shape& shape,
+                                                                 const Strides& strides)
+{
+    return Layout<Shape, Strides>(shape, strides);
+}
+/**
+ * \brief Make layout function with packed strides
+ *        (column-major).
+ *
+ * \tparam Shape Shape for layout.
+ * \return Constructed layout.
+ */
+template <typename Shape>
+__host__ __device__ constexpr Layout<Shape, Tuple<>> make_layout(const Shape& shape)
+{
+    return Layout<Shape, Tuple<>>(shape);
+}
+// Layout helpers
+// get
+// Get dim (could be returned from get with empty Idxs)
+/**
+ * \private
+ */
+template <typename T>
+__host__ __device__ T constexpr get(const T& dim)
+{
+    return dim;
+}
+/**
+ * \brief Get element from tuple (Shape/Strides/Idxs).
+ *
+ * \tparam idx Index to lookup.
+ * \param tuple Tuple to lookup.
+ * \return Requsted element.
+ */
+template <index_t idx, typename... Dims>
+__host__ __device__ constexpr auto get(const Tuple<Dims...>& tuple)
+{
+    return tuple.At(Number<idx>{});
+}
+/**
+ * \brief Get sub layout.
+ *
+ * \tparam idx Index to lookup.
+ * \param layout Layout to create sub layout.
+ * \return Requsted sub layout.
+ */
+template <index_t idx, typename Shape, typename Strides>
+__host__ __device__ constexpr auto get(const Layout<Shape, Strides>& layout)
+{
+    const auto& shape     = layout.GetShape();
+    const auto& new_shape = get<idx>(shape);
+    static_assert(is_detected<is_tuple, decltype(new_shape)>::value,
+                  "Shape of sub layout must be tuple");
+    if constexpr(is_same_v<Strides, Tuple<>>)
+    {
+        // If stride not passed, create without strides
+        return make_layout(new_shape);
+    }
+    else
+    {
+        const auto& strides     = layout.GetStrides();
+        const auto& new_strides = get<idx>(strides);
+        static_assert(is_detected<is_tuple, decltype(new_strides)>::value,
+                      "Strides of sub layout must be tuple");
+        return make_layout(new_shape, new_strides);
+    }
+}
+/**
+ * \brief Hierarchical get.
+ *
+ * \tparam Idxs Indexes to lookup.
+ * \param elem Element to lookup.
+ * \return Requsted element.
+ */
+template <index_t Idx, index_t... Idxs, typename T>
+__host__ __device__ constexpr auto get(const T& elem)
+{
+    return get<Idxs...>(get<Idx>(elem));
+}
+// size
+// Get dim size (could be returned from get function)
+/**
+ * \private
+ */
+template <typename T>
+__host__ __device__ T constexpr size(const T& dim)
+{
+    return dim;
+}
+/**
+ * \brief Length get (product if tuple).
+ *
+ * \tparam idx Index to lookup.
+ * \param layout Layout to get Shape of.
+ * \return Requsted length.
+ */
+template <index_t idx, typename Shape, typename Strides>
+__host__ __device__ constexpr index_t size(const Layout<Shape, Strides>& layout)
+{
+    return layout.template GetLength<idx>();
+}
+/**
+ * \brief Shape size (product of dims).
+ *
+ * \param shape Shape to lookup.
+ * \return Requsted size.
+ */
+template <typename... ShapeDims>
+__host__ __device__ constexpr index_t size(const Tuple<ShapeDims...>& shape)
+{
+    const auto unrolled_shape = UnrollNestedTuple(shape);
+    return TupleReduce<0, unrolled_shape.Size()>([](auto x, auto y) { return x * y; },
+                                                 unrolled_shape);
+}
+/**
+ * \brief Layout size (product of dims).
+ *
+ * \param layout Layout to calculate shape size.
+ * \return Requsted size.
+ */
+template <typename Shape, typename Strides>
+__host__ __device__ constexpr index_t size(const Layout<Shape, Strides>& layout)
+{
+    return layout.GetLengths();
+}
+/**
+ * \brief Length get from tuple (product if tuple).
+ *
+ * \tparam idx Index to lookup.
+ * \param tuple Tuple to lookup.
+ * \return Requsted length.
+ */
+template <index_t idx, typename... Ts>
+__host__ __device__ constexpr index_t size(const Tuple<Ts...>& tuple)
+{
+    return size(tuple.At(Number<idx>{}));
+}
+/**
+ * \brief Hierarchical size.
+ *
+ * \tparam Idx First index to lookup (to avoid empty Idxs).
+ * \tparam Idxs Next indexes to lookup.
+ * \param elem Element to lookup.
+ * \return Requsted element.
+ */
+template <index_t Idx, index_t... Idxs, typename T>
+__host__ __device__ constexpr auto size(const T& elem)
+{
+    return size(get<Idx, Idxs...>(elem));
+}
+// rank
+/**
+ * \brief Get layout rank (num elements in shape).
+ *
+ * \param layout Layout to calculate rank.
+ * \return Requsted rank.
+ */
+template <typename Shape, typename Strides>
+__host__ __device__ constexpr auto rank([[maybe_unused]] const Layout<Shape, Strides>& layout)
+{
+    return Shape::Size();
+}
+/**
+ * \brief Get tuple rank (num elements in tuple).
+ *        Return 1 if scalar passed.
+ *
+ * \param tuple Tuple to calculate rank.
+ * \return Requsted rank.
+ */
+template <typename... Dims>
+__host__ __device__ constexpr auto rank([[maybe_unused]] const Tuple<Dims...>& tuple)
+{
+    return Tuple<Dims...>::Size();
+}
+/**
+ * \private
+ */
+template <index_t IDim>
+__host__ __device__ constexpr index_t rank(const Number<IDim>&)
+{
+    return 1;
+}
+/**
+ * \private
+ */
+__host__ __device__ constexpr index_t rank(const index_t&) { return 1; }
+/**
+ * \brief Hierarchical rank.
+ *
+ * \tparam Idxs Indexes to lookup.
+ * \param elem Element to lookup.
+ * \return Requsted rank.
+ */
+template <index_t... Idxs, typename T>
+__host__ __device__ constexpr auto rank(const T& elem)
+{
+    return rank(get<Idxs...>(elem));
+}
+// depth
+/**
+ * \brief Get depth of the layout shape (return 0 if scalar).
+ *
+ * \param layout Layout to calculate depth.
+ * \return Requsted depth.
+ */
+template <typename Shape, typename Strides>
+__host__ __device__ constexpr auto depth(const Layout<Shape, Strides>& layout)
+{
+    const auto& shape = layout.GetShape();
+    return TupleDepth(shape);
+}
+/**
+ * \brief Get depth of the tuple. (return 0 if scalar)
+ *
+ * \param tuple Tuple to calculate depth.
+ * \return Requsted depth.
+ */
+template <typename... Dims>
+__host__ __device__ constexpr auto depth(const Tuple<Dims...>& tuple)
+{
+    return TupleDepth(tuple);
+}
+/**
+ * \private
+ */
+template <index_t IDim>
+__host__ __device__ constexpr index_t depth(const Number<IDim>&)
+{
+    return 0;
+}
+/**
+ * \private
+ */
+__host__ __device__ constexpr index_t depth(const index_t&) { return 0; }
+/**
+ * \brief Hierarchical depth.
+ *
+ * \tparam Idxs Indexes to lookup.
+ * \param elem Element to lookup.
+ * \return Requsted depth.
+ */
+template <index_t... Idxs, typename T>
+__host__ __device__ constexpr auto depth(const T& elem)
+{
+    return depth(get<Idxs...>(elem));
+}
+/**
+ * \brief Get Layout strides.
+ *
+ * \param layout Layout to get strides from.
+ * \return Requsted strides.
+ */
+template <typename Shape, typename Strides>
+__host__ __device__ constexpr const auto& stride(const Layout<Shape, Strides>& layout)
+{
+    return layout.GetStrides();
+}
+/**
+ * \brief Get Layout shape.
+ *
+ * \param layout Layout to get shape from.
+ * \return Requsted shape.
+ */
+template <typename Shape, typename Strides>
+__host__ __device__ constexpr const auto& shape(const Layout<Shape, Strides>& layout)
+{
+    return layout.GetShape();
+}
+} // namespace wrapper
+} // namespace ck