Merge remote-tracking branch 'origin/develop' into lwpck-367

a1841d55 · Chao Liu · 127bf7f4 · 500fa995 · a1841d55 · 127bf7f4
Commit a1841d55 authored Aug 01, 2022 by Chao Liu
20 changed files
--- a/example/23_softmax/softmax_blockwise.cpp
+++ b/example/23_softmax/softmax_blockwise.cpp
@@ -13,8 +13,8 @@
 #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
 #include "ck/library/utility/check_err.hpp"
-#include "ck/library/host_tensor/device_memory.hpp"
+#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/host_tensor/host_common_util.hpp"
+#include "ck/library/utility/host_common_util.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
 using namespace ck;
@@ -177,7 +177,7 @@ int main(int argc, char* argv[])
        }
        if(beta != 0.0f)
-            for(size_t i = 0; i < out_ref.mDesc.GetElementSpace(); i++)
+            for(size_t i = 0; i < out_ref.mDesc.GetElementSpaceSize(); i++)
                out.mData[i] = out_ref.mData[i];
    };
    // std::cout << "beta = " << beta << std::endl;
@@ -185,8 +185,8 @@ int main(int argc, char* argv[])
    // LogRangeAsType<float>(std::cout << "tensor prior out: " , out.mData, ",") << std::endl;
    // these buffers are usually provided by the user application
-    DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpace());
+    DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
-    DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpace());
+    DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpaceSize());
    in_dev.ToDevice(in.mData.data());

--- a/example/24_batched_gemm_c_permute/CMakeLists.txt
+++ b/example/24_batched_gemm_c_permute/CMakeLists.txt
-add_example_executable(example_batched_gemm_c_permute_xdl_fp16 batched_gemm_c_permute_xdl_fp16.cpp)
--- a/example/24_batched_gemm_e_permute/CMakeLists.txt
+++ b/example/24_batched_gemm_e_permute/CMakeLists.txt
+add_example_executable(example_batched_gemm_e_permute_xdl_fp16 batched_gemm_e_permute_xdl_fp16.cpp)
--- a/example/24_batched_gemm_c_permute/batched_gemm_c_permute_xdl_fp16.cpp
+++ b/example/24_batched_gemm_c_permute/batched_gemm_c_permute_xdl_fp16.cpp
@@ -6,13 +6,13 @@
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/device_batched_gemm_c_permute_xdl.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_e_permute_xdl.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/library/utility/check_err.hpp"
-#include "ck/library/host_tensor/device_memory.hpp"
+#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/host_tensor/host_tensor.hpp"
+#include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/host_tensor/host_tensor_generator.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
 template <ck::index_t... Is>
@@ -30,7 +30,6 @@ using ADataType        = F16;
 using BDataType        = F16;
 using AccDataType      = F32;
 using CShuffleDataType = F16;
-using DsDataType       = ck::Tuple<>;
 using EDataType        = F16;
 using ALayout = Row;
@@ -42,16 +41,14 @@ using BElementOp   = PassThrough;
 using CDEElementOp = PassThrough;
 static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
-// static constexpr auto MNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
-// static constexpr auto MNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmEPermuteXdl
+    // clang-format off
-// clang-format off
+//######| ALayout| BLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmCPermuteXdl
+//######|        |        |        |      Type|      Type|        Type|         DataType|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-//######| ALayout| BLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######|        |        |        |          |          |            |                 |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-//######|        |        |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######|        |        |        |          |          |            |                 |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-//######|        |        |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        < ALayout, BLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
-//######|        |        |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        < ALayout, BLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
 // clang-format on
 using ReferenceBatchedGemmInstance = ck::tensor_operation::host::
@@ -99,7 +96,7 @@ int main(int argc, char* argv[])
    }
    // GEMM shape
-    ck::tensor_operation::device::BatchedGemmCPermuteDesc batched_gemm_c_permute_desc{
+    ck::tensor_operation::device::BatchedGemmEPermuteDesc batched_gemm_e_permute_desc{
        G0, G1, M, N, stride_G0, stride_G1, stride_M, stride_N};
    auto f_host_tensor_descriptor = [](std::size_t batch_count_,
@@ -125,7 +122,7 @@ int main(int argc, char* argv[])
    Tensor<BDataType> b_g_k_n(
        f_host_tensor_descriptor(batch_count, K, N, stride_B, batch_stride_B, BLayout{}));
-    auto f_host_c_tensor_descriptor = [](std::size_t G0_,
+    auto f_host_e_tensor_descriptor = [](std::size_t G0_,
                                         std::size_t G1_,
                                         std::size_t M_,
                                         std::size_t N_,
@@ -138,15 +135,15 @@ int main(int argc, char* argv[])
            std::vector<std::size_t>({stride_G0_, stride_G1_, stride_M_, stride_N_}));
    };
-    Tensor<EDataType> c_g0_g1_m_n_host_result(
+    Tensor<EDataType> e_g0_g1_m_n_host_result(
-        f_host_c_tensor_descriptor(G0, G1, M, N, stride_G0, stride_G1, stride_M, stride_N));
+        f_host_e_tensor_descriptor(G0, G1, M, N, stride_G0, stride_G1, stride_M, stride_N));
-    Tensor<EDataType> c_g0_g1_m_n_device_result(
+    Tensor<EDataType> e_g0_g1_m_n_device_result(
-        f_host_c_tensor_descriptor(G0, G1, M, N, stride_G0, stride_G1, stride_M, stride_N));
+        f_host_e_tensor_descriptor(G0, G1, M, N, stride_G0, stride_G1, stride_M, stride_N));
    std::cout << "a_g_m_k: " << a_g_m_k.mDesc << std::endl;
    std::cout << "b_g_k_n: " << b_g_k_n.mDesc << std::endl;
-    std::cout << "c_g0_g1_m_n: " << c_g0_g1_m_n_host_result.mDesc << std::endl;
+    std::cout << "e_g0_g1_m_n: " << e_g0_g1_m_n_host_result.mDesc << std::endl;
    switch(init_method)
    {
@@ -161,9 +158,10 @@ int main(int argc, char* argv[])
        break;
    }
-    DeviceMem a_device_buf(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSpace());
+    DeviceMem a_device_buf(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSpaceSize());
-    DeviceMem b_device_buf(sizeof(BDataType) * b_g_k_n.mDesc.GetElementSpace());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_g_k_n.mDesc.GetElementSpaceSize());
-    DeviceMem c_device_buf(sizeof(EDataType) * c_g0_g1_m_n_device_result.mDesc.GetElementSpace());
+    DeviceMem e_device_buf(sizeof(EDataType) *
+                           e_g0_g1_m_n_device_result.mDesc.GetElementSpaceSize());
    a_device_buf.ToDevice(a_g_m_k.mData.data());
    b_device_buf.ToDevice(b_g_k_n.mData.data());
@@ -178,7 +176,7 @@ int main(int argc, char* argv[])
    // do GEM
    auto argument = gemm.MakeArgument(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
                                      static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
-                                      static_cast<EDataType*>(c_device_buf.GetDeviceBuffer()),
+                                      static_cast<EDataType*>(e_device_buf.GetDeviceBuffer()),
                                      M,
                                      N,
                                      K,
@@ -186,7 +184,7 @@ int main(int argc, char* argv[])
                                      stride_B,
                                      batch_stride_A,
                                      batch_stride_B,
-                                      batched_gemm_c_permute_desc,
+                                      batched_gemm_e_permute_desc,
                                      batch_count,
                                      a_element_op,
                                      b_element_op,
@@ -217,7 +215,7 @@ int main(int argc, char* argv[])
    if(do_verification)
    {
-        c_device_buf.FromDevice(c_g0_g1_m_n_device_result.mData.data());
+        e_device_buf.FromDevice(e_g0_g1_m_n_device_result.mData.data());
        auto ref_batched_gemm = ReferenceBatchedGemmInstance{};
        auto ref_invoker      = ref_batched_gemm.MakeInvoker();
@@ -238,15 +236,16 @@ int main(int argc, char* argv[])
                {
                    for(int n = 0; n < N; n++)
                    {
-                        int g                                 = g0 * G1 + g1;
+                        int g = g0 * G1 + g1;
-                        c_g0_g1_m_n_host_result(g0, g1, m, n) = c_g_m_n_host_result(g, m, n);
+                        e_g0_g1_m_n_host_result(g0, g1, m, n) = c_g_m_n_host_result(g, m, n);
                    }
                }
            }
        }
-        pass = ck::utils::check_err(c_g0_g1_m_n_host_result.mData,
+        pass = ck::utils::check_err(e_g0_g1_m_n_host_result.mData,
-                                    c_g0_g1_m_n_device_result.mData,
+                                    e_g0_g1_m_n_device_result.mData,
                                    "Error: Incorrect results c");
    }

--- a/example/25_gemm_bias_c_permute/CMakeLists.txt
+++ b/example/25_gemm_bias_c_permute/CMakeLists.txt
-add_example_executable(example_gemm_bias_c_permute_xdl_fp16 gemm_bias_c_permute_xdl_fp16.cpp)
--- a/example/25_gemm_bias_e_permute/CMakeLists.txt
+++ b/example/25_gemm_bias_e_permute/CMakeLists.txt
+add_example_executable(example_gemm_bias_e_permute_xdl_fp16 gemm_bias_e_permute_xdl_fp16.cpp)
--- a/example/25_gemm_bias_c_permute/gemm_bias_c_permute_xdl_fp16.cpp
+++ b/example/25_gemm_bias_c_permute/gemm_bias_c_permute_xdl_fp16.cpp
@@ -9,12 +9,12 @@
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/device_gemm_bias_c_permute_xdl.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_bias_e_permute_xdl.hpp"
 #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
-#include "ck/library/host_tensor/device_memory.hpp"
+#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/host_tensor/host_tensor.hpp"
+#include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/host_tensor/host_tensor_generator.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 #include "ck/library/utility/check_err.hpp"
@@ -49,7 +49,7 @@ using CDEElementOp = Add;
 static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
 // clang-format off
-using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmBiasCPermute_Xdl
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmBiasEPermute_Xdl
 //######| ALayout| BLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
 //######|        |        |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 //######|        |        |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
@@ -186,12 +186,12 @@ int main(int argc, char* argv[])
        d_m0_m1_m2_n0_n1.GenerateTensorValue(GeneratorTensor_3<DDataType>{0.0, 1.0});
    }
-    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
+    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
-    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
+    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
    DeviceMem d_m0_m1_m2_n0_n1_device_buf(sizeof(DDataType) *
-                                          d_m0_m1_m2_n0_n1.mDesc.GetElementSpace());
+                                          d_m0_m1_m2_n0_n1.mDesc.GetElementSpaceSize());
-    DeviceMem e_m0_m1_m2_n0_n1_device_buf(sizeof(EDataType) *
+    DeviceMem e_m0_m1_m2_n0_n1_device_buf(
-                                          e_m0_m1_m2_n0_n1_device_result.mDesc.GetElementSpace());
+        sizeof(EDataType) * e_m0_m1_m2_n0_n1_device_result.mDesc.GetElementSpaceSize());
    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
    b_k_n_device_buf.ToDevice(b_k_n.mData.data());

--- a/example/26_contraction/contraction_bilinear_xdl_fp32.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_fp32.cpp
@@ -12,9 +12,9 @@
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/library/utility/check_err.hpp"
-#include "ck/library/host_tensor/device_memory.hpp"
+#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/host_tensor/host_tensor.hpp"
+#include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/host_tensor/host_tensor_generator.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -324,10 +324,10 @@ int main(int argc, char* argv[])
        break;
    }
-    DeviceMem a_device_buf(sizeof(ADataType) * a_ms_ks.mDesc.GetElementSpace());
+    DeviceMem a_device_buf(sizeof(ADataType) * a_ms_ks.mDesc.GetElementSpaceSize());
-    DeviceMem b_device_buf(sizeof(BDataType) * b_ns_ks.mDesc.GetElementSpace());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_ns_ks.mDesc.GetElementSpaceSize());
-    DeviceMem d_device_buf(sizeof(DDataType) * d_ms_ns.mDesc.GetElementSpace());
+    DeviceMem d_device_buf(sizeof(DDataType) * d_ms_ns.mDesc.GetElementSpaceSize());
-    DeviceMem e_device_buf(sizeof(EDataType) * e_ms_ns_device_result.mDesc.GetElementSpace());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_ms_ns_device_result.mDesc.GetElementSpaceSize());
    a_device_buf.ToDevice(a_ms_ks.mData.data());
    b_device_buf.ToDevice(b_ns_ks.mData.data());

--- a/example/26_contraction/contraction_scale_xdl_fp32.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp32.cpp
@@ -12,9 +12,9 @@
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/library/utility/check_err.hpp"
-#include "ck/library/host_tensor/device_memory.hpp"
+#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/host_tensor/host_tensor.hpp"
+#include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/host_tensor/host_tensor_generator.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -260,16 +260,16 @@ int main(int argc, char* argv[])
        e_ms_ns_lengths = {M0, M1, N0, N1};
        e_ms_ns_strides = {
-            std::stoi(argv[22]), std::stoi(argv[23]), std::stoi(argv[24]), std::stoi(argv[25])};
+            std::stoi(argv[18]), std::stoi(argv[19]), std::stoi(argv[20]), std::stoi(argv[21])};
-        scale = std::stof(argv[26]);
+        scale = std::stof(argv[22]);
    }
    else
    {
        printf("arg1: verification (0=no, 1=yes)\n");
        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
        printf("arg3: time kernel (0=no, 1=yes)\n");
-        printf("arg4 to 7: M0, M1, N0, N1, K0, K1\n");
+        printf("arg4 to 9: M0, M1, N0, N1, K0, K1\n");
        printf("arg10 to 13: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1\n");
        printf("arg14 to 17: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1\n");
        printf("arg18 to 21: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1\n");
@@ -307,9 +307,9 @@ int main(int argc, char* argv[])
        break;
    }
-    DeviceMem a_device_buf(sizeof(ADataType) * a_ms_ks.mDesc.GetElementSpace());
+    DeviceMem a_device_buf(sizeof(ADataType) * a_ms_ks.mDesc.GetElementSpaceSize());
-    DeviceMem b_device_buf(sizeof(BDataType) * b_ns_ks.mDesc.GetElementSpace());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_ns_ks.mDesc.GetElementSpaceSize());
-    DeviceMem e_device_buf(sizeof(EDataType) * e_ms_ns_device_result.mDesc.GetElementSpace());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_ms_ns_device_result.mDesc.GetElementSpaceSize());
    a_device_buf.ToDevice(a_ms_ks.mData.data());
    b_device_buf.ToDevice(b_ns_ks.mData.data());

--- a/example/27_layernorm/layernorm_blockwise.cpp
+++ b/example/27_layernorm/layernorm_blockwise.cpp
@@ -13,10 +13,10 @@
 #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
 #include "ck/library/utility/check_err.hpp"
-#include "ck/library/host_tensor/device_memory.hpp"
+#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/host_tensor/host_common_util.hpp"
+#include "ck/library/utility/host_common_util.hpp"
-#include "ck/library/host_tensor/host_tensor.hpp"
+#include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/host_tensor/host_tensor_generator.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
 using XDataType     = ck::half_t;
@@ -75,10 +75,10 @@ int main()
    gamma.GenerateTensorValue(GeneratorTensor_3<GammaDataType>{0.0, 1.0});
    beta.GenerateTensorValue(GeneratorTensor_3<BetaDataType>{0.0, 1.0});
-    DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpace());
+    DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize());
-    DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpace());
+    DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize());
-    DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpace());
+    DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize());
-    DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpace());
+    DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize());
    x_dev.ToDevice(x.mData.data());
    gamma_dev.ToDevice(gamma.mData.data());

--- a/example/28_grouped_gemm_bias/grouped_gemm_bias_xdl_fp16.cpp
+++ b/example/28_grouped_gemm_bias/grouped_gemm_bias_xdl_fp16.cpp
@@ -13,9 +13,9 @@
 #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
 #include "ck/library/utility/check_err.hpp"
-#include "ck/library/host_tensor/device_memory.hpp"
+#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/host_tensor/host_tensor.hpp"
+#include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/host_tensor/host_tensor_generator.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 template <ck::index_t... Is>
@@ -34,13 +34,15 @@ using ADataType        = F16;
 using BDataType        = F16;
 using AccDataType      = F32;
 using CShuffleDataType = F16;
-using D0DataType       = F16;
+using DDataType        = F16;
-using DsDataType       = ck::Tuple<D0DataType>;
+using DsDataType       = ck::Tuple<DDataType>;
 using EDataType        = F16;
-using ALayout = Row;
+using ALayout  = Row;
-using BLayout = Col;
+using BLayout  = Col;
-using ELayout = Row;
+using DLayout  = Row;
+using DsLayout = ck::Tuple<DLayout>;
+using ELayout  = Row;
 using AElementOp   = PassThrough;
 using BElementOp   = PassThrough;
@@ -48,13 +50,13 @@ using CDEElementOp = Add;
 static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
-using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemmXdl
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemm_Xdl
    // clang-format off
-//######| ALayout| BLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######| ALayout| BLayout| DsLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-//######|        |        |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-//######|        |        |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-//######|        |        |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+//######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        < ALayout, BLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
 // clang-format on
 int main(int argc, char* argv[])
@@ -118,24 +120,24 @@ int main(int argc, char* argv[])
    std::vector<Tensor<ADataType>> a_tensors;
    std::vector<Tensor<BDataType>> b_tensors;
-    std::vector<Tensor<D0DataType>> d0_tensors;
+    std::vector<Tensor<DDataType>> d_tensors;
    std::vector<Tensor<EDataType>> e_host_tensors;
    std::vector<Tensor<EDataType>> e_device_tensors;
    a_tensors.reserve(group_count);
    b_tensors.reserve(group_count);
-    d0_tensors.reserve(group_count);
+    d_tensors.reserve(group_count);
    e_host_tensors.reserve(group_count);
    e_device_tensors.reserve(group_count);
    using DeviceMemPtr = std::unique_ptr<DeviceMem>;
-    std::vector<DeviceMemPtr> a_tensors_device, b_tensors_device, d0_tensors_device,
+    std::vector<DeviceMemPtr> a_tensors_device, b_tensors_device, d_tensors_device,
        e_tensors_device;
    a_tensors_device.reserve(group_count);
    b_tensors_device.reserve(group_count);
-    d0_tensors_device.reserve(group_count);
+    d_tensors_device.reserve(group_count);
    e_tensors_device.reserve(group_count);
    std::size_t flop = 0, num_btype = 0;
@@ -146,7 +148,7 @@ int main(int argc, char* argv[])
            gemm_descs[i].M_, gemm_descs[i].K_, gemm_descs[i].stride_A_, ALayout{})));
        b_tensors.push_back(Tensor<BDataType>(f_host_tensor_descriptor(
            gemm_descs[i].K_, gemm_descs[i].N_, gemm_descs[i].stride_B_, BLayout{})));
-        d0_tensors.push_back(Tensor<D0DataType>(f_host_tensor_descriptor(
+        d_tensors.push_back(Tensor<DDataType>(f_host_tensor_descriptor(
            gemm_descs[i].M_, gemm_descs[i].N_, gemm_descs[i].stride_Ds_[0], ELayout{})));
        e_host_tensors.push_back(Tensor<EDataType>(f_host_tensor_descriptor(
            gemm_descs[i].M_, gemm_descs[i].N_, gemm_descs[i].stride_C_, ELayout{})));
@@ -168,38 +170,38 @@ int main(int argc, char* argv[])
        case 1:
            a_tensors[i].GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
            b_tensors[i].GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
-            d0_tensors[i].GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+            d_tensors[i].GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
            break;
        case 2:
            a_tensors[i].GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
            b_tensors[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
-            d0_tensors[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+            d_tensors[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
            break;
        default:
            a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{});
            b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{});
-            d0_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{});
+            d_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{});
        }
    }
    for(std::size_t i = 0; i < gemm_descs.size(); i++)
    {
-        a_tensors_device.emplace_back(
+        a_tensors_device.emplace_back(std::make_unique<DeviceMem>(
-            std::make_unique<DeviceMem>(sizeof(ADataType) * a_tensors[i].mDesc.GetElementSpace()));
+            sizeof(ADataType) * a_tensors[i].mDesc.GetElementSpaceSize()));
-        b_tensors_device.emplace_back(
+        b_tensors_device.emplace_back(std::make_unique<DeviceMem>(
-            std::make_unique<DeviceMem>(sizeof(BDataType) * b_tensors[i].mDesc.GetElementSpace()));
+            sizeof(BDataType) * b_tensors[i].mDesc.GetElementSpaceSize()));
-        d0_tensors_device.emplace_back(std::make_unique<DeviceMem>(
+        d_tensors_device.emplace_back(std::make_unique<DeviceMem>(
-            sizeof(D0DataType) * d0_tensors[i].mDesc.GetElementSpace()));
+            sizeof(DDataType) * d_tensors[i].mDesc.GetElementSpaceSize()));
        e_tensors_device.emplace_back(std::make_unique<DeviceMem>(
-            sizeof(EDataType) * e_device_tensors[i].mDesc.GetElementSpace()));
+            sizeof(EDataType) * e_device_tensors[i].mDesc.GetElementSpaceSize()));
        a_tensors_device[i]->ToDevice(a_tensors[i].mData.data());
        b_tensors_device[i]->ToDevice(b_tensors[i].mData.data());
-        d0_tensors_device[i]->ToDevice(d0_tensors[i].mData.data());
+        d_tensors_device[i]->ToDevice(d_tensors[i].mData.data());
        p_a.push_back(a_tensors_device[i]->GetDeviceBuffer());
        p_b.push_back(b_tensors_device[i]->GetDeviceBuffer());
-        p_ds.push_back({d0_tensors_device[i]->GetDeviceBuffer()});
+        p_ds.push_back({d_tensors_device[i]->GetDeviceBuffer()});
        p_c.push_back(e_tensors_device[i]->GetDeviceBuffer());
    }
@@ -266,7 +268,7 @@ int main(int argc, char* argv[])
                for(int n = 0; n < gemm_descs[i].N_; ++n)
                {
                    cde_element_op(
-                        e_host_tensors[i](m, n), e_host_tensors[i](m, n), d0_tensors[i](m, n));
+                        e_host_tensors[i](m, n), e_host_tensors[i](m, n), d_tensors[i](m, n));
                }
            }

--- a/example/29_batched_gemm_multi_d/batched_gemm_bias_xdl_fp16.cpp
+++ b/example/29_batched_gemm_multi_d/batched_gemm_bias_xdl_fp16.cpp
@@ -10,9 +10,9 @@
 #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
 #include "ck/library/utility/check_err.hpp"
-#include "ck/library/host_tensor/device_memory.hpp"
+#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/host_tensor/host_tensor.hpp"
+#include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/host_tensor/host_tensor_generator.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
 template <ck::index_t... Is>
@@ -37,7 +37,9 @@ using EDataType        = F16;
 using ALayout  = Row;
 using BLayout  = Col;
-using DELayout = Row;
+using DLayout  = Row;
+using DsLayout = ck::Tuple<DLayout>;
+using ELayout  = Row;
 using AElementOp   = PassThrough;
 using BElementOp   = PassThrough;
@@ -48,12 +50,12 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
 // static constexpr auto MNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
 // clang-format off
-using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmMultiDXdl
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmMultiD_Xdl
-//######| ALayout| BLayout| DELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######| ALayout| BLayout| DsLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-//######|        |        |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-//######|        |        |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-//######|        |        |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+//######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        < ALayout, BLayout, DELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
 // clang-format on
 int main(int argc, char* argv[])
@@ -117,10 +119,10 @@ int main(int argc, char* argv[])
        f_host_tensor_descriptor(batch_count, K, N, stride_B, batch_stride_B, BLayout{}));
    Tensor<DDataType> d_g_m_n(
-        f_host_tensor_descriptor(batch_count, M, N, stride_D, batch_stride_D, DELayout{}));
+        f_host_tensor_descriptor(batch_count, M, N, stride_D, batch_stride_D, DLayout{}));
    Tensor<EDataType> e_g_m_n_device_result(
-        f_host_tensor_descriptor(batch_count, M, N, stride_E, batch_stride_E, DELayout{}));
+        f_host_tensor_descriptor(batch_count, M, N, stride_E, batch_stride_E, ELayout{}));
    std::cout << "a_g_m_k: " << a_g_m_k.mDesc << std::endl;
    std::cout << "b_g_k_n: " << b_g_k_n.mDesc << std::endl;
@@ -142,10 +144,10 @@ int main(int argc, char* argv[])
        break;
    }
-    DeviceMem a_device_buf(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSpace());
+    DeviceMem a_device_buf(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSpaceSize());
-    DeviceMem b_device_buf(sizeof(BDataType) * b_g_k_n.mDesc.GetElementSpace());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_g_k_n.mDesc.GetElementSpaceSize());
-    DeviceMem d_device_buf(sizeof(DDataType) * d_g_m_n.mDesc.GetElementSpace());
+    DeviceMem d_device_buf(sizeof(DDataType) * d_g_m_n.mDesc.GetElementSpaceSize());
-    DeviceMem c_device_buf(sizeof(EDataType) * e_g_m_n_device_result.mDesc.GetElementSpace());
+    DeviceMem c_device_buf(sizeof(EDataType) * e_g_m_n_device_result.mDesc.GetElementSpaceSize());
    a_device_buf.ToDevice(a_g_m_k.mData.data());
    b_device_buf.ToDevice(b_g_k_n.mData.data());
@@ -166,6 +168,7 @@ int main(int argc, char* argv[])
                                      M,
                                      N,
                                      K,
+                                      batch_count,
                                      stride_A,
                                      stride_B,
                                      {stride_D},
@@ -174,7 +177,6 @@ int main(int argc, char* argv[])
                                      batch_stride_B,
                                      {batch_stride_D},
                                      batch_stride_E,
-                                      batch_count,
                                      a_element_op,
                                      b_element_op,
                                      cde_element_op);
@@ -218,7 +220,7 @@ int main(int argc, char* argv[])
        auto ref_invoker      = ref_batched_gemm.MakeInvoker();
        Tensor<EDataType> e_g_m_n_host_result(
-            f_host_tensor_descriptor(batch_count, M, N, stride_E, batch_stride_E, DELayout{}));
+            f_host_tensor_descriptor(batch_count, M, N, stride_E, batch_stride_E, ELayout{}));
        auto ref_argument = ref_batched_gemm.MakeArgument(
            a_g_m_k, b_g_k_n, e_g_m_n_host_result, a_element_op, b_element_op, PassThrough{});

--- a/example/29_batched_gemm_multi_d/batched_gemm_xdl_fp16.cpp
+++ b/example/29_batched_gemm_multi_d/batched_gemm_xdl_fp16.cpp
@@ -10,9 +10,9 @@
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/library/utility/check_err.hpp"
-#include "ck/library/host_tensor/device_memory.hpp"
+#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/host_tensor/host_tensor.hpp"
+#include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/host_tensor/host_tensor_generator.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
 template <ck::index_t... Is>
@@ -33,9 +33,10 @@ using CShuffleDataType = F16;
 using DsDataType       = ck::Tuple<>;
 using EDataType        = F16;
-using ALayout = Row;
+using ALayout  = Row;
-using BLayout = Col;
+using BLayout  = Col;
-using ELayout = Row;
+using DsLayout = ck::Tuple<>;
+using ELayout  = Row;
 using AElementOp   = PassThrough;
 using BElementOp   = PassThrough;
@@ -46,12 +47,12 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
 // static constexpr auto MNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
 // clang-format off
-using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmMultiDXdl
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmMultiD_Xdl
-//######| ALayout| BLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######| ALayout| BLayout| DsLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-//######|        |        |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-//######|        |        |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-//######|        |        |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+//######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        < ALayout, BLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
 // clang-format on
 using ReferenceBatchedGemmInstance = ck::tensor_operation::host::
@@ -135,9 +136,9 @@ int main(int argc, char* argv[])
        break;
    }
-    DeviceMem a_device_buf(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSpace());
+    DeviceMem a_device_buf(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSpaceSize());
-    DeviceMem b_device_buf(sizeof(BDataType) * b_g_k_n.mDesc.GetElementSpace());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_g_k_n.mDesc.GetElementSpaceSize());
-    DeviceMem c_device_buf(sizeof(EDataType) * e_g_m_n_device_result.mDesc.GetElementSpace());
+    DeviceMem c_device_buf(sizeof(EDataType) * e_g_m_n_device_result.mDesc.GetElementSpaceSize());
    a_device_buf.ToDevice(a_g_m_k.mData.data());
    b_device_buf.ToDevice(b_g_k_n.mData.data());
@@ -157,6 +158,7 @@ int main(int argc, char* argv[])
                                      M,
                                      N,
                                      K,
+                                      batch_count,
                                      stride_A,
                                      stride_B,
                                      {},
@@ -165,7 +167,6 @@ int main(int argc, char* argv[])
                                      batch_stride_B,
                                      {},
                                      batch_stride_C,
-                                      batch_count,
                                      a_element_op,
                                      b_element_op,
                                      cde_element_op);

--- a/example/30_grouped_convnd_fwd_bias_relu/CMakeLists.txt
+++ b/example/30_grouped_convnd_fwd_bias_relu/CMakeLists.txt
+add_example_executable(example_grouped_convnd_fwd_bias_relu_xdl_fp16 grouped_convnd_fwd_bias_relu_xdl_fp16.cpp)
+target_link_libraries(example_grouped_convnd_fwd_bias_relu_xdl_fp16 PRIVATE utility)
--- a/example/30_grouped_convnd_fwd_bias_relu/README.md
+++ b/example/30_grouped_convnd_fwd_bias_relu/README.md
+```bash
+#arg1: verification (0=no, 1=yes)
+#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
+#arg3: time kernel (0=no, 1=yes)
+#Following arguments (depending on number of spatial dims):
+# N spatial dimensions
+# G, N, K, C,
+# <filter spatial dimensions>, (ie Y, X for 2D)
+# <input image spatial dimensions>, (ie Hi, Wi for 2D)
+# <strides>, (ie Sy, Sx for 2D)
+# <dilations>, (ie Dy, Dx for 2D)
+# <left padding>, (ie LeftPy, LeftPx for 2D)
+# <right padding>, (ie RightPy, RightPx for 2D)
+bin/example_grouped_convnd_fwd_bias_relu_xdl_fp16 1 1 1
+```
+Result (MI100)
+```
+in: dim 5, lengths {1, 128, 192, 71, 71}, strides {6912, 967872, 1, 13632, 192}
+wei: dim 5, lengths {1, 256, 192, 3, 3}, strides {192, 1728, 1, 576, 192}
+bias: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0}
+out: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 331776, 1, 9216, 256}
+launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
+Warm up 1 time
+Start running 10 times...
+Perf: 1.19215 ms, 123.112 TFlops, 279.827 GB/s, DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<256, 128, 256, 32, Default>
+```
--- a/example/30_grouped_convnd_fwd_bias_relu/grouped_convnd_fwd_bias_common.hpp
+++ b/example/30_grouped_convnd_fwd_bias_relu/grouped_convnd_fwd_bias_common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <type_traits>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+void print_helper_msg()
+{
+    std::cout << "arg1: verification (0=no, 1=yes)\n"
+              << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
+              << "arg3: time kernel (0=no, 1=yes)\n"
+              << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
+}
+template <ck::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementOp,
+          typename WeiElementOp,
+          typename OutElementOp,
+          typename DeviceConvNDFwdInstance>
+int run_grouped_conv_fwd_bias(bool do_verification,
+                              int init_method,
+                              bool time_kernel,
+                              const ck::utils::conv::ConvParam& conv_param,
+                              const HostTensorDescriptor& in_g_n_c_wis_desc,
+                              const HostTensorDescriptor& wei_g_k_c_xs_desc,
+                              const HostTensorDescriptor& bias_g_n_k_wos_desc,
+                              const HostTensorDescriptor& out_g_n_k_wos_desc,
+                              const InElementOp& in_element_op,
+                              const WeiElementOp& wei_element_op,
+                              const OutElementOp& out_element_op)
+{
+    Tensor<InDataType> in(in_g_n_c_wis_desc);
+    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
+    Tensor<OutDataType> bias(bias_g_n_k_wos_desc);
+    Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
+    Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
+    std::cout << "in: " << in.mDesc << std::endl;
+    std::cout << "wei: " << wei.mDesc << std::endl;
+    std::cout << "bias: " << bias.mDesc << std::endl;
+    std::cout << "out: " << out_host.mDesc << std::endl;
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+        wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+        bias.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        break;
+    default:
+        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
+        wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
+        bias.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5});
+    }
+    DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
+    DeviceMem bias_device_buf(sizeof(OutDataType) * bias.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
+    in_device_buf.ToDevice(in.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
+    bias_device_buf.ToDevice(bias.mData.data());
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> d_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> d_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+    auto copy = [](auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); };
+    copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
+    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
+    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
+    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
+    copy(bias_g_n_k_wos_desc.GetLengths(), d_g_n_k_wos_lengths);
+    copy(bias_g_n_k_wos_desc.GetStrides(), d_g_n_k_wos_strides);
+    copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
+    copy(conv_param.conv_filter_strides_, conv_filter_strides);
+    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_param.input_left_pads_, input_left_pads);
+    copy(conv_param.input_right_pads_, input_right_pads);
+    // do Conv
+    auto conv     = DeviceConvNDFwdInstance{};
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(
+        in_device_buf.GetDeviceBuffer(),
+        wei_device_buf.GetDeviceBuffer(),
+        std::array<const void*, 1>{bias_device_buf.GetDeviceBuffer()},
+        out_device_buf.GetDeviceBuffer(),
+        a_g_n_c_wis_lengths,
+        a_g_n_c_wis_strides,
+        b_g_k_c_xs_lengths,
+        b_g_k_c_xs_strides,
+        std::array<std::array<ck::index_t, NDimSpatial + 3>, 1>{{d_g_n_k_wos_lengths}},
+        std::array<std::array<ck::index_t, NDimSpatial + 3>, 1>{{d_g_n_k_wos_strides}},
+        e_g_n_k_wos_lengths,
+        e_g_n_k_wos_strides,
+        conv_filter_strides,
+        conv_filter_dilations,
+        input_left_pads,
+        input_right_pads,
+        in_element_op,
+        wei_element_op,
+        out_element_op);
+    if(!conv.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_conv with the specified compilation parameters does "
+            "not support this Conv problem");
+    }
+    float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+    std::size_t flop      = conv_param.GetFlops();
+    std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
+    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+    float gb_per_sec = num_btype / 1.E6 / avg_time;
+    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << conv.GetTypeString() << std::endl;
+    if(do_verification)
+    {
+        using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+        Tensor<OutDataType> c_host(out_g_n_k_wos_desc);
+        auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+                                                                     InDataType,
+                                                                     WeiDataType,
+                                                                     OutDataType,
+                                                                     InElementOp,
+                                                                     WeiElementOp,
+                                                                     PassThrough>();
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(in,
+                                                  wei,
+                                                  c_host,
+                                                  conv_param.conv_filter_strides_,
+                                                  conv_param.conv_filter_dilations_,
+                                                  conv_param.input_left_pads_,
+                                                  conv_param.input_right_pads_,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  PassThrough{});
+        ref_invoker.Run(ref_argument);
+        // TODO: implement elementwise operation for host
+        out_host.ForEach(
+            [&](auto&, auto idx) { out_element_op(out_host(idx), c_host(idx), bias(idx)); });
+        out_device_buf.FromDevice(out_device.mData.data());
+        return ck::utils::check_err(
+                   out_device.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f)
+                   ? 0
+                   : 1;
+    }
+    return 0;
+}
--- a/example/30_grouped_convnd_fwd_bias_relu/grouped_convnd_fwd_bias_relu_xdl_fp16.cpp
+++ b/example/30_grouped_convnd_fwd_bias_relu/grouped_convnd_fwd_bias_relu_xdl_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include "grouped_convnd_fwd_bias_common.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+using InDataType       = ck::half_t;
+using WeiDataType      = ck::half_t;
+using AccDataType      = float;
+using CShuffleDataType = ck::half_t;
+using BiasDataType     = ck::half_t;
+using OutDataType      = ck::half_t;
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::AddRelu;
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+template <ck::index_t NDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename BiasLayout,
+          typename OutLayout>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
+        NDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<BiasLayout>,
+        OutLayout,
+        InDataType,
+        WeiDataType,
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<BiasDataType>,
+        OutDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        32,          // KPerBlock
+        8,           // AK1
+        8,           // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        8,           // ABlockTransferSrcScalarPerVector
+        8,           // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        8,           // BBlockTransferSrcScalarPerVector
+        8,           // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 32, 1, 8>,
+        8>;
+int main(int argc, char* argv[])
+{
+    namespace ctc = ck::tensor_layout::convolution;
+    print_helper_msg();
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+    ck::utils::conv::ConvParam conv_param{
+        2, 2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else
+    {
+        do_verification                   = std::stoi(argv[1]);
+        init_method                       = std::stoi(argv[2]);
+        time_kernel                       = std::stoi(argv[3]);
+        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
+        conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
+    }
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{};
+    if(conv_param.num_dim_spatial_ == 1)
+    {
+        using InLayout   = ctc::G_NW_C;
+        using WeiLayout  = ctc::G_K_X_C;
+        using BiasLayout = ctc::G_NW_K;
+        using OutLayout  = ctc::G_NW_K;
+        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
+            {conv_param.G_, conv_param.N_, conv_param.C_, conv_param.input_spatial_lengths_[0]},
+            {
+                conv_param.C_,                                                        // g
+                conv_param.input_spatial_lengths_[0] * conv_param.G_ * conv_param.C_, // n
+                1,                                                                    // c
+                conv_param.G_ * conv_param.C_                                         // wi
+            });
+        const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
+            {conv_param.G_, conv_param.K_, conv_param.C_, conv_param.filter_spatial_lengths_[0]},
+            {
+                conv_param.C_,                                                         // g
+                conv_param.filter_spatial_lengths_[0] * conv_param.G_ * conv_param.C_, // k
+                1,                                                                     // c
+                conv_param.G_ * conv_param.C_                                          // x
+            });
+        const auto bias_g_n_k_wos_desc = HostTensorDescriptor(
+            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
+            {
+                conv_param.K_, // g
+                0,             // k
+                1,             // c
+                0              // x
+            });
+        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
+            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
+            {
+                conv_param.K_,                                                         // g
+                conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n
+                1,                                                                     // k
+                conv_param.G_ * conv_param.K_                                          // wo
+            });
+        return run_grouped_conv_fwd_bias<
+            1,
+            InDataType,
+            WeiDataType,
+            OutDataType,
+            InElementOp,
+            WeiElementOp,
+            OutElementOp,
+            DeviceGroupedConvNDFwdInstance<1, InLayout, WeiLayout, BiasLayout, OutLayout>>(
+            do_verification,
+            init_method,
+            time_kernel,
+            conv_param,
+            in_g_n_c_wis_desc,
+            wei_g_k_c_xs_desc,
+            bias_g_n_k_wos_desc,
+            out_g_n_k_wos_desc,
+            in_element_op,
+            wei_element_op,
+            out_element_op);
+    }
+    else if(conv_param.num_dim_spatial_ == 2)
+    {
+        using InLayout   = ctc::G_NHW_C;
+        using WeiLayout  = ctc::G_K_YX_C;
+        using BiasLayout = ctc::G_NHW_K;
+        using OutLayout  = ctc::G_NHW_K;
+        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
+            {conv_param.G_,
+             conv_param.N_,
+             conv_param.C_,
+             conv_param.input_spatial_lengths_[0],
+             conv_param.input_spatial_lengths_[1]},
+            {
+                conv_param.output_spatial_lengths_[0] * conv_param.C_, // g
+                conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
+                    conv_param.G_ * conv_param.C_,                                    // n
+                1,                                                                    // c
+                conv_param.input_spatial_lengths_[1] * conv_param.G_ * conv_param.C_, // hi
+                conv_param.G_ * conv_param.C_                                         // wi
+            });
+        const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
+            {conv_param.G_,
+             conv_param.K_,
+             conv_param.C_,
+             conv_param.filter_spatial_lengths_[0],
+             conv_param.filter_spatial_lengths_[1]},
+            {
+                conv_param.C_, // g
+                conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] *
+                    conv_param.G_ * conv_param.C_,                                     // k
+                1,                                                                     // c
+                conv_param.filter_spatial_lengths_[1] * conv_param.G_ * conv_param.C_, // y
+                conv_param.G_ * conv_param.C_                                          // x
+            });
+        const auto bias_g_n_k_wos_desc =
+            HostTensorDescriptor({conv_param.G_,
+                                  conv_param.N_,
+                                  conv_param.K_,
+                                  conv_param.output_spatial_lengths_[0],
+                                  conv_param.output_spatial_lengths_[1]},
+                                 {
+                                     conv_param.K_, // g
+                                     0,             // n
+                                     1,             // k
+                                     0,             // ho
+                                     0              // wo
+                                 });
+        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
+            {conv_param.G_,
+             conv_param.N_,
+             conv_param.K_,
+             conv_param.output_spatial_lengths_[0],
+             conv_param.output_spatial_lengths_[1]},
+            {
+                conv_param.K_, // g
+                conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
+                    conv_param.G_ * conv_param.K_,                                     // n
+                1,                                                                     // k
+                conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho
+                conv_param.G_ * conv_param.K_                                          // wo
+            });
+        return run_grouped_conv_fwd_bias<
+            2,
+            InDataType,
+            WeiDataType,
+            OutDataType,
+            InElementOp,
+            WeiElementOp,
+            OutElementOp,
+            DeviceGroupedConvNDFwdInstance<2, InLayout, WeiLayout, BiasLayout, OutLayout>>(
+            do_verification,
+            init_method,
+            time_kernel,
+            conv_param,
+            in_g_n_c_wis_desc,
+            wei_g_k_c_xs_desc,
+            bias_g_n_k_wos_desc,
+            out_g_n_k_wos_desc,
+            in_element_op,
+            wei_element_op,
+            out_element_op);
+    }
+    else if(conv_param.num_dim_spatial_ == 3)
+    {
+        using InLayout   = ctc::G_NDHW_C;
+        using WeiLayout  = ctc::G_K_ZYX_C;
+        using BiasLayout = ctc::G_NDHW_K;
+        using OutLayout  = ctc::G_NDHW_K;
+        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
+            {conv_param.G_,
+             conv_param.N_,
+             conv_param.C_,
+             conv_param.input_spatial_lengths_[0],
+             conv_param.input_spatial_lengths_[1],
+             conv_param.input_spatial_lengths_[2]},
+            {
+                conv_param.output_spatial_lengths_[0] * conv_param.C_, // g
+                conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
+                    conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // n
+                1,                                                                        // c
+                conv_param.input_spatial_lengths_[1] * conv_param.input_spatial_lengths_[2] *
+                    conv_param.G_ * conv_param.C_,                                    // di
+                conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi
+                conv_param.G_ * conv_param.C_                                         // wi
+            });
+        const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
+            {conv_param.G_,
+             conv_param.K_,
+             conv_param.C_,
+             conv_param.filter_spatial_lengths_[0],
+             conv_param.filter_spatial_lengths_[1],
+             conv_param.filter_spatial_lengths_[2]},
+            {
+                conv_param.C_, // g
+                conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] *
+                    conv_param.filter_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // k
+                1,                                                                         // c
+                conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
+                    conv_param.G_ * conv_param.C_,                                     // z
+                conv_param.filter_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // y
+                conv_param.G_ * conv_param.C_                                          // x
+            });
+        const auto bias_g_n_k_wos_desc =
+            HostTensorDescriptor({conv_param.G_,
+                                  conv_param.N_,
+                                  conv_param.K_,
+                                  conv_param.output_spatial_lengths_[0],
+                                  conv_param.output_spatial_lengths_[1],
+                                  conv_param.output_spatial_lengths_[2]},
+                                 {
+                                     conv_param.K_, // g
+                                     0,             // n
+                                     1,             // k
+                                     0,             // z
+                                     0,             // y
+                                     0              // x
+                                 });
+        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
+            {conv_param.G_,
+             conv_param.N_,
+             conv_param.K_,
+             conv_param.output_spatial_lengths_[0],
+             conv_param.output_spatial_lengths_[1],
+             conv_param.output_spatial_lengths_[2]},
+            {
+                conv_param.K_, // g
+                conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
+                    conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // n
+                1,                                                                         // k
+                conv_param.output_spatial_lengths_[1] * conv_param.output_spatial_lengths_[2] *
+                    conv_param.G_ * conv_param.K_,                                     // do
+                conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho
+                conv_param.G_ * conv_param.K_                                          // wo
+            });
+        return run_grouped_conv_fwd_bias<
+            3,
+            InDataType,
+            WeiDataType,
+            OutDataType,
+            InElementOp,
+            WeiElementOp,
+            OutElementOp,
+            DeviceGroupedConvNDFwdInstance<3, InLayout, WeiLayout, BiasLayout, OutLayout>>(
+            do_verification,
+            init_method,
+            time_kernel,
+            conv_param,
+            in_g_n_c_wis_desc,
+            wei_g_k_c_xs_desc,
+            bias_g_n_k_wos_desc,
+            out_g_n_k_wos_desc,
+            in_element_op,
+            wei_element_op,
+            out_element_op);
+    }
+    return 0;
+}
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -8,7 +8,7 @@ add_custom_target(examples)
 function(add_example_executable EXAMPLE_NAME FILE_NAME)
    message("adding example ${EXAMPLE_NAME}")
    add_executable(${EXAMPLE_NAME} ${FILE_NAME})
-    target_link_libraries(${EXAMPLE_NAME} PRIVATE host_tensor)
+    target_link_libraries(${EXAMPLE_NAME} PRIVATE utility)
    add_test(NAME ${EXAMPLE_NAME} COMMAND $<TARGET_FILE:${EXAMPLE_NAME}> ${ARGN})
    add_dependencies(examples ${EXAMPLE_NAME})
    add_dependencies(check ${EXAMPLE_NAME})
@@ -17,7 +17,7 @@ endfunction(add_example_executable EXAMPLE_NAME)
 function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
    message("adding example ${EXAMPLE_NAME}")
    add_executable(${EXAMPLE_NAME} ${FILE_NAME})
-    target_link_libraries(${EXAMPLE_NAME} PRIVATE host_tensor)
+    target_link_libraries(${EXAMPLE_NAME} PRIVATE utility)
    add_dependencies(examples ${EXAMPLE_NAME})
 endfunction(add_example_executable_no_testing EXAMPLE_NAME)
@@ -25,26 +25,23 @@ add_subdirectory(01_gemm)
 add_subdirectory(02_gemm_bilinear)
 add_subdirectory(03_gemm_bias_relu)
 add_subdirectory(04_gemm_add_add_fastgelu)
-add_subdirectory(06_conv2d_fwd_bias_relu)
-add_subdirectory(07_conv2d_fwd_bias_relu_add)
 add_subdirectory(09_convnd_fwd)
-add_subdirectory(10_conv2d_bwd_data)
-add_subdirectory(11_conv2d_bwd_weight)
 add_subdirectory(12_reduce)
 add_subdirectory(13_pool2d_fwd)
 add_subdirectory(14_gemm_xdl_requant_relu_requant)
 add_subdirectory(15_grouped_gemm)
 add_subdirectory(16_gemm_reduce)
-add_subdirectory(17_convnd_bwd_data_xdl)
+add_subdirectory(17_convnd_bwd_data)
 add_subdirectory(18_batched_gemm_reduce)
 add_subdirectory(19_binary_elementwise)
-add_subdirectory(20_convnd_bwd_weight_xdl)
+add_subdirectory(20_convnd_bwd_weight)
 add_subdirectory(21_gemm_layernorm)
 add_subdirectory(22_cgemm)
 add_subdirectory(23_softmax)
-add_subdirectory(24_batched_gemm_c_permute)
+add_subdirectory(24_batched_gemm_e_permute)
-add_subdirectory(25_gemm_bias_c_permute)
+add_subdirectory(25_gemm_bias_e_permute)
 add_subdirectory(26_contraction)
 add_subdirectory(27_layernorm)
 add_subdirectory(28_grouped_gemm_bias)
 add_subdirectory(29_batched_gemm_multi_d)
\ No newline at end of file
+add_subdirectory(30_grouped_convnd_fwd_bias_relu)
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -146,7 +146,7 @@
 // workaround: verifaction failure, due to compiler regression, for conv bwd-data fp16 using some
 // tuning parameter
-#define CK_WORKAROUND_SWDEV_325164 1
+#define CK_WORKAROUND_SWDEV_325164 0
 namespace ck {

--- a/include/ck/device_utility/device_prop.hpp
+++ b/include/ck/device_utility/device_prop.hpp