Use new utilities to shorten codes

e4e99a49 · Po-Yen, Chen · 7acbf104 · e4e99a49 · e4e99a49 · e4e99a49
Commit e4e99a49 authored Sep 22, 2022 by Po-Yen, Chen
20 changed files
--- a/profiler/include/profile_conv_bwd_weight_impl.hpp
+++ b/profiler/include/profile_conv_bwd_weight_impl.hpp
@@ -3,25 +3,26 @@

 #pragma once

-#include "ck/ck.hpp"
 #include <iomanip>
 #include <iostream>
 #include <typeinfo>

 #include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+
+#include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

 #include "ck/library/tensor_operation_instance/gpu/convolution_backward_weight.hpp"

+#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp"
 #include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
-#include "ck/library/utility/convolution_parameter.hpp"
-#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp"

 namespace ck {
 namespace profiler {
@@ -30,16 +31,16 @@ template <typename DataType>
 void show_data_nhwc_layout(Tensor<DataType>& nhwc)
 {
    std::cout << "[";
-    for(int n = 0; n < ck::type_convert<int>(nhwc.mDesc.GetLengths()[0]); n++)
+    for(int n = 0; n < ck::type_convert<int>(nhwc.GetLengths()[0]); n++)
    {
        std::cout << "[";
-        for(int hi = 0; hi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[2]); hi++)
+        for(int hi = 0; hi < ck::type_convert<int>(nhwc.GetLengths()[2]); hi++)
        {
            std::cout << "[";
-            for(int wi = 0; wi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[3]); wi++)
+            for(int wi = 0; wi < ck::type_convert<int>(nhwc.GetLengths()[3]); wi++)
            {
                std::cout << "[";
-                for(int c = 0; c < ck::type_convert<int>(nhwc.mDesc.GetLengths()[1]); c++)
+                for(int c = 0; c < ck::type_convert<int>(nhwc.GetLengths()[1]); c++)
                {
                    std::cout << static_cast<float>(nhwc(n, c, hi, wi)) << "  ";
                }
@@ -88,9 +89,9 @@ bool profile_conv_bwd_weight_impl(int do_verification,
    Tensor<WeiDataType> weight_device_result(wei_g_k_c_xs_desc);
    Tensor<OutDataType> output(out_g_n_k_wos_desc);

-    std::cout << "input: " << input.mDesc << std::endl;
-    std::cout << "weight: " << weight_host_result.mDesc << std::endl;
-    std::cout << "output: " << output.mDesc << std::endl;
+    std::cout << "input: " << input.GetDesc() << std::endl;
+    std::cout << "weight: " << weight_host_result.GetDesc() << std::endl;
+    std::cout << "output: " << output.GetDesc() << std::endl;

    switch(init_method)
    {
@@ -104,13 +105,12 @@ bool profile_conv_bwd_weight_impl(int do_verification,
        output.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5});
    }

-    DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpaceSize());
-    DeviceMem wei_device_buf(sizeof(WeiDataType) *
-                             weight_device_result.mDesc.GetElementSpaceSize());
-    DeviceMem out_device_buf(sizeof(OutDataType) * output.mDesc.GetElementSpaceSize());
+    DeviceMem in_device_buf(input.GetMemorySize());
+    DeviceMem wei_device_buf(weight_device_result.GetMemorySize());
+    DeviceMem out_device_buf(output.GetMemorySize());

-    in_device_buf.ToDevice(input.mData.data());
-    out_device_buf.ToDevice(output.mData.data());
+    in_device_buf.ToDevice(input.data());
+    out_device_buf.ToDevice(output.data());

    if(do_verification)
    {
@@ -165,10 +165,9 @@ bool profile_conv_bwd_weight_impl(int do_verification,

    for(auto& op_ptr : op_ptrs)
    {
-        auto argument_ptr =
-            op_ptr->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
-                                        static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
-                                        static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+        auto argument_ptr = op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
+                                                        wei_device_buf.GetDeviceBuffer(),
+                                                        out_device_buf.GetDeviceBuffer(),
                                                        conv_param.N_,
                                                        conv_param.K_,
                                                        conv_param.C_,
@@ -215,10 +214,9 @@ bool profile_conv_bwd_weight_impl(int do_verification,

            if(do_verification)
            {
-                wei_device_buf.FromDevice(weight_device_result.mData.data());
+                wei_device_buf.FromDevice(weight_device_result.data());

-                bool pass =
-                    ck::utils::check_err(weight_host_result.mData, weight_device_result.mData);
+                bool pass = ck::utils::check_err(weight_host_result, weight_device_result);

                if(!pass)
                {

--- a/profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp
+++ b/profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp
@@ -4,15 +4,16 @@
 #pragma once

 #include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation_add.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp"
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp"
+#include "ck/library/utility/literals.hpp"

 namespace ck {
 namespace tensor_operation {
@@ -66,21 +67,21 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
    const ck::index_t Ho = output_spatial_lengths[0];
    const ck::index_t Wo = output_spatial_lengths[1];

+    using namespace ck::literals;
+
    auto f_host_tensor_descriptor =
        [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) {
-            if constexpr(is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value ||
-                         is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value ||
-                         is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value)
+            if constexpr(is_same_v<decltype(layout), ck::tensor_layout::convolution::NCHW> ||
+                         is_same_v<decltype(layout), ck::tensor_layout::convolution::KCYX> ||
+                         is_same_v<decltype(layout), ck::tensor_layout::convolution::NKHW>)
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
-                                            std::vector<std::size_t>({C_ * H * W, H * W, W, 1}));
+                return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, H * W, W, 1_uz});
            }
-            else if constexpr(is_same<decltype(layout), tensor_layout::convolution::NHWC>::value ||
-                              is_same<decltype(layout), tensor_layout::convolution::KYXC>::value ||
-                              is_same<decltype(layout), tensor_layout::convolution::NHWK>::value)
+            else if constexpr(is_same_v<decltype(layout), tensor_layout::convolution::NHWC> ||
+                              is_same_v<decltype(layout), tensor_layout::convolution::KYXC> ||
+                              is_same_v<decltype(layout), tensor_layout::convolution::NHWK>)
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
-                                            std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
+                return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
            }
        };

@@ -92,17 +93,16 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
        f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));

    // bias: assume contiguous 1d vector
-    Tensor<OutDataType> bias_k(
-        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(K)})));
+    Tensor<OutDataType> bias_k(HostTensorDescriptor({K}));

    // residual: assume same layout as output tensor
    Tensor<OutDataType> resi_n_k_ho_wo(f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));

-    std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
-    std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl;
-    std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.mDesc << std::endl;
-    std::cout << "bias_k: " << bias_k.mDesc << std::endl;
-    std::cout << "resi_n_k_ho_wo: " << resi_n_k_ho_wo.mDesc << std::endl;
+    std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.GetDesc() << std::endl;
+    std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.GetDesc() << std::endl;
+    std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.GetDesc() << std::endl;
+    std::cout << "bias_k: " << bias_k.GetDesc() << std::endl;
+    std::cout << "resi_n_k_ho_wo: " << resi_n_k_ho_wo.GetDesc() << std::endl;

    switch(init_method)
    {
@@ -157,17 +157,16 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
        ref_invoker.Run(ref_argument);
    }

-    DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpaceSize());
-    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpaceSize());
-    DeviceMem out_device_buf(sizeof(OutDataType) *
-                             out_n_k_ho_wo_device_result.mDesc.GetElementSpaceSize());
-    DeviceMem bias_device_buf(sizeof(OutDataType) * bias_k.mDesc.GetElementSpaceSize());
-    DeviceMem resi_device_buf(sizeof(OutDataType) * resi_n_k_ho_wo.mDesc.GetElementSpaceSize());
+    DeviceMem in_device_buf(in_n_c_hi_wi.GetMemorySize());
+    DeviceMem wei_device_buf(wei_k_c_y_x.GetMemorySize());
+    DeviceMem out_device_buf(out_n_k_ho_wo_device_result.GetMemorySize());
+    DeviceMem bias_device_buf(bias_k.GetMemorySize());
+    DeviceMem resi_device_buf(resi_n_k_ho_wo.GetMemorySize());

-    in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
-    wei_device_buf.ToDevice(wei_k_c_y_x.mData.data());
-    bias_device_buf.ToDevice(bias_k.mData.data());
-    resi_device_buf.ToDevice(resi_n_k_ho_wo.mData.data());
+    in_device_buf.ToDevice(in_n_c_hi_wi.data());
+    wei_device_buf.ToDevice(wei_k_c_y_x.data());
+    bias_device_buf.ToDevice(bias_k.data());
+    resi_device_buf.ToDevice(resi_n_k_ho_wo.data());

    using DeviceConvFwdBiasReluAddPtr = ck::tensor_operation::device::
        DeviceConvFwdBiasActivationAddPtr<InElementOp, WeiElementOp, OutElementOp>;
@@ -196,12 +195,11 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
    // profile device Conv instances
    for(auto& op_ptr : op_ptrs)
    {
-        auto argument_ptr = op_ptr->MakeArgumentPointer(
-            static_cast<const InDataType*>(in_device_buf.GetDeviceBuffer()),
-            static_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
-            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
-            static_cast<const OutDataType*>(bias_device_buf.GetDeviceBuffer()),
-            static_cast<const OutDataType*>(resi_device_buf.GetDeviceBuffer()),
+        auto argument_ptr = op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
+                                                        wei_device_buf.GetDeviceBuffer(),
+                                                        out_device_buf.GetDeviceBuffer(),
+                                                        bias_device_buf.GetDeviceBuffer(),
+                                                        resi_device_buf.GetDeviceBuffer(),
                                                        N,
                                                        K,
                                                        C,
@@ -225,7 +223,7 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
            float ave_time =
                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});

-            std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
+            std::size_t flop = 2_uz * N * K * Ho * Wo * C * Y * X;

            std::size_t num_btype =
                sizeof(InDataType) * (N * C * Hi * Wi) + sizeof(WeiDataType) * (K * C * Y * X) +
@@ -249,22 +247,19 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,

            if(do_verification)
            {
-                out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());
+                out_device_buf.FromDevice(out_n_k_ho_wo_device_result.data());

-                ck::utils::check_err(out_n_k_ho_wo_device_result.mData,
-                                     out_n_k_ho_wo_host_result.mData);
+                ck::utils::check_err(out_n_k_ho_wo_device_result, out_n_k_ho_wo_host_result);

                if(do_log)
                {
-                    LogRangeAsType<float>(std::cout << "in : ", in_n_c_hi_wi.mData, ",")
-                        << std::endl;
-                    LogRangeAsType<float>(std::cout << "wei: ", wei_k_c_y_x.mData, ",")
-                        << std::endl;
+                    LogRangeAsType<float>(std::cout << "in : ", in_n_c_hi_wi, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "wei: ", wei_k_c_y_x, ",") << std::endl;
                    LogRangeAsType<float>(
-                        std::cout << "out_host  : ", out_n_k_ho_wo_host_result.mData, ",")
+                        std::cout << "out_host  : ", out_n_k_ho_wo_host_result, ",")
                        << std::endl;
                    LogRangeAsType<float>(
-                        std::cout << "out_device: ", out_n_k_ho_wo_device_result.mData, ",")
+                        std::cout << "out_device: ", out_n_k_ho_wo_device_result, ",")
                        << std::endl;
                }
            }

--- a/profiler/include/profile_conv_fwd_bias_relu_impl.hpp
+++ b/profiler/include/profile_conv_fwd_bias_relu_impl.hpp
@@ -4,15 +4,16 @@
 #pragma once

 #include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp"
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp"
+#include "ck/library/utility/literals.hpp"

 namespace ck {
 namespace tensor_operation {
@@ -66,21 +67,21 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
    const ck::index_t Ho = output_spatial_lengths[0];
    const ck::index_t Wo = output_spatial_lengths[1];

+    using namespace ck::literals;
+
    auto f_host_tensor_descriptor =
        [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) {
-            if constexpr(is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value ||
-                         is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value ||
-                         is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value)
+            if constexpr(is_same_v<decltype(layout), ck::tensor_layout::convolution::NCHW> ||
+                         is_same_v<decltype(layout), ck::tensor_layout::convolution::KCYX> ||
+                         is_same_v<decltype(layout), ck::tensor_layout::convolution::NKHW>)
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
-                                            std::vector<std::size_t>({C_ * H * W, H * W, W, 1}));
+                return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, H * W, W, 1_uz});
            }
-            else if constexpr(is_same<decltype(layout), tensor_layout::convolution::NHWC>::value ||
-                              is_same<decltype(layout), tensor_layout::convolution::KYXC>::value ||
-                              is_same<decltype(layout), tensor_layout::convolution::NHWK>::value)
+            else if constexpr(is_same_v<decltype(layout), tensor_layout::convolution::NHWC> ||
+                              is_same_v<decltype(layout), tensor_layout::convolution::KYXC> ||
+                              is_same_v<decltype(layout), tensor_layout::convolution::NHWK>)
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
-                                            std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
+                return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
            }
        };

@@ -92,13 +93,12 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
        f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));

    // bias: assume contiguous 1d vector
-    Tensor<OutDataType> bias_k(
-        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(K)})));
+    Tensor<OutDataType> bias_k(HostTensorDescriptor({K}));

-    std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
-    std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl;
-    std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.mDesc << std::endl;
-    std::cout << "bias_k: " << bias_k.mDesc << std::endl;
+    std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.GetDesc() << std::endl;
+    std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.GetDesc() << std::endl;
+    std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.GetDesc() << std::endl;
+    std::cout << "bias_k: " << bias_k.GetDesc() << std::endl;

    switch(init_method)
    {
@@ -149,15 +149,14 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
        ref_invoker.Run(ref_argument);
    }

-    DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpaceSize());
-    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpaceSize());
-    DeviceMem out_device_buf(sizeof(OutDataType) *
-                             out_n_k_ho_wo_device_result.mDesc.GetElementSpaceSize());
-    DeviceMem bias_device_buf(sizeof(OutDataType) * bias_k.mDesc.GetElementSpaceSize());
+    DeviceMem in_device_buf(in_n_c_hi_wi.GetMemorySize());
+    DeviceMem wei_device_buf(wei_k_c_y_x.GetMemorySize());
+    DeviceMem out_device_buf(out_n_k_ho_wo_device_result.GetMemorySize());
+    DeviceMem bias_device_buf(bias_k.GetMemorySize());

-    in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
-    wei_device_buf.ToDevice(wei_k_c_y_x.mData.data());
-    bias_device_buf.ToDevice(bias_k.mData.data());
+    in_device_buf.ToDevice(in_n_c_hi_wi.data());
+    wei_device_buf.ToDevice(wei_k_c_y_x.data());
+    bias_device_buf.ToDevice(bias_k.data());

    using DeviceConvFwdBiasReluPtr = ck::tensor_operation::device::
        DeviceConvFwdBiasActivationPtr<InElementOp, WeiElementOp, OutElementOp>;
@@ -186,11 +185,10 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
    // profile device Conv instances
    for(auto& op_ptr : op_ptrs)
    {
-        auto argument_ptr = op_ptr->MakeArgumentPointer(
-            static_cast<const InDataType*>(in_device_buf.GetDeviceBuffer()),
-            static_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
-            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
-            static_cast<const OutDataType*>(bias_device_buf.GetDeviceBuffer()),
+        auto argument_ptr = op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
+                                                        wei_device_buf.GetDeviceBuffer(),
+                                                        out_device_buf.GetDeviceBuffer(),
+                                                        bias_device_buf.GetDeviceBuffer(),
                                                        N,
                                                        K,
                                                        C,
@@ -214,7 +212,7 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
            float ave_time =
                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});

-            std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
+            std::size_t flop = 2_uz * N * K * Ho * Wo * C * Y * X;

            std::size_t num_btype =
                sizeof(InDataType) * (N * C * Hi * Wi) + sizeof(WeiDataType) * (K * C * Y * X) +
@@ -237,22 +235,19 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,

            if(do_verification)
            {
-                out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());
+                out_device_buf.FromDevice(out_n_k_ho_wo_device_result.data());

-                ck::utils::check_err(out_n_k_ho_wo_device_result.mData,
-                                     out_n_k_ho_wo_host_result.mData);
+                ck::utils::check_err(out_n_k_ho_wo_device_result, out_n_k_ho_wo_host_result);

                if(do_log)
                {
-                    LogRangeAsType<float>(std::cout << "in : ", in_n_c_hi_wi.mData, ",")
-                        << std::endl;
-                    LogRangeAsType<float>(std::cout << "wei: ", wei_k_c_y_x.mData, ",")
-                        << std::endl;
+                    LogRangeAsType<float>(std::cout << "in : ", in_n_c_hi_wi, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "wei: ", wei_k_c_y_x, ",") << std::endl;
                    LogRangeAsType<float>(
-                        std::cout << "out_host  : ", out_n_k_ho_wo_host_result.mData, ",")
+                        std::cout << "out_host  : ", out_n_k_ho_wo_host_result, ",")
                        << std::endl;
                    LogRangeAsType<float>(
-                        std::cout << "out_device: ", out_n_k_ho_wo_device_result.mData, ",")
+                        std::cout << "out_device: ", out_n_k_ho_wo_device_result, ",")
                        << std::endl;
                }
            }

--- a/profiler/include/profile_conv_fwd_impl.hpp
+++ b/profiler/include/profile_conv_fwd_impl.hpp
@@ -8,19 +8,19 @@
 #include <typeinfo>

 #include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

 #include "ck/library/tensor_operation_instance/gpu/convolution_forward.hpp"

+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
 #include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
-#include "ck/library/utility/convolution_parameter.hpp"
-#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"

 namespace ck {
 namespace profiler {
@@ -60,9 +60,9 @@ bool profile_conv_fwd_impl(int do_verification,
    Tensor<OutDataType> host_output(out_g_n_k_wos_desc);
    Tensor<OutDataType> device_output(out_g_n_k_wos_desc);

-    std::cout << "input: " << input.mDesc << std::endl;
-    std::cout << "weight: " << weight.mDesc << std::endl;
-    std::cout << "output: " << host_output.mDesc << std::endl;
+    std::cout << "input: " << input.GetDesc() << std::endl;
+    std::cout << "weight: " << weight.GetDesc() << std::endl;
+    std::cout << "output: " << host_output.GetDesc() << std::endl;

    switch(init_method)
    {
@@ -76,12 +76,12 @@ bool profile_conv_fwd_impl(int do_verification,
        weight.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
    }

-    DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpaceSize());
-    DeviceMem wei_device_buf(sizeof(WeiDataType) * weight.mDesc.GetElementSpaceSize());
-    DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpaceSize());
+    DeviceMem in_device_buf(input.GetMemorySize());
+    DeviceMem wei_device_buf(weight.GetMemorySize());
+    DeviceMem out_device_buf(device_output.GetMemorySize());

-    in_device_buf.ToDevice(input.mData.data());
-    wei_device_buf.ToDevice(weight.mData.data());
+    in_device_buf.ToDevice(input.data());
+    wei_device_buf.ToDevice(weight.data());

    // run reference op
    if(do_verification)
@@ -139,10 +139,9 @@ bool profile_conv_fwd_impl(int do_verification,

    for(auto& op_ptr : op_ptrs)
    {
-        auto argument_ptr =
-            op_ptr->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
-                                        static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
-                                        static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+        auto argument_ptr = op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
+                                                        wei_device_buf.GetDeviceBuffer(),
+                                                        out_device_buf.GetDeviceBuffer(),
                                                        conv_param.N_,
                                                        conv_param.K_,
                                                        conv_param.C_,
@@ -189,17 +188,17 @@ bool profile_conv_fwd_impl(int do_verification,

            if(do_verification)
            {
-                out_device_buf.FromDevice(device_output.mData.data());
+                out_device_buf.FromDevice(device_output.data());

-                pass = pass & ck::utils::check_err(device_output.mData, host_output.mData);
+                pass = pass & ck::utils::check_err(device_output, host_output);

                if(do_log)
                {
-                    LogRangeAsType<float>(std::cout << "input : ", input.mData, ",") << std::endl;
-                    LogRangeAsType<float>(std::cout << "weight: ", weight.mData, ",") << std::endl;
-                    LogRangeAsType<float>(std::cout << "host_output  : ", host_output.mData, ",")
+                    LogRangeAsType<float>(std::cout << "input : ", input, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "weight: ", weight, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "host_output  : ", host_output, ",")
                        << std::endl;
-                    LogRangeAsType<float>(std::cout << "device_output: ", device_output.mData, ",")
+                    LogRangeAsType<float>(std::cout << "device_output: ", device_output, ",")
                        << std::endl;
                }
            }

--- a/profiler/include/profile_convnd_bwd_data_impl.hpp
+++ b/profiler/include/profile_convnd_bwd_data_impl.hpp
@@ -4,16 +4,17 @@
 #pragma once

 #include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

-#include "ck/library/utility/check_err.hpp"
-#include "ck/library/utility/conv_util.hpp"
 #include "ck/library/host_tensor/device_memory.hpp"
 #include "ck/library/host_tensor/host_tensor.hpp"
 #include "ck/library/host_tensor/host_tensor_generator.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/conv_util.hpp"
+#include "ck/library/utility/ranges.hpp"

 using F16  = ck::half_t;
 using F32  = float;
@@ -241,16 +242,16 @@ template <typename DataType>
 void show_data_nhwc_layout(Tensor<DataType>& nhwc)
 {
    std::cout << "[";
-    for(int n = 0; n < ck::type_convert<int>(nhwc.mDesc.GetLengths()[0]); n++)
+    for(int n = 0; n < ck::type_convert<int>(nhwc.GetLengths()[0]); n++)
    {
        std::cout << "[";
-        for(int hi = 0; hi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[2]); hi++)
+        for(int hi = 0; hi < ck::type_convert<int>(nhwc.GetLengths()[2]); hi++)
        {
            std::cout << "[";
-            for(int wi = 0; wi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[3]); wi++)
+            for(int wi = 0; wi < ck::type_convert<int>(nhwc.GetLengths()[3]); wi++)
            {
                std::cout << "[";
-                for(int c = 0; c < ck::type_convert<int>(nhwc.mDesc.GetLengths()[1]); c++)
+                for(int c = 0; c < ck::type_convert<int>(nhwc.GetLengths()[1]); c++)
                {
                    std::cout << static_cast<float>(nhwc(n, c, hi, wi)) << "  ";
                }
@@ -294,16 +295,16 @@ bool profile_convnd_bwd_data_impl(int do_verification,
    const auto wei_element_op = WeiElementOp{};
    const auto out_element_op = OutElementOp{};

-    std::vector<std::size_t> input_dims{static_cast<std::size_t>(N), static_cast<std::size_t>(C)};
+    auto input_dims = ck::ranges::to<std::vector<std::size_t>>({N, C});
    input_dims.insert(
        std::end(input_dims), std::begin(input_spatial_lengths), std::end(input_spatial_lengths));

-    std::vector<std::size_t> filter_dims{static_cast<std::size_t>(K), static_cast<std::size_t>(C)};
+    auto filter_dims = ck::ranges::to<std::vector<std::size_t>>({K, C});
    filter_dims.insert(std::end(filter_dims),
                       std::begin(filter_spatial_lengths),
                       std::end(filter_spatial_lengths));

-    std::vector<std::size_t> output_dims{static_cast<std::size_t>(N), static_cast<std::size_t>(K)};
+    auto output_dims = ck::ranges::to<std::vector<std::size_t>>({N, K});
    output_dims.insert(std::end(output_dims),
                       std::begin(output_spatial_lengths),
                       std::end(output_spatial_lengths));
@@ -317,9 +318,9 @@ bool profile_convnd_bwd_data_impl(int do_verification,
    Tensor<OutDataType> output(
        get_output_host_ensor_descriptor<OutLayout>(output_dims, NDimSpatial));

-    std::cout << "input: " << input_host_result.mDesc << std::endl;
-    std::cout << "weights: " << weights.mDesc << std::endl;
-    std::cout << "output: " << output.mDesc << std::endl;
+    std::cout << "input: " << input_host_result.GetDesc() << std::endl;
+    std::cout << "weights: " << weights.GetDesc() << std::endl;
+    std::cout << "output: " << output.GetDesc() << std::endl;

    switch(init_method)
    {
@@ -333,12 +334,12 @@ bool profile_convnd_bwd_data_impl(int do_verification,
        weights.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{1});
    }

-    DeviceMem in_device_buf(sizeof(InDataType) * input_device_result.mDesc.GetElementSpace());
-    DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace());
-    DeviceMem out_device_buf(sizeof(OutDataType) * output.mDesc.GetElementSpace());
+    DeviceMem in_device_buf(input_device_result.GetMemorySize());
+    DeviceMem wei_device_buf(weights.GetMemorySize());
+    DeviceMem out_device_buf(output.GetMemorySize());

-    out_device_buf.ToDevice(output.mData.data());
-    wei_device_buf.ToDevice(weights.mData.data());
+    out_device_buf.ToDevice(output.data());
+    wei_device_buf.ToDevice(weights.data());

    // reset input to zero
    in_device_buf.SetZero();
@@ -391,10 +392,9 @@ bool profile_convnd_bwd_data_impl(int do_verification,
    bool success = true;
    for(auto& conv_ptr : conv_ptrs)
    {
-        auto argument_ptr = conv_ptr->MakeArgumentPointer(
-            static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
-            static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
-            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+        auto argument_ptr = conv_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
+                                                          wei_device_buf.GetDeviceBuffer(),
+                                                          out_device_buf.GetDeviceBuffer(),
                                                          N,
                                                          K,
                                                          C,
@@ -440,7 +440,7 @@ bool profile_convnd_bwd_data_impl(int do_verification,

            if(do_verification)
            {
-                in_device_buf.FromDevice(input_device_result.mData.data());
+                in_device_buf.FromDevice(input_device_result.data());

                if(!check_out(input_host_result, input_device_result))
                {
@@ -453,7 +453,7 @@ bool profile_convnd_bwd_data_impl(int do_verification,
                    std::cout << "Pass Info: " << conv_ptr->GetTypeString() << std::endl;
                }

-                success = ck::utils::check_err(input_host_result.mData, input_device_result.mData);
+                success = ck::utils::check_err(input_host_result, input_device_result);

                if(do_log)
                {

--- a/profiler/include/profile_convnd_bwd_weight_impl.hpp
+++ b/profiler/include/profile_convnd_bwd_weight_impl.hpp
 #pragma once

 #include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_conv_backward_weight.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

-#include "ck/library/utility/check_err.hpp"
-#include "ck/library/utility/conv_util.hpp"
 #include "ck/library/host_tensor/device_memory.hpp"
 #include "ck/library/host_tensor/host_tensor.hpp"
 #include "ck/library/host_tensor/host_tensor_generator.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/conv_util.hpp"
+#include "ck/library/utility/ranges.hpp"

 using F16  = ck::half_t;
 using F32  = float;
@@ -205,16 +206,16 @@ template <typename DataType>
 void show_data_nhwc_layout(Tensor<DataType>& nhwc)
 {
    std::cout << "[";
-    for(int n = 0; n < ck::type_convert<int>(nhwc.mDesc.GetLengths()[0]); n++)
+    for(int n = 0; n < ck::type_convert<int>(nhwc.GetLengths()[0]); n++)
    {
        std::cout << "[";
-        for(int hi = 0; hi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[2]); hi++)
+        for(int hi = 0; hi < ck::type_convert<int>(nhwc.GetLengths()[2]); hi++)
        {
            std::cout << "[";
-            for(int wi = 0; wi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[3]); wi++)
+            for(int wi = 0; wi < ck::type_convert<int>(nhwc.GetLengths()[3]); wi++)
            {
                std::cout << "[";
-                for(int c = 0; c < ck::type_convert<int>(nhwc.mDesc.GetLengths()[1]); c++)
+                for(int c = 0; c < ck::type_convert<int>(nhwc.GetLengths()[1]); c++)
                {
                    std::cout << static_cast<float>(nhwc(n, c, hi, wi)) << "  ";
                }
@@ -258,16 +259,16 @@ bool profile_convnd_bwd_weight_impl(int do_verification,
    const auto wei_element_op = WeiElementOp{};
    const auto out_element_op = OutElementOp{};

-    std::vector<std::size_t> input_dims{static_cast<std::size_t>(N), static_cast<std::size_t>(C)};
+    auto input_dims = ck::ranges::to<std::vector<std::size_t>>({N, C});
    input_dims.insert(
        std::end(input_dims), std::begin(input_spatial_lengths), std::end(input_spatial_lengths));

-    std::vector<std::size_t> filter_dims{static_cast<std::size_t>(K), static_cast<std::size_t>(C)};
+    auto filter_dims = ck::ranges::to<std::vector<std::size_t>>({K, C});
    filter_dims.insert(std::end(filter_dims),
                       std::begin(filter_spatial_lengths),
                       std::end(filter_spatial_lengths));

-    std::vector<std::size_t> output_dims{static_cast<std::size_t>(N), static_cast<std::size_t>(K)};
+    auto output_dims = ck::ranges::to<std::vector<std::size_t>>({N, K});
    output_dims.insert(std::end(output_dims),
                       std::begin(output_spatial_lengths),
                       std::end(output_spatial_lengths));
@@ -280,9 +281,9 @@ bool profile_convnd_bwd_weight_impl(int do_verification,
    Tensor<OutDataType> output(
        get_output_host_ensor_descriptor<OutLayout>(output_dims, NDimSpatial));

-    std::cout << "input: " << input.mDesc << std::endl;
-    std::cout << "weights: " << weights_host_result.mDesc << std::endl;
-    std::cout << "output: " << output.mDesc << std::endl;
+    std::cout << "input: " << input.GetDesc() << std::endl;
+    std::cout << "weights: " << weights_host_result.GetDesc() << std::endl;
+    std::cout << "output: " << output.GetDesc() << std::endl;

    switch(init_method)
    {
@@ -296,12 +297,12 @@ bool profile_convnd_bwd_weight_impl(int do_verification,
        output.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{1});
    }

-    DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace());
-    DeviceMem wei_device_buf(sizeof(WeiDataType) * weights_device_result.mDesc.GetElementSpace());
-    DeviceMem out_device_buf(sizeof(OutDataType) * output.mDesc.GetElementSpace());
+    DeviceMem in_device_buf(input.GetMemorySize());
+    DeviceMem wei_device_buf(weights_device_result.GetMemorySize());
+    DeviceMem out_device_buf(output.GetMemorySize());

-    in_device_buf.ToDevice(input.mData.data());
-    out_device_buf.ToDevice(output.mData.data());
+    in_device_buf.ToDevice(input.data());
+    out_device_buf.ToDevice(output.data());

    // reset input to zero
    wei_device_buf.SetZero();
@@ -359,10 +360,9 @@ bool profile_convnd_bwd_weight_impl(int do_verification,
        //    wei_device_buf.SetZero();
        //}

-        auto argument_ptr = conv_ptr->MakeArgumentPointer(
-            static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
-            static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
-            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+        auto argument_ptr = conv_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
+                                                          wei_device_buf.GetDeviceBuffer(),
+                                                          out_device_buf.GetDeviceBuffer(),
                                                          N,
                                                          K,
                                                          C,
@@ -390,7 +390,7 @@ bool profile_convnd_bwd_weight_impl(int do_verification,
        std::string conv_name = conv_ptr->GetTypeString();
        float ave_time        = 0;

-        if(std::is_same<InDataType, ck::bhalf_t>::value && split_k > 1)
+        if constexpr(std::is_same_v<InDataType, ck::bhalf_t> && split_k > 1)
        {
            // alloc work space
            size_t bwd_weight_workspace_size = conv_ptr->GetWorkSpaceSize(argument_ptr.get());
@@ -431,9 +431,9 @@ bool profile_convnd_bwd_weight_impl(int do_verification,

        if(do_verification)
        {
-            wei_device_buf.FromDevice(weights_device_result.mData.data());
+            wei_device_buf.FromDevice(weights_device_result.data());

-            success = ck::utils::check_err(weights_host_result.mData, weights_device_result.mData);
+            success = ck::utils::check_err(weights_host_result, weights_device_result);

            if(success == false)
            {

--- a/profiler/include/profile_gemm_add_add_fastgelu_impl.hpp
+++ b/profiler/include/profile_gemm_add_add_fastgelu_impl.hpp
@@ -6,17 +6,19 @@
 #include <iomanip>

 #include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

 #include "ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp"

+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/array.hpp"
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/literals.hpp"

 namespace ck {
 namespace profiler {
@@ -45,17 +47,17 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
                                        int StrideD1,
                                        int StrideE)
 {
+    using namespace ck::literals;
+
    auto f_host_tensor_descriptor =
        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
-            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
+            if constexpr(is_same_v<decltype(layout), tensor_layout::gemm::RowMajor>)
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
-                                            std::vector<std::size_t>({stride, 1}));
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
            }
            else
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
-                                            std::vector<std::size_t>({1, stride}));
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
            }
        };

@@ -66,11 +68,11 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));

-    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
-    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
-    std::cout << "d0_m_n: " << d0_m_n.mDesc << std::endl;
-    std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl;
-    std::cout << "e_m_n: " << e_m_n_device_result.mDesc << std::endl;
+    std::cout << "a_m_k: " << a_m_k.GetDesc() << std::endl;
+    std::cout << "b_k_n: " << b_k_n.GetDesc() << std::endl;
+    std::cout << "d0_m_n: " << d0_m_n.GetDesc() << std::endl;
+    std::cout << "d1_m_n: " << d1_m_n.GetDesc() << std::endl;
+    std::cout << "e_m_n: " << e_m_n_device_result.GetDesc() << std::endl;

    switch(init_method)
    {
@@ -121,8 +123,7 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
    // run reference
    if(do_verification)
    {
-        Tensor<AccDataType> c_m_n(HostTensorDescriptor(
-            std::vector<std::size_t>{static_cast<std::size_t>(M), static_cast<std::size_t>(N)}));
+        Tensor<AccDataType> c_m_n(HostTensorDescriptor({M, N}));

        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
                                                                                BDataType,
@@ -149,16 +150,16 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
        }
    }

-    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
-    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
-    DeviceMem d0_m_n_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpaceSize());
-    DeviceMem d1_m_n_device_buf(sizeof(D1DataType) * d1_m_n.mDesc.GetElementSpaceSize());
-    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+    DeviceMem a_device_buf(a_m_k.GetMemorySize());
+    DeviceMem b_device_buf(b_k_n.GetMemorySize());
+    DeviceMem d0_m_n_device_buf(d0_m_n.GetMemorySize());
+    DeviceMem d1_m_n_device_buf(d1_m_n.GetMemorySize());
+    DeviceMem e_device_buf(e_m_n_device_result.GetMemorySize());

-    a_device_buf.ToDevice(a_m_k.mData.data());
-    b_device_buf.ToDevice(b_k_n.mData.data());
-    d0_m_n_device_buf.ToDevice(d0_m_n.mData.data());
-    d1_m_n_device_buf.ToDevice(d1_m_n.mData.data());
+    a_device_buf.ToDevice(a_m_k.data());
+    b_device_buf.ToDevice(b_k_n.data());
+    d0_m_n_device_buf.ToDevice(d0_m_n.data());
+    d1_m_n_device_buf.ToDevice(d1_m_n.data());

    std::string best_op_name;
    float best_ave_time   = 0;
@@ -170,18 +171,18 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
    // profile device operation instances
    for(auto& op_ptr : op_ptrs)
    {
-        auto argument_ptr = op_ptr->MakeArgumentPointer(
-            a_device_buf.GetDeviceBuffer(),
+        auto argument_ptr =
+            op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
                                        b_device_buf.GetDeviceBuffer(),
-            std::array<const void*, 2>{d0_m_n_device_buf.GetDeviceBuffer(),
-                                       d1_m_n_device_buf.GetDeviceBuffer()},
+                                        ck::utils::to_array({d0_m_n_device_buf.GetDeviceBuffer(),
+                                                             d1_m_n_device_buf.GetDeviceBuffer()}),
                                        e_device_buf.GetDeviceBuffer(),
                                        M,
                                        N,
                                        K,
                                        StrideA,
                                        StrideB,
-            std::array<ck::index_t, 2>{StrideD0, StrideD1},
+                                        ck::utils::to_array({StrideD0, StrideD1}),
                                        StrideE,
                                        a_element_op,
                                        b_element_op,
@@ -199,7 +200,7 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
            float ave_time =
                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});

-            std::size_t flop = std::size_t(2) * M * N * K;
+            std::size_t flop = 2_uz * M * N * K;

            std::size_t num_btype =
                sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
@@ -221,10 +222,9 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,

            if(do_verification)
            {
-                e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+                e_device_buf.FromDevice(e_m_n_device_result.data());

-                pass = pass &&
-                       ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData);
+                pass = pass && ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
            }
        }
        else

--- a/profiler/include/profile_gemm_bias_add_reduce_impl.hpp
+++ b/profiler/include/profile_gemm_bias_add_reduce_impl.hpp
@@ -4,17 +4,18 @@
 #pragma once

 #include "ck/ck.hpp"
-#include "ck/utility/reduction_operator.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_gemm_reduce.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/utility/reduction_operator.hpp"

+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/convolution_parameter.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/literals.hpp"

 namespace ck {
 namespace tensor_operation {
@@ -74,22 +75,21 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
                                       int StrideC,
                                       int StrideD0)
 {
+    using namespace ck::literals;
+
    auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
-        return HostTensorDescriptor(std::vector<std::size_t>({len}),
-                                    std::vector<std::size_t>({stride}));
+        return HostTensorDescriptor({len}, {stride});
    };

    auto f_host_tensor_descriptor2d =
        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
-            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
+            if constexpr(is_same_v<decltype(layout), tensor_layout::gemm::RowMajor>)
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
-                                            std::vector<std::size_t>({stride, 1}));
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
            }
            else
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
-                                            std::vector<std::size_t>({1, stride}));
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
            }
        };

@@ -99,22 +99,18 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
    Tensor<BiasDataType> bias_n(f_host_tensor_descriptor1d(N, 1));
    Tensor<D0DataType> d0_m_n(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
-    Tensor<ReduceDataType> reduce0_m_host_result(
-        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
-    Tensor<ReduceDataType> reduce1_m_host_result(
-        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
+    Tensor<ReduceDataType> reduce0_m_host_result(HostTensorDescriptor({M}));
+    Tensor<ReduceDataType> reduce1_m_host_result(HostTensorDescriptor({M}));

    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
-    Tensor<ReduceDataType> reduce0_m_device_result(
-        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
-    Tensor<ReduceDataType> reduce1_m_device_result(
-        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
+    Tensor<ReduceDataType> reduce0_m_device_result(HostTensorDescriptor({M}));
+    Tensor<ReduceDataType> reduce1_m_device_result(HostTensorDescriptor({M}));

-    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
-    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
-    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
-    std::cout << "reduce0_m: " << reduce0_m_host_result.mDesc << std::endl;
-    std::cout << "reduce1_m: " << reduce1_m_host_result.mDesc << std::endl;
+    std::cout << "a_m_k: " << a_m_k.GetDesc() << std::endl;
+    std::cout << "b_k_n: " << b_k_n.GetDesc() << std::endl;
+    std::cout << "c_m_n: " << c_m_n_host_result.GetDesc() << std::endl;
+    std::cout << "reduce0_m: " << reduce0_m_host_result.GetDesc() << std::endl;
+    std::cout << "reduce1_m: " << reduce1_m_host_result.GetDesc() << std::endl;

    std::size_t num_thread = 1;
    switch(init_method)
@@ -217,23 +213,21 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
        }
    }

-    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
-    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
-    DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
-    DeviceMem bias_device_buf(sizeof(BiasDataType) * bias_n.mDesc.GetElementSpaceSize());
-    DeviceMem d0_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpaceSize());
-    DeviceMem reduce0_device_buf(sizeof(ReduceDataType) *
-                                 reduce0_m_device_result.mDesc.GetElementSpaceSize());
-    DeviceMem reduce1_device_buf(sizeof(ReduceDataType) *
-                                 reduce1_m_device_result.mDesc.GetElementSpaceSize());
+    DeviceMem a_device_buf(a_m_k.GetMemorySize());
+    DeviceMem b_device_buf(b_k_n.GetMemorySize());
+    DeviceMem c_device_buf(c_m_n_device_result.GetMemorySize());
+    DeviceMem bias_device_buf(bias_n.GetMemorySize());
+    DeviceMem d0_device_buf(d0_m_n.GetMemorySize());
+    DeviceMem reduce0_device_buf(reduce0_m_device_result.GetMemorySize());
+    DeviceMem reduce1_device_buf(reduce1_m_device_result.GetMemorySize());

    std::array<void*, 2> p_reduces = {reduce0_device_buf.GetDeviceBuffer(),
                                      reduce1_device_buf.GetDeviceBuffer()};

-    a_device_buf.ToDevice(a_m_k.mData.data());
-    b_device_buf.ToDevice(b_k_n.mData.data());
-    bias_device_buf.ToDevice(bias_n.mData.data());
-    d0_device_buf.ToDevice(d0_m_n.mData.data());
+    a_device_buf.ToDevice(a_m_k.data());
+    b_device_buf.ToDevice(b_k_n.data());
+    bias_device_buf.ToDevice(bias_n.data());
+    d0_device_buf.ToDevice(d0_m_n.data());

    // add device GEMM instances
    std::vector<ck::tensor_operation::device::instance::DeviceGemmBiasAddReduceNoOpPtr> gemm_ptrs;
@@ -319,7 +313,7 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,

            std::string gemm_name = gemm_ptr->GetTypeString();

-            std::size_t flop = std::size_t(2) * M * N * K + std::size_t(2) * M * N;
+            std::size_t flop = 2_uz * M * N * K + 2_uz * M * N;

            std::size_t num_byte = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
                                   sizeof(CDataType) * M * N + sizeof(BiasDataType) * M * N +
@@ -343,33 +337,29 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,

            if(do_verification)
            {
-                c_device_buf.FromDevice(c_m_n_device_result.mData.data());
-                reduce0_device_buf.FromDevice(reduce0_m_device_result.mData.data());
-                reduce1_device_buf.FromDevice(reduce1_m_device_result.mData.data());
+                c_device_buf.FromDevice(c_m_n_device_result.data());
+                reduce0_device_buf.FromDevice(reduce0_m_device_result.data());
+                reduce1_device_buf.FromDevice(reduce1_m_device_result.data());

-                ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
-                ck::utils::check_err(reduce0_m_device_result.mData, reduce0_m_host_result.mData);
-                ck::utils::check_err(reduce1_m_device_result.mData, reduce1_m_host_result.mData);
+                ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
+                ck::utils::check_err(reduce0_m_device_result, reduce0_m_host_result);
+                ck::utils::check_err(reduce1_m_device_result, reduce1_m_host_result);

                if(do_log)
                {
-                    LogRangeAsType<float>(std::cout << "a : ", a_m_k.mData, ",") << std::endl;
-                    LogRangeAsType<float>(std::cout << "b: ", b_k_n.mData, ",") << std::endl;
-                    LogRangeAsType<float>(std::cout << "c_host: ", c_m_n_host_result.mData, ",")
+                    LogRangeAsType<float>(std::cout << "a : ", a_m_k, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "b: ", b_k_n, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "c_host: ", c_m_n_host_result, ",")
                        << std::endl;
-                    LogRangeAsType<float>(std::cout << "c_device: ", c_m_n_device_result.mData, ",")
+                    LogRangeAsType<float>(std::cout << "c_device: ", c_m_n_device_result, ",")
                        << std::endl;
-                    LogRangeAsType<float>(
-                        std::cout << "d0_host: ", reduce0_m_host_result.mData, ",")
+                    LogRangeAsType<float>(std::cout << "d0_host: ", reduce0_m_host_result, ",")
                        << std::endl;
-                    LogRangeAsType<float>(
-                        std::cout << "d0_device: ", reduce0_m_device_result.mData, ",")
+                    LogRangeAsType<float>(std::cout << "d0_device: ", reduce0_m_device_result, ",")
                        << std::endl;
-                    LogRangeAsType<float>(
-                        std::cout << "d1_host: ", reduce1_m_host_result.mData, ",")
+                    LogRangeAsType<float>(std::cout << "d1_host: ", reduce1_m_host_result, ",")
                        << std::endl;
-                    LogRangeAsType<float>(
-                        std::cout << "d1_device: ", reduce1_m_device_result.mData, ",")
+                    LogRangeAsType<float>(std::cout << "d1_device: ", reduce1_m_device_result, ",")
                        << std::endl;
                }
            }

--- a/profiler/include/profile_gemm_bilinear_impl.hpp
+++ b/profiler/include/profile_gemm_bilinear_impl.hpp
@@ -6,17 +6,19 @@
 #include <iomanip>

 #include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

 #include "ck/library/tensor_operation_instance/gpu/gemm_bilinear.hpp"

+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/array.hpp"
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/literals.hpp"

 namespace ck {
 namespace profiler {
@@ -44,17 +46,17 @@ bool profile_gemm_bilinear_impl(int do_verification,
                                float alpha,
                                float beta)
 {
+    using namespace ck::literals;
+
    auto f_host_tensor_descriptor =
        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
-            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
+            if constexpr(is_same_v<decltype(layout), tensor_layout::gemm::RowMajor>)
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
-                                            std::vector<std::size_t>({stride, 1}));
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
            }
            else
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
-                                            std::vector<std::size_t>({1, stride}));
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
            }
        };

@@ -64,10 +66,10 @@ bool profile_gemm_bilinear_impl(int do_verification,
    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));

-    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
-    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
-    std::cout << "d_m_n: " << d_m_n.mDesc << std::endl;
-    std::cout << "e_m_n: " << e_m_n_device_result.mDesc << std::endl;
+    std::cout << "a_m_k: " << a_m_k.GetDesc() << std::endl;
+    std::cout << "b_k_n: " << b_k_n.GetDesc() << std::endl;
+    std::cout << "d_m_n: " << d_m_n.GetDesc() << std::endl;
+    std::cout << "e_m_n: " << e_m_n_device_result.GetDesc() << std::endl;

    switch(init_method)
    {
@@ -116,8 +118,7 @@ bool profile_gemm_bilinear_impl(int do_verification,
    // run reference
    if(do_verification)
    {
-        Tensor<AccDataType> c_m_n(HostTensorDescriptor(
-            std::vector<std::size_t>{static_cast<std::size_t>(M), static_cast<std::size_t>(N)}));
+        Tensor<AccDataType> c_m_n(HostTensorDescriptor({M, N}));

        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
                                                                                BDataType,
@@ -144,14 +145,14 @@ bool profile_gemm_bilinear_impl(int do_verification,
        }
    }

-    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
-    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
-    DeviceMem d_m_n_device_buf(sizeof(DDataType) * d_m_n.mDesc.GetElementSpaceSize());
-    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+    DeviceMem a_device_buf(a_m_k.GetMemorySize());
+    DeviceMem b_device_buf(b_k_n.GetMemorySize());
+    DeviceMem d_m_n_device_buf(d_m_n.GetMemorySize());
+    DeviceMem e_device_buf(e_m_n_device_result.GetMemorySize());

-    a_device_buf.ToDevice(a_m_k.mData.data());
-    b_device_buf.ToDevice(b_k_n.mData.data());
-    d_m_n_device_buf.ToDevice(d_m_n.mData.data());
+    a_device_buf.ToDevice(a_m_k.data());
+    b_device_buf.ToDevice(b_k_n.data());
+    d_m_n_device_buf.ToDevice(d_m_n.data());

    std::string best_op_name;
    float best_ave_time   = 0;
@@ -163,17 +164,17 @@ bool profile_gemm_bilinear_impl(int do_verification,
    // profile device operation instances
    for(auto& op_ptr : op_ptrs)
    {
-        auto argument_ptr = op_ptr->MakeArgumentPointer(
-            a_device_buf.GetDeviceBuffer(),
+        auto argument_ptr =
+            op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
                                        b_device_buf.GetDeviceBuffer(),
-            std::array<const void*, 1>{d_m_n_device_buf.GetDeviceBuffer()},
+                                        ck::utils::to_array({d_m_n_device_buf.GetDeviceBuffer()}),
                                        e_device_buf.GetDeviceBuffer(),
                                        M,
                                        N,
                                        K,
                                        StrideA,
                                        StrideB,
-            std::array<ck::index_t, 1>{StrideD},
+                                        ck::utils::to_array({StrideD}),
                                        StrideE,
                                        a_element_op,
                                        b_element_op,
@@ -191,7 +192,7 @@ bool profile_gemm_bilinear_impl(int do_verification,
            float ave_time =
                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});

-            std::size_t flop = std::size_t(2) * M * N * K;
+            std::size_t flop = 2_uz * M * N * K;

            std::size_t num_btype =
                sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
@@ -213,10 +214,9 @@ bool profile_gemm_bilinear_impl(int do_verification,

            if(do_verification)
            {
-                e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+                e_device_buf.FromDevice(e_m_n_device_result.data());

-                pass = pass &&
-                       ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData);
+                pass = pass && ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
            }
        }
        else

--- a/profiler/include/profile_gemm_impl.hpp
+++ b/profiler/include/profile_gemm_impl.hpp
@@ -8,17 +8,18 @@
 #include <typeinfo>

 #include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

 #include "ck/library/tensor_operation_instance/gpu/gemm.hpp"

+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/literals.hpp"

 namespace ck {
 namespace profiler {
@@ -43,17 +44,17 @@ int profile_gemm_impl(int do_verification,
 {
    bool pass = true;

+    using namespace ck::literals;
+
    auto f_host_tensor_descriptor =
        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
-            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
+            if constexpr(is_same_v<decltype(layout), tensor_layout::gemm::RowMajor>)
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
-                                            std::vector<std::size_t>({stride, 1}));
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
            }
            else
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
-                                            std::vector<std::size_t>({1, stride}));
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
            }
        };

@@ -62,9 +63,9 @@ int profile_gemm_impl(int do_verification,
    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));

-    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
-    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
-    std::cout << "c_m_n: " << c_m_n_device_result.mDesc << std::endl;
+    std::cout << "a_m_k: " << a_m_k.GetDesc() << std::endl;
+    std::cout << "b_k_n: " << b_k_n.GetDesc() << std::endl;
+    std::cout << "c_m_n: " << c_m_n_device_result.GetDesc() << std::endl;

    switch(init_method)
    {
@@ -86,12 +87,12 @@ int profile_gemm_impl(int do_verification,
    const auto b_element_op = BElementOp{};
    const auto c_element_op = CElementOp{};

-    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
-    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
-    DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+    DeviceMem a_device_buf(a_m_k.GetMemorySize());
+    DeviceMem b_device_buf(b_k_n.GetMemorySize());
+    DeviceMem c_device_buf(c_m_n_device_result.GetMemorySize());

-    a_device_buf.ToDevice(a_m_k.mData.data());
-    b_device_buf.ToDevice(b_k_n.mData.data());
+    a_device_buf.ToDevice(a_m_k.data());
+    b_device_buf.ToDevice(b_k_n.data());

    using DeviceOp = ck::tensor_operation::device::DeviceGemm<ALayout,
                                                              BLayout,
@@ -137,10 +138,9 @@ int profile_gemm_impl(int do_verification,
    // profile device op instances
    for(auto& op_ptr : op_ptrs)
    {
-        auto argument_ptr =
-            op_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
-                                        static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
-                                        static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+        auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+                                                        b_device_buf.GetDeviceBuffer(),
+                                                        c_device_buf.GetDeviceBuffer(),
                                                        M,
                                                        N,
                                                        K,
@@ -163,7 +163,7 @@ int profile_gemm_impl(int do_verification,
            float avg_time =
                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});

-            std::size_t flop = std::size_t(2) * M * N * K;
+            std::size_t flop = 2_uz * M * N * K;

            std::size_t num_btype =
                sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
@@ -185,18 +185,17 @@ int profile_gemm_impl(int do_verification,

            if(do_verification)
            {
-                c_device_buf.FromDevice(c_m_n_device_result.mData.data());
+                c_device_buf.FromDevice(c_m_n_device_result.data());

-                pass =
-                    pass & ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
+                pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);

                if(do_log)
                {
-                    LogRangeAsType<float>(std::cout << "a : ", a_m_k.mData, ",") << std::endl;
-                    LogRangeAsType<float>(std::cout << "b: ", b_k_n.mData, ",") << std::endl;
-                    LogRangeAsType<float>(std::cout << "c_host  : ", c_m_n_host_result.mData, ",")
+                    LogRangeAsType<float>(std::cout << "a : ", a_m_k, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "b: ", b_k_n, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "c_host  : ", c_m_n_host_result, ",")
                        << std::endl;
-                    LogRangeAsType<float>(std::cout << "c_device: ", c_m_n_device_result.mData, ",")
+                    LogRangeAsType<float>(std::cout << "c_device: ", c_m_n_device_result, ",")
                        << std::endl;
                }
            }

--- a/profiler/include/profile_gemm_reduce_impl.hpp
+++ b/profiler/include/profile_gemm_reduce_impl.hpp
@@ -4,17 +4,18 @@
 #pragma once

 #include "ck/ck.hpp"
-#include "ck/utility/reduction_operator.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_gemm_reduce.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/utility/reduction_operator.hpp"

+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/convolution_parameter.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/literals.hpp"

 namespace ck {
 namespace tensor_operation {
@@ -73,17 +74,17 @@ bool profile_gemm_reduce_impl(int do_verification,
 {
    bool pass = true;

+    using namespace ck::literals;
+
    auto f_host_tensor_descriptor =
        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
-            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
+            if constexpr(is_same_v<decltype(layout), tensor_layout::gemm::RowMajor>)
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
-                                            std::vector<std::size_t>({stride, 1}));
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
            }
            else
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
-                                            std::vector<std::size_t>({1, stride}));
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
            }
        };

@@ -91,22 +92,18 @@ bool profile_gemm_reduce_impl(int do_verification,
    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));

    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
-    Tensor<ReduceDataType> reduce0_m_host_result(
-        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
-    Tensor<ReduceDataType> reduce1_m_host_result(
-        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
+    Tensor<ReduceDataType> reduce0_m_host_result(HostTensorDescriptor({M}));
+    Tensor<ReduceDataType> reduce1_m_host_result(HostTensorDescriptor({M}));

    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
-    Tensor<ReduceDataType> reduce0_m_device_result(
-        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
-    Tensor<ReduceDataType> reduce1_m_device_result(
-        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
+    Tensor<ReduceDataType> reduce0_m_device_result(HostTensorDescriptor({M}));
+    Tensor<ReduceDataType> reduce1_m_device_result(HostTensorDescriptor({M}));

-    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
-    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
-    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
-    std::cout << "reduce0_m: " << reduce0_m_host_result.mDesc << std::endl;
-    std::cout << "reduce1_m: " << reduce1_m_host_result.mDesc << std::endl;
+    std::cout << "a_m_k: " << a_m_k.GetDesc() << std::endl;
+    std::cout << "b_k_n: " << b_k_n.GetDesc() << std::endl;
+    std::cout << "c_m_n: " << c_m_n_host_result.GetDesc() << std::endl;
+    std::cout << "reduce0_m: " << reduce0_m_host_result.GetDesc() << std::endl;
+    std::cout << "reduce1_m: " << reduce1_m_host_result.GetDesc() << std::endl;

    std::size_t num_thread = 1;
    switch(init_method)
@@ -189,19 +186,17 @@ bool profile_gemm_reduce_impl(int do_verification,
        }
    }

-    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
-    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
-    DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
-    DeviceMem reduce0_device_buf(sizeof(ReduceDataType) *
-                                 reduce0_m_device_result.mDesc.GetElementSpaceSize());
-    DeviceMem reduce1_device_buf(sizeof(ReduceDataType) *
-                                 reduce1_m_device_result.mDesc.GetElementSpaceSize());
+    DeviceMem a_device_buf(a_m_k.GetMemorySize());
+    DeviceMem b_device_buf(b_k_n.GetMemorySize());
+    DeviceMem c_device_buf(c_m_n_device_result.GetMemorySize());
+    DeviceMem reduce0_device_buf(reduce0_m_device_result.GetMemorySize());
+    DeviceMem reduce1_device_buf(reduce1_m_device_result.GetMemorySize());

    std::array<void*, 2> p_reduces = {reduce0_device_buf.GetDeviceBuffer(),
                                      reduce1_device_buf.GetDeviceBuffer()};

-    a_device_buf.ToDevice(a_m_k.mData.data());
-    b_device_buf.ToDevice(b_k_n.mData.data());
+    a_device_buf.ToDevice(a_m_k.data());
+    b_device_buf.ToDevice(b_k_n.data());

    // add device GEMM instances
    std::vector<ck::tensor_operation::device::instance::DeviceGemmReduceNoOpPtr> gemm_ptrs;
@@ -287,7 +282,7 @@ bool profile_gemm_reduce_impl(int do_verification,

            std::string gemm_name = gemm_ptr->GetTypeString();

-            std::size_t flop = std::size_t(2) * M * N * K;
+            std::size_t flop = 2_uz * M * N * K;

            std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
                                    sizeof(CDataType) * M * N + sizeof(CDataType) * N;
@@ -309,33 +304,29 @@ bool profile_gemm_reduce_impl(int do_verification,

            if(do_verification)
            {
-                c_device_buf.FromDevice(c_m_n_device_result.mData.data());
-                reduce0_device_buf.FromDevice(reduce0_m_device_result.mData.data());
-                reduce1_device_buf.FromDevice(reduce1_m_device_result.mData.data());
+                c_device_buf.FromDevice(c_m_n_device_result.data());
+                reduce0_device_buf.FromDevice(reduce0_m_device_result.data());
+                reduce1_device_buf.FromDevice(reduce1_m_device_result.data());

-                ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
-                ck::utils::check_err(reduce0_m_device_result.mData, reduce0_m_host_result.mData);
-                ck::utils::check_err(reduce1_m_device_result.mData, reduce1_m_host_result.mData);
+                ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
+                ck::utils::check_err(reduce0_m_device_result, reduce0_m_host_result);
+                ck::utils::check_err(reduce1_m_device_result, reduce1_m_host_result);

                if(do_log)
                {
-                    LogRangeAsType<float>(std::cout << "a : ", a_m_k.mData, ",") << std::endl;
-                    LogRangeAsType<float>(std::cout << "b: ", b_k_n.mData, ",") << std::endl;
-                    LogRangeAsType<float>(std::cout << "c_host: ", c_m_n_host_result.mData, ",")
+                    LogRangeAsType<float>(std::cout << "a : ", a_m_k, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "b: ", b_k_n, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "c_host: ", c_m_n_host_result, ",")
                        << std::endl;
-                    LogRangeAsType<float>(std::cout << "c_device: ", c_m_n_device_result.mData, ",")
+                    LogRangeAsType<float>(std::cout << "c_device: ", c_m_n_device_result, ",")
                        << std::endl;
-                    LogRangeAsType<float>(
-                        std::cout << "d0_host: ", reduce0_m_host_result.mData, ",")
+                    LogRangeAsType<float>(std::cout << "d0_host: ", reduce0_m_host_result, ",")
                        << std::endl;
-                    LogRangeAsType<float>(
-                        std::cout << "d0_device: ", reduce0_m_device_result.mData, ",")
+                    LogRangeAsType<float>(std::cout << "d0_device: ", reduce0_m_device_result, ",")
                        << std::endl;
-                    LogRangeAsType<float>(
-                        std::cout << "d1_host: ", reduce1_m_host_result.mData, ",")
+                    LogRangeAsType<float>(std::cout << "d1_host: ", reduce1_m_host_result, ",")
                        << std::endl;
-                    LogRangeAsType<float>(
-                        std::cout << "d1_device: ", reduce1_m_device_result.mData, ",")
+                    LogRangeAsType<float>(std::cout << "d1_device: ", reduce1_m_device_result, ",")
                        << std::endl;
                }
            }

--- a/profiler/include/profile_gemm_splitk_impl.hpp
+++ b/profiler/include/profile_gemm_splitk_impl.hpp
@@ -8,17 +8,18 @@
 #include <typeinfo>

 #include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_gemm_splitk.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

 #include "ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp"

+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/literals.hpp"

 namespace ck {
 namespace profiler {
@@ -44,17 +45,17 @@ bool profile_gemm_splitk_impl(int do_verification,
 {
    bool pass = true;

+    using namespace ck::literals;
+
    auto f_host_tensor_descriptor =
        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
-            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
+            if constexpr(is_same_v<decltype(layout), tensor_layout::gemm::RowMajor>)
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
-                                            std::vector<std::size_t>({stride, 1}));
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
            }
            else
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
-                                            std::vector<std::size_t>({1, stride}));
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
            }
        };

@@ -63,9 +64,9 @@ bool profile_gemm_splitk_impl(int do_verification,
    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));

-    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
-    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
-    std::cout << "c_m_n: " << c_m_n_device_result.mDesc << std::endl;
+    std::cout << "a_m_k: " << a_m_k.GetDesc() << std::endl;
+    std::cout << "b_k_n: " << b_k_n.GetDesc() << std::endl;
+    std::cout << "c_m_n: " << c_m_n_device_result.GetDesc() << std::endl;

    switch(init_method)
    {
@@ -87,13 +88,13 @@ bool profile_gemm_splitk_impl(int do_verification,
    const auto b_element_op = BElementOp{};
    const auto c_element_op = CElementOp{};

-    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
-    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
-    DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+    DeviceMem a_device_buf(a_m_k.GetMemorySize());
+    DeviceMem b_device_buf(b_k_n.GetMemorySize());
+    DeviceMem c_device_buf(c_m_n_device_result.GetMemorySize());

-    a_device_buf.ToDevice(a_m_k.mData.data());
-    b_device_buf.ToDevice(b_k_n.mData.data());
-    c_device_buf.ToDevice(c_m_n_device_result.mData.data());
+    a_device_buf.ToDevice(a_m_k.data());
+    b_device_buf.ToDevice(b_k_n.data());
+    c_device_buf.ToDevice(c_m_n_device_result.data());

    using DeviceOp = ck::tensor_operation::device::DeviceGemmSplitK<ALayout,
                                                                    BLayout,
@@ -139,10 +140,9 @@ bool profile_gemm_splitk_impl(int do_verification,
    // profile device GEMM instances
    for(auto& op_ptr : op_ptrs)
    {
-        auto argument_ptr =
-            op_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
-                                        static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
-                                        static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+        auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+                                                        b_device_buf.GetDeviceBuffer(),
+                                                        c_device_buf.GetDeviceBuffer(),
                                                        M,
                                                        N,
                                                        K,
@@ -166,7 +166,7 @@ bool profile_gemm_splitk_impl(int do_verification,
            float ave_time =
                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});

-            std::size_t flop = std::size_t(2) * M * N * K;
+            std::size_t flop = 2_uz * M * N * K;

            std::size_t num_btype =
                sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
@@ -188,18 +188,17 @@ bool profile_gemm_splitk_impl(int do_verification,

            if(do_verification)
            {
-                c_device_buf.FromDevice(c_m_n_device_result.mData.data());
+                c_device_buf.FromDevice(c_m_n_device_result.data());

-                pass =
-                    pass & ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
+                pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);

                if(do_log)
                {
-                    LogRangeAsType<float>(std::cout << "a : ", a_m_k.mData, ",") << std::endl;
-                    LogRangeAsType<float>(std::cout << "b: ", b_k_n.mData, ",") << std::endl;
-                    LogRangeAsType<float>(std::cout << "c_host  : ", c_m_n_host_result.mData, ",")
+                    LogRangeAsType<float>(std::cout << "a : ", a_m_k, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "b: ", b_k_n, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "c_host  : ", c_m_n_host_result, ",")
                        << std::endl;
-                    LogRangeAsType<float>(std::cout << "c_device: ", c_m_n_device_result.mData, ",")
+                    LogRangeAsType<float>(std::cout << "c_device: ", c_m_n_device_result, ",")
                        << std::endl;
                }
            }

--- a/profiler/include/profile_grouped_conv_fwd_impl.hpp
+++ b/profiler/include/profile_grouped_conv_fwd_impl.hpp
@@ -8,19 +8,21 @@
 #include <typeinfo>

 #include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

 #include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp"

+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/array.hpp"
 #include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
-#include "ck/library/utility/convolution_parameter.hpp"
-#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"

 namespace ck {
 namespace profiler {
@@ -66,7 +68,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
    std::array<ck::index_t, NDimSpatial> input_left_pads{};
    std::array<ck::index_t, NDimSpatial> input_right_pads{};

-    auto copy = [](auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); };
+    auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };

    copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
@@ -84,9 +86,9 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
    Tensor<OutDataType> host_output(out_g_n_k_wos_desc);
    Tensor<OutDataType> device_output(out_g_n_k_wos_desc);

-    std::cout << "input: " << input.mDesc << std::endl;
-    std::cout << "weight: " << weight.mDesc << std::endl;
-    std::cout << "output: " << host_output.mDesc << std::endl;
+    std::cout << "input: " << input.GetDesc() << std::endl;
+    std::cout << "weight: " << weight.GetDesc() << std::endl;
+    std::cout << "output: " << host_output.GetDesc() << std::endl;

    switch(init_method)
    {
@@ -100,12 +102,12 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
        weight.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
    }

-    DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpaceSize());
-    DeviceMem wei_device_buf(sizeof(WeiDataType) * weight.mDesc.GetElementSpaceSize());
-    DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpaceSize());
+    DeviceMem in_device_buf(input.GetMemorySize());
+    DeviceMem wei_device_buf(weight.GetMemorySize());
+    DeviceMem out_device_buf(device_output.GetMemorySize());

-    in_device_buf.ToDevice(input.mData.data());
-    wei_device_buf.ToDevice(weight.mData.data());
+    in_device_buf.ToDevice(input.data());
+    wei_device_buf.ToDevice(weight.data());

    // run reference op
    if(do_verification)
@@ -163,19 +165,20 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
    // profile device op instances
    bool pass = true;

+    using ck::utils::empty_array;
+
    for(auto& op_ptr : op_ptrs)
    {
-        auto argument_ptr =
-            op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
+        auto argument_ptr = op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
                                                        wei_device_buf.GetDeviceBuffer(),
-                                        std::array<const void*, 0>{},
+                                                        empty_array(),
                                                        out_device_buf.GetDeviceBuffer(),
                                                        a_g_n_c_wis_lengths,
                                                        a_g_n_c_wis_strides,
                                                        b_g_k_c_xs_lengths,
                                                        b_g_k_c_xs_strides,
-                                        std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
-                                        std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
+                                                        empty_array(),
+                                                        empty_array(),
                                                        e_g_n_k_wos_lengths,
                                                        e_g_n_k_wos_strides,
                                                        conv_filter_strides,
@@ -218,17 +221,17 @@ bool profile_grouped_conv_fwd_impl(int do_verification,

            if(do_verification)
            {
-                out_device_buf.FromDevice(device_output.mData.data());
+                out_device_buf.FromDevice(device_output.data());

-                pass = pass & ck::utils::check_err(device_output.mData, host_output.mData);
+                pass = pass & ck::utils::check_err(device_output, host_output);

                if(do_log)
                {
-                    LogRangeAsType<float>(std::cout << "input : ", input.mData, ",") << std::endl;
-                    LogRangeAsType<float>(std::cout << "weight: ", weight.mData, ",") << std::endl;
-                    LogRangeAsType<float>(std::cout << "host_output  : ", host_output.mData, ",")
+                    LogRangeAsType<float>(std::cout << "input : ", input, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "weight: ", weight, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "host_output  : ", host_output, ",")
                        << std::endl;
-                    LogRangeAsType<float>(std::cout << "device_output: ", device_output.mData, ",")
+                    LogRangeAsType<float>(std::cout << "device_output: ", device_output, ",")
                        << std::endl;
                }
            }

--- a/profiler/include/profile_grouped_gemm_impl.hpp
+++ b/profiler/include/profile_grouped_gemm_impl.hpp
@@ -6,18 +6,19 @@
 #include <iomanip>

 #include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_grouped_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

 #include "ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp"

+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/convolution_parameter.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/literals.hpp"

 namespace ck {
 namespace profiler {
@@ -43,17 +44,17 @@ bool profile_grouped_gemm_impl(int do_verification,

    bool pass = true;

+    using namespace ck::literals;
+
    auto f_host_tensor_descriptor =
        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
-            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
+            if constexpr(is_same_v<decltype(layout), tensor_layout::gemm::RowMajor>)
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
-                                            std::vector<std::size_t>({stride, 1}));
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
            }
            else
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
-                                            std::vector<std::size_t>({1, stride}));
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
            }
        };

@@ -79,9 +80,9 @@ bool profile_grouped_gemm_impl(int do_verification,
        c_m_n_device_results.push_back(
            Tensor<CDataType>(f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{})));

-        std::cout << "group: " << i << " a_m_k[" << i << "]:" << a_m_k[i].mDesc << ", b_k_n[" << i
-                  << "]:" << b_k_n[i].mDesc << ", c_m_n_device_results[" << i
-                  << "]:" << c_m_n_device_results[i].mDesc << std::endl;
+        std::cout << "group: " << i << " a_m_k[" << i << "]:" << a_m_k[i].GetDesc() << ", b_k_n["
+                  << i << "]:" << b_k_n[i].GetDesc() << ", c_m_n_device_results[" << i
+                  << "]:" << c_m_n_device_results[i].GetDesc() << std::endl;

        std::size_t num_thread = 1;
        switch(init_method)
@@ -132,17 +133,15 @@ bool profile_grouped_gemm_impl(int do_verification,

    for(std::size_t i = 0; i < group_count; i++)
    {
-        a_device_buf.emplace_back(
-            std::make_unique<DeviceMem>(sizeof(ADataType) * a_m_k[i].mDesc.GetElementSpaceSize()));
-        b_device_buf.emplace_back(
-            std::make_unique<DeviceMem>(sizeof(BDataType) * b_k_n[i].mDesc.GetElementSpaceSize()));
+        a_device_buf.emplace_back(std::make_unique<DeviceMem>(a_m_k[i].GetMemorySize()));
+        b_device_buf.emplace_back(std::make_unique<DeviceMem>(b_k_n[i].GetMemorySize()));

-        c_device_buf.emplace_back(std::make_unique<DeviceMem>(
-            sizeof(CDataType) * c_m_n_device_results[i].mDesc.GetElementSpaceSize()));
+        c_device_buf.emplace_back(
+            std::make_unique<DeviceMem>(c_m_n_device_results[i].GetMemorySize()));

-        a_device_buf[i]->ToDevice(a_m_k[i].mData.data());
-        b_device_buf[i]->ToDevice(b_k_n[i].mData.data());
-        c_device_buf[i]->ToDevice(c_m_n_device_results[i].mData.data());
+        a_device_buf[i]->ToDevice(a_m_k[i].data());
+        b_device_buf[i]->ToDevice(b_k_n[i].data());
+        c_device_buf[i]->ToDevice(c_m_n_device_results[i].data());

        gemm_descs.push_back({Ms[i], Ns[i], Ks[i], StrideAs[i], StrideBs[i], StrideCs[i], {}});

@@ -207,7 +206,7 @@ bool profile_grouped_gemm_impl(int do_verification,
            std::size_t flop = 0, num_btype = 0;
            for(std::size_t i = 0; i < gemm_descs.size(); i++)
            {
-                flop += std::size_t(2) * Ms[i] * Ns[i] * Ks[i];
+                flop += 2_uz * Ms[i] * Ns[i] * Ks[i];

                num_btype += sizeof(ADataType) * Ms[i] * Ks[i] + sizeof(BDataType) * Ks[i] * Ns[i] +
                             sizeof(CDataType) * Ms[i] * Ns[i];
@@ -232,7 +231,7 @@ bool profile_grouped_gemm_impl(int do_verification,
                for(std::size_t i = 0; i < gemm_descs.size(); i++)
                {

-                    c_device_buf[i]->FromDevice(c_m_n_device_results[i].mData.data());
+                    c_device_buf[i]->FromDevice(c_m_n_device_results[i].data());

                    Tensor<CDataType> c_m_n_host_result(
                        f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{}));
@@ -257,19 +256,16 @@ bool profile_grouped_gemm_impl(int do_verification,
                                                              c_element_op);

                    ref_invoker.Run(ref_argument);
-                    pass = pass && ck::utils::check_err(c_m_n_device_results[i].mData,
-                                                        c_m_n_host_result.mData);
+                    pass = pass && ck::utils::check_err(c_m_n_device_results[i], c_m_n_host_result);

                    if(do_log)
                    {
-                        LogRangeAsType<float>(std::cout << "a : ", a_m_k[i].mData, ",")
-                            << std::endl;
-                        LogRangeAsType<float>(std::cout << "b: ", b_k_n[i].mData, ",") << std::endl;
+                        LogRangeAsType<float>(std::cout << "a : ", a_m_k[i], ",") << std::endl;
+                        LogRangeAsType<float>(std::cout << "b: ", b_k_n[i], ",") << std::endl;
                        LogRangeAsType<float>(
-                            std::cout << "c_device: ", c_m_n_device_results[i].mData, ",")
+                            std::cout << "c_device: ", c_m_n_device_results[i], ",")
                            << std::endl;
-                        LogRangeAsType<float>(
-                            std::cout << "c_host  : ", c_m_n_host_result.mData, ",")
+                        LogRangeAsType<float>(std::cout << "c_host  : ", c_m_n_host_result, ",")
                            << std::endl;
                    }
                }

--- a/profiler/include/profile_groupnorm_impl.hpp
+++ b/profiler/include/profile_groupnorm_impl.hpp
@@ -9,11 +9,11 @@

 #include "ck/library/tensor_operation_instance/gpu/layernorm.hpp"

+#include "ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp"
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp"

 namespace ck {
 namespace profiler {
@@ -65,14 +65,14 @@ bool profile_groupnorm_impl(int do_verification,
        beta.GenerateTensorValue(GeneratorTensor_3<BetaDataType>{-0.5, 0.5});
    }

-    DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize());
-    DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize());
-    DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize());
-    DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize());
+    DeviceMem x_dev(x.GetMemorySize());
+    DeviceMem gamma_dev(gamma.GetMemorySize());
+    DeviceMem beta_dev(beta.GetMemorySize());
+    DeviceMem y_dev(y.GetMemorySize());

-    x_dev.ToDevice(x.mData.data());
-    gamma_dev.ToDevice(gamma.mData.data());
-    beta_dev.ToDevice(beta.mData.data());
+    x_dev.ToDevice(x.data());
+    gamma_dev.ToDevice(gamma.data());
+    beta_dev.ToDevice(beta.data());

    // add device normalization instances
    using DeviceOp = ck::tensor_operation::device::DeviceLayernorm<XDataType,
@@ -116,10 +116,10 @@ bool profile_groupnorm_impl(int do_verification,
    {
        auto argument_ptr = inst_ptr->MakeArgumentPointer(
            length,
-            std::vector<ck::index_t>{x.mDesc.GetStrides().begin(), x.mDesc.GetStrides().end()},
+            std::vector<ck::index_t>{x.GetStrides().begin(), x.GetStrides().end()},
            gammaBetaStride,
            gammaBetaStride,
-            std::vector<ck::index_t>{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()},
+            std::vector<ck::index_t>{y.GetStrides().begin(), y.GetStrides().end()},
            reduce_dim,
            1e-6,
            x_dev.GetDeviceBuffer(),
@@ -141,10 +141,10 @@ bool profile_groupnorm_impl(int do_verification,

        float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});

-        std::size_t num_bytes = x.mDesc.GetElementSize() * sizeof(XDataType) +
-                                gamma.mDesc.GetElementSize() * sizeof(GammaDataType) +
-                                beta.mDesc.GetElementSize() * sizeof(BetaDataType) +
-                                y.mDesc.GetElementSize() * sizeof(YDataType);
+        std::size_t num_bytes = x.GetElementSize() * sizeof(XDataType) +
+                                gamma.GetElementSize() * sizeof(GammaDataType) +
+                                beta.GetElementSize() * sizeof(BetaDataType) +
+                                y.GetElementSize() * sizeof(YDataType);

        float gb_per_sec = num_bytes / 1.E6 / avg_time;

@@ -161,16 +161,15 @@ bool profile_groupnorm_impl(int do_verification,

        if(do_verification)
        {
-            y_dev.FromDevice(y.mData.data());
+            y_dev.FromDevice(y.data());

-            bool pass =
-                ck::utils::check_err(y.mData, host_y.mData, "Error: Incorrect results", 1e-3, 1e-3);
+            bool pass = ck::utils::check_err(y, host_y, "Error: Incorrect results", 1e-3, 1e-3);

            if(do_log)
            {
-                LogRangeAsType<float>(std::cout << "x  : ", x.mData, ",") << std::endl;
-                LogRangeAsType<float>(std::cout << "host_y  : ", host_y.mData, ",") << std::endl;
-                LogRangeAsType<float>(std::cout << "y  : ", y.mData, ",") << std::endl;
+                LogRangeAsType<float>(std::cout << "x  : ", x, ",") << std::endl;
+                LogRangeAsType<float>(std::cout << "host_y  : ", host_y, ",") << std::endl;
+                LogRangeAsType<float>(std::cout << "y  : ", y, ",") << std::endl;
            }

            if(!pass)

--- a/profiler/include/profile_layernorm_impl.hpp
+++ b/profiler/include/profile_layernorm_impl.hpp
@@ -9,11 +9,11 @@

 #include "ck/library/tensor_operation_instance/gpu/layernorm.hpp"

+#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"

 namespace ck {
 namespace profiler {
@@ -72,14 +72,14 @@ void profile_layernorm_impl(int do_verification,
        y.GenerateTensorValue(GeneratorTensor_3<YDataType>{-0.5, 0.5});
    }

-    DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize());
-    DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize());
-    DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize());
-    DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize());
+    DeviceMem x_dev(x.GetMemorySize());
+    DeviceMem gamma_dev(gamma.GetMemorySize());
+    DeviceMem beta_dev(beta.GetMemorySize());
+    DeviceMem y_dev(y.GetMemorySize());

-    x_dev.ToDevice(x.mData.data());
-    gamma_dev.ToDevice(gamma.mData.data());
-    beta_dev.ToDevice(beta.mData.data());
+    x_dev.ToDevice(x.data());
+    gamma_dev.ToDevice(gamma.data());
+    beta_dev.ToDevice(beta.data());

    constexpr int NumReduceDim = Rank - 1;

@@ -149,10 +149,10 @@ void profile_layernorm_impl(int do_verification,

        float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});

-        std::size_t num_bytes = x.mDesc.GetElementSize() * sizeof(XDataType) +
-                                gamma.mDesc.GetElementSize() * sizeof(GammaDataType) +
-                                beta.mDesc.GetElementSize() * sizeof(BetaDataType) +
-                                y.mDesc.GetElementSize() * sizeof(YDataType);
+        std::size_t num_bytes = x.GetElementSize() * sizeof(XDataType) +
+                                gamma.GetElementSize() * sizeof(GammaDataType) +
+                                beta.GetElementSize() * sizeof(BetaDataType) +
+                                y.GetElementSize() * sizeof(YDataType);

        float gb_per_sec = num_bytes / 1.E6 / avg_time;

@@ -168,16 +168,15 @@ void profile_layernorm_impl(int do_verification,

        if(do_verification)
        {
-            y_dev.FromDevice(y.mData.data());
+            y_dev.FromDevice(y.data());

-            bool pass = ck::utils::check_err(
-                y.mData, host_y.mData, "Error: Incorrect results d1", 1e-3, 1e-3);
+            bool pass = ck::utils::check_err(y, host_y, "Error: Incorrect results d1", 1e-3, 1e-3);

            if(do_log)
            {
-                LogRangeAsType<float>(std::cout << "x  : ", x.mData, ",") << std::endl;
-                LogRangeAsType<float>(std::cout << "host_y  : ", host_y.mData, ",") << std::endl;
-                LogRangeAsType<float>(std::cout << "y  : ", y.mData, ",") << std::endl;
+                LogRangeAsType<float>(std::cout << "x  : ", x, ",") << std::endl;
+                LogRangeAsType<float>(std::cout << "host_y  : ", host_y, ",") << std::endl;
+                LogRangeAsType<float>(std::cout << "y  : ", y, ",") << std::endl;
            }

            if(!pass)

--- a/profiler/include/profile_normalization_impl.hpp
+++ b/profiler/include/profile_normalization_impl.hpp
@@ -6,15 +6,16 @@
 #include <iomanip>

 #include "ck/ck.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/convolution_parameter.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
-#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/utility/data_type.hpp"

 namespace ck {
 namespace tensor_operation {
@@ -87,7 +88,7 @@ void profile_normalization_impl(int do_verification,

    Tensor<InDataType> in = in_strides.empty() ? Tensor<InDataType>(in_length)
                                               : Tensor<InDataType>(in_length, in_strides);
-    Tensor<OutDataType> out(in.mDesc);
+    Tensor<OutDataType> out(in.GetDesc());

    switch(init_method)
    {
@@ -107,13 +108,13 @@ void profile_normalization_impl(int do_verification,

    Tensor<OutDataType> out_ref(out);

-    DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
-    DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpaceSize());
-    in_dev.ToDevice(in.mData.data());
-    out_dev.ToDevice(out.mData.data());
+    DeviceMem in_dev(in.GetMemorySize());
+    DeviceMem out_dev(out.GetMemorySize());
+    in_dev.ToDevice(in.data());
+    out_dev.ToDevice(out.data());

-    std::vector<index_t> i_in_lengths(in.mDesc.GetLengths().begin(), in.mDesc.GetLengths().end());
-    std::vector<index_t> i_in_strides(in.mDesc.GetStrides().begin(), in.mDesc.GetStrides().end());
+    std::vector<index_t> i_in_lengths(in.GetLengths().begin(), in.GetLengths().end());
+    std::vector<index_t> i_in_strides(in.GetStrides().begin(), in.GetStrides().end());

    // add device softmax instances
    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
@@ -189,9 +190,8 @@ void profile_normalization_impl(int do_verification,

        float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});

-        std::size_t num_bytes =
-            in.mDesc.GetElementSize() * sizeof(InDataType) +
-            (beta == 0.0f ? 1 : 2) * out.mDesc.GetElementSize() * sizeof(OutDataType);
+        std::size_t num_bytes = in.GetElementSize() * sizeof(InDataType) +
+                                (beta == 0.0f ? 1 : 2) * out.GetElementSize() * sizeof(OutDataType);

        float gb_per_sec = num_bytes / 1.E6 / avg_time;

@@ -213,30 +213,27 @@ void profile_normalization_impl(int do_verification,

            ReferenceFactory{}.MakeInvoker().Run({in, out_ref, alpha, beta, reduce_dims});

-            out_dev.FromDevice(out.mData.data());
+            out_dev.FromDevice(out.data());

            bool pass;
-            if(std::is_same<InDataType, int8_t>::value)
+            if constexpr(std::is_same_v<InDataType, int8_t>)
            {
-                pass = ck::utils::check_err(
-                    out.mData, out_ref.mData, "Error: Incorrect results!", 0, 1);
+                pass = ck::utils::check_err(out, out_ref, "Error: Incorrect results!", 0, 1);
                if(do_log)
                {
-                    LogRangeAsType<int>(std::cout << "in  : ", in.mData, ",") << std::endl;
-                    LogRangeAsType<int>(std::cout << "out_ref  : ", out_ref.mData, ",")
-                        << std::endl;
-                    LogRangeAsType<int>(std::cout << "out  : ", out.mData, ",") << std::endl;
+                    LogRangeAsType<int>(std::cout << "in  : ", in, ",") << std::endl;
+                    LogRangeAsType<int>(std::cout << "out_ref  : ", out_ref, ",") << std::endl;
+                    LogRangeAsType<int>(std::cout << "out  : ", out, ",") << std::endl;
                }
            }
            else
            {
-                pass = ck::utils::check_err(out.mData, out_ref.mData);
+                pass = ck::utils::check_err(out, out_ref);
                if(do_log)
                {
-                    LogRangeAsType<float>(std::cout << "in  : ", in.mData, ",") << std::endl;
-                    LogRangeAsType<float>(std::cout << "out_ref  : ", out_ref.mData, ",")
-                        << std::endl;
-                    LogRangeAsType<float>(std::cout << "out  : ", out.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "in  : ", in, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "out_ref  : ", out_ref, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "out  : ", out, ",") << std::endl;
                }
            }


--- a/profiler/include/profile_reduce_impl.hpp
+++ b/profiler/include/profile_reduce_impl.hpp
@@ -3,11 +3,13 @@

 #pragma once

-#include "ck/utility/reduction_enums.hpp"
 #include "ck/tensor_operation/gpu/device/device_reduce.hpp"
+#include "ck/utility/reduction_enums.hpp"

-#include "ck/library/utility/check_err.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_reduction.hpp"
 #include "ck/library/utility/host_common_util.hpp"
@@ -214,11 +216,11 @@ bool profile_reduce_impl_impl(bool do_verification,
        Tensor<int32_t> out_indices_ref(outLengths);
        Tensor<int32_t> out_indices(outLengths);

-        auto inStrides  = in.mDesc.GetStrides();
-        auto outStrides = out.mDesc.GetStrides();
+        auto inStrides  = in.GetStrides();
+        auto outStrides = out.GetStrides();

-        size_t invariant_total_length = out.mDesc.GetElementSize();
-        size_t reduce_total_length    = in.mDesc.GetElementSize() / invariant_total_length;
+        size_t invariant_total_length = out.GetElementSize();
+        size_t reduce_total_length    = in.GetElementSize() / invariant_total_length;

        std::size_t num_thread = 1;

@@ -245,20 +247,21 @@ bool profile_reduce_impl_impl(bool do_verification,
            }

            if(beta != 0.0f)
-                for(size_t i = 0; i < out_ref.mDesc.GetElementSpaceSize(); i++)
-                    out.mData[i] = out_ref.mData[i];
+            {
+                ck::ranges::copy(out_ref, out.begin());
+            }
        };

        // these buffers are usually provided by the user application
-        DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
-        DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpaceSize());
+        DeviceMem in_dev(in.GetMemorySize());
+        DeviceMem out_dev(out.GetMemorySize());

-        in_dev.ToDevice(in.mData.data());
+        in_dev.ToDevice(in.data());

        if(beta != 0.0f)
-            out_dev.ToDevice(out.mData.data());
+            out_dev.ToDevice(out.data());

-        size_t indicesSizeInBytes = OutputIndex ? out.mDesc.GetElementSize() * sizeof(int) : 0;
+        size_t indicesSizeInBytes = OutputIndex ? out.GetElementSize() * sizeof(int) : 0;

        DeviceMem out_indices_dev(indicesSizeInBytes);

@@ -331,13 +334,13 @@ bool profile_reduce_impl_impl(bool do_verification,
                          NumReduceDim,
                          PropagateNan,
                          OutputIndex>
-                hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
+                hostReduce(in.GetDesc(), out_ref.GetDesc(), invariantDims, reduceDims);

            hostReduce.Run(alpha,
-                           in.mData.data(),
+                           in.data(),
                           beta,
-                           out_ref.mData.data(),
-                           out_indices_ref.mData.data(),
+                           out_ref.data(),
+                           out_indices_ref.data(),
                           in_elementwise_op,
                           acc_elementwise_op);
        };
@@ -398,14 +401,13 @@ bool profile_reduce_impl_impl(bool do_verification,
            {
                bool single_pass;

-                out_dev.FromDevice(out.mData.data());
-                single_pass = ck::utils::check_err(out.mData, out_ref.mData);
+                out_dev.FromDevice(out.data());
+                single_pass = ck::utils::check_err(out, out_ref);

                if(OutputIndex)
                {
-                    out_indices_dev.FromDevice(out_indices.mData.data());
-                    single_pass = single_pass &&
-                                  ck::utils::check_err(out_indices.mData, out_indices_ref.mData);
+                    out_indices_dev.FromDevice(out_indices.data());
+                    single_pass = single_pass && ck::utils::check_err(out_indices, out_indices_ref);
                };

                if(!single_pass)
@@ -418,18 +420,16 @@ bool profile_reduce_impl_impl(bool do_verification,

            if(do_dumpout)
            {
-                dumpBufferToFile("dump_in.bin", in.mData.data(), in.mDesc.GetElementSize());
-                dumpBufferToFile("dump_out.bin", out.mData.data(), out.mDesc.GetElementSize());
-                dumpBufferToFile(
-                    "dump_out_host.bin", out_ref.mData.data(), out_ref.mDesc.GetElementSize());
+                dumpBufferToFile("dump_in.bin", in.data(), in.GetElementSize());
+                dumpBufferToFile("dump_out.bin", out.data(), out.GetElementSize());
+                dumpBufferToFile("dump_out_host.bin", out_ref.data(), out_ref.GetElementSize());
                if(OutputIndex)
                {
-                    dumpBufferToFile("dump_indices.bin",
-                                     out_indices.mData.data(),
-                                     out_indices.mDesc.GetElementSize());
+                    dumpBufferToFile(
+                        "dump_indices.bin", out_indices.data(), out_indices.GetElementSize());
                    dumpBufferToFile("dump_indices_host.bin",
-                                     out_indices_ref.mData.data(),
-                                     out_indices_ref.mDesc.GetElementSize());
+                                     out_indices_ref.data(),
+                                     out_indices_ref.GetElementSize());
                };
            };
        };

--- a/test/data_type/int4.cpp
+++ b/test/data_type/int4.cpp
@@ -98,8 +98,8 @@ TEST(Int4, CopyAsI8PositiveValue)

    d_src_i4.ToDevice(h_src_i4.data());

-    copy<<<1, 64>>>(reinterpret_cast<const int4_t*>(d_src_i4.GetDeviceBuffer()),
-                    reinterpret_cast<std::int8_t*>(d_dst_i8.GetDeviceBuffer()),
+    copy<<<1, 64>>>(static_cast<const int4_t*>(d_src_i4.GetDeviceBuffer()),
+                    static_cast<std::int8_t*>(d_dst_i8.GetDeviceBuffer()),
                    SIZE);
    hip_check_error(hipDeviceSynchronize());
    d_dst_i8.FromDevice(h_dst_i8.data());
@@ -125,8 +125,8 @@ TEST(Int4, DISABLED_CopyAsI8NegativeValue)

    d_src_i4.ToDevice(h_src_i4.data());

-    copy<<<1, 64>>>(reinterpret_cast<const int4_t*>(d_src_i4.GetDeviceBuffer()),
-                    reinterpret_cast<std::int8_t*>(d_dst_i8.GetDeviceBuffer()),
+    copy<<<1, 64>>>(static_cast<const int4_t*>(d_src_i4.GetDeviceBuffer()),
+                    static_cast<std::int8_t*>(d_dst_i8.GetDeviceBuffer()),
                    SIZE);
    hip_check_error(hipDeviceSynchronize());
    d_dst_i8.FromDevice(h_dst_i8.data());
@@ -152,8 +152,8 @@ TEST(Int4, CopyAsI8NegativeValueStaticCast)

    d_src_i4.ToDevice(h_src_i4.data());

-    copy_with_static_cast<<<1, 64>>>(reinterpret_cast<const int4_t*>(d_src_i4.GetDeviceBuffer()),
-                                     reinterpret_cast<std::int8_t*>(d_dst_i8.GetDeviceBuffer()),
+    copy_with_static_cast<<<1, 64>>>(static_cast<const int4_t*>(d_src_i4.GetDeviceBuffer()),
+                                     static_cast<std::int8_t*>(d_dst_i8.GetDeviceBuffer()),
                                     SIZE);
    hip_check_error(hipDeviceSynchronize());
    d_dst_i8.FromDevice(h_dst_i8.data());

--- a/test/gemm/gemm_util.hpp
+++ b/test/gemm/gemm_util.hpp
@@ -5,11 +5,13 @@

 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/literals.hpp"

 namespace ck {
 namespace gemm_util {
@@ -71,9 +73,9 @@ bool RunDeviceGEMM(DeviceGemmPtr_& gemmPtr,
                   BElementwiseOperation b_element_op,
                   CElementwiseOperation c_element_op)
 {
-    DeviceMem a_m_k_device_buf(sizeof(ADataType) * A.mDesc.GetElementSpaceSize());
-    DeviceMem b_k_n_device_buf(sizeof(BDataType) * B.mDesc.GetElementSpaceSize());
-    DeviceMem c_m_n_device_buf(sizeof(CDataType) * C.mDesc.GetElementSpaceSize());
+    DeviceMem a_m_k_device_buf(A.GetMemorySize());
+    DeviceMem b_k_n_device_buf(B.GetMemorySize());
+    DeviceMem c_m_n_device_buf(C.GetMemorySize());

    auto invoker_ptr = gemmPtr->MakeInvokerPointer();
    auto argument_ptr =
@@ -92,10 +94,10 @@ bool RunDeviceGEMM(DeviceGemmPtr_& gemmPtr,

    if(gemmPtr->IsSupportedArgument(argument_ptr.get()))
    {
-        a_m_k_device_buf.ToDevice(A.mData.data());
-        b_k_n_device_buf.ToDevice(B.mData.data());
+        a_m_k_device_buf.ToDevice(A.data());
+        b_k_n_device_buf.ToDevice(B.data());
        invoker_ptr->Run(argument_ptr.get());
-        c_m_n_device_buf.FromDevice(C.mData.data());
+        c_m_n_device_buf.FromDevice(C.data());

        return true;
    }
@@ -124,17 +126,17 @@ struct TestGemm
 {
    auto PrepareGemmTensor(const ck::gemm_util::GemmParams& params)
    {
+        using namespace ck::literals;
+
        auto f_host_tensor_descriptor =
            [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
-                if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+                if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
                {
-                    return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
-                                                std::vector<std::size_t>({stride, 1}));
+                    return HostTensorDescriptor({row, col}, {stride, 1_uz});
                }
                else
                {
-                    return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
-                                                std::vector<std::size_t>({1, stride}));
+                    return HostTensorDescriptor({row, col}, {1_uz, stride});
                }
            };

@@ -204,29 +206,29 @@ struct TestGemm
        {
            // Assert
            bool res = false;
-            if(std::is_same<CDataType, float>::value)
+            if constexpr(std::is_same_v<CDataType, float>)
            {
-                res = ck::utils::check_err(c_device.mData, c_host.mData);
+                res = ck::utils::check_err(c_device, c_host);
                std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
            }
-            else if(std::is_same<CDataType, ck::half_t>::value)
+            else if constexpr(std::is_same_v<CDataType, ck::half_t>)
            {
-                res = ck::utils::check_err(c_device.mData, c_host.mData);
+                res = ck::utils::check_err(c_device, c_host);
                std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
            }
-            else if(std::is_same<CDataType, ck::bhalf_t>::value)
+            else if constexpr(std::is_same_v<CDataType, ck::bhalf_t>)
            {
-                res = ck::utils::check_err(c_device.mData, c_host.mData);
+                res = ck::utils::check_err(c_device, c_host);
                std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
            }
-            else if(std::is_same<CDataType, int8_t>::value)
+            else if constexpr(std::is_same_v<CDataType, int8_t>)
            {
-                res = ck::utils::check_err(c_device.mData, c_host.mData);
+                res = ck::utils::check_err(c_device, c_host);
                std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
            }
-            else if(std::is_same<CDataType, double>::value)
+            else if constexpr(std::is_same_v<CDataType, double>)
            {
-                res = ck::utils::check_err(c_device.mData, c_host.mData);
+                res = ck::utils::check_err(c_device, c_host);
                std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
            }