Rebase branch 'develop' of...

Rebase branch 'develop' of https://github.com/ROCmSoftwarePlatform/composable_kernel into contraction_hipTENSOR

Rebase branch 'develop' of...
Rebase branch 'develop' of https://github.com/ROCmSoftwarePlatform/composable_kernel into contraction_hipTENSOR
aea62819 · Chaitanya Inumella · 75af5450 · 75ab874e · aea62819 · aea62819
Commit aea62819 authored Aug 03, 2022 by Chaitanya Inumella
13 changed files
--- a/test/layernorm/CMakeLists.txt
+++ b/test/layernorm/CMakeLists.txt
+add_custom_target(test_layernorm)
+add_gtest_executable(test_layernorm_fp32 test_layernorm_fp32.cpp)
+add_gtest_executable(test_layernorm_fp16 test_layernorm_fp16.cpp)
+target_link_libraries(test_layernorm_fp32 PRIVATE utility)
+target_link_libraries(test_layernorm_fp16 PRIVATE utility)
+add_dependencies(test_layernorm test_layernorm_fp32)
+add_dependencies(test_layernorm test_layernorm_fp16)
--- a/test/layernorm/test_layernorm_fp16.cpp
+++ b/test/layernorm/test_layernorm_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include "gtest/gtest.h"
+#include "test_layernorm_util.hpp"
+template <ck::index_t N>
+using I = ck::Number<N>;
+template <typename Tuple>
+class TestLayernormFP16 : public ck::TestLayernorm<Tuple>
+{
+};
+// clang-format off
+using KernelTypes = ::testing::Types<
+//  XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, , GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize>
+    std::tuple<ck::half_t, ck::half_t, ck::half_t, float, ck::half_t, I<2>, I<1>, I<256>, I<8>, I<32>, I<1>, I<8>, I<1>, I<8>, I<8>, I<8>, I<8>>,
+    std::tuple<ck::half_t, ck::half_t, ck::half_t, float, ck::half_t, I<2>, I<1>, I<256>, I<8>, I<32>, I<2>, I<8>, I<1>, I<8>, I<8>, I<8>, I<8>>,
+    std::tuple<ck::half_t, ck::half_t, ck::half_t, float, ck::half_t, I<2>, I<1>, I<256>, I<4>, I<64>, I<1>, I<8>, I<1>, I<8>, I<8>, I<8>, I<8>>,
+    std::tuple<ck::half_t, ck::half_t, ck::half_t, float, ck::half_t, I<2>, I<1>, I<256>, I<4>, I<64>, I<2>, I<8>, I<1>, I<8>, I<8>, I<8>, I<8>>,
+    std::tuple<ck::half_t, ck::half_t, ck::half_t, float, ck::half_t, I<2>, I<1>, I<256>, I<2>, I<128>, I<1>, I<8>, I<1>, I<8>, I<8>, I<8>, I<8>>,
+    std::tuple<ck::half_t, ck::half_t, ck::half_t, float, ck::half_t, I<2>, I<1>, I<256>, I<2>, I<128>, I<2>, I<8>, I<1>, I<8>, I<8>, I<8>, I<8>>,
+    std::tuple<ck::half_t, ck::half_t, ck::half_t, float, ck::half_t, I<2>, I<1>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<8>, I<8>, I<8>, I<8>>,
+    std::tuple<ck::half_t, ck::half_t, ck::half_t, float, ck::half_t, I<2>, I<1>, I<256>, I<1>, I<256>, I<2>, I<8>, I<1>, I<8>, I<8>, I<8>, I<8>>
+    >;
+// clang-format on
+TYPED_TEST_SUITE(TestLayernormFP16, KernelTypes);
+TYPED_TEST(TestLayernormFP16, Test_FP16) { this->Run(); }
--- a/test/layernorm/test_layernorm_fp32.cpp
+++ b/test/layernorm/test_layernorm_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include "gtest/gtest.h"
+#include "test_layernorm_util.hpp"
+template <ck::index_t N>
+using I = ck::Number<N>;
+template <typename Tuple>
+class TestLayernormFP32 : public ck::TestLayernorm<Tuple>
+{
+};
+// clang-format off
+using KernelTypes = ::testing::Types<
+//  XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, , GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize>
+    std::tuple<float, float, float, float, float, I<2>, I<1>, I<256>, I<8>, I<32>, I<1>, I<8>, I<1>, I<4>, I<4>, I<4>, I<4>>,
+    std::tuple<float, float, float, float, float, I<2>, I<1>, I<256>, I<8>, I<32>, I<2>, I<8>, I<1>, I<4>, I<4>, I<4>, I<4>>,
+    std::tuple<float, float, float, float, float, I<2>, I<1>, I<256>, I<4>, I<64>, I<1>, I<8>, I<1>, I<4>, I<4>, I<4>, I<4>>,
+    std::tuple<float, float, float, float, float, I<2>, I<1>, I<256>, I<4>, I<64>, I<2>, I<8>, I<1>, I<4>, I<4>, I<4>, I<4>>,
+    std::tuple<float, float, float, float, float, I<2>, I<1>, I<256>, I<2>, I<128>, I<1>, I<8>, I<1>, I<4>, I<4>, I<4>, I<4>>,
+    std::tuple<float, float, float, float, float, I<2>, I<1>, I<256>, I<2>, I<128>, I<2>, I<8>, I<1>, I<4>, I<4>, I<4>, I<4>>,
+    std::tuple<float, float, float, float, float, I<2>, I<1>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<4>, I<4>, I<4>, I<4>>,
+    std::tuple<float, float, float, float, float, I<2>, I<1>, I<256>, I<1>, I<256>, I<2>, I<8>, I<1>, I<4>, I<4>, I<4>, I<4>>
+    >;
+// clang-format on
+TYPED_TEST_SUITE(TestLayernormFP32, KernelTypes);
+TYPED_TEST(TestLayernormFP32, Test_FP32) { this->Run(); }
--- a/test/layernorm/test_layernorm_util.hpp
+++ b/test/layernorm/test_layernorm_util.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <vector>
+#include <iostream>
+#include <gtest/gtest.h>
+#include "ck/ck.hpp"
+#include "ck/utility/number.hpp"
+#include "ck/tensor_operation/gpu/device/device_layernorm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
+namespace ck {
+template <typename Range>
+std::string serialize_range(const Range& range)
+{
+    std::stringstream ss;
+    for(auto& r : range)
+    {
+        ss << r << ", ";
+    }
+    std::string str = ss.str();
+    return std::string(str.begin(), str.end() - 2);
+}
+template <typename Tuple>
+class TestLayernorm : public ::testing::Test
+{
+    protected:
+    using XDataType                             = std::tuple_element_t<0, Tuple>;
+    using GammaDataType                         = std::tuple_element_t<1, Tuple>;
+    using BetaDataType                          = std::tuple_element_t<2, Tuple>;
+    using AccDataType                           = std::tuple_element_t<3, Tuple>;
+    using YDataType                             = std::tuple_element_t<4, Tuple>;
+    static constexpr index_t Rank               = std::tuple_element_t<5, Tuple>{}.value;
+    static constexpr index_t NumReduceDim       = std::tuple_element_t<6, Tuple>{}.value;
+    static constexpr index_t BlockSize          = std::tuple_element_t<7, Tuple>{}.value;
+    static constexpr index_t MThreadClusterSize = std::tuple_element_t<8, Tuple>{}.value;
+    static constexpr index_t KThreadClusterSize = std::tuple_element_t<9, Tuple>{}.value;
+    static constexpr index_t MThreadSliceSize   = std::tuple_element_t<10, Tuple>{}.value;
+    static constexpr index_t KThreadSliceSize   = std::tuple_element_t<11, Tuple>{}.value;
+    static constexpr index_t XYSrcVectorDim     = std::tuple_element_t<12, Tuple>{}.value;
+    static constexpr index_t XSrcVectorSize     = std::tuple_element_t<13, Tuple>{}.value;
+    static constexpr index_t GammaSrcVectorSize = std::tuple_element_t<14, Tuple>{}.value;
+    static constexpr index_t BetaSrcVectorSize  = std::tuple_element_t<15, Tuple>{}.value;
+    static constexpr index_t YDstVectorSize     = std::tuple_element_t<16, Tuple>{}.value;
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+    using ReferenceInstance = tensor_operation::host::ReferenceLayernorm<XDataType,
+                                                                         GammaDataType,
+                                                                         BetaDataType,
+                                                                         YDataType,
+                                                                         AccDataType,
+                                                                         PassThrough,
+                                                                         Rank,
+                                                                         NumReduceDim>;
+    using DeviceInstance = tensor_operation::device::DeviceLayernorm<XDataType,
+                                                                     GammaDataType,
+                                                                     BetaDataType,
+                                                                     AccDataType,
+                                                                     YDataType,
+                                                                     PassThrough,
+                                                                     Rank,
+                                                                     NumReduceDim,
+                                                                     BlockSize,
+                                                                     MThreadClusterSize,
+                                                                     KThreadClusterSize,
+                                                                     MThreadSliceSize,
+                                                                     KThreadSliceSize,
+                                                                     XYSrcVectorDim,
+                                                                     XSrcVectorSize,
+                                                                     GammaSrcVectorSize,
+                                                                     BetaSrcVectorSize,
+                                                                     YDstVectorSize>;
+    TestLayernorm() : ref_instance_invoker_(ReferenceInstance{}.MakeInvoker()) {}
+    void RunSingle(std::vector<index_t> lengths, std::vector<index_t> reduceDims)
+    {
+        std::vector<index_t> reduceLength(reduceDims.size());
+        for(int i = 0; i < NumReduceDim; ++i)
+        {
+            reduceLength[i] = lengths[reduceDims[i]];
+        }
+        Tensor<XDataType> x(lengths);
+        Tensor<GammaDataType> gamma(reduceLength);
+        Tensor<BetaDataType> beta(reduceLength);
+        Tensor<YDataType> y(lengths);
+        Tensor<YDataType> y_ref(lengths);
+        x.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 1.0});
+        gamma.GenerateTensorValue(GeneratorTensor_3<GammaDataType>{0.0, 1.0});
+        beta.GenerateTensorValue(GeneratorTensor_3<BetaDataType>{0.0, 1.0});
+        DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize());
+        DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize());
+        DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize());
+        DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize());
+        x_dev.ToDevice(x.mData.data());
+        gamma_dev.ToDevice(gamma.mData.data());
+        beta_dev.ToDevice(beta.mData.data());
+        auto device_instance = DeviceInstance{};
+        auto argument_ptr    = device_instance.MakeArgumentPointer(
+            lengths,
+            std::vector<ck::index_t>{x.mDesc.GetStrides().begin(), x.mDesc.GetStrides().end()},
+            std::vector<ck::index_t>{gamma.mDesc.GetStrides().begin(),
+                                     gamma.mDesc.GetStrides().end()},
+            std::vector<ck::index_t>{beta.mDesc.GetStrides().begin(),
+                                     beta.mDesc.GetStrides().end()},
+            reduceDims,
+            1e-4,
+            x_dev.GetDeviceBuffer(),
+            gamma_dev.GetDeviceBuffer(),
+            beta_dev.GetDeviceBuffer(),
+            y_dev.GetDeviceBuffer(),
+            PassThrough{});
+        if(!device_instance.IsSupportedArgument(argument_ptr.get()))
+        {
+            return;
+        }
+        auto invoker_ptr = device_instance.MakeInvokerPointer();
+        invoker_ptr->Run(argument_ptr.get());
+        ref_instance_invoker_.Run(
+            {x, gamma, beta, y_ref, PassThrough{}, lengths, reduceDims, 1e-4});
+        y_dev.FromDevice(y.mData.data());
+        bool pass;
+        if(std::is_same<XDataType, int8_t>::value)
+        {
+            EXPECT_TRUE(pass = ck::utils::check_err(
+                            y.mData, y_ref.mData, "Error: Incorrect results!", 0, 1));
+        }
+        else
+        {
+            EXPECT_TRUE(pass = ck::utils::check_err(
+                            y.mData, y_ref.mData, "Error: Incorrect results d1", 1e-3, 1e-3));
+        }
+        if(!pass)
+        {
+            FAIL() << "Failure in input lengths = [" << serialize_range(lengths) << "], "
+                   << "reduce dim = [" << serialize_range(reduceDims) << "].";
+        }
+    }
+    void Run()
+    {
+        for(auto length : this->lengths_)
+        {
+            this->RunSingle(length, reduceDims_[0]);
+        }
+    }
+    std::vector<std::vector<index_t>> lengths_ = {
+        {4, 256}, {8, 511}, {9, 1032}, {4, 2048}, {1, 8192}, {4000, 2000}};
+    std::vector<std::vector<index_t>> reduceDims_ = {{1}};
+    typename ReferenceInstance::Invoker ref_instance_invoker_;
+};
+} // namespace ck
--- a/test/magic_number_division/CMakeLists.txt
+++ b/test/magic_number_division/CMakeLists.txt
 add_test_executable(test_magic_number_division magic_number_division.cpp)
-target_link_libraries(test_magic_number_division PRIVATE host_tensor)
+target_link_libraries(test_magic_number_division PRIVATE utility)
--- a/test/magic_number_division/magic_number_division.cpp
+++ b/test/magic_number_division/magic_number_division.cpp
@@ -9,9 +9,9 @@
 #include "ck/ck.hpp"
 #include "ck/utility/magic_division.hpp"
 #include "ck/library/utility/check_err.hpp"
-#include "ck/library/host_tensor/device_memory.hpp"
+#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/host_tensor/host_tensor.hpp"
+#include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/host_tensor/host_tensor_generator.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
 __global__ void gpu_magic_number_division(uint32_t magic_multiplier,
                                          uint32_t magic_shift,

--- a/test/reduce/CMakeLists.txt
+++ b/test/reduce/CMakeLists.txt
 add_test_executable(test_reduce_no_index reduce_no_index.cpp)
 add_test_executable(test_reduce_with_index reduce_with_index.cpp)
-target_link_libraries(test_reduce_no_index PRIVATE host_tensor)
+target_link_libraries(test_reduce_no_index PRIVATE utility)
 target_link_libraries(test_reduce_no_index PRIVATE device_reduce_instance)
-target_link_libraries(test_reduce_with_index PRIVATE host_tensor)
+target_link_libraries(test_reduce_with_index PRIVATE utility)
 target_link_libraries(test_reduce_with_index PRIVATE device_reduce_instance)
--- a/test/reduce/reduce_no_index.cpp
+++ b/test/reduce/reduce_no_index.cpp
@@ -3,7 +3,7 @@
 #include <getopt.h>
-#include "ck/library/host_tensor/host_common_util.hpp"
+#include "ck/library/utility/host_common_util.hpp"
 #include "profiler/include/profile_reduce_impl.hpp"
 using namespace ck;

--- a/test/reduce/reduce_with_index.cpp
+++ b/test/reduce/reduce_with_index.cpp
@@ -3,7 +3,7 @@
 #include <getopt.h>
-#include "ck/library/host_tensor/host_common_util.hpp"
+#include "ck/library/utility/host_common_util.hpp"
 #include "profiler/include/profile_reduce_impl.hpp"
 using namespace ck;

--- a/test/reference_conv_fwd/CMakeLists.txt
+++ b/test/reference_conv_fwd/CMakeLists.txt
 add_gtest_executable(test_reference_conv_fwd reference_conv_fwd.cpp)
-target_link_libraries(test_reference_conv_fwd PRIVATE host_tensor conv_util)
+target_link_libraries(test_reference_conv_fwd PRIVATE utility)
--- a/test/reference_conv_fwd/reference_conv_fwd.cpp
+++ b/test/reference_conv_fwd/reference_conv_fwd.cpp
@@ -13,74 +13,64 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/library/utility/check_err.hpp"
-#include "ck/library/utility/conv_util.hpp"
 #include "ck/library/utility/fill.hpp"
-#include "ck/library/host_tensor/host_tensor.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
 namespace {
 using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
 using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
 using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
-template <ck::index_t NDim,
+template <ck::index_t NDimSpatial,
          typename InDataType    = float,
          typename WeiDataType   = float,
          typename OutDataType   = float,
-          typename InLayout      = ck::tensor_layout::convolution::NHWC,
+          typename InLayout      = ck::tensor_layout::convolution::GNHWC,
-          typename WeiLayout     = ck::tensor_layout::convolution::KYXC,
+          typename WeiLayout     = ck::tensor_layout::convolution::GKYXC,
-          typename OutLayout     = ck::tensor_layout::convolution::NHWK,
+          typename OutLayout     = ck::tensor_layout::convolution::GNHWK,
          typename FillInputOp   = ck::utils::FillMonotonicSeq<InDataType>,
          typename FillWeightsOp = ck::utils::FillConstant<WeiDataType>>
 Tensor<OutDataType>
-run_reference_convolution_forward(const ck::utils::conv::ConvParams& params,
+run_reference_convolution_forward(const ck::utils::conv::ConvParam& conv_param,
                                  const FillInputOp& fill_input_op     = FillInputOp{},
                                  const FillWeightsOp& fill_weights_op = FillWeightsOp{0.5f})
 {
-    std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N_),
+    const auto in_g_n_c_wis_desc =
-                                        static_cast<std::size_t>(params.C_)};
+        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
-    input_dims.insert(std::end(input_dims),
-                      std::begin(params.input_spatial_lengths_),
-                      std::end(params.input_spatial_lengths_));
-    std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K_),
+    const auto wei_g_k_c_xs_desc =
-                                         static_cast<std::size_t>(params.C_)};
+        ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
-    filter_dims.insert(std::end(filter_dims),
-                       std::begin(params.filter_spatial_lengths_),
-                       std::end(params.filter_spatial_lengths_));
-    const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
+    const auto out_g_n_k_wos_desc =
-    std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N_),
+        ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
-                                         static_cast<std::size_t>(params.K_)};
-    output_dims.insert(std::end(output_dims),
-                       std::begin(output_spatial_lengths),
-                       std::end(output_spatial_lengths));
-    Tensor<InDataType> input(ck::utils::conv::get_host_tensor_descriptor(input_dims, InLayout{}));
+    Tensor<InDataType> input(in_g_n_c_wis_desc);
-    Tensor<WeiDataType> weights(
+    Tensor<WeiDataType> weights(wei_g_k_c_xs_desc);
-        ck::utils::conv::get_host_tensor_descriptor(filter_dims, WeiLayout{}));
+    Tensor<OutDataType> host_output(out_g_n_k_wos_desc);
-    Tensor<OutDataType> host_output(
-        ck::utils::conv::get_host_tensor_descriptor(output_dims, OutLayout{}));
    fill_input_op(input.begin(), input.end());
    fill_weights_op(weights.begin(), weights.end());
    std::fill(host_output.begin(), host_output.end(), OutDataType(0.f));
-    auto ref_conv     = ck::tensor_operation::host::ReferenceConvFwd<InDataType,
+    auto ref_conv     = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+                                                                 InDataType,
                                                                 WeiDataType,
                                                                 OutDataType,
                                                                 InElementOp,
                                                                 WeiElementOp,
-                                                                 OutElementOp,
+                                                                 OutElementOp>();
-                                                                 NDim>();
    auto ref_invoker  = ref_conv.MakeInvoker();
    auto ref_argument = ref_conv.MakeArgument(input,
                                              weights,
                                              host_output,
-                                              params.conv_filter_strides_,
+                                              conv_param.conv_filter_strides_,
-                                              params.conv_filter_dilations_,
+                                              conv_param.conv_filter_dilations_,
-                                              params.input_left_pads_,
+                                              conv_param.input_left_pads_,
-                                              params.input_right_pads_,
+                                              conv_param.input_right_pads_,
                                              InElementOp{},
                                              WeiElementOp{},
                                              OutElementOp{});
@@ -91,21 +81,29 @@ run_reference_convolution_forward(const ck::utils::conv::ConvParams& params,
 } // anonymous namespace
-TEST(ReferenceConvolutionFWD, Conv2DNHWC)
+// Eeference convolution assume dimensions of tensor descriptors are in GNCDHW/GKCZYX/GNKDHW order,
+// regardless of physical tensor layouts in  memory.
+// Some tests below assume dimensions of tensor descriptors can be in other order, and therefore
+// are disabled
+// TODO: add more tests, which comply with assumption about dimension order of reference convolution
+// and add tests for more physical layout
+#if 0
+TEST(ReferenceConvolutionFWD, Conv2DGNHWC)
 {
-    ck::utils::conv::ConvParams params;
+    ck::utils::conv::ConvParam conv_param(2,
-    params.N_                      = 1;
+                                          1,
-    params.K_                      = 1;
+                                          1,
-    params.C_                      = 2;
+                                          1,
-    params.filter_spatial_lengths_ = std::vector<ck::index_t>{3, 3};
+                                          2,
-    params.input_spatial_lengths_  = std::vector<ck::index_t>{6, 6};
+                                          std::vector<ck::index_t>{3, 3},
-    params.conv_filter_strides_    = std::vector<ck::index_t>{1, 1};
+                                          std::vector<ck::index_t>{6, 6},
-    params.conv_filter_dilations_  = std::vector<ck::index_t>{1, 1};
+                                          std::vector<ck::index_t>{1, 1},
-    params.input_left_pads_        = std::vector<ck::index_t>{0, 0};
+                                          std::vector<ck::index_t>{1, 1},
-    params.input_right_pads_       = std::vector<ck::index_t>{0, 0};
+                                          std::vector<ck::index_t>{0, 0},
+                                          std::vector<ck::index_t>{0, 0});
-    auto out_tensor = run_reference_convolution_forward<2>(params);
+    auto out_tensor = run_reference_convolution_forward<2>(conv_param);
-    std::vector<std::size_t> ref_dims{1, 1, 4, 4};
+    std::vector<std::size_t> ref_dims{1, 1, 4, 4, 1};
    std::vector<float> ref_data{130.5,
                                148.5,
                                166.5,
@@ -127,21 +125,22 @@ TEST(ReferenceConvolutionFWD, Conv2DNHWC)
    EXPECT_TRUE(ck::utils::check_err(out_tensor.mData, ref_data, "Error: incorrect results!"));
 }
-TEST(ReferenceConvolutionFWD, Conv2DNHWCStridesDilationsPadding)
+TEST(ReferenceConvolutionFWD, Conv2DGNHWCStridesDilationsPadding)
 {
-    ck::utils::conv::ConvParams params;
+    ck::utils::conv::ConvParam conv_param(2,
-    params.N_                      = 1;
+                                          1,
-    params.K_                      = 2;
+                                          1,
-    params.C_                      = 2;
+                                          2,
-    params.filter_spatial_lengths_ = std::vector<ck::index_t>{3, 3};
+                                          2,
-    params.input_spatial_lengths_  = std::vector<ck::index_t>{12, 12};
+                                          std::vector<ck::index_t>{3, 3},
-    params.conv_filter_strides_    = std::vector<ck::index_t>{2, 2};
+                                          std::vector<ck::index_t>{12, 12},
-    params.conv_filter_dilations_  = std::vector<ck::index_t>{2, 2};
+                                          std::vector<ck::index_t>{2, 2},
-    params.input_left_pads_        = std::vector<ck::index_t>{1, 1};
+                                          std::vector<ck::index_t>{2, 2},
-    params.input_right_pads_       = std::vector<ck::index_t>{1, 1};
+                                          std::vector<ck::index_t>{1, 1},
+                                          std::vector<ck::index_t>{1, 1});
-    auto out_tensor                   = run_reference_convolution_forward<2>(params);
+    auto out_tensor                   = run_reference_convolution_forward<2>(conv_param);
-    std::vector<std::size_t> ref_dims = std::vector<std::size_t>{1, 2, 5, 5};
+    std::vector<std::size_t> ref_dims = std::vector<std::size_t>{1, 5, 5, 2};
    std::vector<float> ref_data{
        210.,  210.,  327.,   327.,   351.,   351.,   375.,   375.,   399.,   399.,
        459.,  459.,  706.5,  706.5,  742.5,  742.5,  778.5,  778.5,  814.5,  814.5,
@@ -153,88 +152,88 @@ TEST(ReferenceConvolutionFWD, Conv2DNHWCStridesDilationsPadding)
    EXPECT_TRUE(ck::utils::check_err(out_tensor.mData, ref_data, "Error: incorrect results!"));
 }
-TEST(ReferenceConvolutionFWD, Conv1DNWC)
+TEST(ReferenceConvolutionFWD, Conv1DGNWC)
 {
-    ck::utils::conv::ConvParams params;
+    ck::utils::conv::ConvParam conv_param(1,
-    params.num_dim_spatial_        = 1;
+                                          1,
-    params.N_                      = 1;
+                                          1,
-    params.K_                      = 1;
+                                          1,
-    params.C_                      = 2;
+                                          2,
-    params.filter_spatial_lengths_ = std::vector<ck::index_t>{3};
+                                          std::vector<ck::index_t>{3},
-    params.input_spatial_lengths_  = std::vector<ck::index_t>{6};
+                                          std::vector<ck::index_t>{6},
-    params.conv_filter_strides_    = std::vector<ck::index_t>{1};
+                                          std::vector<ck::index_t>{1},
-    params.conv_filter_dilations_  = std::vector<ck::index_t>{1};
+                                          std::vector<ck::index_t>{1},
-    params.input_left_pads_        = std::vector<ck::index_t>{0};
+                                          std::vector<ck::index_t>{0},
-    params.input_right_pads_       = std::vector<ck::index_t>{0};
+                                          std::vector<ck::index_t>{0});
    auto out_tensor =
        run_reference_convolution_forward<1,
                                          float,
                                          float,
                                          float,
-                                          ck::tensor_layout::convolution::NWC,
+                                          ck::tensor_layout::convolution::GNWC,
-                                          ck::tensor_layout::convolution::KXC,
+                                          ck::tensor_layout::convolution::GKXC,
-                                          ck::tensor_layout::convolution::NWK>(params);
+                                          ck::tensor_layout::convolution::GNWK>(conv_param);
-    std::vector<std::size_t> ref_dims{1, 1, 4};
+    std::vector<std::size_t> ref_dims{1, 1, 4, 1};
    std::vector<float> ref_data{7.5, 13.5, 19.5, 25.5};
    EXPECT_TRUE(ck::utils::check_err(
        out_tensor.mDesc.GetLengths(), ref_dims, "Error: wrong output tensor dimensions!"));
    EXPECT_TRUE(ck::utils::check_err(out_tensor.mData, ref_data, "Error: incorrect results!"));
 }
-TEST(ReferenceConvolutionFWD, Conv1DNWCStridesDilationsPadding)
+TEST(ReferenceConvolutionFWD, Conv1DGNWCStridesDilationsPadding)
 {
-    ck::utils::conv::ConvParams params;
+    ck::utils::conv::ConvParam conv_param(1,
-    params.num_dim_spatial_        = 1;
+                                          1,
-    params.N_                      = 1;
+                                          1,
-    params.K_                      = 2;
+                                          2,
-    params.C_                      = 2;
+                                          2,
-    params.filter_spatial_lengths_ = std::vector<ck::index_t>{3};
+                                          std::vector<ck::index_t>{3},
-    params.input_spatial_lengths_  = std::vector<ck::index_t>{12};
+                                          std::vector<ck::index_t>{12},
-    params.conv_filter_strides_    = std::vector<ck::index_t>{2};
+                                          std::vector<ck::index_t>{2},
-    params.conv_filter_dilations_  = std::vector<ck::index_t>{2};
+                                          std::vector<ck::index_t>{2},
-    params.input_left_pads_        = std::vector<ck::index_t>{1};
+                                          std::vector<ck::index_t>{1},
-    params.input_right_pads_       = std::vector<ck::index_t>{1};
+                                          std::vector<ck::index_t>{1});
    auto out_tensor =
        run_reference_convolution_forward<1,
                                          float,
                                          float,
                                          float,
-                                          ck::tensor_layout::convolution::NWC,
+                                          ck::tensor_layout::convolution::GNWC,
-                                          ck::tensor_layout::convolution::KXC,
+                                          ck::tensor_layout::convolution::GKXC,
-                                          ck::tensor_layout::convolution::NWK>(params);
+                                          ck::tensor_layout::convolution::GNWK>(conv_param);
-    std::vector<std::size_t> ref_dims{1, 2, 5};
+    std::vector<std::size_t> ref_dims{1, 1, 5, 2};
    std::vector<float> ref_data{9., 9., 19.5, 19.5, 31.5, 31.5, 43.5, 43.5, 55.5, 55.5};
    EXPECT_TRUE(ck::utils::check_err(
        out_tensor.mDesc.GetLengths(), ref_dims, "Error: wrong output tensor dimensions!"));
    EXPECT_TRUE(ck::utils::check_err(out_tensor.mData, ref_data, "Error: incorrect results!"));
 }
-TEST(ReferenceConvolutionFWD, Conv1DNWCSameOutputSize)
+TEST(ReferenceConvolutionFWD, Conv1DGNWCSameOutputSize)
 {
-    ck::utils::conv::ConvParams params;
+    ck::utils::conv::ConvParam conv_param(1,
-    params.num_dim_spatial_        = 1;
+                                          1,
-    params.N_                      = 2;
+                                          2,
-    params.K_                      = 16;
+                                          16,
-    params.C_                      = 4;
+                                          4,
-    params.filter_spatial_lengths_ = std::vector<ck::index_t>{3};
+                                          std::vector<ck::index_t>{3},
-    params.input_spatial_lengths_  = std::vector<ck::index_t>{16};
+                                          std::vector<ck::index_t>{16},
-    params.conv_filter_strides_    = std::vector<ck::index_t>{1};
+                                          std::vector<ck::index_t>{1},
-    params.conv_filter_dilations_  = std::vector<ck::index_t>{1};
+                                          std::vector<ck::index_t>{1},
-    params.input_left_pads_        = std::vector<ck::index_t>{1};
+                                          std::vector<ck::index_t>{1},
-    params.input_right_pads_       = std::vector<ck::index_t>{1};
+                                          std::vector<ck::index_t>{1});
    auto out_tensor2 = run_reference_convolution_forward<1,
                                                         float,
                                                         float,
                                                         float,
-                                                         ck::tensor_layout::convolution::NWC,
+                                                         ck::tensor_layout::convolution::GNWC,
-                                                         ck::tensor_layout::convolution::KXC,
+                                                         ck::tensor_layout::convolution::GKXC,
-                                                         ck::tensor_layout::convolution::NWK>(
+                                                         ck::tensor_layout::convolution::GNWK>(
-        params, ck::utils::FillMonotonicSeq<float>{0.f, 0.1f});
+        conv_param, ck::utils::FillMonotonicSeq<float>{0.f, 0.1f});
-    std::vector<std::size_t> ref_dims{2, 16, 16};
+    std::vector<std::size_t> ref_dims{1, 2, 16, 16};
    std::vector<float> ref_data{
        1.4,       1.4,       1.4,       1.4,       1.4,       1.4,       1.4,       1.4,
        1.4,       1.4,       1.4,       1.4,       1.4,       1.4,       1.4,       1.4,
@@ -304,30 +303,31 @@ TEST(ReferenceConvolutionFWD, Conv1DNWCSameOutputSize)
        out_tensor2.mDesc.GetLengths(), ref_dims, "Error: wrong output tensor dimensions!"));
    EXPECT_TRUE(ck::utils::check_err(out_tensor2.mData, ref_data, "Error: incorrect results!"));
 }
+#endif
-TEST(ReferenceConvolutionFWD, Conv3DNCDHW)
+TEST(ReferenceConvolutionFWD, Conv3DGNCDHW)
 {
-    ck::utils::conv::ConvParams params;
+    ck::utils::conv::ConvParam conv_param(3,
-    params.num_dim_spatial_        = 3;
+                                          1,
-    params.N_                      = 1;
+                                          1,
-    params.K_                      = 1;
+                                          1,
-    params.C_                      = 2;
+                                          2,
-    params.filter_spatial_lengths_ = std::vector<ck::index_t>{3, 3, 3};
+                                          std::vector<ck::index_t>{3, 3, 3},
-    params.input_spatial_lengths_  = std::vector<ck::index_t>{6, 6, 6};
+                                          std::vector<ck::index_t>{6, 6, 6},
-    params.conv_filter_strides_    = std::vector<ck::index_t>{1, 1, 1};
+                                          std::vector<ck::index_t>{1, 1, 1},
-    params.conv_filter_dilations_  = std::vector<ck::index_t>{1, 1, 1};
+                                          std::vector<ck::index_t>{1, 1, 1},
-    params.input_left_pads_        = std::vector<ck::index_t>{0, 0, 0};
+                                          std::vector<ck::index_t>{0, 0, 0},
-    params.input_right_pads_       = std::vector<ck::index_t>{0, 0, 0};
+                                          std::vector<ck::index_t>{0, 0, 0});
    auto out_tensor = run_reference_convolution_forward<3,
                                                        float,
                                                        float,
                                                        float,
-                                                        ck::tensor_layout::convolution::NCDHW,
+                                                        ck::tensor_layout::convolution::GNCDHW,
-                                                        ck::tensor_layout::convolution::KCZYX,
+                                                        ck::tensor_layout::convolution::GKCZYX,
-                                                        ck::tensor_layout::convolution::NKDHW>(
+                                                        ck::tensor_layout::convolution::GNKDHW>(
-        params, ck::utils::FillMonotonicSeq<float>{0.f, 0.1f});
+        conv_param, ck::utils::FillMonotonicSeq<float>{0.f, 0.1f});
-    std::vector<std::size_t> ref_dims{1, 1, 4, 4, 4};
+    std::vector<std::size_t> ref_dims{1, 1, 1, 4, 4, 4};
    std::vector<float> ref_data{
        407.7,     410.40002, 413.09998, 415.80002, 423.90002, 426.6,     429.30002, 432.,
        440.1,     442.80002, 445.5,     448.2,     456.30002, 459.,      461.7,     464.40002,
@@ -344,29 +344,29 @@ TEST(ReferenceConvolutionFWD, Conv3DNCDHW)
        ck::utils::check_err(out_tensor.mData, ref_data, "Error [case 1]: incorrect results!"));
 }
-TEST(ReferenceConvolutionFWD, Conv3DNCDHWStridesDilations)
+TEST(ReferenceConvolutionFWD, Conv3DGNCDHWStridesDilations)
 {
-    ck::utils::conv::ConvParams params;
+    ck::utils::conv::ConvParam conv_param(3,
-    params.num_dim_spatial_        = 3;
+                                          1,
-    params.N_                      = 1;
+                                          1,
-    params.K_                      = 2;
+                                          2,
-    params.C_                      = 2;
+                                          2,
-    params.filter_spatial_lengths_ = std::vector<ck::index_t>{3, 3, 3};
+                                          std::vector<ck::index_t>{3, 3, 3},
-    params.input_spatial_lengths_  = std::vector<ck::index_t>{12, 12, 12};
+                                          std::vector<ck::index_t>{12, 12, 12},
-    params.conv_filter_strides_    = std::vector<ck::index_t>{3, 3, 3};
+                                          std::vector<ck::index_t>{3, 3, 3},
-    params.conv_filter_dilations_  = std::vector<ck::index_t>{1, 1, 1};
+                                          std::vector<ck::index_t>{1, 1, 1},
-    params.input_left_pads_        = std::vector<ck::index_t>{0, 0, 0};
+                                          std::vector<ck::index_t>{0, 0, 0},
-    params.input_right_pads_       = std::vector<ck::index_t>{0, 0, 0};
+                                          std::vector<ck::index_t>{0, 0, 0});
    auto out_tensor = run_reference_convolution_forward<3,
                                                        float,
                                                        float,
                                                        float,
-                                                        ck::tensor_layout::convolution::NCDHW,
+                                                        ck::tensor_layout::convolution::GNCDHW,
-                                                        ck::tensor_layout::convolution::KCZYX,
+                                                        ck::tensor_layout::convolution::GKCZYX,
-                                                        ck::tensor_layout::convolution::NKDHW>(
+                                                        ck::tensor_layout::convolution::GNKDHW>(
-        params, ck::utils::FillMonotonicSeq<float>{0.f, 0.1f});
+        conv_param, ck::utils::FillMonotonicSeq<float>{0.f, 0.1f});
-    std::vector<std::size_t> ref_dims{1, 2, 4, 4, 4};
+    std::vector<std::size_t> ref_dims{1, 1, 2, 4, 4, 4};
    std::vector<float> ref_data{
        2756.7002, 2764.7998, 2772.9001, 2781.,     2853.9001, 2862.,     2870.1,    2878.2002,
        2951.1,    2959.2002, 2967.2998, 2975.4001, 3048.2998, 3056.4001, 3064.5,    3072.6,

--- a/test/softmax/CMakeLists.txt
+++ b/test/softmax/CMakeLists.txt
@@ -3,9 +3,9 @@ add_custom_target(test_softmax)
 add_gtest_executable(test_softmax_fp32 test_softmax_fp32.cpp)
 add_gtest_executable(test_softmax_fp16 test_softmax_fp16.cpp)
 add_gtest_executable(test_softmax_int8 test_softmax_int8.cpp)
-target_link_libraries(test_softmax_fp32 PRIVATE host_tensor)
+target_link_libraries(test_softmax_fp32 PRIVATE utility)
-target_link_libraries(test_softmax_fp16 PRIVATE host_tensor)
+target_link_libraries(test_softmax_fp16 PRIVATE utility)
-target_link_libraries(test_softmax_int8 PRIVATE host_tensor)
+target_link_libraries(test_softmax_int8 PRIVATE utility)
 add_dependencies(test_softmax test_softmax_fp32)
 add_dependencies(test_softmax test_softmax_fp16)
 add_dependencies(test_softmax test_softmax_int8)
\ No newline at end of file
--- a/test/softmax/test_softmax_util.hpp
+++ b/test/softmax/test_softmax_util.hpp
@@ -12,8 +12,8 @@
 #include "ck/tensor_operation/gpu/device/device_softmax.hpp"
 #include "ck/library/utility/check_err.hpp"
-#include "ck/library/host_tensor/host_tensor.hpp"
+#include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/host_tensor/device_memory.hpp"
+#include "ck/library/utility/device_memory.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
 namespace ck {
@@ -80,8 +80,8 @@ class TestSoftmax : public ::testing::Test
        Tensor<OutDataType> out_ref(out);
-        DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpace());
+        DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
-        DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpace());
+        DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpaceSize());
        in_dev.ToDevice(in.mData.data());
        out_dev.ToDevice(out.mData.data());