merge

0c823497 · muozturk · 334cfe1c · 68f2b5e7 · 0c823497 · 0c823497
Commit 0c823497 authored Nov 10, 2023 by muozturk
15 changed files
--- a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp
+++ b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp
@@ -11,6 +11,7 @@

 #include "ck/utility/common_header.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/host_utility/device_prop.hpp"

 #include "profiler/profile_grouped_conv_bwd_weight_impl.hpp"

@@ -33,8 +34,9 @@ class TestGroupedConvndBwdWeight : public ::testing::Test

    bool skip_case(const ck::utils::conv::ConvParam& params, const ck::index_t split_k)
    {
-        // Odd K or C values are supported only by DL kernel (only applies to fp16)
-        // DL kernel currently supports only `split_k=1`
+        // Odd K or C values are supported only by DL and WMMA
+        // kernels (only applies to fp16)
+        // DL and WMMA kernels currently support only `split_k=1`
        if constexpr(std::is_same_v<InDataType, ck::half_t>)
        {
            if(split_k != 1 && (params.K_ % 2 != 0 || params.C_ % 2 != 0))
@@ -53,6 +55,42 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
            }
        }

+        const bool is_navi3x = ck::get_device_name() == "gfx1100" ||
+                               ck::get_device_name() == "gfx1101" ||
+                               ck::get_device_name() == "gfx1102";
+        if(is_navi3x)
+        {
+            // on navi3x only support for 3d is implemented
+            if constexpr(NDimSpatial{} != 3)
+            {
+                return true;
+            }
+            // on navi3x only support for i8 and fp16 is implemented
+            if constexpr(!((std::is_same_v<InDataType, int8_t> &&
+                            std::is_same_v<WeiDataType, int8_t> &&
+                            std::is_same_v<OutDataType, int8_t>) ||
+                           (std::is_same_v<InDataType, ck::half_t> &&
+                            std::is_same_v<WeiDataType, ck::half_t> &&
+                            std::is_same_v<OutDataType, ck::half_t>)))
+            {
+                return true;
+            }
+            // WMMA kernel is only supported for split_k=1
+            if(split_k != 1)
+            {
+                return true;
+            }
+        }
+        else
+        {
+            // support for i8 is only implemented on navi3x
+            if constexpr(std::is_same_v<InDataType, int8_t> &&
+                         std::is_same_v<WeiDataType, int8_t> && std::is_same_v<OutDataType, int8_t>)
+            {
+                return true;
+            }
+        }
+
        return false;
    }

@@ -120,9 +158,11 @@ using KernelTypes3d = ::testing::Types<
    std::tuple<float, float, float, GNDHWC, GKZYXC, GNDHWK, ck::Number<3>>,
    std::tuple<ck::half_t, ck::half_t, ck::half_t, GNDHWC, GKZYXC, GNDHWK, ck::Number<3>>,
    std::tuple<ck::bhalf_t, float, ck::bhalf_t, GNDHWC, GKZYXC, GNDHWK, ck::Number<3>>,
+    std::tuple<int8_t, int8_t, int8_t, GNDHWC, GKZYXC, GNDHWK, ck::Number<3>>,
    std::tuple<float, float, float, NDHWGC, GKZYXC, NDHWGK, ck::Number<3>>,
    std::tuple<ck::half_t, ck::half_t, ck::half_t, NDHWGC, GKZYXC, NDHWGK, ck::Number<3>>,
-    std::tuple<ck::bhalf_t, float, ck::bhalf_t, NDHWGC, GKZYXC, NDHWGK, ck::Number<3>>>;
+    std::tuple<ck::bhalf_t, float, ck::bhalf_t, NDHWGC, GKZYXC, NDHWGK, ck::Number<3>>,
+    std::tuple<int8_t, int8_t, int8_t, NDHWGC, GKZYXC, NDHWGK, ck::Number<3>>>;

 TYPED_TEST_SUITE(TestGroupedConvndBwdWeight1d, KernelTypes1d);
 TYPED_TEST_SUITE(TestGroupedConvndBwdWeight2d, KernelTypes2d);

--- a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_interface_wmma.cpp
+++ b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_interface_wmma.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <tuple>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle.hpp"
+
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using F16         = ck::half_t;
+using F32         = float;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using ConvolutionBackwardWeightSpecialization =
+    ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization;
+
+static constexpr auto ConvBwdWeightDefault = ConvolutionBackwardWeightSpecialization::Default;
+static constexpr auto Filter1x1Stride1Pad0 =
+    ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0;
+
+template <typename Tuple, ConvolutionBackwardWeightSpecialization ConvSpec>
+class TestGroupedConvndBwdWeight : public ::testing::Test
+{
+    protected:
+    using OutLayout                          = std::tuple_element_t<0, Tuple>;
+    using WeiLayout                          = std::tuple_element_t<1, Tuple>;
+    using InLayout                           = std::tuple_element_t<2, Tuple>;
+    static constexpr ck::index_t NDimSpatial = std::tuple_element_t<3, Tuple>{};
+
+    // clang-format off
+    using GroupedConvBwdWeightDeviceInstance = ck::tensor_operation::device::DeviceGroupedConvBwdWeight_Wmma_CShuffle
+        //|    NumDim|       A|       B|       C| AData| BData|  CData| AccData|            A|           B|            C|    ConvForward| Block|  MPer|  NPer|  KPer| K1|  MPer| NPer| MRepeat| NRepeat|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|       CShuffle|       CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //|   Spatial|  Layout|  Layout|  Layout|  Type|  Type|   Type|    Type|  Elementwise| Elementwise|  Elementwise| Specialization|  Size| Block| Block| Block|   |  WMMA| WMMA|        |        |   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MRepeatPerWave| NRepeatPerWave|            _MBlock_MPerBlock| ScalarPerVector|
+        //|          |        |        |        |      |      |       |        |    Operation|   Operation|    Operation|               |      |      |      |      |   |      |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |     PerShuffle|     PerShuffle|            _NBlock_NPerBlock|      _NPerBlock|
+        //|          |        |        |        |      |      |       |        |             |            |             |               |      |      |      |      |   |      |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |               |               |                             |                |
+    <NDimSpatial, InLayout, WeiLayout, OutLayout,  F16,   F16,  F16,  F32,  PassThrough, PassThrough, PassThrough,       ConvSpec,          128,   128,   128,     8,  8,    16,   16,       4,       4,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>;
+    // clang-format on
+
+    ck::utils::conv::ConvParam conv_param;
+
+    template <ck::index_t SplitK>
+    bool Run()
+    {
+
+        const auto in_g_n_c_wis_desc =
+            ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(
+                conv_param);
+
+        const auto wei_g_k_c_xs_desc =
+            ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(
+                conv_param);
+
+        const auto out_g_n_k_wos_desc =
+            ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(
+                conv_param);
+
+        std::array<ck::index_t, NDimSpatial + 3> input_lengths{};
+        std::array<ck::index_t, NDimSpatial + 3> filter_lengths{};
+        std::array<ck::index_t, NDimSpatial + 3> output_lengths{};
+        std::array<ck::index_t, NDimSpatial + 3> input_strides{};
+        std::array<ck::index_t, NDimSpatial + 3> weights_strides{};
+        std::array<ck::index_t, NDimSpatial + 3> output_strides{};
+        std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+        std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+        std::array<ck::index_t, NDimSpatial> input_left_pads{};
+        std::array<ck::index_t, NDimSpatial> input_right_pads{};
+
+        auto range_copy = [](const auto& from, auto to) { std::copy(begin(from), end(from), to); };
+
+        range_copy(in_g_n_c_wis_desc.GetLengths(), begin(input_lengths));
+        range_copy(in_g_n_c_wis_desc.GetStrides(), begin(input_strides));
+        range_copy(wei_g_k_c_xs_desc.GetLengths(), begin(filter_lengths));
+        range_copy(wei_g_k_c_xs_desc.GetStrides(), begin(weights_strides));
+        range_copy(out_g_n_k_wos_desc.GetLengths(), begin(output_lengths));
+        range_copy(out_g_n_k_wos_desc.GetStrides(), begin(output_strides));
+        range_copy(conv_param.conv_filter_strides_, begin(conv_filter_strides));
+        range_copy(conv_param.conv_filter_dilations_, begin(conv_filter_dilations));
+        range_copy(conv_param.input_left_pads_, begin(input_left_pads));
+        range_copy(conv_param.input_right_pads_, begin(input_right_pads));
+
+        auto conv = GroupedConvBwdWeightDeviceInstance{};
+
+        auto argument = conv.MakeArgument(nullptr,
+                                          nullptr,
+                                          nullptr,
+                                          input_lengths,
+                                          input_strides,
+                                          filter_lengths,
+                                          weights_strides,
+                                          output_lengths,
+                                          output_strides,
+                                          conv_filter_strides,
+                                          conv_filter_dilations,
+                                          input_left_pads,
+                                          input_right_pads,
+                                          PassThrough{},
+                                          PassThrough{},
+                                          PassThrough{},
+                                          SplitK);
+        return conv.IsSupportedArgument(argument);
+    }
+};
+
+using namespace ck::tensor_layout::convolution;
+
+using KernelTypes3d = ::testing::Types<std::tuple<GNDHWK, GKZYXC, GNDHWC, ck::Number<3>>,
+                                       std::tuple<NDHWGK, GKZYXC, NDHWGC, ck::Number<3>>>;
+
+template <typename Tuple>
+class TestGroupedConvndBwdWeightFilter1x13d
+    : public TestGroupedConvndBwdWeight<Tuple, Filter1x1Stride1Pad0>
+{
+};
+
+template <typename Tuple>
+class TestGroupedConvndBwdWeightDefault3d
+    : public TestGroupedConvndBwdWeight<Tuple, ConvBwdWeightDefault>
+{
+};
+
+TYPED_TEST_SUITE(TestGroupedConvndBwdWeightFilter1x13d, KernelTypes3d);
+TYPED_TEST_SUITE(TestGroupedConvndBwdWeightDefault3d, KernelTypes3d);
+
+TYPED_TEST(TestGroupedConvndBwdWeightFilter1x13d, SpecializationCheck)
+{
+    // Check filter 3x3x3 instead of 1x1x1
+    this->conv_param = {
+        3, 2, 4, 192, 192, {3, 3, 3}, {28, 28, 28}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}};
+    bool is_supported = this->template Run<1>();
+    EXPECT_FALSE(is_supported);
+
+    // Check strides 2x2x2 instead of 1x1x1
+    this->conv_param = {
+        3, 2, 4, 192, 192, {1, 1, 1}, {28, 28, 28}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}};
+    is_supported = this->template Run<1>();
+    EXPECT_FALSE(is_supported);
+
+    // Check with pad
+    this->conv_param = {
+        3, 2, 4, 192, 192, {1, 1, 1}, {28, 28, 28}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}};
+    is_supported = this->template Run<1>();
+    EXPECT_FALSE(is_supported);
+
+    // Supported version
+    this->conv_param = {
+        3, 2, 128, 128, 256, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}};
+    is_supported = this->template Run<1>();
+    EXPECT_TRUE(is_supported);
+}
+
+TYPED_TEST(TestGroupedConvndBwdWeightDefault3d, VectorLoadCheck)
+{
+    // vector load for A
+    this->conv_param = {
+        3, 2, 128, 129, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}};
+    bool is_supported = this->template Run<1>();
+    EXPECT_FALSE(is_supported);
+    // vector load for B, E, Ds
+    this->conv_param = {
+        3, 2, 128, 128, 257, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}};
+    is_supported = this->template Run<1>();
+    EXPECT_FALSE(is_supported);
+}
+
+TYPED_TEST(TestGroupedConvndBwdWeightDefault3d, SplitKCheck)
+{
+    // SplitK=1
+    this->conv_param = {
+        3, 2, 128, 128, 256, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}};
+    bool is_supported = this->template Run<1>();
+    EXPECT_TRUE(is_supported);
+    // SplitK=2
+    this->conv_param = {
+        3, 2, 128, 128, 256, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}};
+    is_supported = this->template Run<2>();
+    EXPECT_FALSE(is_supported);
+}
--- a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_interface.cpp
+++ b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_interface.cpp
--- a/test/grouped_gemm/test_grouped_gemm_interface.cpp
+++ b/test/grouped_gemm/test_grouped_gemm_interface.cpp
@@ -108,6 +108,10 @@ TEST_F(TestGGemmSplitKInterface_MKNKMN, KLoops)

    // kloops % 2
    Ks = std::vector<int>{256, 512, 320, 768};
+    EXPECT_FALSE(
+        DefaultGGemmInstance{}.IsSupported(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, kbatch));
+
+    Ks = std::vector<int>{256, 512, 384, 768};
    EXPECT_TRUE(
        DefaultGGemmInstance{}.IsSupported(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, kbatch));


--- a/test/normalization/CMakeLists.txt
+++ b/test/normalization/CMakeLists.txt
-add_custom_target(test_normalization)
-add_gtest_executable(test_layernorm2d_fp32 test_layernorm2d_fp32.cpp)
-if(result EQUAL 0)
-  target_link_libraries(test_layernorm2d_fp32 PRIVATE utility device_normalization_instance)
-  add_dependencies(test_normalization test_layernorm2d_fp32)
-endif()
-add_gtest_executable(test_groupnorm_fp32 test_groupnorm_fp32.cpp)
-if(result EQUAL 0)
-  target_link_libraries(test_groupnorm_fp32 PRIVATE utility device_normalization_instance)
-  add_dependencies(test_normalization test_groupnorm_fp32)
-endif()
-add_gtest_executable(test_layernorm2d_fp16 test_layernorm2d_fp16.cpp)
-if(result EQUAL 0)
-  target_link_libraries(test_layernorm2d_fp16 PRIVATE utility device_normalization_instance)
-  add_dependencies(test_normalization test_layernorm2d_fp16)
-endif()
-add_gtest_executable(test_groupnorm_fp16 test_groupnorm_fp16.cpp)
-if(result EQUAL 0)
-  target_link_libraries(test_groupnorm_fp16 PRIVATE utility device_normalization_instance)
-  add_dependencies(test_normalization test_groupnorm_fp16)
-endif()
--- a/test/normalization_fwd/CMakeLists.txt
+++ b/test/normalization_fwd/CMakeLists.txt
+add_custom_target(test_normalization_fwd)
+add_gtest_executable(test_layernorm2d_fwd_fp32 test_layernorm2d_fwd_fp32.cpp)
+if(result EQUAL 0)
+  target_link_libraries(test_layernorm2d_fwd_fp32 PRIVATE utility device_normalization_fwd_instance)
+  add_dependencies(test_normalization_fwd test_layernorm2d_fwd_fp32)
+endif()
+
+add_gtest_executable(test_groupnorm_fwd_fp32 test_groupnorm_fwd_fp32.cpp)
+if(result EQUAL 0)
+  target_link_libraries(test_groupnorm_fwd_fp32 PRIVATE utility device_normalization_fwd_instance)
+  add_dependencies(test_normalization_fwd test_groupnorm_fwd_fp32)
+endif()
+
+add_gtest_executable(test_layernorm2d_fwd_fp16 test_layernorm2d_fwd_fp16.cpp)
+if(result EQUAL 0)
+  target_link_libraries(test_layernorm2d_fwd_fp16 PRIVATE utility device_normalization_fwd_instance)
+  add_dependencies(test_normalization_fwd test_layernorm2d_fwd_fp16)
+endif()
+
+add_gtest_executable(test_layernorm4d_fwd_fp16 test_layernorm4d_fwd_fp16.cpp)
+if(result EQUAL 0)
+  target_link_libraries(test_layernorm4d_fwd_fp16 PRIVATE utility device_normalization_fwd_instance)
+  add_dependencies(test_normalization_fwd test_layernorm4d_fwd_fp16)
+endif()
+
+add_gtest_executable(test_groupnorm_fwd_fp16 test_groupnorm_fwd_fp16.cpp)
+if(result EQUAL 0)
+  target_link_libraries(test_groupnorm_fwd_fp16 PRIVATE utility device_normalization_fwd_instance)
+  add_dependencies(test_normalization_fwd test_groupnorm_fwd_fp16)
+endif()
--- a/test/normalization/test_groupnorm_fp16.cpp
+++ b/test/normalization/test_groupnorm_fp16.cpp
@@ -2,7 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #include "gtest/gtest.h"
-#include "profiler/profile_groupnorm_impl.hpp"
+#include "profiler/profile_groupnorm_fwd_impl.hpp"

 using F16 = ck::half_t;
 using F32 = float;
@@ -12,11 +12,12 @@ template <typename Tuple>
 class TestGroupnorm : public ::testing::Test
 {
    protected:
-    using XDataType       = std::tuple_element_t<0, Tuple>;
-    using GammaDataType   = std::tuple_element_t<1, Tuple>;
-    using BetaDataType    = std::tuple_element_t<2, Tuple>;
-    using ComputeDataType = std::tuple_element_t<3, Tuple>;
-    using YDataType       = std::tuple_element_t<4, Tuple>;
+    using XDataType              = std::tuple_element_t<0, Tuple>;
+    using GammaDataType          = std::tuple_element_t<1, Tuple>;
+    using BetaDataType           = std::tuple_element_t<2, Tuple>;
+    using ComputeDataType        = std::tuple_element_t<3, Tuple>;
+    using YDataType              = std::tuple_element_t<4, Tuple>;
+    using SaveMeanInvStdDataType = std::tuple_element_t<5, Tuple>;

    void Run()
    {
@@ -37,7 +38,9 @@ class TestGroupnorm : public ::testing::Test
                                                     GammaDataType,
                                                     BetaDataType,
                                                     ComputeDataType,
-                                                     YDataType>(true, 2, false, false, length);
+                                                     YDataType,
+                                                     SaveMeanInvStdDataType,
+                                                     true>(true, 2, false, false, length);
            EXPECT_TRUE(success);
        }
    }
@@ -45,7 +48,7 @@ class TestGroupnorm : public ::testing::Test

 using KernelTypes = ::testing::Types<
    // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType>
-    std::tuple<F16, F16, F16, F32, F16>>;
+    std::tuple<F16, F16, F16, F32, F16, F32>>;

 TYPED_TEST_SUITE(TestGroupnorm, KernelTypes);
 TYPED_TEST(TestGroupnorm, Test_FP16) { this->Run(); }
--- a/test/normalization/test_groupnorm_fp32.cpp
+++ b/test/normalization/test_groupnorm_fp32.cpp
@@ -2,7 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #include "gtest/gtest.h"
-#include "profiler/profile_groupnorm_impl.hpp"
+#include "profiler/profile_groupnorm_fwd_impl.hpp"

 using F16 = ck::half_t;
 using F32 = float;
@@ -12,11 +12,12 @@ template <typename Tuple>
 class TestGroupnorm : public ::testing::Test
 {
    protected:
-    using XDataType       = std::tuple_element_t<0, Tuple>;
-    using GammaDataType   = std::tuple_element_t<1, Tuple>;
-    using BetaDataType    = std::tuple_element_t<2, Tuple>;
-    using ComputeDataType = std::tuple_element_t<3, Tuple>;
-    using YDataType       = std::tuple_element_t<4, Tuple>;
+    using XDataType              = std::tuple_element_t<0, Tuple>;
+    using GammaDataType          = std::tuple_element_t<1, Tuple>;
+    using BetaDataType           = std::tuple_element_t<2, Tuple>;
+    using ComputeDataType        = std::tuple_element_t<3, Tuple>;
+    using YDataType              = std::tuple_element_t<4, Tuple>;
+    using SaveMeanInvStdDataType = std::tuple_element_t<5, Tuple>;

    void Run()
    {
@@ -35,7 +36,9 @@ class TestGroupnorm : public ::testing::Test
                                                     GammaDataType,
                                                     BetaDataType,
                                                     ComputeDataType,
-                                                     YDataType>(true, 2, false, false, length);
+                                                     YDataType,
+                                                     SaveMeanInvStdDataType,
+                                                     true>(true, 2, false, false, length);
            EXPECT_TRUE(success);
        }
    }
@@ -43,7 +46,7 @@ class TestGroupnorm : public ::testing::Test

 using KernelTypes = ::testing::Types<
    // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType>
-    std::tuple<F32, F32, F32, F32, F32>>;
+    std::tuple<F32, F32, F32, F32, F32, F32>>;

 TYPED_TEST_SUITE(TestGroupnorm, KernelTypes);
 TYPED_TEST(TestGroupnorm, Test_FP32) { this->Run(); }
--- a/test/normalization/test_layernorm2d_fp16.cpp
+++ b/test/normalization/test_layernorm2d_fp16.cpp
@@ -2,7 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #include "gtest/gtest.h"
-#include "profiler/profile_layernorm_impl.hpp"
+#include "profiler/profile_layernorm_fwd_impl.hpp"

 using F16 = ck::half_t;
 using F32 = float;
@@ -12,11 +12,12 @@ template <typename Tuple>
 class TestLayernorm2d : public ::testing::Test
 {
    protected:
-    using XDataType       = std::tuple_element_t<0, Tuple>;
-    using GammaDataType   = std::tuple_element_t<1, Tuple>;
-    using BetaDataType    = std::tuple_element_t<2, Tuple>;
-    using ComputeDataType = std::tuple_element_t<3, Tuple>;
-    using YDataType       = std::tuple_element_t<4, Tuple>;
+    using XDataType              = std::tuple_element_t<0, Tuple>;
+    using GammaDataType          = std::tuple_element_t<1, Tuple>;
+    using BetaDataType           = std::tuple_element_t<2, Tuple>;
+    using ComputeDataType        = std::tuple_element_t<3, Tuple>;
+    using YDataType              = std::tuple_element_t<4, Tuple>;
+    using SaveMeanInvStdDataType = std::tuple_element_t<5, Tuple>;

    void Run()
    {
@@ -31,6 +32,8 @@ class TestLayernorm2d : public ::testing::Test
                                                                BetaDataType,
                                                                ComputeDataType,
                                                                YDataType,
+                                                                SaveMeanInvStdDataType,
+                                                                true,
                                                                2>(true, 2, false, false, length);
            EXPECT_TRUE(success);
        }
@@ -39,7 +42,7 @@ class TestLayernorm2d : public ::testing::Test

 using KernelTypes = ::testing::Types<
    // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType>
-    std::tuple<F16, F16, F16, F32, F16>>;
+    std::tuple<F16, F16, F16, F32, F16, F32>>;

 TYPED_TEST_SUITE(TestLayernorm2d, KernelTypes);
 TYPED_TEST(TestLayernorm2d, Test_FP16) { this->Run(); }
--- a/test/normalization/test_layernorm2d_fp32.cpp
+++ b/test/normalization/test_layernorm2d_fp32.cpp
@@ -2,7 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #include "gtest/gtest.h"
-#include "profiler/profile_layernorm_impl.hpp"
+#include "profiler/profile_layernorm_fwd_impl.hpp"

 using F16 = ck::half_t;
 using F32 = float;
@@ -12,11 +12,12 @@ template <typename Tuple>
 class TestLayernorm2d : public ::testing::Test
 {
    protected:
-    using XDataType       = std::tuple_element_t<0, Tuple>;
-    using GammaDataType   = std::tuple_element_t<1, Tuple>;
-    using BetaDataType    = std::tuple_element_t<2, Tuple>;
-    using ComputeDataType = std::tuple_element_t<3, Tuple>;
-    using YDataType       = std::tuple_element_t<4, Tuple>;
+    using XDataType              = std::tuple_element_t<0, Tuple>;
+    using GammaDataType          = std::tuple_element_t<1, Tuple>;
+    using BetaDataType           = std::tuple_element_t<2, Tuple>;
+    using ComputeDataType        = std::tuple_element_t<3, Tuple>;
+    using YDataType              = std::tuple_element_t<4, Tuple>;
+    using SaveMeanInvStdDataType = std::tuple_element_t<5, Tuple>;

    void Run()
    {
@@ -31,6 +32,8 @@ class TestLayernorm2d : public ::testing::Test
                                                                BetaDataType,
                                                                ComputeDataType,
                                                                YDataType,
+                                                                SaveMeanInvStdDataType,
+                                                                true,
                                                                2>(true, 2, false, false, length);
            EXPECT_TRUE(success);
        }
@@ -39,7 +42,7 @@ class TestLayernorm2d : public ::testing::Test

 using KernelTypes = ::testing::Types<
    // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType>
-    std::tuple<F32, F32, F32, F32, F32>>;
+    std::tuple<F32, F32, F32, F32, F32, F32>>;

 TYPED_TEST_SUITE(TestLayernorm2d, KernelTypes);
 TYPED_TEST(TestLayernorm2d, Test_FP32) { this->Run(); }
--- a/test/normalization_fwd/test_layernorm4d_fwd_fp16.cpp
+++ b/test/normalization_fwd/test_layernorm4d_fwd_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "profiler/profile_layernorm_fwd_impl.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+using ck::index_t;
+
+template <typename Tuple>
+class TestLayernorm4d : public ::testing::Test
+{
+    protected:
+    using XDataType              = std::tuple_element_t<0, Tuple>;
+    using GammaDataType          = std::tuple_element_t<1, Tuple>;
+    using BetaDataType           = std::tuple_element_t<2, Tuple>;
+    using ComputeDataType        = std::tuple_element_t<3, Tuple>;
+    using YDataType              = std::tuple_element_t<4, Tuple>;
+    using SaveMeanInvStdDataType = std::tuple_element_t<5, Tuple>;
+
+    void Run()
+    {
+        // [N, D], reduce D
+        std::vector<std::vector<ck::index_t>> lengths = {
+            {1, 1, 1, 1}, {7, 7, 7, 7}, {256, 16, 16, 8}};
+
+        for(auto length : lengths)
+        {
+            bool success = ck::profiler::profile_layernorm_impl<XDataType,
+                                                                GammaDataType,
+                                                                BetaDataType,
+                                                                ComputeDataType,
+                                                                YDataType,
+                                                                SaveMeanInvStdDataType,
+                                                                true,
+                                                                4>(true, 2, false, false, length);
+            EXPECT_TRUE(success);
+        }
+    }
+};
+
+using KernelTypes = ::testing::Types<
+    // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType>
+    std::tuple<F16, F16, F16, F32, F16, F32>>;
+
+TYPED_TEST_SUITE(TestLayernorm4d, KernelTypes);
+TYPED_TEST(TestLayernorm4d, Test_FP16) { this->Run(); }
--- a/test/transpose/CMakeLists.txt
+++ b/test/transpose/CMakeLists.txt
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
+   add_gtest_executable(test_transpose test_transpose.cpp)
+   target_link_libraries(test_transpose PRIVATE utility device_transpose_instance)
+   set(target 1)
+ endif()
+endforeach()
--- a/test/transpose/test_transpose.cpp
+++ b/test/transpose/test_transpose.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include "gtest/gtest.h"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "test_transpose_util.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+
+template <typename Tuple>
+class TestTranspose : public ::testing::Test
+{
+};
+
+// clang-format off
+using KernelTypes = ::testing::Types<
+    std::tuple<      F16,       F16>,
+    std::tuple<      F32,       F32>
+    >;
+// clang-format on
+
+TYPED_TEST_SUITE(TestTranspose, KernelTypes);
+
+//#include "test_transpose_ut_cases.inc"
--- a/test/transpose/test_transpose_ut_cases.inc
+++ b/test/transpose/test_transpose_ut_cases.inc
+#pragma once
+
+TYPED_TEST(TestTranspose, Test1)
+{
+    // for 16, 8, 16, 32, 8
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
+    std::vector<index_t> lengths{16, 8, 16, 32, 8};
+    /**constexpr int N = 16;
+    constexpr int C = 8;
+    constexpr int D = 16;
+    constexpr int H = 32;
+    constexpr int W = 8;**/
+
+    this->Run();
+}
+
+
+TYPED_TEST(TestTranpose, Test2)
+{
+    std::vector<int> Ms{127, 255, 312, 799, 1573};
+    std::vector<index_t> lengths{16, 8, 16, 32, 16};
+    /**constexpr int N = 16;
+    constexpr int C = 8;
+    constexpr int D = 16;
+    constexpr int H = 32;
+    constexpr int W = 8;**/
+
+    this->Run();
+}
+
--- a/test/transpose/test_transpose_util.hpp
+++ b/test/transpose/test_transpose_util.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <string>
+#include <sstream>
+#include <tuple>
+#include <vector>
+#include <gtest/gtest.h>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "include/ck/utility/data_type.hpp"
+#include "profiler/profile_transpose_impl.hpp"
+
+namespace ck {
+namespace test {
+
+template <typename Tuple>
+class TestTranspose : public testing::Test
+{
+    using F32 = float;
+
+    protected:
+    using ADataType = std::tuple_element_t<0, Tuple>;
+    using BDataType = std::tuple_element_t<1, Tuple>;
+
+    public:
+    static constexpr bool verify_              = true;
+    static constexpr int init_method_          = 1; // decimal value initialization
+    static constexpr bool log_                 = false;
+    static constexpr bool bench_               = false; // measure kernel performance
+    std::vector<std::vector<index_t>> lengths_ = {{16, 32, 16, 32, 16}, {16, 8, 16, 32, 8}};
+
+    void Run()
+    {
+        for(auto length : this->lengths_)
+        {
+            this->RunSingle(length);
+        }
+    }
+
+    void RunSingle()
+    {
+        bool pass = ck::profiler::profile_transpose_impl<ADataType, BDataType, 5>(
+            verify_, init_method_, log_, bench_, lengths_);
+        EXPECT_TRUE(pass);
+    }
+};
+
+} // namespace test
+} // namespace ck