Merge branch 'develop' into dl_conv_multiple_d

f0224f2a · letaoqin · befc2638 · 0e9c88ce · f0224f2a · f0224f2a
Commit f0224f2a authored Nov 29, 2022 by letaoqin
11 changed files
--- a/test/batched_gemm_softmax_gemm_permute/CMakeLists.txt
+++ b/test/batched_gemm_softmax_gemm_permute/CMakeLists.txt
 add_custom_target(test_batched_gemm_softmax_gemm_permute)

 add_gtest_executable(test_batched_gemm_softmax_gemm_permute_fp16 test_batched_gemm_softmax_gemm_permute_fp16.cpp)
+add_gtest_executable(test_batched_gemm_softmax_gemm_permute_bf16 test_batched_gemm_softmax_gemm_permute_bf16.cpp)
 target_link_libraries(test_batched_gemm_softmax_gemm_permute_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
-add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_softmax_gemm_permute_fp16)
\ No newline at end of file
+target_link_libraries(test_batched_gemm_softmax_gemm_permute_bf16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
+add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_softmax_gemm_permute_fp16)
+add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_softmax_gemm_permute_bf16)
\ No newline at end of file
--- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16.cpp
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "test_batched_gemm_softmax_gemm_permute_util.hpp"
+
+template <typename Tuple>
+class TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16
+    : public TestBatchedGemmMaskingScaleSoftmaxGemmPermute<Tuple>
+{
+};
+
+using I1_t = ck::Number<1>;
+using I2_t = ck::Number<2>;
+
+using MaskDisabled_t =
+    ck::integral_constant<MaskingSpecialization, MaskingSpecialization::MaskDisabled>;
+using MaskOutUpperTriangle_t =
+    ck::integral_constant<MaskingSpecialization, MaskingSpecialization::MaskOutUpperTriangle>;
+
+// clang-format off
+using KernelTypes = ::testing::Types<
+    std::tuple<I2_t, I1_t, I1_t, I1_t, I1_t, BF16, BF16, BF16, BF16, ck::Tuple<>, ck::Tuple<>, MaskDisabled_t>,
+    std::tuple<I2_t, I1_t, I1_t, I1_t, I1_t, BF16, BF16, BF16, BF16, ck::Tuple<>, ck::Tuple<>, MaskOutUpperTriangle_t>
+    >;
+// clang-format on
+
+TYPED_TEST_SUITE(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, KernelTypes);
+
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, Test_BF16) { this->Run(); }
+
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, Test_BF16_PadM)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {136, 128, 32, 128, 2, 3},
+    };
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, Test_BF16_PadN)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 136, 32, 128, 3, 2},
+    };
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, Test_BF16_PadK)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 128, 40, 128, 2, 4},
+        {128, 128, 136, 128, 4, 2},
+    };
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, Test_BF16_PadO)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 128, 32, 136, 1, 3},
+    };
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, Test_BF16_OddM)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {129, 128, 32, 128, 2, 3},
+    };
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, Test_BF16_OddN)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 129, 32, 128, 4, 3},
+    };
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, Test_BF16_OddK)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 128, 33, 128, 2, 3},
+        {128, 128, 129, 128, 2, 3},
+    };
+    this->Run();
+}
+
+// If kernel B1Layout is RowMajor, expect not to support odd O size
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, Test_BF16_OddO)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 128, 32, 129, 2, 3},
+    };
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, DISABLED_Bench_BF16_IrregularK)
+{
+    this->lengths_ = std::vector<std::vector<int>>{{256, 256, 160, 160, 1, 16},
+                                                   {256, 64, 160, 64, 1, 16},
+                                                   {1024, 1024, 80, 80, 1, 16},
+                                                   {1024, 64, 80, 64, 1, 16},
+                                                   {4096, 4096, 40, 40, 1, 16},
+                                                   {4096, 64, 40, 64, 1, 16}};
+    this->bench_   = true;
+    this->verify_  = false;
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, DISABLED_Bench_BF16)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {256, 256, 64, 64, 48, 16},
+        {256, 256, 128, 128, 48, 16},
+        {512, 512, 64, 64, 48, 16},
+        {512, 512, 128, 128, 48, 16},
+        {1024, 1024, 64, 64, 48, 16},
+        {1024, 1024, 128, 128, 48, 16},
+        {2048, 2048, 64, 64, 48, 16},
+        {2048, 2048, 128, 128, 48, 16},
+        {4096, 4096, 64, 64, 48, 16},
+        {4096, 4096, 128, 128, 48, 16},
+    };
+    this->bench_  = true;
+    this->verify_ = false;
+    this->Run();
+}
+
+using ck::tensor_operation::device::GemmSpecialization;
+
+TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteInterface, GemmSpecializationSizeMatch)
+{
+    int P = 120; // requires padding
+    int Q = 128; // do not require padding
+
+    // IsSupported(M, N, K, O)
+    // clang-format off
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::Default>{}.IsSupported(Q, Q, Q, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MPadding>{}.IsSupported(P, Q, Q, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::NPadding>{}.IsSupported(Q, P, Q, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::KPadding>{}.IsSupported(Q, Q, P, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MNPadding>{}.IsSupported(P, P, Q, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MKPadding>{}.IsSupported(P, Q, P, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::NKPadding>{}.IsSupported(Q, P, P, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MNKPadding>{}.IsSupported(P, P, P, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::OPadding>{}.IsSupported(Q, Q, Q, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MOPadding>{}.IsSupported(P, Q, Q, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::NOPadding>{}.IsSupported(Q, P, Q, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::KOPadding>{}.IsSupported(Q, Q, P, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MNOPadding>{}.IsSupported(P, P, Q, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MKOPadding>{}.IsSupported(P, Q, P, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::NKOPadding>{}.IsSupported(Q, P, P, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MNKOPadding>{}.IsSupported(P, P, P, P));
+    // clang-format on
+}
+
+TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteInterface, GemmSpecializationSizeMismatch)
+{
+    // IsSupported(M, N, K, O)
+    // clang-format off
+    EXPECT_FALSE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::Default>{}.IsSupported(128, 128, 120, 128));
+    EXPECT_FALSE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MNKPadding>{}.IsSupported(128, 128, 128, 120));
+    // Kernel can't support odd K size because SrcVectorDim == KDim and must satisfy SizeKRaw % ABSrcScalarPerVector == 0
+    EXPECT_FALSE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MNKOPadding>{}.IsSupported(128, 128, 129, 128));
+    EXPECT_FALSE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MNKOPadding>{}.IsSupported(128, 128, 130, 128));
+    // Kernel can't support odd O size because SrcVectorDim == ODim and must satisfy SizeORaw % B1SrcScalarPerVector == 0
+    EXPECT_FALSE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MNKOPadding>{}.IsSupported(128, 128, 128, 129));
+    // clang-format on
+}
+
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, AdhocTest)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {49, 49, 64, 64, 4, 6},
+        {64, 49, 64, 64, 4, 6},
+        {1020, 1020, 64, 128, 4, 6},
+        {576, 576, 64, 64, 4, 6},
+    };
+    this->Run();
+}
--- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp
@@ -16,7 +16,8 @@ using ck::tensor_operation::device::TensorSpecialization;
 template <ck::index_t N>
 using I = ck::Number<N>;

-using F16 = ck::half_t;
+using F16  = ck::half_t;
+using BF16 = ck::bhalf_t;

 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
@@ -63,7 +64,7 @@ struct TestBatchedGemmMaskingScaleSoftmaxGemmPermute : public ::testing::Test
                                                                         ck::Tuple<>,
                                                                         ck::Tuple<>,
                                                                         MaskingType::value>(
-                verify_, 1, false, bench_, M, N, K, O, G0, G1);
+                verify_, 2, false, bench_, M, N, K, O, G0, G1);

        EXPECT_TRUE(pass);
    }
@@ -224,3 +225,144 @@ struct DeviceInstanceWrapper_G2M1N1K1O1_TNTT_FP16_M128_N128_K32_O128
        return gemm.IsSupportedArgument(argument);
    }
 };
+
+template <GemmSpecialization GemmSpec>
+struct DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128
+{
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+    using Scale       = ck::tensor_operation::element_wise::Scale;
+
+    template <ck::index_t... Is>
+    using S = ck::Sequence<Is...>;
+
+    using ADataType        = BF16;
+    using B0DataType       = BF16;
+    using B1DataType       = BF16;
+    using AccDataType      = float;
+    using CShuffleDataType = BF16;
+    using CDataType        = BF16;
+
+    using AElementOp    = PassThrough;
+    using B0ElementOp   = PassThrough;
+    using Acc0ElementOp = Scale;
+    using B1ElementOp   = PassThrough;
+    using CElementOp    = PassThrough;
+
+    // static constexpr auto GemmSpec = std::tuple_element_t<0, Tuple>::value;
+
+    using DeviceGemmGemmInstance =
+        ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<
+            2,
+            1,
+            1,
+            1,
+            1,
+            ADataType,
+            B0DataType,
+            B1DataType,
+            CDataType,
+            ck::Tuple<>,
+            ck::Tuple<>,
+            AccDataType,
+            CShuffleDataType,
+            AElementOp,
+            B0ElementOp,
+            Acc0ElementOp,
+            B1ElementOp,
+            CElementOp,
+            GemmSpec,
+            TensorSpecialization::Default, // ATensorSpec
+            TensorSpecialization::Default, // B0TensorSpec
+            TensorSpecialization::Default, // B1TensorSpec
+            TensorSpecialization::Default, // CTensorSpec
+            1,
+            256,
+            128,         // MPerBlock
+            128,         // NPerBlock
+            32,          // KPerBlock
+            128,         // Gemm1NPerBlock
+            32,          // Gemm1KPerBlock
+            8,           // AK1
+            8,           // BK1
+            2,           // B1K1
+            32,          // MPerXDL
+            32,          // NPerXDL
+            1,           // MXdlPerWave
+            4,           // NXdlPerWave
+            4,           // Gemm1NXdlPerWave
+            S<4, 64, 1>, // ABlockTransfer
+            S<1, 0, 2>,
+            S<1, 0, 2>,
+            2,
+            8,
+            8,
+            true,
+            S<4, 64, 1>, // BBlockTransfer
+            S<1, 0, 2>,
+            S<1, 0, 2>,
+            2,
+            8,
+            8,
+            true,
+            S<8, 32, 1>, // B1BlockTransfer
+            S<0, 2, 1>,
+            S<0, 2, 1>,
+            1,
+            4,
+            2,
+            false,
+            1,              // CShuffleMXdlPerWavePerShuffle
+            2,              // CShuffleNXdlPerWavePerShuffle
+            S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+            8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
+            MaskingSpecialization::MaskOutUpperTriangle>; // MaskOutUpperTriangle
+
+    bool IsSupported(int M, int N, int K, int O)
+    {
+        const int G0 = 1, G1 = 1;
+
+        // A layout [G0, M, G1, K]
+        std::vector<ck::index_t> a_gs_ms_ks_lengths{G0, G1, M, K};
+        std::vector<ck::index_t> a_gs_ms_ks_strides{M * G1 * K, K, G1 * K, 1};
+
+        // B0 layout [G0, N, G1, K]
+        std::vector<ck::index_t> b0_gs_ns_ks_lengths{G0, G1, N, K};
+        std::vector<ck::index_t> b0_gs_ns_ks_strides{N * G1 * K, K, G1 * K, 1};
+
+        // B1 layout [G0, N, G1, O]
+        std::vector<ck::index_t> b1_gs_os_ns_lengths{G0, G1, O, N};
+        std::vector<ck::index_t> b1_gs_os_ns_strides{N * G1 * O, O, 1, G1 * O};
+
+        // C layout [G0, M, G1, O]
+        std::vector<ck::index_t> c_gs_ms_os_lengths{G0, G1, M, O};
+        std::vector<ck::index_t> c_gs_ms_os_strides{M * G1 * O, O, G1 * O, 1};
+
+        auto gemm     = DeviceGemmGemmInstance{};
+        auto invoker  = gemm.MakeInvoker();
+        auto argument = gemm.MakeArgument(static_cast<ADataType*>(nullptr),
+                                          static_cast<B0DataType*>(nullptr),
+                                          static_cast<B1DataType*>(nullptr),
+                                          static_cast<CDataType*>(nullptr),
+                                          {}, // p_acc0_biases
+                                          {}, // p_acc1_biases
+                                          a_gs_ms_ks_lengths,
+                                          a_gs_ms_ks_strides,
+                                          b0_gs_ns_ks_lengths,
+                                          b0_gs_ns_ks_strides,
+                                          b1_gs_os_ns_lengths,
+                                          b1_gs_os_ns_strides,
+                                          c_gs_ms_os_lengths,
+                                          c_gs_ms_os_strides,
+                                          {},             // acc0_biases_gs_ms_ns_lengths
+                                          {},             // acc0_biases_gs_ms_ns_strides
+                                          {},             // acc1_biases_gs_ms_os_lengths
+                                          {},             // acc1_biases_gs_ms_os_strides
+                                          PassThrough{},  // a_element_op
+                                          PassThrough{},  // b0_element_op
+                                          Scale{1.f},     // acc0_element_op
+                                          PassThrough{},  // b1_element_op
+                                          PassThrough{}); // c_element_op
+
+        return gemm.IsSupportedArgument(argument);
+    }
+};
--- a/test/batchnorm_fwd/CMakeLists.txt
+++ b/test/batchnorm_fwd/CMakeLists.txt
+add_gtest_executable(test_batchnorm_fwd_rank_4 batchnorm_fwd_rank_4.cpp)
+target_link_libraries(test_batchnorm_fwd_rank_4 PRIVATE utility device_batchnorm_instance)
--- a/test/batchnorm_fwd/batchnorm_fwd_rank_4.cpp
+++ b/test/batchnorm_fwd/batchnorm_fwd_rank_4.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <vector>
+#include <tuple>
+#include <gtest/gtest.h>
+
+#include "profiler/include/profile_batchnorm_forward_impl.hpp"
+
+using F16  = ck::half_t;
+using F32  = float;
+using BF16 = ck::bhalf_t;
+using I8   = int8_t;
+using F64  = double;
+
+template <typename Tuple>
+class TestBatchNormFwdRank4 : public ::testing::Test
+{
+    private:
+    const double epsilon       = std::numeric_limits<float>::epsilon();
+    const double averageFactor = 0.1;
+
+    protected:
+    using XDataType       = std::tuple_element_t<0, Tuple>;
+    using YDataType       = std::tuple_element_t<1, Tuple>;
+    using AccDataType     = std::tuple_element_t<2, Tuple>;
+    using ScaleDataType   = std::tuple_element_t<3, Tuple>;
+    using BiasDataType    = std::tuple_element_t<4, Tuple>;
+    using MeanVarDataType = std::tuple_element_t<5, Tuple>;
+
+    std::vector<std::vector<size_t>> list_of_lengths = {
+        {128, 16, 3, 1024}, {128, 16, 6, 512}, {1, 1, 1, 1}, {4, 4, 4, 4}, {32, 32, 32, 32}};
+    std::vector<int> reduceDims;
+
+    template <int NumReduceDim>
+    void Run()
+    {
+        for(auto& inOutLengths : list_of_lengths)
+        {
+            bool pass = true;
+
+            EXPECT_FALSE(reduceDims.size() != NumReduceDim);
+
+            pass =
+                pass && ck::profiler::profile_batchnorm_forward_impl<XDataType,
+                                                                     YDataType,
+                                                                     AccDataType,
+                                                                     ScaleDataType,
+                                                                     BiasDataType,
+                                                                     MeanVarDataType,
+                                                                     4,
+                                                                     NumReduceDim>(true,
+                                                                                   3,
+                                                                                   false,
+                                                                                   false,
+                                                                                   inOutLengths,
+                                                                                   reduceDims,
+                                                                                   true,
+                                                                                   true,
+                                                                                   epsilon,
+                                                                                   averageFactor);
+
+            pass =
+                pass && ck::profiler::profile_batchnorm_forward_impl<XDataType,
+                                                                     YDataType,
+                                                                     AccDataType,
+                                                                     ScaleDataType,
+                                                                     BiasDataType,
+                                                                     MeanVarDataType,
+                                                                     4,
+                                                                     NumReduceDim>(true,
+                                                                                   3,
+                                                                                   false,
+                                                                                   false,
+                                                                                   inOutLengths,
+                                                                                   reduceDims,
+                                                                                   false,
+                                                                                   false,
+                                                                                   epsilon,
+                                                                                   averageFactor);
+
+            EXPECT_TRUE(pass);
+        }
+    }
+};
+
+using KernelTypes = ::testing::Types<std::tuple<F16, F16, F32, F16, F16, F32>,
+                                     std::tuple<F32, F32, F32, F32, F32, F32>,
+                                     std::tuple<BF16, BF16, F32, BF16, BF16, F32>,
+                                     std::tuple<F64, F64, F64, F64, F64, F64>>;
+
+TYPED_TEST_SUITE(TestBatchNormFwdRank4, KernelTypes);
+
+// nhwc
+TYPED_TEST(TestBatchNormFwdRank4, nhwc)
+{
+    this->reduceDims = {0, 1, 2};
+    this->template Run<3>();
+}
+
+// nchw
+TYPED_TEST(TestBatchNormFwdRank4, nchw)
+{
+    this->reduceDims = {0, 2, 3};
+    this->template Run<3>();
+}
--- a/test/convnd_bwd_weight/CMakeLists.txt
+++ b/test/convnd_bwd_weight/CMakeLists.txt
-add_gtest_executable(test_convnd_bwd_weight convnd_bwd_weight.cpp) 
-target_link_libraries(test_convnd_bwd_weight PRIVATE utility device_conv1d_bwd_weight_instance device_conv2d_bwd_weight_instance  device_conv3d_bwd_weight_instance)
--- a/test/gemm/gemm_util.hpp
+++ b/test/gemm/gemm_util.hpp
@@ -9,6 +9,7 @@
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"

 namespace ck {
@@ -128,15 +129,15 @@ struct TestGemm
    {
        auto f_host_tensor_descriptor =
            [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+                using namespace ck::literals;
+
                if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
                {
-                    return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
-                                                std::vector<std::size_t>({stride, 1}));
+                    return HostTensorDescriptor({row, col}, {stride, 1_uz});
                }
                else
                {
-                    return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
-                                                std::vector<std::size_t>({1, stride}));
+                    return HostTensorDescriptor({row, col}, {1_uz, stride});
                }
            };

@@ -229,27 +230,27 @@ struct TestGemm
            bool res = false;
            if(std::is_same<CDataType, float>::value)
            {
-                res = ck::utils::check_err(c_device.mData, c_host.mData);
+                res = ck::utils::check_err(c_device, c_host);
                std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
            }
            else if(std::is_same<CDataType, ck::half_t>::value)
            {
-                res = ck::utils::check_err(c_device.mData, c_host.mData);
+                res = ck::utils::check_err(c_device, c_host);
                std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
            }
            else if(std::is_same<CDataType, ck::bhalf_t>::value)
            {
-                res = ck::utils::check_err(c_device.mData, c_host.mData);
+                res = ck::utils::check_err(c_device, c_host);
                std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
            }
            else if(std::is_same<CDataType, int8_t>::value)
            {
-                res = ck::utils::check_err(c_device.mData, c_host.mData);
+                res = ck::utils::check_err(c_device, c_host);
                std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
            }
            else if(std::is_same<CDataType, double>::value)
            {
-                res = ck::utils::check_err(c_device.mData, c_host.mData);
+                res = ck::utils::check_err(c_device, c_host);
                std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
            }


--- a/test/gemm_split_k/gemm_split_k.cpp
+++ b/test/gemm_split_k/gemm_split_k.cpp
@@ -16,6 +16,7 @@
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"

 #include "ck/library/utility/host_gemm.hpp"
@@ -93,15 +94,15 @@ int test_gemm(const gemmArgs& args)

    auto f_host_tensor_descriptor =
        [](std::size_t row, std::size_t col, std::size_t stride, bool row_major) {
+            using namespace ck::literals;
+
            if(row_major)
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
-                                            std::vector<std::size_t>({stride, 1}));
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
            }
            else
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
-                                            std::vector<std::size_t>({1, stride}));
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
            }
        };


--- a/test/grouped_convnd_bwd_weight/CMakeLists.txt
+++ b/test/grouped_convnd_bwd_weight/CMakeLists.txt
+add_gtest_executable(test_grouped_convnd_bwd_weight grouped_convnd_bwd_weight.cpp) 
+target_link_libraries(test_grouped_convnd_bwd_weight PRIVATE utility device_grouped_conv1d_bwd_weight_instance device_grouped_conv2d_bwd_weight_instance device_grouped_conv3d_bwd_weight_instance)
--- a/test/convnd_bwd_weight/convnd_bwd_weight.cpp
+++ b/test/convnd_bwd_weight/convnd_bwd_weight.cpp
@@ -4,14 +4,15 @@
 #include <cstdlib>
 #include <iostream>
 #include <initializer_list>
-#include <vector>
 #include <tuple>
+#include <vector>
+
 #include <gtest/gtest.h>

-#include "profiler/include/profile_conv_bwd_weight_impl.hpp"
+#include "profiler/include/profile_grouped_conv_bwd_weight_impl.hpp"

 template <typename Tuple>
-class TestConvndBwdWeight : public ::testing::Test
+class TestGroupedConvndBwdWeight : public ::testing::Test
 {
    protected:
    using DataType = std::tuple_element_t<0, Tuple>;
@@ -25,20 +26,20 @@ class TestConvndBwdWeight : public ::testing::Test
        {
            bool pass;
            EXPECT_FALSE(conv_params.empty());
-            pass = ck::profiler::profile_conv_bwd_weight_impl<
+            pass = ck::profiler::profile_grouped_conv_bwd_weight_impl<
                NDimSpatial,
                ck::tuple_element_t<NDimSpatial - 1,
-                                    ck::Tuple<ck::tensor_layout::convolution::NWC,
-                                              ck::tensor_layout::convolution::NHWC,
-                                              ck::tensor_layout::convolution::NDHWC>>,
+                                    ck::Tuple<ck::tensor_layout::convolution::GNWC,
+                                              ck::tensor_layout::convolution::GNHWC,
+                                              ck::tensor_layout::convolution::GNDHWC>>,
                ck::tuple_element_t<NDimSpatial - 1,
-                                    ck::Tuple<ck::tensor_layout::convolution::KXC,
-                                              ck::tensor_layout::convolution::KYXC,
-                                              ck::tensor_layout::convolution::KZYXC>>,
+                                    ck::Tuple<ck::tensor_layout::convolution::GKXC,
+                                              ck::tensor_layout::convolution::GKYXC,
+                                              ck::tensor_layout::convolution::GKZYXC>>,
                ck::tuple_element_t<NDimSpatial - 1,
-                                    ck::Tuple<ck::tensor_layout::convolution::NWK,
-                                              ck::tensor_layout::convolution::NHWK,
-                                              ck::tensor_layout::convolution::NDHWK>>,
+                                    ck::Tuple<ck::tensor_layout::convolution::GNWK,
+                                              ck::tensor_layout::convolution::GNHWK,
+                                              ck::tensor_layout::convolution::GNDHWK>>,
                DataType,
                DataType,
                DataType>(true,  // do_verification
@@ -54,37 +55,37 @@ class TestConvndBwdWeight : public ::testing::Test

 using KernelTypes =
    ::testing::Types<std::tuple<float>, std::tuple<ck::half_t>, std::tuple<ck::bhalf_t>>;
-TYPED_TEST_SUITE(TestConvndBwdWeight, KernelTypes);
+TYPED_TEST_SUITE(TestGroupedConvndBwdWeight, KernelTypes);

-TYPED_TEST(TestConvndBwdWeight, Test1D)
+TYPED_TEST(TestGroupedConvndBwdWeight, Test1D)
 {
    this->conv_params.clear();
-    this->conv_params.push_back({1, 1, 128, 128, 256, {1}, {14}, {2}, {1}, {0}, {0}});
-    this->conv_params.push_back({1, 1, 128, 128, 256, {3}, {28}, {1}, {1}, {1}, {1}});
-    this->conv_params.push_back({1, 1, 128, 128, 256, {1}, {3}, {1}, {1}, {0}, {0}});
+    this->conv_params.push_back({1, 4, 128, 128, 256, {1}, {14}, {2}, {1}, {0}, {0}});
+    this->conv_params.push_back({1, 4, 128, 128, 256, {3}, {28}, {1}, {1}, {1}, {1}});
+    this->conv_params.push_back({1, 4, 128, 128, 256, {1}, {3}, {1}, {1}, {0}, {0}});
    this->template Run<1>();
 }

-TYPED_TEST(TestConvndBwdWeight, Test2D)
+TYPED_TEST(TestGroupedConvndBwdWeight, Test2D)
 {
    this->conv_params.clear();
    this->conv_params.push_back(
-        {2, 1, 128, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
+        {2, 4, 128, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
    this->conv_params.push_back(
-        {2, 1, 32, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+        {2, 4, 32, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
    this->conv_params.push_back(
-        {2, 1, 128, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
+        {2, 4, 128, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
    this->template Run<2>();
 }

-TYPED_TEST(TestConvndBwdWeight, Test3D)
+TYPED_TEST(TestGroupedConvndBwdWeight, Test3D)
 {
    this->conv_params.clear();
    this->conv_params.push_back(
-        {3, 1, 128, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+        {3, 4, 128, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
    this->conv_params.push_back(
-        {3, 1, 32, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+        {3, 4, 32, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
    this->conv_params.push_back(
-        {3, 1, 128, 128, 256, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+        {3, 4, 128, 128, 256, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
    this->template Run<3>();
 }
--- a/test/reference_conv_fwd/reference_conv_fwd.cpp
+++ b/test/reference_conv_fwd/reference_conv_fwd.cpp
@@ -12,6 +12,7 @@
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"

+#include "ck/library/utility/algorithm.hpp"
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/fill.hpp"
 #include "ck/library/utility/host_tensor.hpp"
@@ -54,7 +55,7 @@ run_reference_convolution_forward(const ck::utils::conv::ConvParam& conv_param,

    fill_input_op(input.begin(), input.end());
    fill_weights_op(weights.begin(), weights.end());
-    std::fill(host_output.begin(), host_output.end(), OutDataType(0.f));
+    ck::ranges::fill<OutDataType>(host_output, 0.f);

    auto ref_conv     = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
                                                                 InDataType,
@@ -122,7 +123,7 @@ TEST(ReferenceConvolutionFWD, Conv2DGNHWC)
                                508.5};
    EXPECT_TRUE(ck::utils::check_err(
        out_tensor.mDesc.GetLengths(), ref_dims, "Error: wrong output tensor dimensions!"));
-    EXPECT_TRUE(ck::utils::check_err(out_tensor.mData, ref_data, "Error: incorrect results!"));
+    EXPECT_TRUE(ck::utils::check_err(out_tensor, ref_data, "Error: incorrect results!"));
 }

 TEST(ReferenceConvolutionFWD, Conv2DGNHWCStridesDilationsPadding)
@@ -149,7 +150,7 @@ TEST(ReferenceConvolutionFWD, Conv2DGNHWCStridesDilationsPadding)
        1323., 1323., 2002.5, 2002.5, 2038.5, 2038.5, 2074.5, 2074.5, 2110.5, 2110.5};
    EXPECT_TRUE(ck::utils::check_err(
        out_tensor.mDesc.GetLengths(), ref_dims, "Error: wrong output tensor dimensions!"));
-    EXPECT_TRUE(ck::utils::check_err(out_tensor.mData, ref_data, "Error: incorrect results!"));
+    EXPECT_TRUE(ck::utils::check_err(out_tensor, ref_data, "Error: incorrect results!"));
 }

 TEST(ReferenceConvolutionFWD, Conv1DGNWC)
@@ -178,7 +179,7 @@ TEST(ReferenceConvolutionFWD, Conv1DGNWC)
    std::vector<float> ref_data{7.5, 13.5, 19.5, 25.5};
    EXPECT_TRUE(ck::utils::check_err(
        out_tensor.mDesc.GetLengths(), ref_dims, "Error: wrong output tensor dimensions!"));
-    EXPECT_TRUE(ck::utils::check_err(out_tensor.mData, ref_data, "Error: incorrect results!"));
+    EXPECT_TRUE(ck::utils::check_err(out_tensor, ref_data, "Error: incorrect results!"));
 }

 TEST(ReferenceConvolutionFWD, Conv1DGNWCStridesDilationsPadding)
@@ -207,7 +208,7 @@ TEST(ReferenceConvolutionFWD, Conv1DGNWCStridesDilationsPadding)
    std::vector<float> ref_data{9., 9., 19.5, 19.5, 31.5, 31.5, 43.5, 43.5, 55.5, 55.5};
    EXPECT_TRUE(ck::utils::check_err(
        out_tensor.mDesc.GetLengths(), ref_dims, "Error: wrong output tensor dimensions!"));
-    EXPECT_TRUE(ck::utils::check_err(out_tensor.mData, ref_data, "Error: incorrect results!"));
+    EXPECT_TRUE(ck::utils::check_err(out_tensor, ref_data, "Error: incorrect results!"));
 }

 TEST(ReferenceConvolutionFWD, Conv1DGNWCSameOutputSize)
@@ -301,7 +302,7 @@ TEST(ReferenceConvolutionFWD, Conv1DGNWCSameOutputSize)
        49.4,      49.4,      49.4,      49.4,      49.4,      49.4,      49.4,      49.4};
    EXPECT_TRUE(ck::utils::check_err(
        out_tensor2.mDesc.GetLengths(), ref_dims, "Error: wrong output tensor dimensions!"));
-    EXPECT_TRUE(ck::utils::check_err(out_tensor2.mData, ref_data, "Error: incorrect results!"));
+    EXPECT_TRUE(ck::utils::check_err(out_tensor2, ref_data, "Error: incorrect results!"));
 }
 #endif

@@ -340,8 +341,7 @@ TEST(ReferenceConvolutionFWD, Conv3DGNCDHW)
    EXPECT_TRUE(ck::utils::check_err(out_tensor.mDesc.GetLengths(),
                                     ref_dims,
                                     "Error [case 1]: wrong output tensor dimensions!"));
-    EXPECT_TRUE(
-        ck::utils::check_err(out_tensor.mData, ref_data, "Error [case 1]: incorrect results!"));
+    EXPECT_TRUE(ck::utils::check_err(out_tensor, ref_data, "Error [case 1]: incorrect results!"));
 }

 TEST(ReferenceConvolutionFWD, Conv3DGNCDHWStridesDilations)
@@ -388,5 +388,5 @@ TEST(ReferenceConvolutionFWD, Conv3DGNCDHWStridesDilations)
                                     ref_dims,
                                     "Error [case 2]: wrong output tensor dimensions!"));
    EXPECT_TRUE(ck::utils::check_err(
-        out_tensor.mData, ref_data, "Error [case 2]: incorrect results!", 1e-4f, 1e-6f));
+        out_tensor, ref_data, "Error [case 2]: incorrect results!", 1e-4f, 1e-6f));
 }