Merge branch 'develop' of github.com:ROCmSoftwarePlatform/composable_kernel...

Merge branch 'develop' of github.com:ROCmSoftwarePlatform/composable_kernel into barkocot/grouped-conv-weight-fp16-c1-k1

Merge branch 'develop' of github.com:ROCmSoftwarePlatform/composable_kernel...
Merge branch 'develop' of github.com:ROCmSoftwarePlatform/composable_kernel into barkocot/grouped-conv-weight-fp16-c1-k1
ec2b30b5 · Bartlomiej Kocot · 822a1110 · 37a8c1f7 · ec2b30b5 · ec2b30b5
Commit ec2b30b5 authored Sep 07, 2023 by Bartlomiej Kocot
13 changed files
--- a/profiler/src/profile_max_pool3d_bwd.cpp
+++ b/profiler/src/profile_max_pool3d_bwd.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <vector>
+#include <unordered_map>
+
+#include "profiler/data_type_enum.hpp"
+#include "profiler/profile_max_pool3d_bwd_impl.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "profiler_operation_registry.hpp"
+
+using ck::index_t;
+
+struct maxPoolbwdArgParser
+{
+    std::unordered_map<std::string, std::vector<int>> long_opts = {{"length", {}},
+                                                                   {"wsize", {}},
+                                                                   {"wstride", {}},
+                                                                   {"wdilation", {}},
+                                                                   {"pad1", {}},
+                                                                   {"pad2", {}}};
+
+    bool parse_opt(int argc, char* argv[], const std::string& key, int i)
+    {
+        if(std::string("--") + key == argv[i])
+        {
+            int pos = i;
+            while(++i < argc && argv[i][0] != '-') {}
+            int end = i;
+            for(int j = pos + 1; j < end; j++)
+            {
+                long_opts[key].push_back(std::stoi(argv[j]));
+            }
+            return true;
+        }
+        return false;
+    }
+
+    void operator()(int argc, char* argv[])
+    {
+        for(auto& kv : long_opts)
+        {
+            for(int i = 1; i < argc; i++)
+            {
+                if(parse_opt(argc, argv, kv.first, i))
+                    break;
+            }
+        }
+    }
+};
+
+void print_help_max_pool3d_bwd()
+{
+    std::cout << "arg1: data type (0: fp16; 1: fp32; 5: bf16)\n"
+              << "arg2: verification (0: no; 1: yes)\n"
+              << "arg3: initialization (0: no init; 1: integer value; 2: decimal value)\n"
+              << "arg4: print tensor value (0: no; 1: yes)\n"
+              << "arg5: time kernel (0=no, 1=yes)\n"
+              << "--length: input tensor length for NCDHW(e.g, --length 2 32 30 30 30) \n"
+              << "--wsize: window size for ZYX (e.g, --wsize 2 2 2) \n"
+              << "--wstride: window stride for DHW (e.g, --wstride 2 2 2) \n"
+              << "--wdilation: window dilation for DHW (e.g, --wdilation 1 1 1) \n"
+              << "--pad1: left side of padding in DHW (e.g, --pad1 1 1 1) \n"
+              << "--pad2: right side of padding in DHW (e.g, --pad2 1 1 1) \n"
+              << "eg: ckProfiler max_pool3d_bwd 0 1 2 0 1 --length 2 32 30 30 30 --wsize 2 2 2 "
+                 "--wstride 2 2 2 --wdilation 1 1 1 --pad1 1 1 1 --pad2 1 1 1"
+              << std::endl;
+}
+
+int profile_max_pool3d_bwd(int argc, char* argv[])
+{
+    ck::DataTypeEnum data_type = ck::DataTypeEnum::Half;
+    bool do_verification       = true;
+    int init_method            = 0;
+    bool do_log                = false;
+    bool time_kernel           = true;
+
+    std::vector<index_t> in_length = {2, 32, 30, 30, 30};
+    std::vector<index_t> wsize     = {2, 2, 2};
+    std::vector<index_t> wstride   = {2, 2, 2};
+    std::vector<index_t> wdilation = {1, 1, 1};
+    std::vector<index_t> pad1      = {1, 1, 1};
+    std::vector<index_t> pad2      = {1, 1, 1};
+
+    if(argc != 2 && argc != 33)
+    {
+        print_help_max_pool3d_bwd();
+        return 0;
+    }
+    else if(argc == 33)
+    {
+        data_type       = static_cast<ck::DataTypeEnum>(std::stoi(argv[2]));
+        do_verification = std::stoi(argv[3]);
+        init_method     = std::stoi(argv[4]);
+        do_log          = std::stoi(argv[5]);
+        time_kernel     = std::stoi(argv[6]);
+
+        // parse the long options
+        maxPoolbwdArgParser arg_parser;
+        arg_parser(argc, argv);
+        in_length = arg_parser.long_opts["length"];
+        wsize     = arg_parser.long_opts["wsize"];
+        wstride   = arg_parser.long_opts["wstride"];
+        wdilation = arg_parser.long_opts["wdilation"];
+        pad1      = arg_parser.long_opts["pad1"];
+        pad2      = arg_parser.long_opts["pad2"];
+    }
+
+#ifdef CK_ENABLE_FP16
+    using F16 = ck::half_t;
+#endif
+#ifdef CK_ENABLE_BF16
+    using BF16 = ck::bhalf_t;
+#endif
+#ifdef CK_ENABLE_FP32
+    using F32 = float;
+#endif
+    using I32 = int32_t;
+
+    if(false)
+        ;
+#ifdef CK_ENABLE_FP16
+    else if(data_type == ck::DataTypeEnum::Half)
+    {
+        ck::profiler::profile_max_pool3d_bwd_impl<F16, F16, I32, F16, F16, false>(do_verification,
+                                                                                  init_method,
+                                                                                  do_log,
+                                                                                  time_kernel,
+                                                                                  in_length,
+                                                                                  wsize,
+                                                                                  wstride,
+                                                                                  wdilation,
+                                                                                  pad1,
+                                                                                  pad2);
+    }
+#endif
+#ifdef CK_ENABLE_BF16
+    else if(data_type == ck::DataTypeEnum::BFloat16)
+    {
+        ck::profiler::profile_max_pool3d_bwd_impl<BF16, BF16, I32, BF16, BF16, false>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            in_length,
+            wsize,
+            wstride,
+            wdilation,
+            pad1,
+            pad2);
+    }
+#endif
+#ifdef CK_ENABLE_FP32
+    else if(data_type == ck::DataTypeEnum::Float)
+    {
+        ck::profiler::profile_max_pool3d_bwd_impl<F32, F32, I32, F32, F32, false>(do_verification,
+                                                                                  init_method,
+                                                                                  do_log,
+                                                                                  time_kernel,
+                                                                                  in_length,
+                                                                                  wsize,
+                                                                                  wstride,
+                                                                                  wdilation,
+                                                                                  pad1,
+                                                                                  pad2);
+    }
+#endif
+    else
+    {
+        throw std::runtime_error("not implemented yet");
+    }
+
+    return 0;
+}
+
+REGISTER_PROFILER_OPERATION("max_pool3d_bwd", "max_pool3d bwd", profile_max_pool3d_bwd);
--- a/profiler/src/profile_max_pool3d_fwd.cpp
+++ b/profiler/src/profile_max_pool3d_fwd.cpp
@@ -51,7 +51,7 @@ struct maxPoolFwdArgParser

 void print_help_max_pool3d_fwd()
 {
-    std::cout << "arg1: data type (0: fp16; 1: fp32)\n"
+    std::cout << "arg1: data type (0: fp16; 1: fp32; 5: bf16)\n"
              << "arg2: verification (0: no; 1: yes)\n"
              << "arg3: initialization (0: no init; 1: integer value; 2: decimal value)\n"
              << "arg4: print tensor value (0: no; 1: yes)\n"
@@ -109,8 +109,15 @@ int profile_max_pool3d_fwd(int argc, char* argv[])
        pad2      = arg_parser.long_opts["pad2"];
    }

-    using F16   = ck::half_t;
-    using F32   = float;
+#ifdef CK_ENABLE_FP16
+    using F16 = ck::half_t;
+#endif
+#ifdef CK_ENABLE_BF16
+    using BF16 = ck::bhalf_t;
+#endif
+#ifdef CK_ENABLE_FP32
+    using F32 = float;
+#endif
    using I32   = int32_t;
    using NDHWC = ck::tensor_layout::convolution::NDHWC;

@@ -120,7 +127,10 @@ int profile_max_pool3d_fwd(int argc, char* argv[])
    constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
 #endif

-    if(data_type == ck::DataTypeEnum::Half)
+    if(false)
+        ;
+#ifdef CK_ENABLE_FP16
+    else if(data_type == ck::DataTypeEnum::Half)
    {
        if(return_index)
            ck::profiler::
@@ -149,6 +159,51 @@ int profile_max_pool3d_fwd(int argc, char* argv[])
                    pad1,
                    pad2);
    }
+#endif
+#ifdef CK_ENABLE_BF16
+    else if(data_type == ck::DataTypeEnum::BFloat16)
+    {
+        if(return_index)
+            ck::profiler::profile_pool3d_fwd_impl<BF16,
+                                                  BF16,
+                                                  BF16,
+                                                  I32,
+                                                  NDHWC,
+                                                  NDHWC,
+                                                  ReduceOpId,
+                                                  false,
+                                                  true>(do_verification,
+                                                        init_method,
+                                                        do_log,
+                                                        time_kernel,
+                                                        in_length,
+                                                        wsize,
+                                                        wstride,
+                                                        wdilation,
+                                                        pad1,
+                                                        pad2);
+        else
+            ck::profiler::profile_pool3d_fwd_impl<BF16,
+                                                  BF16,
+                                                  BF16,
+                                                  I32,
+                                                  NDHWC,
+                                                  NDHWC,
+                                                  ReduceOpId,
+                                                  false,
+                                                  false>(do_verification,
+                                                         init_method,
+                                                         do_log,
+                                                         time_kernel,
+                                                         in_length,
+                                                         wsize,
+                                                         wstride,
+                                                         wdilation,
+                                                         pad1,
+                                                         pad2);
+    }
+#endif
+#ifdef CK_ENABLE_FP32
    else if(data_type == ck::DataTypeEnum::Float)
    {
        if(return_index)
@@ -178,6 +233,7 @@ int profile_max_pool3d_fwd(int argc, char* argv[])
                    pad1,
                    pad2);
    }
+#endif
    else
    {
        throw std::runtime_error("not implemented yet");

--- a/script/cmake-ck-dev.sh
+++ b/script/cmake-ck-dev.sh
@@ -16,4 +16,3 @@ cmake
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                 \
 -D USE_BITINT_EXTENSION_INT4=OFF                                                                  \
 ${MY_PROJECT_SOURCE}
-
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -57,9 +57,10 @@ add_subdirectory(data_type)
 add_subdirectory(elementwise_normalization)
 add_subdirectory(batchnorm)
 add_subdirectory(contraction)
-add_subdirectory(pool_fwd)
+add_subdirectory(pool)
 add_subdirectory(batched_gemm_multi_d)
 add_subdirectory(grouped_convnd_bwd_data)
+add_subdirectory(image_to_column)
 if(GPU_TARGETS MATCHES "gfx11")
    add_subdirectory(wmma_op)
 endif()
--- a/test/image_to_column/CMakeLists.txt
+++ b/test/image_to_column/CMakeLists.txt
+add_gtest_executable(test_image_to_column test_image_to_column.cpp)
+target_link_libraries(test_image_to_column PRIVATE utility device_image_to_column_instance)
+add_gtest_executable(test_image_to_column_interface test_image_to_column_interface.cpp)
+target_link_libraries(test_image_to_column_interface PRIVATE utility)
--- a/test/image_to_column/test_image_to_column.cpp
+++ b/test/image_to_column/test_image_to_column.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <tuple>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "profiler/profile_image_to_column_impl.hpp"
+
+template <typename Tuple>
+class TestImageToColumn : public ::testing::Test
+{
+    protected:
+    using InDataType  = std::tuple_element_t<0, Tuple>;
+    using OutDataType = std::tuple_element_t<1, Tuple>;
+    using InLayout    = std::tuple_element_t<2, Tuple>;
+
+    std::vector<ck::utils::conv::ConvParam> conv_params;
+
+    template <ck::index_t NDimSpatial>
+    void Run()
+    {
+        EXPECT_FALSE(conv_params.empty());
+        bool pass = true;
+        for(auto& param : conv_params)
+        {
+            pass = pass && ck::profiler::profile_image_to_column_impl<NDimSpatial,
+                                                                      InLayout,
+                                                                      InDataType,
+                                                                      OutDataType>(
+                               true,  // do_verification
+                               1,     // init_method: integer value
+                               false, // do_log
+                               false, // time_kernel
+                               param);
+        }
+        EXPECT_TRUE(pass);
+    }
+};
+
+using namespace ck::tensor_layout::convolution;
+
+using KernelTypes1d = ::testing::Types<std::tuple<float, float, GNWC>,
+                                       std::tuple<ck::bhalf_t, ck::bhalf_t, GNWC>,
+                                       std::tuple<ck::half_t, ck::half_t, GNWC>,
+                                       std::tuple<int8_t, int8_t, GNWC>>;
+
+using KernelTypes2d = ::testing::Types<std::tuple<float, float, GNHWC>,
+                                       std::tuple<ck::bhalf_t, ck::bhalf_t, GNHWC>,
+                                       std::tuple<ck::half_t, ck::half_t, GNHWC>,
+                                       std::tuple<int8_t, int8_t, GNHWC>>;
+
+using KernelTypes3d = ::testing::Types<std::tuple<float, float, GNDHWC>,
+                                       std::tuple<ck::bhalf_t, ck::bhalf_t, GNDHWC>,
+                                       std::tuple<ck::half_t, ck::half_t, GNDHWC>,
+                                       std::tuple<int8_t, int8_t, GNDHWC>>;
+
+template <typename Tuple>
+class TestImageToColumn1d : public TestImageToColumn<Tuple>
+{
+};
+
+template <typename Tuple>
+class TestImageToColumn2d : public TestImageToColumn<Tuple>
+{
+};
+
+template <typename Tuple>
+class TestImageToColumn3d : public TestImageToColumn<Tuple>
+{
+};
+
+TYPED_TEST_SUITE(TestImageToColumn1d, KernelTypes1d);
+TYPED_TEST_SUITE(TestImageToColumn2d, KernelTypes2d);
+TYPED_TEST_SUITE(TestImageToColumn3d, KernelTypes3d);
+
+TYPED_TEST(TestImageToColumn1d, Test1D)
+{
+    this->conv_params.clear();
+
+    this->conv_params.push_back({1, 1, 4, 1, 192, {3}, {28}, {1}, {1}, {1}, {1}});
+    this->conv_params.push_back({1, 1, 64, 1, 64, {3}, {14}, {1}, {1}, {1}, {1}});
+    this->conv_params.push_back({1, 1, 64, 1, 64, {1}, {7}, {2}, {1}, {0}, {0}});
+    this->conv_params.push_back({1, 1, 64, 1, 64, {1}, {3}, {1}, {1}, {0}, {0}});
+    // ScalarPerVector should be 1
+    this->conv_params.push_back({1, 1, 4, 1, 1, {3}, {28}, {1}, {1}, {1}, {1}});
+    // stride != 1
+    this->conv_params.push_back({1, 1, 1, 1, 4, {3}, {28}, {2}, {1}, {1}, {1}});
+    // dilation != 1
+    this->conv_params.push_back({1, 1, 1, 1, 4, {3}, {28}, {1}, {2}, {1}, {1}});
+    this->template Run<1>();
+}
+
+TYPED_TEST(TestImageToColumn2d, Test2D)
+{
+    this->conv_params.clear();
+
+    this->conv_params.push_back(
+        {2, 1, 4, 1, 192, {3, 3}, {28, 28}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    this->conv_params.push_back(
+        {2, 1, 64, 1, 64, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    this->conv_params.push_back({2, 1, 64, 1, 64, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
+    this->conv_params.push_back({2, 1, 64, 1, 64, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
+    this->template Run<2>();
+}
+
+TYPED_TEST(TestImageToColumn3d, Test3D)
+{
+    this->conv_params.clear();
+    this->conv_params.push_back(
+        {3, 1, 16, 1, 64, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 1, 2, 1, 64, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+    this->conv_params.push_back(
+        {3, 1, 32, 1, 64, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->template Run<3>();
+}
--- a/test/image_to_column/test_image_to_column_interface.cpp
+++ b/test/image_to_column/test_image_to_column_interface.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <tuple>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp"
+
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using DataType = float;
+using InLayout = ck::tensor_layout::convolution::GNWC;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+template <ck::index_t ScalarPerVector, bool IsCPacked>
+class TestImageToColumnInterface : public ::testing::Test
+{
+    protected:
+    static constexpr ck::index_t NDimSpatial = 1;
+
+    // clang-format off
+    using DeviceImgToColInstance = ck::tensor_operation::device::DeviceImageToColumnImpl
+        //#####################|        Num| InLayout| InDataType| OutDataType| Block|  MPer|  KPer|    Thread|         Scalar|
+        //#####################|        Dim|         |           |            |  Size| Block| Block|   Cluster|            Per|
+        //#####################|    Spatial|         |           |            |      |      |      |   Lengths|         Vector|
+        //#####################|           |         |           |            |      |      |      |          |               |
+                              < NDimSpatial, InLayout,   DataType,    DataType,   256,   128,   128, S<16, 16>,ScalarPerVector>;
+    // clang-format on
+
+    ck::utils::conv::ConvParam conv_param;
+
+    bool Run()
+    {
+
+        const auto N = conv_param.N_;
+        const auto C = conv_param.C_;
+        const auto FakeC =
+            conv_param.C_ / 2; // Fake C to simulate the behavior that C is not packed
+
+        const ck::index_t NDoHoWo =
+            N *
+            ck::accumulate_n<ck::index_t>(
+                conv_param.output_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
+        const ck::index_t CZYX =
+            C *
+            ck::accumulate_n<ck::index_t>(
+                conv_param.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
+
+        const auto in_desc =
+            ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(
+                conv_param);
+        const auto out_desc = HostTensorDescriptor({NDoHoWo, CZYX});
+
+        std::array<ck::index_t, NDimSpatial> input_spatial_lengths{};
+        std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{};
+        std::array<ck::index_t, NDimSpatial> output_spatial_lengths{};
+        std::array<ck::index_t, NDimSpatial + 3> input_g_n_c_wis_strides{};
+        std::array<ck::index_t, 2> output_m_k_strides{};
+        std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+        std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+        std::array<ck::index_t, NDimSpatial> input_left_pads{};
+        std::array<ck::index_t, NDimSpatial> input_right_pads{};
+
+        auto copy = [](const auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); };
+
+        copy(conv_param.input_spatial_lengths_, input_spatial_lengths);
+        copy(conv_param.filter_spatial_lengths_, filter_spatial_lengths);
+        copy(conv_param.output_spatial_lengths_, output_spatial_lengths);
+        copy(in_desc.GetStrides(), input_g_n_c_wis_strides);
+        copy(out_desc.GetStrides(), output_m_k_strides);
+        copy(conv_param.conv_filter_strides_, conv_filter_strides);
+        copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+        copy(conv_param.input_left_pads_, input_left_pads);
+        copy(conv_param.input_right_pads_, input_right_pads);
+
+        auto img2col  = DeviceImgToColInstance{};
+        auto argument = img2col.MakeArgument(nullptr,
+                                             nullptr,
+                                             N,
+                                             IsCPacked ? C : FakeC,
+                                             input_spatial_lengths,
+                                             filter_spatial_lengths,
+                                             output_spatial_lengths,
+                                             input_g_n_c_wis_strides,
+                                             output_m_k_strides,
+                                             conv_filter_strides,
+                                             conv_filter_dilations,
+                                             input_left_pads,
+                                             input_right_pads);
+
+        return img2col.IsSupportedArgument(argument);
+    }
+};
+
+class TestImageToColumnInterface1ScalarPerVector : public TestImageToColumnInterface<1, true>
+{
+};
+
+class TestImageToColumnInterface4ScalarPerVector : public TestImageToColumnInterface<4, true>
+{
+};
+
+class TestImageToColumnInterface4ScalarPerVectorFakeC : public TestImageToColumnInterface<4, false>
+{
+};
+
+TEST_F(TestImageToColumnInterface1ScalarPerVector, X1ScalarPerVector)
+{
+    // vector load C * X % ScalarPerVector
+    this->conv_param  = {1, 1, 1, 1, 1, {3}, {3}, {1}, {1}, {0}, {0}};
+    bool is_supported = this->Run();
+    EXPECT_TRUE(is_supported);
+    // vector load C * left_pad_x % ScalarPerVector
+    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {1}, {3}, {0}};
+    is_supported     = this->Run();
+    EXPECT_TRUE(is_supported);
+    // vector load C * right_pad_x % ScalarPerVector
+    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {1}, {0}, {3}};
+    is_supported     = this->Run();
+    EXPECT_TRUE(is_supported);
+
+    // vector load C % ScalarPerVector, right_pad and stride
+    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {2}, {1}, {0}, {3}};
+    is_supported     = this->Run();
+    EXPECT_TRUE(is_supported);
+    // vector load C % ScalarPerVector, left_pad and stride
+    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {2}, {1}, {3}, {0}};
+    is_supported     = this->Run();
+    EXPECT_TRUE(is_supported);
+    // vector load C % ScalarPerVector, dilation
+    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {2}, {0}, {0}};
+    is_supported     = this->Run();
+    EXPECT_TRUE(is_supported);
+
+    // C = 4
+    this->conv_param = {1, 1, 1, 1, 4, {3}, {3}, {1}, {1}, {3}, {3}};
+    is_supported     = this->Run();
+    EXPECT_TRUE(is_supported);
+}
+
+TEST_F(TestImageToColumnInterface4ScalarPerVector, X4ScalarPerVector)
+{
+    // vector load C * X % ScalarPerVector
+    this->conv_param  = {1, 1, 1, 1, 1, {3}, {3}, {1}, {1}, {0}, {0}};
+    bool is_supported = this->Run();
+    EXPECT_FALSE(is_supported);
+    // vector load C * left_pad_x % ScalarPerVector
+    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {1}, {3}, {0}};
+    is_supported     = this->Run();
+    EXPECT_FALSE(is_supported);
+    // vector load C * right_pad_x % ScalarPerVector
+    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {1}, {0}, {3}};
+    is_supported     = this->Run();
+    EXPECT_FALSE(is_supported);
+
+    // vector load C % ScalarPerVector, right_pad and stride
+    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {2}, {1}, {0}, {3}};
+    is_supported     = this->Run();
+    EXPECT_FALSE(is_supported);
+    // vector load C % ScalarPerVector, left_pad and stride
+    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {2}, {1}, {3}, {0}};
+    is_supported     = this->Run();
+    EXPECT_FALSE(is_supported);
+    // vector load C % ScalarPerVector, dilation
+    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {2}, {0}, {0}};
+    is_supported     = this->Run();
+    EXPECT_FALSE(is_supported);
+
+    // C = 4
+    this->conv_param = {1, 1, 1, 1, 4, {3}, {3}, {1}, {1}, {3}, {3}};
+    is_supported     = this->Run();
+    EXPECT_TRUE(is_supported);
+}
+
+TEST_F(TestImageToColumnInterface4ScalarPerVectorFakeC, X4ScalarPerVectorFakeC)
+{
+    // C = 3
+    this->conv_param  = {1, 1, 1, 1, 3, {4}, {3}, {1}, {1}, {0}, {0}};
+    bool is_supported = this->Run();
+    EXPECT_FALSE(is_supported);
+    // C = 4
+    this->conv_param = {1, 1, 1, 1, 8, {4}, {3}, {1}, {1}, {0}, {0}};
+    is_supported     = this->Run();
+    EXPECT_TRUE(is_supported);
+}
--- a/test/pool_fwd/CMakeLists.txt
+++ b/test/pool_fwd/CMakeLists.txt
-add_custom_target(test_pool_fwd)
+add_custom_target(test_pool)

+add_gtest_executable(test_avg_pool3d_bwd test_avg_pool3d_bwd.cpp)
+add_gtest_executable(test_max_pool3d_bwd test_max_pool3d_bwd.cpp)
 add_gtest_executable(test_avg_pool3d_fwd test_avg_pool3d_fwd.cpp)
 add_gtest_executable(test_max_pool3d_fwd test_max_pool3d_fwd.cpp)

+target_link_libraries(test_avg_pool3d_bwd PRIVATE utility device_avg_pool3d_bwd_instance)
+target_link_libraries(test_max_pool3d_bwd PRIVATE utility device_max_pool_bwd_instance)
 target_link_libraries(test_avg_pool3d_fwd PRIVATE utility device_pool3d_fwd_instance)
 target_link_libraries(test_max_pool3d_fwd PRIVATE utility device_pool3d_fwd_instance)

-add_dependencies(test_pool_fwd test_avg_pool3d_fwd)
-add_dependencies(test_pool_fwd test_max_pool3d_fwd)
+add_dependencies(test_pool test_avg_pool3d_bwd)
+add_dependencies(test_pool test_max_pool3d_bwd)
+add_dependencies(test_pool test_avg_pool3d_fwd)
+add_dependencies(test_pool test_max_pool3d_fwd)
--- a/test/pool/test_avg_pool3d_bwd.cpp
+++ b/test/pool/test_avg_pool3d_bwd.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "profiler/profile_avg_pool3d_bwd_impl.hpp"
+#include "test_pool_fwd_common.hpp"
+
+template <typename Tuple>
+class TestAvgPool3dBwd : public ::testing::Test
+{
+    protected:
+    using DOutDataType    = std::tuple_element_t<0, Tuple>;
+    using DInDataType     = std::tuple_element_t<1, Tuple>;
+    using ComputeDataType = std::tuple_element_t<2, Tuple>;
+    using DOutLayout      = std::tuple_element_t<3, Tuple>;
+    using DInLayout       = std::tuple_element_t<4, Tuple>;
+
+    std::vector<PoolingParam> params;
+
+    void Run()
+    {
+        for(auto param : params)
+        {
+            bool success =
+                ck::profiler::profile_avg_pool3d_bwd_impl<DOutDataType,
+                                                          DInDataType,
+                                                          ComputeDataType,
+                                                          DOutLayout,
+                                                          DInLayout>(true,
+                                                                     2,
+                                                                     false,
+                                                                     false,
+                                                                     param.length_,
+                                                                     param.window_spatial_lengths_,
+                                                                     param.window_strides_,
+                                                                     param.window_dilations_,
+                                                                     param.input_left_pads_,
+                                                                     param.input_right_pads_);
+            EXPECT_TRUE(success);
+        }
+    }
+};
+
+#if defined(CK_ENABLE_FP16) && defined(CK_ENABLE_BF16) && defined(CK_ENABLE_FP32)
+using KernelTypes = ::testing::Types<std::tuple<F16, F16, F32, NDHWC, NDHWC>,
+                                     std::tuple<BF16, BF16, F32, NDHWC, NDHWC>,
+                                     std::tuple<F32, F32, F32, NDHWC, NDHWC>>;
+#elif defined(CK_ENABLE_FP16) && defined(CK_ENABLE_FP32)
+using KernelTypes = ::testing::Types<std::tuple<F16, F16, F32, NDHWC, NDHWC>,
+                                     std::tuple<F32, F32, F32, NDHWC, NDHWC>>;
+#elif defined(CK_ENABLE_BF16) && defined(CK_ENABLE_FP32)
+using KernelTypes = ::testing::Types<std::tuple<BF16, BF16, F32, NDHWC, NDHWC>,
+                                     std::tuple<F32, F32, F32, NDHWC, NDHWC>>;
+#elif defined(CK_ENABLE_FP16) && defined(CK_ENABLE_BF16)
+using KernelTypes = ::testing::Types<std::tuple<F16, F16, F32, NDHWC, NDHWC>,
+                                     std::tuple<BF16, BF16, F32, NDHWC, NDHWC>>;
+#elif defined(CK_ENABLE_FP16)
+using KernelTypes = ::testing::Types<std::tuple<F16, F16, F32, NDHWC, NDHWC>>;
+#elif defined(CK_ENABLE_BF16)
+using KernelTypes = ::testing::Types<std::tuple<BF16, BF16, F32, NDHWC, NDHWC>>;
+#elif defined(CK_ENABLE_FP32)
+using KernelTypes = ::testing::Types<std::tuple<F32, F32, F32, NDHWC, NDHWC>>;
+#endif
+
+TYPED_TEST_SUITE(TestAvgPool3dBwd, KernelTypes);
+TYPED_TEST(TestAvgPool3dBwd, Test_Pool)
+{
+    // length, window_length, window_stride, window_dilation, left_pad, right_pad
+    this->params = {{{1, 1, 1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}},
+                    {{2, 16, 64, 64, 64}, {4, 4, 4}, {4, 4, 4}, {2, 2, 2}, {0, 0, 0}, {0, 0, 0}},
+                    {{2, 32, 30, 30, 30}, {2, 2, 2}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}};
+
+    this->Run();
+}
--- a/test/pool_fwd/test_avg_pool3d_fwd.cpp
+++ b/test/pool_fwd/test_avg_pool3d_fwd.cpp
--- a/test/pool/test_max_pool3d_bwd.cpp
+++ b/test/pool/test_max_pool3d_bwd.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "profiler/profile_max_pool3d_bwd_impl.hpp"
+#include "test_pool_fwd_common.hpp"
+
+template <typename Tuple>
+class TestMaxPool3dBwd : public ::testing::Test
+{
+    protected:
+    using DOutDataType  = std::tuple_element_t<0, Tuple>;
+    using DInDataType   = std::tuple_element_t<1, Tuple>;
+    using IndexDataType = std::tuple_element_t<2, Tuple>;
+
+    using InDataType  = DInDataType;
+    using OutDataType = DOutDataType;
+
+    std::vector<PoolingParam> params;
+
+    void Run()
+    {
+        for(auto param : params)
+        {
+            bool success =
+                ck::profiler::profile_max_pool3d_bwd_impl<InDataType,
+                                                          OutDataType,
+                                                          IndexDataType,
+                                                          DOutDataType,
+                                                          DInDataType,
+                                                          false>(true,
+                                                                 2,
+                                                                 false,
+                                                                 false,
+                                                                 param.length_,
+                                                                 param.window_spatial_lengths_,
+                                                                 param.window_strides_,
+                                                                 param.window_dilations_,
+                                                                 param.input_left_pads_,
+                                                                 param.input_right_pads_);
+            EXPECT_TRUE(success);
+        }
+    }
+};
+
+#if defined(CK_ENABLE_FP16) && defined(CK_ENABLE_BF16) && defined(CK_ENABLE_FP32)
+using KernelTypes = ::testing::Types<std::tuple<F16, F16, I32, NDHWC, NDHWC>,
+                                     std::tuple<BF16, BF16, I32, NDHWC, NDHWC>,
+                                     std::tuple<F32, F32, I32, NDHWC, NDHWC>>;
+#elif defined(CK_ENABLE_FP16) && defined(CK_ENABLE_FP32)
+using KernelTypes = ::testing::Types<std::tuple<F16, F16, I32, NDHWC, NDHWC>,
+                                     std::tuple<F32, F32, I32, NDHWC, NDHWC>>;
+#elif defined(CK_ENABLE_BF16) && defined(CK_ENABLE_FP32)
+using KernelTypes = ::testing::Types<std::tuple<BF16, BF16, I32, NDHWC, NDHWC>,
+                                     std::tuple<F32, F32, I32, NDHWC, NDHWC>>;
+#elif defined(CK_ENABLE_FP16) && defined(CK_ENABLE_BF16)
+using KernelTypes = ::testing::Types<std::tuple<F16, F16, I32, NDHWC, NDHWC>,
+                                     std::tuple<BF16, BF16, I32, NDHWC, NDHWC>>;
+#elif defined(CK_ENABLE_FP16)
+using KernelTypes = ::testing::Types<std::tuple<F16, F16, I32, NDHWC, NDHWC>>;
+#elif defined(CK_ENABLE_BF16)
+using KernelTypes = ::testing::Types<std::tuple<BF16, BF16, I32, NDHWC, NDHWC>>;
+#elif defined(CK_ENABLE_FP32)
+using KernelTypes = ::testing::Types<std::tuple<F32, F32, I32, NDHWC, NDHWC>>;
+#endif
+
+TYPED_TEST_SUITE(TestMaxPool3dBwd, KernelTypes);
+TYPED_TEST(TestMaxPool3dBwd, Test_Pool)
+{
+    // length, window_length, window_stride, window_dilation, left_pad, right_pad
+    this->params = {{{1, 1, 1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}},
+                    {{2, 16, 64, 64, 64}, {4, 4, 4}, {4, 4, 4}, {2, 2, 2}, {0, 0, 0}, {0, 0, 0}},
+                    {{2, 32, 30, 30, 30}, {2, 2, 2}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}};
+
+    // this->params = {{{2, 32, 30, 30, 30}, {2, 2, 2}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1,
+    // 1}}};
+
+    this->Run();
+}
--- a/test/pool_fwd/test_max_pool3d_fwd.cpp
+++ b/test/pool_fwd/test_max_pool3d_fwd.cpp
--- a/test/pool_fwd/test_pool_fwd_common.hpp
+++ b/test/pool_fwd/test_pool_fwd_common.hpp
@@ -4,10 +4,12 @@
 #include "gtest/gtest.h"
 #include "ck/ck.hpp"

-using F16 = ck::half_t;
-using F32 = float;
-using I32 = int32_t;
+using F16  = ck::half_t;
+using BF16 = ck::bhalf_t;
+using F32  = float;
+using I32  = int32_t;
 using ck::index_t;
+using NDHWC = ck::tensor_layout::convolution::NDHWC;

 struct PoolingParam
 {